diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1136,14 +1136,19 @@ /// Returns sum of the base pointer and offset. SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL); - SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, - SDValue Mask, SDValue Src0, EVT MemVT, - MachineMemOperand *MMO, ISD::LoadExtType, - bool IsExpanding = false); + SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, + SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, + MachineMemOperand *MMO, ISD::MemIndexedMode AM, + ISD::LoadExtType, bool IsExpanding = false); + SDValue getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base, + SDValue Offset, ISD::MemIndexedMode AM); SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, - SDValue Ptr, SDValue Mask, EVT MemVT, - MachineMemOperand *MMO, bool IsTruncating = false, - bool IsCompressing = false); + SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, + MachineMemOperand *MMO, ISD::MemIndexedMode AM, + bool IsTruncating = false, bool IsCompressing = false); + SDValue getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl, + SDValue Base, SDValue Offset, + ISD::MemIndexedMode AM); SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl, ArrayRef Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -553,6 +553,7 @@ class LSBaseSDNodeBitfields { friend class LSBaseSDNode; + friend class MaskedLoadStoreSDNode; friend class MaskedGatherScatterSDNode; uint16_t : NumMemSDNodeBits; @@ -560,6 +561,7 @@ // This storage is shared between disparate class hierarchies to hold an // enumeration specific to the class hierarchy in use. // LSBaseSDNode => enum ISD::MemIndexedMode + // MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode // MaskedGatherScatterSDNode => enum ISD::MemIndexType uint16_t AddressingMode : 3; }; @@ -2273,19 +2275,38 @@ friend class SelectionDAG; MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, - const DebugLoc &dl, SDVTList VTs, EVT MemVT, + const DebugLoc &dl, SDVTList VTs, + ISD::MemIndexedMode AM, EVT MemVT, MachineMemOperand *MMO) - : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {} + : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { + LSBaseSDNodeBits.AddressingMode = AM; + assert(getAddressingMode() == AM && "Value truncated"); + } - // MaskedLoadSDNode (Chain, ptr, mask, passthru) - // MaskedStoreSDNode (Chain, data, ptr, mask) + // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru) + // MaskedStoreSDNode (Chain, data, ptr, offset, mask) // Mask is a vector of i1 elements const SDValue &getBasePtr() const { return getOperand(getOpcode() == ISD::MLOAD ? 1 : 2); } - const SDValue &getMask() const { + const SDValue &getOffset() const { return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3); } + const SDValue &getMask() const { + return getOperand(getOpcode() == ISD::MLOAD ? 3 : 4); + } + + /// Return the addressing mode for this load or store: + /// unindexed, pre-inc, pre-dec, post-inc, or post-dec. + ISD::MemIndexedMode getAddressingMode() const { + return static_cast(LSBaseSDNodeBits.AddressingMode); + } + + /// Return true if this is a pre/post inc/dec load/store. + bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; } + + /// Return true if this is NOT a pre/post inc/dec load/store. + bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MLOAD || @@ -2299,9 +2320,9 @@ friend class SelectionDAG; MaskedLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, - ISD::LoadExtType ETy, bool IsExpanding, EVT MemVT, - MachineMemOperand *MMO) - : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, MemVT, MMO) { + ISD::MemIndexedMode AM, ISD::LoadExtType ETy, + bool IsExpanding, EVT MemVT, MachineMemOperand *MMO) + : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, AM, MemVT, MMO) { LoadSDNodeBits.ExtTy = ETy; LoadSDNodeBits.IsExpanding = IsExpanding; } @@ -2311,8 +2332,9 @@ } const SDValue &getBasePtr() const { return getOperand(1); } - const SDValue &getMask() const { return getOperand(2); } - const SDValue &getPassThru() const { return getOperand(3); } + const SDValue &getOffset() const { return getOperand(2); } + const SDValue &getMask() const { return getOperand(3); } + const SDValue &getPassThru() const { return getOperand(4); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MLOAD; @@ -2327,9 +2349,9 @@ friend class SelectionDAG; MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, - bool isTrunc, bool isCompressing, EVT MemVT, - MachineMemOperand *MMO) - : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, MemVT, MMO) { + ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing, + EVT MemVT, MachineMemOperand *MMO) + : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, AM, MemVT, MMO) { StoreSDNodeBits.IsTruncating = isTrunc; StoreSDNodeBits.IsCompressing = isCompressing; } @@ -2345,9 +2367,10 @@ /// memory at base_addr. bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; } - const SDValue &getValue() const { return getOperand(1); } + const SDValue &getValue() const { return getOperand(1); } const SDValue &getBasePtr() const { return getOperand(2); } - const SDValue &getMask() const { return getOperand(3); } + const SDValue &getOffset() const { return getOperand(3); } + const SDValue &getMask() const { return getOperand(4); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MSTORE; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1110,12 +1110,8 @@ /// Return how the indexed load should be treated: either it is legal, needs /// to be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. - LegalizeAction - getIndexedLoadAction(unsigned IdxMode, MVT VT) const { - assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() && - "Table isn't big enough!"); - unsigned Ty = (unsigned)VT.SimpleTy; - return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4); + LegalizeAction getIndexedLoadAction(unsigned IdxMode, MVT VT) const { + return getIndexedModeAction(IdxMode, VT, IMAB_Load); } /// Return true if the specified indexed load is legal on this target. @@ -1128,12 +1124,8 @@ /// Return how the indexed store should be treated: either it is legal, needs /// to be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. - LegalizeAction - getIndexedStoreAction(unsigned IdxMode, MVT VT) const { - assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() && - "Table isn't big enough!"); - unsigned Ty = (unsigned)VT.SimpleTy; - return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f); + LegalizeAction getIndexedStoreAction(unsigned IdxMode, MVT VT) const { + return getIndexedModeAction(IdxMode, VT, IMAB_Store); } /// Return true if the specified indexed load is legal on this target. @@ -1143,6 +1135,34 @@ getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom); } + /// Return how the indexed load should be treated: either it is legal, needs + /// to be promoted to a larger size, needs to be expanded to some other code + /// sequence, or the target has a custom expander for it. + LegalizeAction getIndexedMaskedLoadAction(unsigned IdxMode, MVT VT) const { + return getIndexedModeAction(IdxMode, VT, IMAB_MaskedLoad); + } + + /// Return true if the specified indexed load is legal on this target. + bool isIndexedMaskedLoadLegal(unsigned IdxMode, EVT VT) const { + return VT.isSimple() && + (getIndexedMaskedLoadAction(IdxMode, VT.getSimpleVT()) == Legal || + getIndexedMaskedLoadAction(IdxMode, VT.getSimpleVT()) == Custom); + } + + /// Return how the indexed store should be treated: either it is legal, needs + /// to be promoted to a larger size, needs to be expanded to some other code + /// sequence, or the target has a custom expander for it. + LegalizeAction getIndexedMaskedStoreAction(unsigned IdxMode, MVT VT) const { + return getIndexedModeAction(IdxMode, VT, IMAB_MaskedStore); + } + + /// Return true if the specified indexed load is legal on this target. + bool isIndexedMaskedStoreLegal(unsigned IdxMode, EVT VT) const { + return VT.isSimple() && + (getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Legal || + getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Custom); + } + /// Return how the condition code should be treated: either it is legal, needs /// to be expanded to some other code sequence, or the target has a custom /// expander for it. @@ -2030,13 +2050,8 @@ /// /// NOTE: All indexed mode loads are initialized to Expand in /// TargetLowering.cpp - void setIndexedLoadAction(unsigned IdxMode, MVT VT, - LegalizeAction Action) { - assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE && - (unsigned)Action < 0xf && "Table isn't big enough!"); - // Load action are kept in the upper half. - IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0; - IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action) <<4; + void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action) { + setIndexedModeAction(IdxMode, VT, IMAB_Load, Action); } /// Indicate that the specified indexed store does or does not work with the @@ -2044,13 +2059,28 @@ /// /// NOTE: All indexed mode stores are initialized to Expand in /// TargetLowering.cpp - void setIndexedStoreAction(unsigned IdxMode, MVT VT, - LegalizeAction Action) { - assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE && - (unsigned)Action < 0xf && "Table isn't big enough!"); - // Store action are kept in the lower half. - IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f; - IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action); + void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action) { + setIndexedModeAction(IdxMode, VT, IMAB_Store, Action); + } + + /// Indicate that the specified indexed masked load does or does not work with + /// the specified type and indicate what to do about it. + /// + /// NOTE: All indexed mode masked loads are initialized to Expand in + /// TargetLowering.cpp + void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, + LegalizeAction Action) { + setIndexedModeAction(IdxMode, VT, IMAB_MaskedLoad, Action); + } + + /// Indicate that the specified indexed masked store does or does not work + /// with the specified type and indicate what to do about it. + /// + /// NOTE: All indexed mode masked stores are initialized to Expand in + /// TargetLowering.cpp + void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, + LegalizeAction Action) { + setIndexedModeAction(IdxMode, VT, IMAB_MaskedStore, Action); } /// Indicate that the specified condition code is or isn't supported on the @@ -2763,13 +2793,13 @@ /// truncating store of a specific value type and truncating type is legal. LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]; - /// For each indexed mode and each value type, keep a pair of LegalizeAction + /// For each indexed mode and each value type, keep a quad of LegalizeAction /// that indicates how instruction selection should deal with the load / - /// store. + /// store / maskedload / maskedstore. /// /// The first dimension is the value_type for the reference. The second /// dimension represents the various modes for load store. - uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE]; + uint16_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE]; /// For each condition code (ISD::CondCode) keep a LegalizeAction that /// indicates how instruction selection should deal with the condition code. @@ -2812,6 +2842,32 @@ /// Set default libcall names and calling conventions. void InitLibcalls(const Triple &TT); + /// The bits of IndexedModeActions used to store the legalisation actions + /// We store the data as | ML | MS | L | S | each taking 4 bits. + enum IndexedModeActionsBits { + IMAB_Store = 0, + IMAB_Load = 4, + IMAB_MaskedStore = 8, + IMAB_MaskedLoad = 12 + }; + + void setIndexedModeAction(unsigned IdxMode, MVT VT, unsigned Shift, + LegalizeAction Action) { + assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE && + (unsigned)Action < 0xf && "Table isn't big enough!"); + unsigned Ty = (unsigned)VT.SimpleTy; + IndexedModeActions[Ty][IdxMode] &= ~(0xf << Shift); + IndexedModeActions[Ty][IdxMode] |= ((uint16_t)Action) << Shift; + } + + LegalizeAction getIndexedModeAction(unsigned IdxMode, MVT VT, + unsigned Shift) const { + assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() && + "Table isn't big enough!"); + unsigned Ty = (unsigned)VT.SimpleTy; + return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] >> Shift) & 0xf); + } + protected: /// Return true if the extension represented by \p I is free. /// \pre \p I is a sign, zero, or fp extension and diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -224,13 +224,13 @@ SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3> ]>; -def SDTMaskedStore: SDTypeProfile<0, 3, [ // masked store - SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2> +def SDTMaskedStore: SDTypeProfile<0, 4, [ // masked store + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameNumEltsAs<0, 3> ]>; -def SDTMaskedLoad: SDTypeProfile<1, 3, [ // masked load - SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>, - SDTCisSameNumEltsAs<0, 2> +def SDTMaskedLoad: SDTypeProfile<1, 4, [ // masked load + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameAs<0, 4>, + SDTCisSameNumEltsAs<0, 3> ]>; def SDTVecShuffle : SDTypeProfile<1, 2, [ diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8724,6 +8724,10 @@ if (ISD::isBuildVectorAllZeros(Mask.getNode())) return Chain; + // Try transforming N to an indexed store. + if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) + return SDValue(N, 0); + return SDValue(); } @@ -8748,6 +8752,10 @@ if (ISD::isBuildVectorAllZeros(Mask.getNode())) return CombineTo(N, MLD->getPassThru(), MLD->getChain()); + // Try transforming N to an indexed load. + if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) + return SDValue(N, 0); + return SDValue(); } @@ -9506,11 +9514,10 @@ SDLoc dl(Ld); SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru()); - SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(), - Ld->getBasePtr(), Ld->getMask(), - PassThru, Ld->getMemoryVT(), - Ld->getMemOperand(), ExtLoadType, - Ld->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), + PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(), + ExtLoadType, Ld->isExpandingLoad()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); return NewLoad; } @@ -13612,12 +13619,22 @@ EVT VT; unsigned AS; - if (LoadSDNode *LD = dyn_cast(Use)) { + if (LoadSDNode *LD = dyn_cast(Use)) { if (LD->isIndexed() || LD->getBasePtr().getNode() != N) return false; VT = LD->getMemoryVT(); AS = LD->getAddressSpace(); - } else if (StoreSDNode *ST = dyn_cast(Use)) { + } else if (StoreSDNode *ST = dyn_cast(Use)) { + if (ST->isIndexed() || ST->getBasePtr().getNode() != N) + return false; + VT = ST->getMemoryVT(); + AS = ST->getAddressSpace(); + } else if (MaskedLoadSDNode *LD = dyn_cast(Use)) { + if (LD->isIndexed() || LD->getBasePtr().getNode() != N) + return false; + VT = LD->getMemoryVT(); + AS = LD->getAddressSpace(); + } else if (MaskedStoreSDNode *ST = dyn_cast(Use)) { if (ST->isIndexed() || ST->getBasePtr().getNode() != N) return false; VT = ST->getMemoryVT(); @@ -13651,38 +13668,64 @@ VT.getTypeForEVT(*DAG.getContext()), AS); } -/// Try turning a load/store into a pre-indexed load/store when the base -/// pointer is an add or subtract and it has other uses besides the load/store. -/// After the transformation, the new indexed load/store has effectively folded -/// the add/subtract in and all of its other uses are redirected to the -/// new load/store. -bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { - if (Level < AfterLegalizeDAG) - return false; - - bool isLoad = true; - SDValue Ptr; - EVT VT; - if (LoadSDNode *LD = dyn_cast(N)) { +static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, + bool &IsLoad, bool &IsMasked, SDValue &Ptr, + const TargetLowering &TLI) { + if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; - VT = LD->getMemoryVT(); - if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) && - !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT)) + EVT VT = LD->getMemoryVT(); + if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT)) return false; Ptr = LD->getBasePtr(); - } else if (StoreSDNode *ST = dyn_cast(N)) { + } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; - VT = ST->getMemoryVT(); - if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) && - !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT)) + EVT VT = ST->getMemoryVT(); + if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT)) return false; Ptr = ST->getBasePtr(); - isLoad = false; + IsLoad = false; + } else if (MaskedLoadSDNode *LD = dyn_cast(N)) { + if (LD->isIndexed()) + return false; + EVT VT = LD->getMemoryVT(); + if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) && + !TLI.isIndexedMaskedLoadLegal(Dec, VT)) + return false; + Ptr = LD->getBasePtr(); + IsMasked = true; + } else if (MaskedStoreSDNode *ST = dyn_cast(N)) { + if (ST->isIndexed()) + return false; + EVT VT = ST->getMemoryVT(); + if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) && + !TLI.isIndexedMaskedStoreLegal(Dec, VT)) + return false; + Ptr = ST->getBasePtr(); + IsLoad = false; + IsMasked = true; } else { return false; } + return true; +} + +/// Try turning a load/store into a pre-indexed load/store when the base +/// pointer is an add or subtract and it has other uses besides the load/store. +/// After the transformation, the new indexed load/store has effectively folded +/// the add/subtract in and all of its other uses are redirected to the +/// new load/store. +bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { + if (Level < AfterLegalizeDAG) + return false; + + bool IsLoad = true; + bool IsMasked = false; + SDValue Ptr; + if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked, + Ptr, TLI)) + return false; // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail // out. There is no reason to make this a preinc/predec. @@ -13724,8 +13767,9 @@ return false; // Check #2. - if (!isLoad) { - SDValue Val = cast(N)->getValue(); + if (!IsLoad) { + SDValue Val = IsMasked ? cast(N)->getValue() + : cast(N)->getValue(); // Would require a copy. if (Val == BasePtr) @@ -13801,18 +13845,26 @@ return false; SDValue Result; - if (isLoad) - Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM); - else - Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM); + if (!IsMasked) { + if (IsLoad) + Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); + else + Result = + DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); + } else { + if (IsLoad) + Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM); + else + Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM); + } ++PreIndexedNodes; ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); - if (isLoad) { + if (IsLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { @@ -13866,7 +13918,7 @@ // We can now generate the new expression. SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); - SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0); + SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0); SDValue NewUse = DAG.getNode(Opcode, DL, @@ -13876,7 +13928,7 @@ } // Replace the uses of Ptr with uses of the updated base value. - DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); + DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0)); deleteAndRecombine(Ptr.getNode()); AddToWorklist(Result.getNode()); @@ -13891,29 +13943,12 @@ if (Level < AfterLegalizeDAG) return false; - bool isLoad = true; + bool IsLoad = true; + bool IsMasked = false; SDValue Ptr; - EVT VT; - if (LoadSDNode *LD = dyn_cast(N)) { - if (LD->isIndexed()) - return false; - VT = LD->getMemoryVT(); - if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) && - !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT)) - return false; - Ptr = LD->getBasePtr(); - } else if (StoreSDNode *ST = dyn_cast(N)) { - if (ST->isIndexed()) - return false; - VT = ST->getMemoryVT(); - if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) && - !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT)) - return false; - Ptr = ST->getBasePtr(); - isLoad = false; - } else { + if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked, + Ptr, TLI)) return false; - } if (Ptr.getNode()->hasOneUse()) return false; @@ -13949,7 +13984,7 @@ // If all the uses are load / store addresses, then don't do the // transformation. - if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){ + if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { bool RealUse = false; for (SDNode *UseUse : Use->uses()) { if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) @@ -13975,18 +14010,24 @@ Worklist.push_back(Op); if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) { - SDValue Result = isLoad - ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM) - : DAG.getIndexedStore(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM); + SDValue Result; + if (!IsMasked) + Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM) + : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM); + else + Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM) + : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM); ++PostIndexedNodes; ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); - if (isLoad) { + if (IsLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { @@ -13998,7 +14039,7 @@ // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), - Result.getValue(isLoad ? 1 : 0)); + Result.getValue(IsLoad ? 1 : 0)); deleteAndRecombine(Op); return true; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -592,8 +592,9 @@ SDLoc dl(N); SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), - N->getMask(), ExtPassThru, N->getMemoryVT(), - N->getMemOperand(), ISD::EXTLOAD); + N->getOffset(), N->getMask(), ExtPassThru, + N->getMemoryVT(), N->getMemOperand(), + N->getAddressingMode(), ISD::EXTLOAD); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); @@ -1485,11 +1486,11 @@ SDLoc dl(N); bool TruncateStore = false; - if (OpNo == 3) { + if (OpNo == 4) { Mask = PromoteTargetBoolean(Mask, DataVT); // Update in place. SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[3] = Mask; + NewOps[4] = Mask; return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } else { // Data operand assert(OpNo == 1 && "Unexpected operand for promotion"); @@ -1497,14 +1498,15 @@ TruncateStore = true; } - return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, - N->getMemoryVT(), N->getMemOperand(), + return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), + N->getOffset(), Mask, N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), TruncateStore, N->isCompressingStore()); } SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo) { - assert(OpNo == 2 && "Only know how to promote the mask!"); + assert(OpNo == 3 && "Only know how to promote the mask!"); EVT DataVT = N->getValueType(0); SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); SmallVector NewOps(N->op_begin(), N->op_end()); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1541,12 +1541,15 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi) { + assert(MLD->isUnindexed() && "Indexed masked load during type legalization!"); EVT LoVT, HiVT; SDLoc dl(MLD); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); SDValue Ch = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); + SDValue Offset = MLD->getOffset(); + assert(Offset.isUndef() && "Unexpected indexed masked load offset"); SDValue Mask = MLD->getMask(); SDValue PassThru = MLD->getPassThru(); unsigned Alignment = MLD->getOriginalAlignment(); @@ -1578,8 +1581,9 @@ MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); - Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, PassThruLo, LoMemVT, MMO, - ExtType, MLD->isExpandingLoad()); + Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT, + MMO, MLD->getAddressingMode(), ExtType, + MLD->isExpandingLoad()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG, MLD->isExpandingLoad()); @@ -1590,8 +1594,9 @@ HiMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); - Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, PassThruHi, HiMemVT, MMO, - ExtType, MLD->isExpandingLoad()); + Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi, HiMemVT, + MMO, MLD->getAddressingMode(), ExtType, + MLD->isExpandingLoad()); // Build a factor node to remember that this load is independent of the // other one. @@ -2326,8 +2331,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { + assert(N->isUnindexed() && "Indexed masked store of vector?"); SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); + SDValue Offset = N->getOffset(); + assert(Offset.isUndef() && "Unexpected indexed masked store offset"); SDValue Mask = N->getMask(); SDValue Data = N->getValue(); EVT MemoryVT = N->getMemoryVT(); @@ -2361,8 +2369,8 @@ MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, - N->isTruncatingStore(), + Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO, + N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, @@ -2374,8 +2382,9 @@ HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, - N->isTruncatingStore(), N->isCompressingStore()); + Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO, + N->getAddressingMode(), N->isTruncatingStore(), + N->isCompressingStore()); // Build a factor node to remember that this store is independent of the // other one. @@ -3699,10 +3708,10 @@ WidenVT.getVectorNumElements()); Mask = ModifyToType(Mask, WideMaskVT, true); - SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(), - Mask, PassThru, N->getMemoryVT(), - N->getMemOperand(), ExtType, - N->isExpandingLoad()); + SDValue Res = DAG.getMaskedLoad( + WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), + ExtType, N->isExpandingLoad()); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); @@ -4447,7 +4456,8 @@ StVal.getValueType().getVectorNumElements() && "Mask and data vectors should have the same number of elements"); return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(), - Mask, MST->getMemoryVT(), MST->getMemOperand(), + MST->getOffset(), Mask, MST->getMemoryVT(), + MST->getMemOperand(), MST->getAddressingMode(), false, MST->isCompressingStore()); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6975,16 +6975,22 @@ } SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, - SDValue Ptr, SDValue Mask, SDValue PassThru, - EVT MemVT, MachineMemOperand *MMO, + SDValue Base, SDValue Offset, SDValue Mask, + SDValue PassThru, EVT MemVT, + MachineMemOperand *MMO, + ISD::MemIndexedMode AM, ISD::LoadExtType ExtTy, bool isExpanding) { - SDVTList VTs = getVTList(VT, MVT::Other); - SDValue Ops[] = { Chain, Ptr, Mask, PassThru }; + bool Indexed = AM != ISD::UNINDEXED; + assert((Indexed || Offset.isUndef()) && + "Unindexed masked load with an offset!"); + SDVTList VTs = Indexed ? getVTList(VT, Base.getValueType(), MVT::Other) + : getVTList(VT, MVT::Other); + SDValue Ops[] = {Chain, Base, Offset, Mask, PassThru}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops); ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(getSyntheticNodeSubclassData( - dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO)); + dl.getIROrder(), VTs, AM, ExtTy, isExpanding, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { @@ -6992,7 +6998,7 @@ return SDValue(E, 0); } auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, - ExtTy, isExpanding, MemVT, MMO); + AM, ExtTy, isExpanding, MemVT, MMO); createOperands(N, Ops); CSEMap.InsertNode(N, IP); @@ -7002,27 +7008,45 @@ return V; } +SDValue SelectionDAG::getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl, + SDValue Base, SDValue Offset, + ISD::MemIndexedMode AM) { + MaskedLoadSDNode *LD = cast(OrigLoad); + assert(LD->getOffset().isUndef() && "Masked load is already a indexed load!"); + return getMaskedLoad(OrigLoad.getValueType(), dl, LD->getChain(), Base, + Offset, LD->getMask(), LD->getPassThru(), + LD->getMemoryVT(), LD->getMemOperand(), AM, + LD->getExtensionType(), LD->isExpandingLoad()); +} + SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl, - SDValue Val, SDValue Ptr, SDValue Mask, - EVT MemVT, MachineMemOperand *MMO, - bool IsTruncating, bool IsCompressing) { + SDValue Val, SDValue Base, SDValue Offset, + SDValue Mask, EVT MemVT, + MachineMemOperand *MMO, + ISD::MemIndexedMode AM, bool IsTruncating, + bool IsCompressing) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - SDVTList VTs = getVTList(MVT::Other); - SDValue Ops[] = { Chain, Val, Ptr, Mask }; + bool Indexed = AM != ISD::UNINDEXED; + assert((Indexed || Offset.isUndef()) && + "Unindexed masked store with an offset!"); + SDVTList VTs = Indexed ? getVTList(Base.getValueType(), MVT::Other) + : getVTList(MVT::Other); + SDValue Ops[] = {Chain, Val, Base, Offset, Mask}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops); ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(getSyntheticNodeSubclassData( - dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO)); + dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } - auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, - IsTruncating, IsCompressing, MemVT, MMO); + auto *N = + newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, AM, + IsTruncating, IsCompressing, MemVT, MMO); createOperands(N, Ops); CSEMap.InsertNode(N, IP); @@ -7032,6 +7056,17 @@ return V; } +SDValue SelectionDAG::getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl, + SDValue Base, SDValue Offset, + ISD::MemIndexedMode AM) { + MaskedStoreSDNode *ST = cast(OrigStore); + assert(ST->getOffset().isUndef() && + "Masked store is already a indexed store!"); + return getMaskedStore(ST->getChain(), dl, ST->getValue(), Base, Offset, + ST->getMask(), ST->getMemoryVT(), ST->getMemOperand(), + AM, ST->isTruncatingStore(), ST->isCompressingStore()); +} + SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl, ArrayRef Ops, MachineMemOperand *MMO, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4295,6 +4295,7 @@ SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(Src0Operand); SDValue Mask = getValue(MaskOperand); + SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); EVT VT = Src0.getValueType(); if (!Alignment) @@ -4311,9 +4312,9 @@ // vectors. VT.getStoreSize().getKnownMinSize(), Alignment, AAInfo); - SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT, - MMO, false /* Truncating */, - IsCompressing); + SDValue StoreNode = + DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO, + ISD::UNINDEXED, false /* Truncating */, IsCompressing); DAG.setRoot(StoreNode); setValue(&I, StoreNode); } @@ -4461,6 +4462,7 @@ SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(Src0Operand); SDValue Mask = getValue(MaskOperand); + SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); EVT VT = Src0.getValueType(); if (!Alignment) @@ -4491,8 +4493,9 @@ VT.getStoreSize().getKnownMinSize(), Alignment, AAInfo, Ranges); - SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO, - ISD::NON_EXTLOAD, IsExpanding); + SDValue Load = + DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO, + ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding); if (AddToChain) PendingLoads.push_back(Load.getValue(1)); setValue(&I, Load); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -685,6 +685,10 @@ if (doExt) OS << " from " << MLd->getMemoryVT().getEVTString(); + const char *AM = getIndexedModeName(MLd->getAddressingMode()); + if (*AM) + OS << ", " << AM; + if (MLd->isExpandingLoad()) OS << ", expanding"; @@ -696,6 +700,10 @@ if (MSt->isTruncatingStore()) OS << ", trunc to " << MSt->getMemoryVT().getEVTString(); + const char *AM = getIndexedModeName(MSt->getAddressingMode()); + if (*AM) + OS << ", " << AM; + if (MSt->isCompressingStore()) OS << ", compressing"; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -633,6 +633,8 @@ IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) { setIndexedLoadAction(IM, VT, Expand); setIndexedStoreAction(IM, VT, Expand); + setIndexedMaskedLoadAction(IM, VT, Expand); + setIndexedMaskedStoreAction(IM, VT, Expand); } // Most backends expect to see the node which just returns the value loaded. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -262,15 +262,17 @@ // non-extending masked load fragment. def nonext_masked_load : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (masked_ld node:$ptr, node:$pred, node:$def), [{ - return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed(); }]>; // sign extending masked load fragments. def asext_masked_load : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (masked_ld node:$ptr, node:$pred, node:$def),[{ - return cast(N)->getExtensionType() == ISD::EXTLOAD || - cast(N)->getExtensionType() == ISD::SEXTLOAD; + (masked_ld node:$ptr, undef, node:$pred, node:$def),[{ + return (cast(N)->getExtensionType() == ISD::EXTLOAD || + cast(N)->getExtensionType() == ISD::SEXTLOAD) && + cast(N)->isUnindexed(); }]>; def asext_masked_load_i8 : PatFrag<(ops node:$ptr, node:$pred, node:$def), @@ -290,8 +292,9 @@ // zero extending masked load fragments. def zext_masked_load : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (masked_ld node:$ptr, node:$pred, node:$def), [{ - return cast(N)->getExtensionType() == ISD::ZEXTLOAD; + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::ZEXTLOAD && + cast(N)->isUnindexed(); }]>; def zext_masked_load_i8 : PatFrag<(ops node:$ptr, node:$pred, node:$def), @@ -312,14 +315,16 @@ // non-truncating masked store fragment. def nontrunc_masked_store : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ - return !cast(N)->isTruncatingStore(); + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return !cast(N)->isTruncatingStore() && + cast(N)->isUnindexed(); }]>; // truncating masked store fragments. def trunc_masked_store : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ - return cast(N)->isTruncatingStore(); + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return cast(N)->isTruncatingStore() && + cast(N)->isUnindexed(); }]>; def trunc_masked_store_i8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1351,11 +1351,27 @@ SDValue &OffImm, unsigned Shift) { unsigned Opcode = Op->getOpcode(); - ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) - ? cast(Op)->getAddressingMode() - : cast(Op)->getAddressingMode(); + ISD::MemIndexedMode AM; + switch (Opcode) { + case ISD::LOAD: + AM = cast(Op)->getAddressingMode(); + break; + case ISD::STORE: + AM = cast(Op)->getAddressingMode(); + break; + case ISD::MLOAD: + AM = cast(Op)->getAddressingMode(); + break; + case ISD::MSTORE: + AM = cast(Op)->getAddressingMode(); + break; + default: + llvm_unreachable("Unexpected Opcode for Imm7Offset"); + } + int RHSC; - if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits. + // 7 bit constant, shifted by Shift. + if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32) @@ -1625,58 +1641,93 @@ } bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { - LoadSDNode *LD = cast(N); - ISD::MemIndexedMode AM = LD->getAddressingMode(); - if (AM == ISD::UNINDEXED) - return false; - EVT LoadedVT = LD->getMemoryVT(); - if (!LoadedVT.isVector()) - return false; - bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; - SDValue Offset; - bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + EVT LoadedVT; unsigned Opcode = 0; - unsigned Align = LD->getAlignment(); - bool IsLE = Subtarget->isLittle(); + bool isSExtLd, isPre; + unsigned Align; + ARMVCC::VPTCodes Pred; + SDValue PredReg; + SDValue Chain, Base, Offset; + + if (LoadSDNode *LD = dyn_cast(N)) { + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return false; + LoadedVT = LD->getMemoryVT(); + if (!LoadedVT.isVector()) + return false; + + Chain = LD->getChain(); + Base = LD->getBasePtr(); + Offset = LD->getOffset(); + Align = LD->getAlignment(); + isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + Pred = ARMVCC::None; + PredReg = CurDAG->getRegister(0, MVT::i32); + } else if (MaskedLoadSDNode *LD = dyn_cast(N)) { + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return false; + LoadedVT = LD->getMemoryVT(); + if (!LoadedVT.isVector()) + return false; + Chain = LD->getChain(); + Base = LD->getBasePtr(); + Offset = LD->getOffset(); + Align = LD->getAlignment(); + isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + Pred = ARMVCC::Then; + PredReg = LD->getMask(); + } else + llvm_unreachable("Expected a Load or a Masked Load!"); + + // We allow LE non-masked loads to change the type (for example use a vldrb.8 + // as opposed to a vldrw.32). This can allow extra addressing modes or + // alignments for what is otherwise an equivalent instruction. + bool CanChangeType = Subtarget->isLittle() && !isa(N); + + SDValue NewOffset; if (Align >= 2 && LoadedVT == MVT::v4i16 && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) { + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; else Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post; } else if (LoadedVT == MVT::v8i8 && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post; else Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post; } else if (LoadedVT == MVT::v4i8 && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; else Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; } else if (Align >= 4 && - (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2)) + (CanChangeType || LoadedVT == MVT::v4i32 || + LoadedVT == MVT::v4f32) && + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2)) Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; else if (Align >= 2 && - (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) + (CanChangeType || LoadedVT == MVT::v8i16 || + LoadedVT == MVT::v8f16) && + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post; - else if ((IsLE || LoadedVT == MVT::v16i8) && - SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) + else if ((CanChangeType || LoadedVT == MVT::v16i8) && + SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post; else return false; - SDValue Chain = LD->getChain(); - SDValue Base = LD->getBasePtr(); - SDValue Ops[] = {Base, Offset, - CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32), - CurDAG->getRegister(0, MVT::i32), Chain}; - SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0), + SDValue Ops[] = {Base, NewOffset, + CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg, + Chain}; + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0), MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), SDValue(New, 1)); @@ -3292,6 +3343,11 @@ // Other cases are autogenerated. break; } + case ISD::MLOAD: + if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N)) + return; + // Other cases are autogenerated. + break; case ARMISD::WLS: case ARMISD::LE: { SDValue Ops[] = { N->getOperand(1), diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -296,6 +296,8 @@ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); + setIndexedMaskedLoadAction(im, VT, Legal); + setIndexedMaskedStoreAction(im, VT, Legal); } } @@ -322,6 +324,8 @@ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); + setIndexedMaskedLoadAction(im, VT, Legal); + setIndexedMaskedStoreAction(im, VT, Legal); } if (HasMVEFP) { @@ -374,12 +378,12 @@ // Pre and Post inc on these are legal, given the correct extends for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, MVT::v8i8, Legal); - setIndexedStoreAction(im, MVT::v8i8, Legal); - setIndexedLoadAction(im, MVT::v4i8, Legal); - setIndexedStoreAction(im, MVT::v4i8, Legal); - setIndexedLoadAction(im, MVT::v4i16, Legal); - setIndexedStoreAction(im, MVT::v4i16, Legal); + for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); + setIndexedMaskedLoadAction(im, VT, Legal); + setIndexedMaskedStoreAction(im, VT, Legal); + } } // Predicate types @@ -9013,8 +9017,9 @@ SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(0, dl, MVT::i32)); SDValue NewLoad = DAG.getMaskedLoad( - VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(), - N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); + VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, + N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), + N->getExtensionType(), N->isExpandingLoad()); SDValue Combo = NewLoad; if (!PassThru.isUndef() && (PassThru.getOpcode() != ISD::BITCAST || @@ -15192,14 +15197,19 @@ } static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, - bool isSEXTLoad, bool isLE, SDValue &Base, - SDValue &Offset, bool &isInc, - SelectionDAG &DAG) { + bool isSEXTLoad, bool IsMasked, bool isLE, + SDValue &Base, SDValue &Offset, + bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; if (!isa(Ptr->getOperand(1))) return false; + // We allow LE non-masked loads to change the type (for example use a vldrb.8 + // as opposed to a vldrw.32). This can allow extra addressing modes or + // alignments for what is otherwise an equivalent instruction. + bool CanChangeType = isLE && !IsMasked; + ConstantSDNode *RHS = cast(Ptr->getOperand(1)); int RHSC = (int)RHS->getZExtValue(); @@ -15218,7 +15228,7 @@ }; // Try to find a matching instruction based on s/zext, Alignment, Offset and - // (in BE) type. + // (in BE/masked) type. Base = Ptr->getOperand(0); if (VT == MVT::v4i16) { if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) @@ -15226,13 +15236,15 @@ } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { if (IsInRange(RHSC, 0x80, 1)) return true; - } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) && + } else if (Align >= 4 && + (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && IsInRange(RHSC, 0x80, 4)) return true; - else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) && + else if (Align >= 2 && + (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && IsInRange(RHSC, 0x80, 2)) return true; - else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) + else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) return true; return false; } @@ -15252,6 +15264,7 @@ SDValue Ptr; unsigned Align; bool isSEXTLoad = false; + bool IsMasked = false; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); @@ -15261,6 +15274,17 @@ Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); Align = ST->getAlignment(); + } else if (MaskedLoadSDNode *LD = dyn_cast(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + Align = LD->getAlignment(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + IsMasked = true; + } else if (MaskedStoreSDNode *ST = dyn_cast(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + Align = ST->getAlignment(); + IsMasked = true; } else return false; @@ -15269,8 +15293,8 @@ if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, - Subtarget->isLittle(), Base, Offset, - isInc, DAG); + IsMasked, Subtarget->isLittle(), Base, + Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, @@ -15298,6 +15322,7 @@ SDValue Ptr; unsigned Align; bool isSEXTLoad = false, isNonExt; + bool IsMasked = false; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); @@ -15309,6 +15334,19 @@ Ptr = ST->getBasePtr(); Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); + } else if (MaskedLoadSDNode *LD = dyn_cast(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + Align = LD->getAlignment(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; + IsMasked = true; + } else if (MaskedStoreSDNode *ST = dyn_cast(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + Align = ST->getAlignment(); + isNonExt = !ST->isTruncatingStore(); + IsMasked = true; } else return false; @@ -15332,7 +15370,7 @@ bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, + getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5332,6 +5332,10 @@ PatFrag StoreKind, int shift> : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset:$addr), (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset:$addr)>; +class MVE_vector_offset_maskedstore_typed + : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset:$addr, VCCR:$pred), + (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset:$addr, (i32 1), VCCR:$pred)>; multiclass MVE_vector_offset_store { @@ -5363,7 +5367,7 @@ def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{ auto *Ld = cast(N); return Ld->getMemoryVT().getScalarType() == MVT::i8; }]>; @@ -5382,7 +5386,7 @@ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; }]>; def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{ auto *Ld = cast(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2; @@ -5402,14 +5406,14 @@ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; }]>; def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{ auto *Ld = cast(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; }]>; def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ return cast(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), @@ -5417,7 +5421,7 @@ return cast(N)->isTruncatingStore(); }]>; def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ auto *St = cast(N); EVT ScalarVT = St->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; @@ -5428,12 +5432,41 @@ return cast(N)->isTruncatingStore(); }]>; def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ auto *St = cast(N); EVT ScalarVT = St->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; }]>; + +def pre_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask), + (masked_st node:$val, node:$base, node:$offset, node:$mask), [{ + ISD::MemIndexedMode AM = cast(N)->getAddressingMode(); + return AM == ISD::PRE_INC || AM == ISD::PRE_DEC; +}]>; +def post_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask), + (masked_st node:$val, node:$base, node:$offset, node:$mask), [{ + ISD::MemIndexedMode AM = cast(N)->getAddressingMode(); + return AM == ISD::POST_INC || AM == ISD::POST_DEC; +}]>; +def aligned32_pre_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + return cast(N)->getAlignment() >= 4; +}]>; +def aligned32_post_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + return cast(N)->getAlignment() >= 4; +}]>; +def aligned16_pre_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + return cast(N)->getAlignment() >= 2; +}]>; +def aligned16_post_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask), + (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{ + return cast(N)->getAlignment() >= 2; +}]>; + + let Predicates = [HasMVEInt, IsLE] in { // Stores defm : MVE_vector_store; @@ -5515,19 +5548,26 @@ def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; - // Truncating stores - def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred), - (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; + + // Pre/Post inc masked stores + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + def : MVE_vector_offset_maskedstore_typed; + // Aligned masked loads def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; + // Extending masked loads. def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))), @@ -5569,6 +5609,37 @@ (pre_truncstvi16 node:$val, node:$base, node:$offset)>; } +def pre_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred), + (masked_st node:$val, node:$base, node:$offset, node:$pred), [{ + ISD::MemIndexedMode AM = cast(N)->getAddressingMode(); + return AM == ISD::PRE_INC || AM == ISD::PRE_DEC; +}]>; +def pre_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred), + (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def pre_truncmaskedstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred), + (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{ + auto *St = cast(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; +def post_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd), + (masked_st node:$val, node:$base, node:$offset, node:$postd), [{ + ISD::MemIndexedMode AM = cast(N)->getAddressingMode(); + return AM == ISD::POST_INC || AM == ISD::POST_DEC; +}]>; +def post_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd), + (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def post_truncmaskedstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd), + (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{ + auto *St = cast(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; + let Predicates = [HasMVEInt] in { def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr), (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>; @@ -5590,6 +5661,27 @@ (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; + + def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr, VCCR:$pred), + (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; + + def : Pat<(post_truncmaskedstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred), + (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(post_truncmaskedstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred), + (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(post_truncmaskedstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr, VCCR:$pred), + (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr, (i32 1), VCCR:$pred)>; + + def : Pat<(pre_truncmaskedstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred), + (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(pre_truncmaskedstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred), + (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(pre_truncmaskedstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr, VCCR:$pred), + (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr, (i32 1), VCCR:$pred)>; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24280,9 +24280,11 @@ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + SDValue Offset = DAG.getUNDEF(VMask.getValueType()); - return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT, - MemIntr->getMemOperand(), true /* truncating */); + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask, + MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED, + true /* truncating */); } case X86ISD::VTRUNCUS: case X86ISD::VTRUNCS: { @@ -27593,12 +27595,11 @@ if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) return Op; - SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), - N->getBasePtr(), Mask, - getZeroVector(VT, Subtarget, DAG, dl), - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType(), - N->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), + N->isExpandingLoad()); // Emit a blend. SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, PassThru); @@ -27632,11 +27633,10 @@ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); - SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), - N->getBasePtr(), Mask, PassThru, - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType(), - N->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), + N->getExtensionType(), N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), @@ -27682,7 +27682,8 @@ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), - Mask, N->getMemoryVT(), N->getMemOperand(), + N->getOffset(), Mask, N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore()); } @@ -40453,6 +40454,7 @@ static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + assert(ML->isUnindexed() && "Unexpected indexed masked load!"); // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. @@ -40480,6 +40482,7 @@ static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + assert(ML->isUnindexed() && "Unexpected indexed masked load!"); if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) return SDValue(); @@ -40515,10 +40518,10 @@ // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. - SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), - ML->getMask(), DAG.getUNDEF(VT), - ML->getMemoryVT(), ML->getMemOperand(), - ML->getExtensionType()); + SDValue NewML = DAG.getMaskedLoad( + VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(), + DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), + ML->getAddressingMode(), ML->getExtensionType()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru()); @@ -40604,8 +40607,9 @@ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), Mst->getMemoryVT())) { return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); + Mst->getBasePtr(), Mst->getOffset(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), + Mst->getAddressingMode(), true); } return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -706,6 +706,10 @@ def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>; +def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2> +]>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -1040,9 +1044,10 @@ INSERT_get_vinsert256_imm>; def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_ld node:$src1, node:$src2, node:$src3), [{ + (masked_ld node:$src1, undef, node:$src2, node:$src3), [{ return !cast(N)->isExpandingLoad() && - cast(N)->getExtensionType() == ISD::NON_EXTLOAD; + cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed(); }]>; def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -1055,17 +1060,19 @@ }]>; def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_ld node:$src1, node:$src2, node:$src3), [{ - return cast(N)->isExpandingLoad(); + (masked_ld node:$src1, undef, node:$src2, node:$src3), [{ + return cast(N)->isExpandingLoad() && + cast(N)->isUnindexed(); }]>; // Masked store fragments. // X86mstore can't be implemented in core DAG files because some targets // do not support vector types (llvm-tblgen will fail). def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_st node:$src1, node:$src2, node:$src3), [{ - return (!cast(N)->isTruncatingStore()) && - (!cast(N)->isCompressingStore()); + (masked_st node:$src1, node:$src2, undef, node:$src3), [{ + return !cast(N)->isTruncatingStore() && + !cast(N)->isCompressingStore() && + cast(N)->isUnindexed(); }]>; def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -1078,16 +1085,18 @@ }]>; def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_st node:$src1, node:$src2, node:$src3), [{ - return cast(N)->isCompressingStore(); + (masked_st node:$src1, node:$src2, undef, node:$src3), [{ + return cast(N)->isCompressingStore() && + cast(N)->isUnindexed(); }]>; // masked truncstore fragments // X86mtruncstore can't be implemented in core DAG files because some targets // doesn't support vector type ( llvm-tblgen will fail) def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_st node:$src1, node:$src2, node:$src3), [{ - return cast(N)->isTruncatingStore(); + (masked_st node:$src1, node:$src2, undef, node:$src3), [{ + return cast(N)->isTruncatingStore() && + cast(N)->isUnindexed(); }]>; def masked_truncstorevi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -1111,10 +1120,10 @@ def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore, +def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTX86MaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore, +def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr), diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -154,11 +154,11 @@ ; CHECK-NEXT: vldrwt.u32 ; CHECK-NEXT: vldrwt.u32 ; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] +; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4 ; CHECK: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3] ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2] -; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4 ; CHECK: le lr, [[LOOP]] ; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK: vpsel diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -39,14 +39,11 @@ ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: adds r1, #16 -; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r2], #16 ; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vmul.f32 q0, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_5 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new @@ -236,13 +233,11 @@ ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q2, [r0] -; CHECK-NEXT: vldrwt.u32 q3, [r1] ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: adds r0, #16 -; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -88,10 +88,9 @@ ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q2, [r1] ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldrh.s32 q2, [r1], #8 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: letp lr, .LBB1_1 @@ -229,10 +228,9 @@ ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q2, [r1] ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldrh.u32 q2, [r1], #8 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: letp lr, .LBB3_1 @@ -295,10 +293,9 @@ ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: letp lr, .LBB4_1 @@ -390,11 +387,10 @@ ; CHECK-NEXT: adds r5, r1, r4 ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_12 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new @@ -594,15 +590,12 @@ ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0], #8 +; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} @@ -691,11 +684,10 @@ ; CHECK-NEXT: adds r5, r1, r4 ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_12 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new @@ -895,15 +887,12 @@ ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0], #8 +; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} @@ -988,15 +977,12 @@ ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r0, #16 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: adds r1, #16 -; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new @@ -1189,12 +1175,11 @@ ; CHECK-NEXT: add.w r4, r1, r12 ; CHECK-NEXT: vldrb.u16 q0, [r4] ; CHECK-NEXT: add.w r4, r2, r12 -; CHECK-NEXT: vldrb.u16 q1, [r4] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vldrb.u16 q1, [r4] +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -9,23 +9,21 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: adds r0, #16 -; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -83,8 +81,7 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB1_1 @@ -144,8 +141,7 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB2_1 @@ -201,12 +197,10 @@ ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmul.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: adds r1, #16 -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vmul.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -255,12 +249,10 @@ ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: adds r1, #16 -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -369,14 +361,11 @@ ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: adds r1, #16 -; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r2], #16 ; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll @@ -7,8 +7,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -67,8 +66,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #508 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -107,8 +105,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #-508 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -147,8 +144,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.u32 q0, [r0], #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -189,8 +185,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.u32 q0, [r0], #2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -210,8 +205,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.u32 q0, [r0], #254 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -252,8 +246,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.u32 q0, [r0], #-254 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -294,8 +287,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.s32 q0, [r0], #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -336,8 +328,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.s32 q0, [r0], #2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -357,8 +348,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.s32 q0, [r0], #254 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -399,8 +389,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.s32 q0, [r0], #-254 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -441,8 +430,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.u16 q0, [r0], #4 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -481,8 +469,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.u16 q0, [r0], #2 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -501,8 +488,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0], #254 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -541,8 +527,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0], #-254 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -581,8 +566,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.u32 q0, [r0], #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -602,8 +586,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.u32 q0, [r0], #3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -623,8 +606,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.u32 q0, [r0], #2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -644,8 +626,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.u32 q0, [r0], #127 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -686,8 +667,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.u32 q0, [r0], #-127 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -728,8 +708,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.s32 q0, [r0], #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -749,8 +728,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.s32 q0, [r0], #3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -770,8 +748,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.s32 q0, [r0], #2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -791,8 +768,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.s32 q0, [r0], #127 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -833,8 +809,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.s32 q0, [r0], #-127 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -875,8 +850,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #4 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -896,8 +870,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #3 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -917,8 +890,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #2 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -938,8 +910,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #127 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -980,8 +951,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #-127 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1022,8 +992,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.s16 q0, [r0], #4 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1043,8 +1012,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.s16 q0, [r0], #3 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1064,8 +1032,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.s16 q0, [r0], #2 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1085,8 +1052,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.s16 q0, [r0], #127 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1127,8 +1093,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.s16 q0, [r0], #-127 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1169,8 +1134,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.u8 q0, [r0], #4 ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1189,8 +1153,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.u8 q0, [r0], #3 ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1209,8 +1172,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.u8 q0, [r0], #2 ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1229,8 +1191,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.u8 q0, [r0], #127 ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1269,8 +1230,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.u8 q0, [r0], #-127 ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1309,8 +1269,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1369,8 +1328,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #508 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1409,8 +1367,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #-508 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1449,8 +1406,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.u16 q0, [r0], #4 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1489,8 +1445,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.u16 q0, [r0], #2 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1509,8 +1464,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0], #254 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1549,8 +1503,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0], #-254 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1593,8 +1546,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrwt.32 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1653,8 +1605,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0], #508 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 508 @@ -1693,8 +1644,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0], #-508 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -508 @@ -1733,8 +1683,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrht.32 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1773,8 +1722,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrht.32 q0, [r0], #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1793,8 +1741,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrht.32 q0, [r0], #254 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -1833,8 +1780,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrht.32 q0, [r0], #-254 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -254 @@ -1873,8 +1819,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrht.16 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1913,8 +1858,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrht.16 q0, [r0], #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1933,8 +1877,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0], #254 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -1973,8 +1916,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0], #-254 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -254 @@ -2013,8 +1955,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrbt.32 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2033,8 +1974,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrbt.32 q0, [r0], #3 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -2053,8 +1993,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrbt.32 q0, [r0], #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2073,8 +2012,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrbt.32 q0, [r0], #127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -2113,8 +2051,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrbt.32 q0, [r0], #-127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -127 @@ -2153,8 +2090,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrbt.16 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2173,8 +2109,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrbt.16 q0, [r0], #3 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -2193,8 +2128,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrbt.16 q0, [r0], #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2213,8 +2147,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrbt.16 q0, [r0], #127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -2253,8 +2186,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrbt.16 q0, [r0], #-127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -127 @@ -2293,8 +2225,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrbt.8 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2313,8 +2244,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrbt.8 q0, [r0], #3 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -2333,8 +2263,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrbt.8 q0, [r0], #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2353,8 +2282,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrbt.8 q0, [r0], #127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -2393,8 +2321,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrbt.8 q0, [r0], #-127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -127 @@ -2433,8 +2360,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrwt.32 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2493,8 +2419,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0], #508 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 508 @@ -2533,8 +2458,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0], #-508 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -508 @@ -2573,8 +2497,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrht.16 q0, [r0], #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2613,8 +2536,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrht.16 q0, [r0], #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2633,8 +2555,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0], #254 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -2673,8 +2594,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0], #-254 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -254 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll @@ -7,8 +7,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -67,8 +66,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -107,8 +105,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -147,8 +144,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.u32 q0, [r0, #4]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -189,8 +185,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.u32 q0, [r0, #2]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -210,8 +205,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.u32 q0, [r0, #254]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -252,8 +246,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.u32 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.u32 q0, [r0, #-254]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -294,8 +287,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.s32 q0, [r0, #4]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -336,8 +328,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.s32 q0, [r0, #2]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -357,8 +348,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.s32 q0, [r0, #254]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -399,8 +389,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrht.s32 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.s32 q0, [r0, #-254]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -441,8 +430,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.u16 q0, [r0, #4]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -481,8 +469,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.u16 q0, [r0, #2]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -501,8 +488,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0, #254]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -541,8 +527,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -581,8 +566,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.u32 q0, [r0, #4]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -602,8 +586,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.u32 q0, [r0, #3]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -623,8 +606,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.u32 q0, [r0, #2]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -644,8 +626,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.u32 q0, [r0, #127]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -686,8 +667,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -728,8 +708,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.s32 q0, [r0, #4]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -749,8 +728,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.s32 q0, [r0, #3]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -770,8 +748,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.s32 q0, [r0, #2]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -791,8 +768,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.s32 q0, [r0, #127]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -833,8 +809,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -875,8 +850,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.u16 q0, [r0, #4]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -896,8 +870,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.u16 q0, [r0, #3]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -917,8 +890,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.u16 q0, [r0, #2]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -938,8 +910,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.u16 q0, [r0, #127]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -980,8 +951,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1022,8 +992,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.s16 q0, [r0, #4]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1043,8 +1012,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.s16 q0, [r0, #3]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1064,8 +1032,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.s16 q0, [r0, #2]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1085,8 +1052,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.s16 q0, [r0, #127]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1127,8 +1093,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1169,8 +1134,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrbt.u8 q0, [r0, #4]! ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1189,8 +1153,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrbt.u8 q0, [r0, #3]! ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1209,8 +1172,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrbt.u8 q0, [r0, #2]! ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1229,8 +1191,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrbt.u8 q0, [r0, #127]! ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1269,8 +1230,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr -; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127]! ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1309,8 +1269,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1369,8 +1328,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1409,8 +1367,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1449,8 +1406,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldrht.u16 q0, [r0, #4]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1489,8 +1445,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrht.u16 q0, [r0, #2]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1509,8 +1464,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0, #254]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1549,8 +1503,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]! ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1593,8 +1546,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrwt.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1653,8 +1605,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0, #508] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0, #508]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 508 @@ -1693,8 +1644,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -508 @@ -1733,8 +1683,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrht.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1773,8 +1722,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrht.32 q0, [r0, #2]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1793,8 +1741,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrht.32 q0, [r0, #254]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -1833,8 +1780,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrht.32 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrht.32 q0, [r0, #-254]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -254 @@ -1873,8 +1819,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrht.16 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1913,8 +1858,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrht.16 q0, [r0, #2]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1933,8 +1877,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0, #254]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -1973,8 +1916,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0, #-254]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -254 @@ -2013,8 +1955,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrbt.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2033,8 +1974,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrbt.32 q0, [r0, #3]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -2053,8 +1993,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrbt.32 q0, [r0, #2]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2073,8 +2012,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrbt.32 q0, [r0, #127]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -2113,8 +2051,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrbt.32 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrbt.32 q0, [r0, #-127]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -127 @@ -2153,8 +2090,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrbt.16 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2173,8 +2109,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrbt.16 q0, [r0, #3]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -2193,8 +2128,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrbt.16 q0, [r0, #2]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2213,8 +2147,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrbt.16 q0, [r0, #127]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -2253,8 +2186,7 @@ ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrbt.16 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrbt.16 q0, [r0, #-127]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -127 @@ -2293,8 +2225,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrbt.8 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2313,8 +2244,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0, #3] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrbt.8 q0, [r0, #3]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -2333,8 +2263,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrbt.8 q0, [r0, #2]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2353,8 +2282,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0, #127] -; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrbt.8 q0, [r0, #127]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -2393,8 +2321,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr -; CHECK-NEXT: vstrbt.8 q0, [r0, #-127] -; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrbt.8 q0, [r0, #-127]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -127 @@ -2433,8 +2360,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrwt.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2493,8 +2419,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0, #508] -; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0, #508]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 508 @@ -2533,8 +2458,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] -; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -508 @@ -2573,8 +2497,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #4] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrht.16 q0, [r0, #4]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -2613,8 +2536,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #2] -; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrht.16 q0, [r0, #2]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -2633,8 +2555,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #254] -; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0, #254]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -2673,8 +2594,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr -; CHECK-NEXT: vstrht.16 q0, [r0, #-254] -; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrht.16 q0, [r0, #-254]! ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 -254 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -468,8 +468,7 @@ ; CHECK-LE-LABEL: masked_v4i32_preinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4]! ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -477,8 +476,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4]! ; CHECK-BE-NEXT: vstrw.32 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -495,8 +493,7 @@ ; CHECK-LE-LABEL: masked_v4i32_postinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrwt.u32 q0, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrwt.u32 q0, [r0], #4 ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -504,8 +501,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrwt.u32 q0, [r0], #4 ; CHECK-BE-NEXT: vstrw.32 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1032,8 +1028,7 @@ ; CHECK-LE-NEXT: vldr d1, [sp] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4]! ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1043,8 +1038,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4]! ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1061,8 +1055,7 @@ ; CHECK-LE-LABEL: masked_v8i16_postinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrht.u16 q0, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrht.u16 q0, [r0], #4 ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1070,8 +1063,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrht.u16 q0, [r0], #4 ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1151,8 +1143,7 @@ ; CHECK-LE-LABEL: masked_v16i8_preinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u8 q0, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrbt.u8 q0, [r0, #4]! ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1160,8 +1151,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrbt.u8 q0, [r0, #4]! ; CHECK-BE-NEXT: vstrb.8 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1178,8 +1168,7 @@ ; CHECK-LE-LABEL: masked_v16i8_postinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrbt.u8 q0, [r0], #4 ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1187,8 +1176,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrbt.u8 q0, [r0], #4 ; CHECK-BE-NEXT: vstrb.8 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1355,8 +1343,7 @@ ; CHECK-LE-LABEL: masked_v4f32_preinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4]! ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1364,8 +1351,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4]! ; CHECK-BE-NEXT: vstrw.32 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1382,8 +1368,7 @@ ; CHECK-LE-LABEL: masked_v4f32_postinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrwt.u32 q0, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrwt.u32 q0, [r0], #4 ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1391,8 +1376,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrwt.u32 q0, [r0], #4 ; CHECK-BE-NEXT: vstrw.32 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1724,8 +1708,7 @@ ; CHECK-LE-LABEL: masked_v8f16_preinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4]! ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1733,8 +1716,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4]! ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: @@ -1751,8 +1733,7 @@ ; CHECK-LE-LABEL: masked_v8f16_postinc: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrht.u16 q0, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vldrht.u16 q0, [r0], #4 ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: bx lr ; @@ -1760,8 +1741,7 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vldrht.u16 q0, [r0], #4 ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -111,8 +111,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i32_pre: @@ -122,8 +121,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr -; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -142,8 +140,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vstrwt.32 q1, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i32_post: @@ -153,8 +150,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr -; CHECK-BE-NEXT: vstrwt.32 q1, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -334,8 +330,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_pre: @@ -345,8 +340,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr -; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -365,8 +359,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vstrht.16 q1, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_post: @@ -376,8 +369,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr -; CHECK-BE-NEXT: vstrht.16 q1, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -416,8 +408,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr -; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v16i8_pre: @@ -427,8 +418,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.8 q2, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr -; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4]! ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -447,8 +437,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr -; CHECK-LE-NEXT: vstrbt.8 q1, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrbt.8 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v16i8_post: @@ -458,8 +447,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.8 q2, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr -; CHECK-BE-NEXT: vstrbt.8 q1, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrbt.8 q1, [r0], #4 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -591,8 +579,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_pre: @@ -602,8 +589,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr -; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -622,8 +608,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vstrwt.32 q1, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_post: @@ -633,8 +618,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr -; CHECK-BE-NEXT: vstrwt.32 q1, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -904,8 +888,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_pre: @@ -915,8 +898,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr -; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -935,8 +917,7 @@ ; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vstrht.16 q1, [r0] -; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_post: @@ -946,8 +927,7 @@ ; CHECK-BE-NEXT: vmov d0, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr -; CHECK-BE-NEXT: vstrht.16 q1, [r0] -; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4