Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -10993,6 +10993,121 @@
                               DAG.getUNDEF(VT), NewMask);
 }
 
+static bool isUpdatingVLDorVST(SDNode *Inst) {
+  switch(Inst->getOpcode()) {
+  case ARMISD::VLD1_UPD:
+  case ARMISD::VST1_UPD:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static ConstantSDNode *tryGetConstOperand(SDNode *Inst, unsigned NOp) {
+   return dyn_cast<ConstantSDNode>(Inst->getOperand(NOp).getNode());
+}  
+
+static SDValue getIncrementWithOffset(SelectionDAG &DAG, SDValue C,
+                                     unsigned Offset, SDLoc DL) {
+  // If Offset is zero then C may or may not be constant.
+  if (!Offset)
+    return C;
+
+  // We should always have constant value C, if offset is not zero.
+  unsigned NewVal =
+      cast<ConstantSDNode>(C.getNode())->getZExtValue() - Offset;
+
+  return DAG.getConstant(NewVal, DL, C.getValueType());
+}
+
+static std::pair<SDValue, unsigned> checkedGetIncrement(SDValue Addr,
+                                                        SDNode *Inst,
+                                                        unsigned AccessSize,
+                                                        unsigned Offset) {
+  // If the increment is a constant, it must match the memory ref size.
+  SDValue Inc = Inst->getOperand(Inst->getOperand(0) == Addr ? 1 : 0);
+  auto *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
+
+  // Don't select non-constant increment if we have to subtract a
+  // constant from it. This may result in additional register pressure
+  if (!CInc && Offset)
+    return {SDValue(), 0};
+
+  unsigned CIncSize = CInc ? CInc->getZExtValue() : 0;
+  if (AccessSize >= 3 * 16 && CIncSize != AccessSize) {
+    // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+    // separate instructions that make it harder to use a non-constant update.
+    return {SDValue(), 0};
+  }
+
+  // If increment is not greater than offset introduced by VLD/VST upper in the
+  // call chain we'll be unable to fold such.
+  if (CInc && CIncSize <= Offset)
+    return {SDValue(), 0};
+
+  return {Inc, CIncSize};
+}
+
+// Find address updating instruction, which we can fold with load/store,
+// creating VLD{X}_UPD or VST{X}_UPD.
+static std::pair<SDNode *, SDValue>
+findAddressUpdateToFold(SelectionDAG &DAG, SDNode *N, SDValue Addr,
+                        unsigned AccessSize) {
+  unsigned Offset = 0;
+  SDLoc DL(N);
+  struct Match {
+    SDNode *UInst;    // Address update instruction
+    SDValue Inc;      // Address increment
+    unsigned Off;     // Offset introduced by cascade vld/vst
+  } M = {};
+
+  while (true) {
+    for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+           UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User->getOpcode() != ISD::ADD ||
+          UI.getUse().getResNo() != Addr.getResNo())
+        continue;
+
+      // Check that the add is independent of the load/store.  Otherwise,
+      // folding it would create a cycle.
+      if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+        continue;
+     
+      // We can fold following types of address increment:
+      // 1. Non-constant and Offset == 0
+      // 2. Constant and Inc.second >= Offset
+      auto Inc = checkedGetIncrement(Addr, User, AccessSize, Offset);
+      if (Inc.first.getNode())
+        M = {User, Inc.first, Offset};
+
+      if (Inc.second == AccessSize + Offset)
+        // We've found best match possible.
+        return {M.UInst, getIncrementWithOffset(DAG, Inc.first, Offset, DL)};       
+    }
+
+    // If 'Addr' points to VLD{X}_UPD or VST{X}_UPD with fixed post-increment
+    // then we examine parent address operand as well, keeping track of 
+    // post-increment value
+    if (!isUpdatingVLDorVST(Addr.getNode()))
+      break;
+
+    // Get post-increment value from VST{X}_UPD or VLD{X}_UPD. If it is not
+    // constant don't bother. Otherwise we'll introduce extra register
+    // operation, because we'll need to subtract constant value from register
+    // increment.
+    auto *CInc = tryGetConstOperand(Addr.getNode(), 2);
+    if (!CInc)
+      break;
+
+    // Update offset with a size of post-increment of command upper in the
+    // chain. 
+    Offset += CInc->getZExtValue();
+    Addr = Addr.getOperand(1);
+  }
+  return {M.UInst, getIncrementWithOffset(DAG, M.Inc, M.Off, DL)}; 
+}
+
 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
 /// NEON load/store intrinsics, and generic vector load/stores, to merge
 /// base address updates.
@@ -11009,195 +11124,175 @@
   MemSDNode *MemN = cast<MemSDNode>(N);
   SDLoc dl(N);
 
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
-      continue;
-
-    // Check that the add is independent of the load/store.  Otherwise, folding
-    // it would create a cycle.
-    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
-      continue;
-
-    // Find the new opcode for the updating load/store.
-    bool isLoadOp = true;
-    bool isLaneOp = false;
-    unsigned NewOpc = 0;
-    unsigned NumVecs = 0;
-    if (isIntrinsic) {
-      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      default: llvm_unreachable("unexpected intrinsic for Neon base update");
-      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
-        NumVecs = 1; break;
-      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
-        NumVecs = 2; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
-        NumVecs = 3; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
-        NumVecs = 4; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
-        NumVecs = 1; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
-        NumVecs = 2; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
-        NumVecs = 3; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
-        NumVecs = 4; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
-        NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
-        NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
-        NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
-      }
-    } else {
-      isLaneOp = true;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("unexpected opcode for Neon base update");
-      case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
-      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
-      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
-      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
-      case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
-        NumVecs = 1; isLaneOp = false; break;
-      case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
-        NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
-      }
-    }
-
-    // Find the size of memory referenced by the load/store.
-    EVT VecTy;
-    if (isLoadOp) {
-      VecTy = N->getValueType(0);
-    } else if (isIntrinsic) {
-      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
-    } else {
-      assert(isStore && "Node has to be a load, a store, or an intrinsic!");
-      VecTy = N->getOperand(1).getValueType();
+  // Find the new opcode for the updating load/store.
+  bool isLoadOp = true;
+  bool isLaneOp = false;
+  unsigned NewOpc = 0;
+  unsigned NumVecs = 0;
+  if (isIntrinsic) {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
+      NumVecs = 1; break;
+    case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
+      NumVecs = 2; break;
+    case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
+      NumVecs = 3; break;
+    case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
+      NumVecs = 4; break;
+    case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+      NumVecs = 2; isLaneOp = true; break;
+    case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+      NumVecs = 3; isLaneOp = true; break;
+    case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+      NumVecs = 4; isLaneOp = true; break;
+    case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
+      NumVecs = 1; isLoadOp = false; break;
+    case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
+      NumVecs = 2; isLoadOp = false; break;
+    case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
+      NumVecs = 3; isLoadOp = false; break;
+    case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
+      NumVecs = 4; isLoadOp = false; break;
+    case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+      NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
+    case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+      NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
+    case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+      NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
     }
-
-    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-    if (isLaneOp)
-      NumBytes /= VecTy.getVectorNumElements();
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
-    if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
-      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
-      // separate instructions that make it harder to use a non-constant update.
-      continue;
+  } else {
+    isLaneOp = true;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("unexpected opcode for Neon base update");
+    case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
+    case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+    case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+    case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+    case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
+      NumVecs = 1; isLaneOp = false; break;
+    case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
+      NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
     }
+  }
 
-    // OK, we found an ADD we can fold into the base update.
-    // Now, create a _UPD node, taking care of not breaking alignment.
-
-    EVT AlignedVecTy = VecTy;
-    unsigned Alignment = MemN->getAlignment();
+  // Find the size of memory referenced by the load/store.
+  EVT VecTy;
+  if (isLoadOp) {
+    VecTy = N->getValueType(0);
+  } else if (isIntrinsic) {
+    VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+  } else {
+    assert(isStore && "Node has to be a load, a store, or an intrinsic!");
+    VecTy = N->getOperand(1).getValueType();
+  }
 
-    // If this is a less-than-standard-aligned load/store, change the type to
-    // match the standard alignment.
-    // The alignment is overlooked when selecting _UPD variants; and it's
-    // easier to introduce bitcasts here than fix that.
-    // There are 3 ways to get to this base-update combine:
-    // - intrinsics: they are assumed to be properly aligned (to the standard
-    //   alignment of the memory type), so we don't need to do anything.
-    // - ARMISD::VLDx nodes: they are only generated from the aforementioned
-    //   intrinsics, so, likewise, there's nothing to do.
-    // - generic load/store instructions: the alignment is specified as an
-    //   explicit operand, rather than implicitly as the standard alignment
-    //   of the memory type (like the intrisics).  We need to change the
-    //   memory type to match the explicit alignment.  That way, we don't
-    //   generate non-standard-aligned ARMISD::VLDx nodes.
-    if (isa<LSBaseSDNode>(N)) {
-      if (Alignment == 0)
-        Alignment = 1;
-      if (Alignment < VecTy.getScalarSizeInBits() / 8) {
-        MVT EltTy = MVT::getIntegerVT(Alignment * 8);
-        assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
-        assert(!isLaneOp && "Unexpected generic load/store lane.");
-        unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
-        AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
-      }
-      // Don't set an explicit alignment on regular load/stores that we want
-      // to transform to VLD/VST 1_UPD nodes.
-      // This matches the behavior of regular load/stores, which only get an
-      // explicit alignment if the MMO alignment is larger than the standard
-      // alignment of the memory type.
-      // Intrinsics, however, always get an explicit alignment, set to the
-      // alignment of the MMO.
+  unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+  if (isLaneOp)
+    NumBytes /= VecTy.getVectorNumElements();
+
+  auto AU = findAddressUpdateToFold(DAG, N, Addr, NumBytes);
+  if (!AU.first)
+    return SDValue();
+
+  // OK, we found an ADD we can fold into the base update.
+  // Now, create a _UPD node, taking care of not breaking alignment.
+
+  EVT AlignedVecTy = VecTy;
+  unsigned Alignment = MemN->getAlignment();
+
+  // If this is a less-than-standard-aligned load/store, change the type to
+  // match the standard alignment.
+  // The alignment is overlooked when selecting _UPD variants; and it's
+  // easier to introduce bitcasts here than fix that.
+  // There are 3 ways to get to this base-update combine:
+  // - intrinsics: they are assumed to be properly aligned (to the standard
+  //   alignment of the memory type), so we don't need to do anything.
+  // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+  //   intrinsics, so, likewise, there's nothing to do.
+  // - generic load/store instructions: the alignment is specified as an
+  //   explicit operand, rather than implicitly as the standard alignment
+  //   of the memory type (like the intrisics).  We need to change the
+  //   memory type to match the explicit alignment.  That way, we don't
+  //   generate non-standard-aligned ARMISD::VLDx nodes.
+  if (isa<LSBaseSDNode>(N)) {
+    if (Alignment == 0)
       Alignment = 1;
-    }
-
-    // Create the new updating load/store node.
-    // First, create an SDVTList for the new updating node's results.
-    EVT Tys[6];
-    unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
-    unsigned n;
-    for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = AlignedVecTy;
-    Tys[n++] = MVT::i32;
-    Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
-
-    // Then, gather the new node's operands.
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // incoming chain
-    Ops.push_back(N->getOperand(AddrOpIdx));
-    Ops.push_back(Inc);
-
-    if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
-      // Try to match the intrinsic's signature
-      Ops.push_back(StN->getValue());
-    } else {
-      // Loads (and of course intrinsics) match the intrinsics' signature,
-      // so just add all but the alignment operand.
-      for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
-        Ops.push_back(N->getOperand(i));
-    }
+    if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+      MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+      assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+      assert(!isLaneOp && "Unexpected generic load/store lane.");
+      unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+      AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+    }
+    // Don't set an explicit alignment on regular load/stores that we want
+    // to transform to VLD/VST 1_UPD nodes.
+    // This matches the behavior of regular load/stores, which only get an
+    // explicit alignment if the MMO alignment is larger than the standard
+    // alignment of the memory type.
+    // Intrinsics, however, always get an explicit alignment, set to the
+    // alignment of the MMO.
+    Alignment = 1;
+  }
+
+  // Create the new updating load/store node.
+  // First, create an SDVTList for the new updating node's results.
+  EVT Tys[6];
+  unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+  unsigned n;
+  for (n = 0; n < NumResultVecs; ++n)
+    Tys[n] = AlignedVecTy;
+  Tys[n++] = MVT::i32;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
 
-    // For all node types, the alignment operand is always the last one.
-    Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+  // Then, gather the new node's operands.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(N->getOperand(0)); // incoming chain
+  Ops.push_back(N->getOperand(AddrOpIdx));
+  Ops.push_back(AU.second);
+
+  if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+    // Try to match the intrinsic's signature
+    Ops.push_back(StN->getValue());
+  } else {
+    // Loads (and of course intrinsics) match the intrinsics' signature,
+    // so just add all but the alignment operand.
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+      Ops.push_back(N->getOperand(i));
+  }
 
-    // If this is a non-standard-aligned STORE, the penultimate operand is the
-    // stored value.  Bitcast it to the aligned type.
-    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
-      SDValue &StVal = Ops[Ops.size()-2];
-      StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
-    }
+  // For all node types, the alignment operand is always the last one.
+  Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
 
-    EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
-                                           MemN->getMemOperand());
+  // If this is a non-standard-aligned STORE, the penultimate operand is the
+  // stored value.  Bitcast it to the aligned type.
+  if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+    SDValue &StVal = Ops[Ops.size()-2];
+    StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+  }
 
-    // Update the uses.
-    SmallVector<SDValue, 5> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i)
-      NewResults.push_back(SDValue(UpdN.getNode(), i));
+  EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+  SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
+                                         MemN->getMemOperand());
 
-    // If this is an non-standard-aligned LOAD, the first result is the loaded
-    // value.  Bitcast it to the expected result type.
-    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
-      SDValue &LdVal = NewResults[0];
-      LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
-    }
+  // Update the uses.
+  SmallVector<SDValue, 5> NewResults;
+  for (unsigned i = 0; i < NumResultVecs; ++i)
+    NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+  // If this is an non-standard-aligned LOAD, the first result is the loaded
+  // value.  Bitcast it to the expected result type.
+  if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+    SDValue &LdVal = NewResults[0];
+    LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+  }
 
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
-    DCI.CombineTo(N, NewResults);
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+  NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+  DCI.CombineTo(N, NewResults);
+  DCI.CombineTo(AU.first, SDValue(UpdN.getNode(), NumResultVecs));
 
-    break;
-  }
   return SDValue();
 }
 
Index: test/CodeGen/ARM/alloc-no-stack-realign.ll
===================================================================
--- test/CodeGen/ARM/alloc-no-stack-realign.ll
+++ test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -8,31 +8,26 @@
 define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: ldr     r[[R1:[0-9]+]], [pc, r[[R1]]]
-; CHECK: mov     r[[R2:[0-9]+]], r[[R1]]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R2:[0-9]+]], r[[R1]], #48
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r[[R1]], #32
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: mov     r[[R1:[0-9]+]], #32
-; CHECK: mov     r[[R2:[0-9]+]], sp
-; CHECK: mov     r[[R3:[0-9]+]], r[[R2]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128], r[[R1]]
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK:   ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   mov r[[R1]], sp
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
+; CHECK:   vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
+; CHECK:   vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK:   add sp, sp, #64
+; CHECK:   bx  lr
  %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
@@ -44,32 +39,26 @@
 define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
 entry:
 ; CHECK-LABEL: test2:
-; CHECK: ldr     r[[R1:[0-9]+]], [pc, r[[R1]]]
-; CHECK: mov     r[[R2:[0-9]+]], r[[R1]]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R2:[0-9]+]], r[[R1]], #48
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r[[R1]], #32
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: mov     r[[R1:[0-9]+]], #32
-; CHECK: mov     r[[R2:[0-9]+]], sp
-; CHECK: mov     r[[R3:[0-9]+]], r[[R2]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128], r[[R1]]
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
-
+; CHECK:   ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   mov r[[R1]], sp
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:   vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
+; CHECK:   vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
+; CHECK:   vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
+; CHECK:   vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK:   mov sp, r7
+; CHECK:   pop {r7, pc}
 
 %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
Index: test/CodeGen/ARM/cascade-vld-vst.ll
===================================================================
--- test/CodeGen/ARM/cascade-vld-vst.ll
+++ test/CodeGen/ARM/cascade-vld-vst.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
+
+%M = type { [4 x <4 x float>] }
+
+; Function Attrs: noimplicitfloat noinline norecurse nounwind uwtable
+define void @_test_vld1_vst1(%M* %A, %M *%B)  {
+entry:
+  %v0p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 0
+  %v0 = load <4 x float>, <4 x float>* %v0p
+  %v1p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 1
+  %v1 = load <4 x float>, <4 x float>* %v1p
+  %v2p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 2
+  %v2 = load <4 x float>, <4 x float>* %v2p
+  %v3p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 3
+  %v3 = load <4 x float>, <4 x float>* %v3p
+
+  %s0p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 0
+  store <4 x float> %v0, <4 x float>* %s0p
+  %s1p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 1
+  store <4 x float> %v1, <4 x float>* %s1p
+  %s2p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 2
+  store <4 x float> %v2, <4 x float>* %s2p
+  %s3p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 3
+  store <4 x float> %v3, <4 x float>* %s3p
+  ret void
+}
+
+; CHECK:       vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT:  vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT:  vld1.32 {d20, d21}, [r0]!
+; CHECK-NEXT:  vld1.64 {d22, d23}, [r0]
+; CHECK-NEXT:  vst1.32 {d16, d17}, [r1]!
+; CHECK-NEXT:  vst1.32 {d18, d19}, [r1]!
+; CHECK-NEXT:  vst1.32 {d20, d21}, [r1]!
+; CHECK-NEXT:  vst1.64 {d22, d23}, [r1]
+; CHECK-NEXT:  mov pc, lr
+
Index: test/CodeGen/ARM/memcpy-inline.ll
===================================================================
--- test/CodeGen/ARM/memcpy-inline.ll
+++ test/CodeGen/ARM/memcpy-inline.ll
@@ -44,15 +44,14 @@
 define void @t2(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
-; CHECK: movs [[INC:r[0-9]+]], #32
-; CHECK: add.w   r3, r0, #16
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
-; CHECK: movw [[REG2:r[0-9]+]], #16716
-; CHECK: movt [[REG2:r[0-9]+]], #72
-; CHECK: str [[REG2]], [r0]
-; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
+; CHECK:   vld1.8  {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK:   vst1.8  {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
+; CHECK:   vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK:   vst1.8  {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
+; CHECK:   movw  r1, #16716
+; CHECK:   movt  r1, #72
+; CHECK:   str r1, [r0]
+; CHECK:   bx  lr
 ; CHECK-T1-LABEL: t2:
 ; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
Index: test/CodeGen/ARM/misched-fusion-aes.ll
===================================================================
--- test/CodeGen/ARM/misched-fusion-aes.ll
+++ test/CodeGen/ARM/misched-fusion-aes.ll
@@ -74,19 +74,19 @@
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QA]]
 ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]]
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
 }
@@ -160,19 +160,19 @@
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QA]]
 ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]]
 }
Index: test/CodeGen/ARM/vector-load.ll
===================================================================
--- test/CodeGen/ARM/vector-load.ll
+++ test/CodeGen/ARM/vector-load.ll
@@ -253,10 +253,10 @@
 }
 
 ; CHECK-LABEL: test_silly_load:
-; CHECK: vldr d{{[0-9]+}}, [r0, #16]
-; CHECK: movs r1, #24
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1
-; CHECK: ldr {{r[0-9]+}}, [r0]
+; CHECK:       vld1.8  {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]!
+; CHECK-NEXT:  vld1.8  {d{{[0-9]+}}}, [r0:64]!
+; CHECK-NEXT:  ldr r0, [r0]
+; CHECK-NEXT:  bx  lr
 
 define void @test_silly_load(<28 x i8>* %addr) {
   load volatile <28 x i8>, <28 x i8>* %addr
Index: test/CodeGen/ARM/vext.ll
===================================================================
--- test/CodeGen/ARM/vext.ll
+++ test/CodeGen/ARM/vext.ll
@@ -216,21 +216,18 @@
 define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
 ; CHECK-LABEL: test_multisource:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    add r2, r0, #48
-; CHECK-NEXT:    add r0, r0, #32
-; CHECK-NEXT:    vld1.16 {d16, d17}, [r1:128]!
-; CHECK-NEXT:    vld1.64 {d20, d21}, [r0:128]
-; CHECK-NEXT:    vorr d24, d20, d20
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r2:128]
-; CHECK-NEXT:    vld1.64 {d22, d23}, [r1:128]
-; CHECK-NEXT:    vzip.16 d24, d18
-; CHECK-NEXT:    vtrn.16 q8, q11
-; CHECK-NEXT:    vext.16 d18, d20, d24, #2
-; CHECK-NEXT:    vext.16 d16, d18, d16, #2
-; CHECK-NEXT:    vext.16 d16, d16, d16, #2
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    mov pc, lr
+; CHECK-NEXT:  vld1.16 {d16, d17}, [r0:128]!
+; CHECK-NEXT:  vld1.16 {d18, d19}, [r0:128]!
+; CHECK-NEXT:  vld1.16 {d20, d21}, [r0:128]!
+; CHECK-NEXT:  vorr  d24, d20, d20
+; CHECK-NEXT:  vld1.64 {d22, d23}, [r0:128]
+; CHECK-NEXT:  vzip.16 d24, d22
+; CHECK-NEXT:  vtrn.16 q8, q9
+; CHECK-NEXT:  vext.16 d18, d20, d24, #2
+; CHECK-NEXT:  vext.16 d16, d18, d16, #2
+; CHECK-NEXT:  vext.16 d16, d16, d16, #2
+; CHECK-NEXT:  vmov  r0, r1, d16
+; CHECK-NEXT:  mov pc, lr
         %tmp1 = load <32 x i16>, <32 x i16>* %B
         %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
         ret <4 x i16> %tmp2
Index: test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -198,20 +198,12 @@
 
 ; @testNeon is an important example of the nead for ivchains.
 ;
-; Currently we have two extra add.w's that keep the store address
-; live past the next increment because ISEL is unfortunately undoing
-; the store chain. ISEL also fails to convert all but one of the stores to
-; post-increment addressing. However, the loads should use
-; post-increment addressing, no add's or add.w's beyond the three
-; mentioned. Most importantly, there should be no spills or reloads!
-;
 ; A9: testNeon:
 ; A9: %.lr.ph
-; A9: add.w r
+; A9: vst1.8  {{.*}}, [r{{[0-9]+}}]!
 ; A9-NOT: lsl.w
 ; A9-NOT: {{ldr|str|adds|add r}}
 ; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}}
-; A9: add.w r
 ; A9-NOT: {{ldr|str|adds|add r}}
 ; A9-NOT: add.w r
 ; A9: bne