diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2381,6 +2381,15 @@
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
+  /// Return true if it is beneficial to retain post-indexing-friendly patterns
+  /// while performing optimizations.
+  virtual bool shouldRetainImmediatePostIncrement(const DataLayout &DL,
+                                                  Type *Ty, CombineLevel Level,
+                                                  unsigned AddrSpace,
+                                                  int64_t Increment) const {
+    return false;
+  }
+
   /// Return the cost of the scaling factor used in the addressing mode
   /// represented by AM for this target, for a load/store of the specified type.
   ///
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -524,6 +524,7 @@
     bool reassociationCanBreakAddressingModePattern(unsigned Opc,
                                                     const SDLoc &DL, SDValue N0,
                                                     SDValue N1);
+    bool reassociationCanBreakPostIndexingPattern(SDNode *N);
     SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
                                       SDValue N1);
     SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
@@ -1053,6 +1054,37 @@
   return false;
 }
 
+bool DAGCombiner::reassociationCanBreakPostIndexingPattern(SDNode *N) {
+  const DataLayout &DL = DAG.getDataLayout();
+  if (N->getOpcode() != ISD::ADD)
+    return false;
+
+  auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!Const)
+    return false;
+
+  const APInt &APIntVal = Const->getAPIntValue();
+  if (APIntVal.getBitWidth() > 64)
+    return false;
+  const int64_t ConstValue = APIntVal.getSExtValue();
+
+  // Check for (load/store (add x, const))
+
+  for (SDNode *Node : N->getOperand(0)->uses()) {
+    auto LoadStore = dyn_cast<MemSDNode>(Node);
+    if (!LoadStore)
+      continue;
+
+    EVT VT = LoadStore->getMemoryVT();
+    Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
+    unsigned AS = LoadStore->getAddressSpace();
+    if (TLI.shouldRetainImmediatePostIncrement(DL, AccessTy, Level, AS,
+                                               ConstValue))
+      return true;
+  }
+  return false;
+}
+
 // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
 // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
 SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
@@ -2488,6 +2520,13 @@
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
+  // Prevent ADD reassociation as well as converting ADD -> OR
+  if (reassociationCanBreakPostIndexingPattern(N) ||
+      reassociationCanBreakPostIndexingPattern(N0.getNode()) ||
+      reassociationCanBreakPostIndexingPattern(N1.getNode())) {
+    return SDValue();
+  }
+
   if (SDValue Combined = visitADDLike(N))
     return Combined;
 
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -43,6 +43,7 @@
                                CodeGenOpt::Level OptLevel);
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
+FunctionPass *createARMPostIndexingOptimizationPass();
 FunctionPass *createARMExpandPseudoPass();
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
@@ -65,6 +66,7 @@
 
 void initializeARMParallelDSPPass(PassRegistry &);
 void initializeARMLoadStoreOptPass(PassRegistry &);
+void initializeARMPostIndexingOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
 void initializeARMExpandPseudoPass(PassRegistry &);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -469,6 +469,11 @@
                                Type *Ty, unsigned AS,
                                Instruction *I = nullptr) const override;
 
+    bool shouldRetainImmediatePostIncrement(const DataLayout &DL, Type *Ty,
+                                            CombineLevel Level,
+                                            unsigned AddrSpace,
+                                            int64_t Increment) const override;
+
     /// getScalingFactorCost - Return the cost of the scaling used in
     /// addressing mode represented by AM.
     /// If the AM is supported, the return value must be >= 0.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -18707,6 +18707,27 @@
   return true;
 }
 
+bool ARMTargetLowering::shouldRetainImmediatePostIncrement(
+    const DataLayout &DL, Type *Ty, CombineLevel Level, unsigned AddrSpace,
+    int64_t Increment) const {
+  // NEON has rather restricted address calculation for vector load / store
+  // instructions compared to MVE or AArch64 ASIMD.
+  if (Subtarget->hasMVEIntegerOps())
+    return false;
+
+  // If the first DAG optimization pass did not consume this increment,
+  // try combining as usual during subsequent optimization passes.
+  if (Level != CombineLevel::BeforeLegalizeTypes)
+    return false;
+
+  if (!Ty->isVectorTy())
+    return false;
+
+  unsigned BitSize = DL.getTypeSizeInBits(Ty);
+
+  return BitSize > 64 && isPowerOf2_32(BitSize);
+}
+
 /// isLegalICmpImmediate - Return true if the specified immediate is legal
 /// icmp immediate, that is the target has icmp instructions which can compare
 /// a register against the immediate without having to materialize the
diff --git a/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp b/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp
@@ -0,0 +1,335 @@
+//===- ARMPostIndexingOptimizer.cpp - Prepare ld/st for post-indexing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file ARMPostIndexingOpt pass transforms address operands of load/store
+/// instructions to allow them to be emitted as NEON load/store with post-index
+/// addressing mode. For example:
+///
+///   %first = gep %base, %offset1
+///   load %first
+///   %second = gep %base, %offset2
+///   load %second
+///
+/// this sequence may be tranformed into:
+///
+///   %first = gep %base, %offset1
+///   load %first
+///   %second = gep %first, (%offset2 - %offset1)
+///   load %second
+///
+/// The transformation is done only if:
+///  1) GEPs are constant
+///  2) Difference between offsets is compatible with either "[Rn]!"
+///     or "[Rn], Rm" addressing modes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-post-indexing-opt"
+#define PASS_DESC "ARM post-indexed access optimizer"
+
+namespace {
+
+// LdStInfo the base address of a load or store, and an immediate offset of a
+// memory access obtained by a best-effort heuristic as well as other useful
+// instruction properties.
+struct LdStInfo {
+  LdStInfo(const DataLayout &DL, Instruction *LdSt, unsigned BaseOperandIndex,
+           int AccessSize);
+  // A memory access in question
+  Instruction *LdSt;
+  // An actual operand of LdSt can be updated throughout this pass execution,
+  // so store an index instead
+  unsigned BaseOperandIndex;
+  // A guessed base address
+  Value *IndirectBase;
+  // An immediate offset to add to IndirectBase
+  int32_t Offset;
+  // An access size that can be used for post-indexed addressing mode
+  int AccessSize;
+
+  // Returns current *direct* base operand
+  Value *getBaseOperand() { return LdSt->getOperand(BaseOperandIndex); }
+};
+
+struct ARMPostIndexingOpt : public FunctionPass {
+  static char ID;
+
+  ARMPostIndexingOpt() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override { return PASS_DESC; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char ARMPostIndexingOpt::ID = 0;
+INITIALIZE_PASS(ARMPostIndexingOpt, DEBUG_TYPE, PASS_DESC, false, false)
+
+// Returns (nullptr, 0) for instructions not handled by this pass
+static std::pair<Type *, unsigned> getDataTypeAndBaseIndex(Instruction *I) {
+  if (LoadInst *Load = dyn_cast<LoadInst>(I))
+    return std::make_pair(Load->getType(), 0);
+  if (StoreInst *Store = dyn_cast<StoreInst>(I))
+    return std::make_pair(Store->getValueOperand()->getType(), 1);
+
+  if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+      return std::make_pair(Intrinsic->getType(), 0);
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+      return std::make_pair(Intrinsic->getOperand(1)->getType(), 0);
+    default:
+      break;
+    }
+  }
+  return std::make_pair(nullptr, 0);
+}
+
+LdStInfo::LdStInfo(const DataLayout &DL, Instruction *LdSt,
+                   unsigned BaseOperandIndex, int AccessSize)
+    : LdSt(LdSt), BaseOperandIndex(BaseOperandIndex), Offset(0),
+      AccessSize(AccessSize) {
+  IndirectBase = LdSt->getOperand(BaseOperandIndex);
+  for (;;) {
+    IndirectBase = IndirectBase->stripPointerCasts();
+    // Match GetElementPtrInst as well as corresponding ContantExpr
+    if (auto *GEP = dyn_cast<GEPOperator>(IndirectBase)) {
+      APInt APOffset(32, 0, /* isSigned = */ true);
+      if (GEP->accumulateConstantOffset(DL, APOffset)) {
+        IndirectBase = GEP->getPointerOperand();
+        Offset += APOffset.getSExtValue();
+        continue;
+      }
+    }
+    return;
+  }
+}
+
+// Guess a common stride (aside from post-incrementing by access size) that
+// is suitable for "[Rn], Rm" addressing mode, if any
+static int32_t guessCustomAccessStride(ArrayRef<LdStInfo> Instructions) {
+  int32_t Stride = 0; // not decided
+  assert(!Instructions.empty());
+  for (auto I = Instructions.begin(), End = Instructions.end();
+       std::next(I) != End; ++I) {
+    int32_t ThisStride = std::next(I)->Offset - I->Offset;
+    // Check if "[Rn]!" addressing mode can be used
+    if (ThisStride == I->AccessSize)
+      continue;
+    // If this is the first instruction requiring a register operand,
+    // request this stride value
+    if (Stride == 0)
+      Stride = ThisStride;
+    // If multiple different stride values have to be used,
+    // conservatively refrain from using "[Rn], Rm" addressing mode
+    if (Stride != ThisStride)
+      return 0;
+  }
+  return Stride;
+}
+
+// Rewrite a memory address used by Second to use address of First incremented
+// by a constant value
+static bool rewriteAddressCalculation(LdStInfo &First, LdStInfo &Second,
+                                      int32_t RegStride,
+                                      const TargetLibraryInfo &TLI) {
+  LLVM_DEBUG(dbgs() << "Rewriting load/store for post-indexing: ";
+             Second.LdSt->dump());
+
+  IRBuilder<> IRB(Second.LdSt);
+  const DataLayout &DL = Second.LdSt->getModule()->getDataLayout();
+
+  int32_t Stride = Second.Offset - First.Offset;
+  if (Stride != First.AccessSize && Stride != RegStride)
+    return false;
+
+  // In case GEPOperand matched ContantExpr, replace it by instruction to
+  // prevent folding
+  if (auto Const = dyn_cast<ConstantExpr>(First.getBaseOperand())) {
+    auto Inst = Const->getAsInstruction();
+    Inst->insertBefore(First.LdSt);
+    First.LdSt->replaceUsesOfWith(Const, Inst);
+  }
+
+  Value *FirstBase = First.getBaseOperand();
+  Value *OldSecondBase = Second.getBaseOperand();
+  PointerType *FirstBaseTy = cast<PointerType>(FirstBase->getType());
+  PointerType *SecondBaseTy = cast<PointerType>(OldSecondBase->getType());
+  assert(FirstBaseTy->getAddressSpace() == 0 && "Unexpected address space");
+  assert(SecondBaseTy->getAddressSpace() == 0 && "Unexpected address space");
+
+  int32_t FirstElementSize =
+      DL.getTypeSizeInBits(FirstBaseTy->getElementType()) / 8;
+
+  Value *NewSecondBase;
+  if (FirstBaseTy == SecondBaseTy && Stride % FirstElementSize == 0) {
+    int32_t ElementStride = Stride / FirstElementSize;
+    Type *EltTy = FirstBaseTy->getPointerElementType();
+    NewSecondBase =
+        IRB.CreateConstGEP1_32(EltTy, FirstBase, ElementStride, "postinc");
+  } else {
+    Value *FirstBaseBytePtr =
+        IRB.CreateBitCast(FirstBase, IRB.getInt8PtrTy(), "oldbase.byteptr");
+    Value *NewSecondBaseBytePtr = IRB.CreateConstGEP1_32(
+        IRB.getInt8Ty(), FirstBaseBytePtr, Stride, "postinc.byteptr");
+    NewSecondBase =
+        IRB.CreateBitCast(NewSecondBaseBytePtr, SecondBaseTy, "postinc");
+  }
+  Second.LdSt->replaceUsesOfWith(OldSecondBase, NewSecondBase);
+  LLVM_DEBUG(dbgs() << "New load/store: "; Second.LdSt->dump());
+  RecursivelyDeleteTriviallyDeadInstructions(OldSecondBase, &TLI, nullptr);
+  return true;
+}
+
+static bool isProfitable(const SmallVectorImpl<LdStInfo> &Instructions,
+                         int32_t RegStride) {
+  if (Instructions.size() < 2)
+    return false;
+
+  unsigned Matches = 1; // start from 1 since we look for pairs
+  for (auto I = Instructions.begin(), End = Instructions.end();
+       std::next(I) != End; ++I) {
+    int32_t Stride = std::next(I)->Offset - I->Offset;
+    if (Stride == I->AccessSize || Stride == RegStride)
+      Matches++;
+  }
+
+  if (Matches < 4 && Instructions.size() > Matches) {
+    // Bail out if there are other users of the base pointer and not a
+    // lot of consecutive accesses.
+    return false;
+  }
+  return true;
+}
+
+static void refineIndirectBase(SmallVectorImpl<LdStInfo> &LdSt) {
+  // Match the following pattern:
+  //
+  // %base1 = gep %truebase, %i
+  // %ptr1 = gep %base1, 0
+  // load %ptr1
+  // %base2 = gep %truebase, %i
+  // %ptr2 = gep %base2, 4
+  // load %ptr2
+  //
+  // Here base1 and base2 are indirect bases in LdSt. Use the same
+  // value for all corresponding elements.
+  for (auto I = LdSt.begin(), End = LdSt.end(); std::next(I) != End; ++I) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(I->IndirectBase)) {
+      if (auto *GEPNext =
+              dyn_cast<GetElementPtrInst>(std::next(I)->IndirectBase)) {
+        if (GEP->getPointerOperand() != GEPNext->getPointerOperand() ||
+            GEP->getNumIndices() != GEPNext->getNumIndices() ||
+            !std::equal(GEP->idx_begin(), GEP->idx_end(), GEPNext->idx_begin()))
+          continue;
+
+        // GEPs are equal
+        std::next(I)->IndirectBase = GEP;
+      }
+    }
+  }
+}
+
+static bool runOnBasicBlock(BasicBlock &BB, const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+
+  // Collect relevant load/store instructions, grouped by guessed base address
+  SmallVector<LdStInfo, 16> LdSt;
+  for (Instruction &I : BB) {
+    Type *ValueTy;
+    unsigned BaseOperandIndex;
+    std::tie(ValueTy, BaseOperandIndex) = getDataTypeAndBaseIndex(&I);
+    if (!ValueTy || !ValueTy->isVectorTy())
+      continue;
+
+    unsigned AccessSize = DL.getTypeSizeInBits(ValueTy) / 8;
+    if (isPowerOf2_32(AccessSize))
+      LdSt.emplace_back(DL, &I, BaseOperandIndex, AccessSize);
+  }
+
+  if (LdSt.size() < 2)
+    return false;
+
+  refineIndirectBase(LdSt);
+
+  DenseMap<Value *, SmallVector<LdStInfo, 16>> LdStMap;
+  for (LdStInfo &LSI : LdSt) {
+    LdStMap[LSI.IndirectBase].push_back(std::move(LSI));
+    LLVM_DEBUG(dbgs() << "found load/store, base: " << LSI.IndirectBase
+                      << ", offset: " << LSI.Offset << '\n'
+                      << LSI.LdSt << '\n');
+  }
+  // For each group, form a chain of address increments
+  bool Modified = false;
+  for (auto BaseAndWorklist : LdStMap) {
+    auto Worklist = BaseAndWorklist.second;
+    assert(!Worklist.empty());
+    int32_t RegStride = guessCustomAccessStride(Worklist);
+    if (!isProfitable(Worklist, RegStride)) {
+      LLVM_DEBUG(dbgs() << "not profitable to transform this ld/st group: "
+                        << BaseAndWorklist.first << '\n');
+      continue;
+    }
+    for (auto I = Worklist.begin(), E = Worklist.end(); std::next(I) != E;
+         ++I) {
+      Modified |= rewriteAddressCalculation(*I, *std::next(I), RegStride, TLI);
+    }
+  }
+  return Modified;
+}
+
+bool ARMPostIndexingOpt::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  // If MVE is available, skip this function.
+  const auto &TPC = getAnalysis<TargetPassConfig>();
+  const auto &TM = TPC.getTM<TargetMachine>();
+  const auto &STI = TM.getSubtarget<ARMSubtarget>(F);
+  if (STI.hasMVEIntegerOps())
+    return false;
+
+  const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+  bool Modified = false;
+  for (auto &BB : F)
+    Modified |= runOnBasicBlock(BB, TLI);
+  return Modified;
+}
+
+FunctionPass *llvm::createARMPostIndexingOptimizationPass() {
+  return new ARMPostIndexingOpt();
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -90,6 +90,7 @@
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeGlobalISel(Registry);
   initializeARMLoadStoreOptPass(Registry);
+  initializeARMPostIndexingOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
   initializeARMParallelDSPPass(Registry);
   initializeARMConstantIslandsPass(Registry);
@@ -471,6 +472,7 @@
     // any ISel takes place. We should have a more principled way of handling
     // this. See D99707 for more details.
     addPass(createBarrierNoopPass());
+    addPass(createARMPostIndexingOptimizationPass());
   }
 
   return false;
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -47,6 +47,7 @@
   ARMMacroFusion.cpp
   ARMRegisterInfo.cpp
   ARMOptimizeBarriersPass.cpp
+  ARMPostIndexingOptimizer.cpp
   ARMRegisterBankInfo.cpp
   ARMSelectionDAGInfo.cpp
   ARMSLSHardening.cpp
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -64,6 +64,7 @@
 ; CHECK-NEXT:        Transform predicated vector loops to use MVE tail predication
 ; CHECK-NEXT:      A No-Op Barrier Pass
 ; CHECK-NEXT:      FunctionPass Manager
+; CHECK-NEXT:      ARM post-indexed access optimizer
 ; CHECK-NEXT:      Safe Stack instrumentation pass
 ; CHECK-NEXT:      Insert stack protectors
 ; CHECK-NEXT:      Module Verifier
diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt-ir.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt-ir.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt-ir.ll
@@ -0,0 +1,354 @@
+; RUN: opt --arm-post-indexing-opt -S -o - < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-unknown-linux-gnueabihf"
+
+define <4 x float> @test(float* %A) {
+  %X.ptr = bitcast float* %A to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+; CHECK-LABEL: define <4 x float> @test(float* %A) {
+; CHECK-NEXT:    %X.ptr = bitcast float* %A to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc1, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+define <4 x float> @test_stride(float* %A) {
+  %X.ptr = bitcast float* %A to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_stride(float* %A) {
+; CHECK-NEXT:    %X.ptr = bitcast float* %A to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %oldbase.byteptr = bitcast <4 x float>* %X.ptr to i8*
+; CHECK-NEXT:    %postinc.byteptr = getelementptr i8, i8* %oldbase.byteptr, i32 24
+; CHECK-NEXT:    %postinc = bitcast i8* %postinc.byteptr to <4 x float>*
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %oldbase.byteptr1 = bitcast <4 x float>* %postinc to i8*
+; CHECK-NEXT:    %postinc.byteptr2 = getelementptr i8, i8* %oldbase.byteptr1, i32 24
+; CHECK-NEXT:    %postinc3 = bitcast i8* %postinc.byteptr2 to <4 x float>*
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc3, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+define <4 x float> @test_stride_mixed(float* %A) {
+  %X.ptr = bitcast float* %A to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_stride_mixed(float* %A) {
+; CHECK-NEXT:    %X.ptr = bitcast float* %A to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %oldbase.byteptr = bitcast <4 x float>* %X.ptr to i8*
+; CHECK-NEXT:    %postinc.byteptr = getelementptr i8, i8* %oldbase.byteptr, i32 24
+; CHECK-NEXT:    %postinc = bitcast i8* %postinc.byteptr to <4 x float>*
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc1, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+; Refrain from using multiple stride registers
+define <4 x float> @test_stride_noop(float* %A) {
+  %X.ptr = bitcast float* %A to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_stride_noop(float* %A) {
+; CHECK-NEXT:    %X.ptr = bitcast float* %A to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+; CHECK-NEXT:    %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+; CHECK-NEXT:    %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14
+; CHECK-NEXT:    %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+define <4 x float> @test_positive_initial_offset(float* %A) {
+  %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+  %X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_positive_initial_offset(float* %A) {
+; CHECK-NEXT:    %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+; CHECK-NEXT:    %X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc1, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+define <4 x float> @test_negative_initial_offset(float* %A) {
+  %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16
+  %X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_negative_initial_offset(float* %A) {
+; CHECK-NEXT:    %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16
+; CHECK-NEXT:    %X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc1, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+@global_float_array = external global [128 x float], align 4
+define <4 x float> @test_global() {
+  %X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4
+  %Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4
+  %Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_global() {
+; CHECK-NEXT:    %1 = bitcast float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %1, align 4
+; CHECK-NEXT:    %postinc = getelementptr <4 x float>, <4 x float>* %1, i32 1
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc1, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+define <4 x float> @test_stack() {
+; Use huge alignment to test that ADD would not be converted to OR
+  %array = alloca [32 x float], align 128
+  %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0
+  call void @external_function(float* %arraydecay)
+  %X.ptr = bitcast [32 x float]* %array to <4 x float>*
+  %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+  %Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8
+  %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+  %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+  %tmp.sum = fadd <4 x float> %X, %Y
+  %sum = fadd <4 x float> %tmp.sum, %Z
+  ret <4 x float> %sum
+}
+
+; CHECK-LABEL: define <4 x float> @test_stack() {
+; CHECK-NEXT:    %array = alloca [32 x float], align 128
+; CHECK-NEXT:    %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0
+; CHECK-NEXT:    call void @external_function(float* %arraydecay)
+; CHECK-NEXT:    %X.ptr = bitcast [32 x float]* %array to <4 x float>*
+; CHECK-NEXT:    %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+; CHECK-NEXT:    %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <4 x float>, <4 x float>* %postinc1, align 4
+; CHECK-NEXT:    %tmp.sum = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %sum = fadd <4 x float> %tmp.sum, %Z
+; CHECK-NEXT:    ret <4 x float> %sum
+; CHECK-NEXT:  }
+
+define <2 x double> @test_double(double* %A) {
+  %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8
+  %X.ptr = bitcast double* %X.ptr.elt to <2 x double>*
+  %X = load <2 x double>, <2 x double>* %X.ptr, align 8
+  %Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10
+  %Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>*
+  %Y = load <2 x double>, <2 x double>* %Y.ptr, align 8
+  %Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12
+  %Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>*
+  %Z = load <2 x double>, <2 x double>* %Z.ptr, align 8
+  %tmp.sum = fadd <2 x double> %X, %Y
+  %sum = fadd <2 x double> %tmp.sum, %Z
+  ret <2 x double> %sum
+}
+
+; CHECK-LABEL: define <2 x double> @test_double(double* %A) {
+; CHECK-NEXT:    %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8
+; CHECK-NEXT:    %X.ptr = bitcast double* %X.ptr.elt to <2 x double>*
+; CHECK-NEXT:    %X = load <2 x double>, <2 x double>* %X.ptr, align 8
+; CHECK-NEXT:    %postinc = getelementptr <2 x double>, <2 x double>* %X.ptr, i32 1
+; CHECK-NEXT:    %Y = load <2 x double>, <2 x double>* %postinc, align 8
+; CHECK-NEXT:    %postinc1 = getelementptr <2 x double>, <2 x double>* %postinc, i32 1
+; CHECK-NEXT:    %Z = load <2 x double>, <2 x double>* %postinc1, align 8
+; CHECK-NEXT:    %tmp.sum = fadd <2 x double> %X, %Y
+; CHECK-NEXT:    %sum = fadd <2 x double> %tmp.sum, %Z
+; CHECK-NEXT:    ret <2 x double> %sum
+; CHECK-NEXT:  }
+
+define void @test_various_instructions(float* %A) {
+  %X.ptr = bitcast float* %A to i8*
+  %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1)
+  %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
+  %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+  %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+  %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+  %Z.ptr = bitcast float* %Z.ptr.elt to i8*
+  %Z = fadd <4 x float> %X, %Y
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: define void @test_various_instructions(float* %A) {
+; CHECK-NEXT:    %X.ptr = bitcast float* %A to i8*
+; CHECK-NEXT:    %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1)
+; CHECK-NEXT:    %postinc.byteptr = getelementptr i8, i8* %X.ptr, i32 16
+; CHECK-NEXT:    %postinc = bitcast i8* %postinc.byteptr to <4 x float>*
+; CHECK-NEXT:    %Y = load <4 x float>, <4 x float>* %postinc, align 4
+; CHECK-NEXT:    %Z = fadd <4 x float> %X, %Y
+; CHECK-NEXT:    %oldbase.byteptr = bitcast <4 x float>* %postinc to i8*
+; CHECK-NEXT:    %postinc.byteptr1 = getelementptr i8, i8* %oldbase.byteptr, i32 16
+; CHECK-NEXT:    tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %postinc.byteptr1, <4 x float> %Z, i32 4)
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+
+define void @test_lsr_geps(float* %a, float* %b, i32 %n) {
+entry:
+  %cmp61 = icmp sgt i32 %n, 0
+  br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ]
+  %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+  %0 = bitcast float* %a to i8*
+  %1 = bitcast float* %b to i8*
+  %uglygep19 = getelementptr i8, i8* %0, i32 %lsr.iv1
+  %uglygep1920 = bitcast i8* %uglygep19 to <4 x float>*
+  %2 = load <4 x float>, <4 x float>* %uglygep1920, align 4
+  %uglygep16 = getelementptr i8, i8* %0, i32 %lsr.iv1
+  %uglygep1617 = bitcast i8* %uglygep16 to <4 x float>*
+  %scevgep18 = getelementptr <4 x float>, <4 x float>* %uglygep1617, i32 1
+  %3 = load <4 x float>, <4 x float>* %scevgep18, align 4
+  %uglygep13 = getelementptr i8, i8* %0, i32 %lsr.iv1
+  %uglygep1314 = bitcast i8* %uglygep13 to <4 x float>*
+  %scevgep15 = getelementptr <4 x float>, <4 x float>* %uglygep1314, i32 2
+  %4 = load <4 x float>, <4 x float>* %scevgep15, align 4
+  %uglygep10 = getelementptr i8, i8* %0, i32 %lsr.iv1
+  %uglygep1011 = bitcast i8* %uglygep10 to <4 x float>*
+  %scevgep12 = getelementptr <4 x float>, <4 x float>* %uglygep1011, i32 3
+  %5 = load <4 x float>, <4 x float>* %scevgep12, align 4
+  %uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv1
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %uglygep8, <4 x float> %2, i32 4)
+  %uglygep6 = getelementptr i8, i8* %1, i32 %lsr.iv1
+  %scevgep7 = getelementptr i8, i8* %uglygep6, i32 16
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep7, <4 x float> %3, i32 4)
+  %uglygep4 = getelementptr i8, i8* %1, i32 %lsr.iv1
+  %scevgep5 = getelementptr i8, i8* %uglygep4, i32 32
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep5, <4 x float> %4, i32 4)
+  %uglygep = getelementptr i8, i8* %1, i32 %lsr.iv1
+  %scevgep = getelementptr i8, i8* %uglygep, i32 48
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep, <4 x float> %5, i32 4)
+  %lsr.iv.next = add i32 %lsr.iv, -1
+  %lsr.iv.next2 = add nuw i32 %lsr.iv1, 64
+  %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: define void @test_lsr_geps(float* %a, float* %b, i32 %n)
+;
+; CHECK: %[[GEP:uglygep[0-9]+]] = getelementptr i8, i8* %0, i32 %lsr.iv1
+; CHECK-NEXT: %[[BASE:uglygep[0-9]+]] = bitcast i8* %[[GEP]] to <4 x float>*
+; CHECK-NEXT: load <4 x float>, <4 x float>* %[[BASE]], align 4
+; CHECK-NEXT: %[[POSTINC1:postinc[0-9]+]] = getelementptr <4 x float>, <4 x float>* %[[BASE]], i32 1
+; CHECK-NEXT: load <4 x float>, <4 x float>* %[[POSTINC1]], align 4
+; CHECK-NEXT: %[[POSTINC2:postinc[0-9]+]] = getelementptr <4 x float>, <4 x float>* %[[POSTINC1]], i32 1
+; CHECK-NEXT: load <4 x float>, <4 x float>* %[[POSTINC2]], align 4
+; CHECK-NEXT: %[[POSTINC3:postinc[0-9]+]] = getelementptr <4 x float>, <4 x float>* %[[POSTINC2]], i32 1
+; CHECK-NEXT: load <4 x float>, <4 x float>* %[[POSTINC3]], align 4
+;
+; CHECK-NEXT: %[[BASE:uglygep[0-9]+]] = getelementptr i8, i8* %1, i32 %lsr.iv1
+; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[BASE]]
+; CHECK-NEXT: %[[POSTINC1:postinc]] = getelementptr i8, i8* %[[BASE]], i32 16
+; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[POSTINC1]]
+; CHECK-NEXT: %[[POSTINC2:postinc[0-9]+]] = getelementptr i8, i8* %[[POSTINC1]], i32 16
+; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[POSTINC2]]
+; CHECK-NEXT: %[[POSTINC3:postinc[0-9]+]] = getelementptr i8, i8* %[[POSTINC2]], i32 16
+; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[POSTINC3]]
+
+declare void @external_function(float*)
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly
diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll
--- a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll
+++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll
@@ -7,10 +7,8 @@
 define <4 x float> @test(float* %A) {
 ; CHECK-LABEL: test:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r2, r0, #16
-; CHECK-NEXT:    mov r1, #32
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r2]
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
@@ -31,10 +29,9 @@
 define <4 x float> @test_stride(float* %A) {
 ; CHECK-LABEL: test_stride:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r2, r0, #24
-; CHECK-NEXT:    mov r1, #48
+; CHECK-NEXT:    mov r1, #24
 ; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r2]
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0], r1
 ; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
@@ -55,10 +52,9 @@
 define <4 x float> @test_stride_mixed(float* %A) {
 ; CHECK-LABEL: test_stride_mixed:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r2, r0, #24
-; CHECK-NEXT:    mov r1, #40
+; CHECK-NEXT:    mov r1, #24
 ; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r2]
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
@@ -80,10 +76,10 @@
 define <4 x float> @test_stride_noop(float* %A) {
 ; CHECK-LABEL: test_stride_noop:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r2, r0, #24
-; CHECK-NEXT:    mov r1, #56
+; CHECK-NEXT:    mov r1, #24
 ; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r2]
+; CHECK-NEXT:    mov r1, #32
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0], r1
 ; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
@@ -104,12 +100,10 @@
 define <4 x float> @test_positive_initial_offset(float* %A) {
 ; CHECK-LABEL: test_positive_initial_offset:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r1, r0, #48
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r1]
-; CHECK-NEXT:    add r1, r0, #32
-; CHECK-NEXT:    add r0, r0, #64
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r1]
-; CHECK-NEXT:    vadd.f32 q8, q9, q8
+; CHECK-NEXT:    add r0, r0, #32
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
 ; CHECK-NEXT:    bx lr
@@ -130,12 +124,10 @@
 define <4 x float> @test_negative_initial_offset(float* %A) {
 ; CHECK-LABEL: test_negative_initial_offset:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    sub r1, r0, #48
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r1]
-; CHECK-NEXT:    sub r1, r0, #64
-; CHECK-NEXT:    sub r0, r0, #32
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r1]
-; CHECK-NEXT:    vadd.f32 q8, q9, q8
+; CHECK-NEXT:    sub r0, r0, #64
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
 ; CHECK-NEXT:    bx lr
@@ -159,12 +151,10 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movw r0, :lower16:global_float_array
 ; CHECK-NEXT:    movt r0, :upper16:global_float_array
-; CHECK-NEXT:    add r1, r0, #48
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r1]
-; CHECK-NEXT:    add r1, r0, #32
-; CHECK-NEXT:    add r0, r0, #64
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r1]
-; CHECK-NEXT:    vadd.f32 q8, q9, q8
+; CHECK-NEXT:    add r0, r0, #32
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
 ; CHECK-NEXT:    bx lr
@@ -190,11 +180,10 @@
 ; CHECK-NEXT:    mov r4, sp
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl external_function
-; CHECK-NEXT:    orr r0, r4, #32
 ; CHECK-NEXT:    vld1.32 {d16, d17}, [r4:128]!
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r4:128]
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r4:128]!
 ; CHECK-NEXT:    vadd.f32 q8, q8, q9
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r4]
 ; CHECK-NEXT:    vadd.f32 q0, q8, q9
 ; CHECK-NEXT:    sub sp, r11, #8
 ; CHECK-NEXT:    pop {r4, r10, r11, pc}
@@ -217,13 +206,11 @@
 define <2 x double> @test_double(double* %A) {
 ; CHECK-LABEL: test_double:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r1, r0, #80
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    add r1, r0, #64
-; CHECK-NEXT:    add r0, r0, #96
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT:    vadd.f64 d20, d19, d17
-; CHECK-NEXT:    vadd.f64 d16, d18, d16
+; CHECK-NEXT:    add r0, r0, #64
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]!
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]!
+; CHECK-NEXT:    vadd.f64 d20, d17, d19
+; CHECK-NEXT:    vadd.f64 d16, d16, d18
 ; CHECK-NEXT:    vld1.64 {d22, d23}, [r0]
 ; CHECK-NEXT:    vadd.f64 d1, d20, d23
 ; CHECK-NEXT:    vadd.f64 d0, d16, d22
@@ -245,10 +232,8 @@
 define void @test_various_instructions(float* %A) {
 ; CHECK-LABEL: test_various_instructions:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    add r2, r0, #16
-; CHECK-NEXT:    mov r1, #32
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r2]
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vadd.f32 q8, q8, q9
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -267,36 +252,27 @@
 define void @test_lsr_geps(float* %a, float* %b, i32 %n) {
 ; CHECK-LABEL: test_lsr_geps:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r11, lr}
-; CHECK-NEXT:    push {r4, r5, r11, lr}
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB10_3
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    mov r3, #0
-; CHECK-NEXT:    mov r12, #48
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB10_1: @ %for.body.preheader
+; CHECK-NEXT:    mov r12, #0
 ; CHECK-NEXT:  .LBB10_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add lr, r0, r3
+; CHECK-NEXT:    add r3, r0, r12
 ; CHECK-NEXT:    subs r2, r2, #1
-; CHECK-NEXT:    mov r4, lr
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r4], r12
-; CHECK-NEXT:    vld1.32 {d18, d19}, [r4]
-; CHECK-NEXT:    add r4, lr, #32
-; CHECK-NEXT:    vld1.32 {d20, d21}, [r4]
-; CHECK-NEXT:    add r4, lr, #16
-; CHECK-NEXT:    vld1.32 {d22, d23}, [r4]
-; CHECK-NEXT:    add r4, r1, r3
-; CHECK-NEXT:    add r5, r4, #16
-; CHECK-NEXT:    add r3, r3, #64
-; CHECK-NEXT:    mov lr, r4
-; CHECK-NEXT:    add r4, r4, #32
-; CHECK-NEXT:    vst1.32 {d16, d17}, [lr], r12
-; CHECK-NEXT:    vst1.32 {d22, d23}, [r5]
-; CHECK-NEXT:    vst1.32 {d20, d21}, [r4]
-; CHECK-NEXT:    vst1.32 {d18, d19}, [lr]
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r3]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r3]!
+; CHECK-NEXT:    vld1.32 {d20, d21}, [r3]!
+; CHECK-NEXT:    vld1.32 {d22, d23}, [r3]
+; CHECK-NEXT:    add r3, r1, r12
+; CHECK-NEXT:    add r12, r12, #64
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r3]!
+; CHECK-NEXT:    vst1.32 {d18, d19}, [r3]!
+; CHECK-NEXT:    vst1.32 {d20, d21}, [r3]!
+; CHECK-NEXT:    vst1.32 {d22, d23}, [r3]
 ; CHECK-NEXT:    bne .LBB10_2
-; CHECK-NEXT:  .LBB10_3: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r11, pc}
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
 entry:
   %cmp61 = icmp sgt i32 %n, 0
   br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll
--- a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll
+++ b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll
@@ -76,9 +76,9 @@
 ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
 
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 
 ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
@@ -86,6 +86,7 @@
 ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
 
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]]
 
@@ -93,8 +94,6 @@
 ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
 
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
-
 ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
 }
@@ -170,9 +169,9 @@
 ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
 
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 
 ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
@@ -180,6 +179,7 @@
 ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
 
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]]
 
@@ -187,7 +187,6 @@
 ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]]
 
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]]
 }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -198,21 +198,13 @@
 
 ; @testNeon is an important example of the nead for ivchains.
 ;
-; Currently we have two extra add.w's that keep the store address
-; live past the next increment because ISEL is unfortunately undoing
-; the store chain. ISEL also fails to convert all but one of the stores to
-; post-increment addressing. However, the loads should use
-; post-increment addressing, no add's or add.w's beyond the three
-; mentioned. Most importantly, there should be no spills or reloads!
+; Loads and stores should use post-increment addressing, no add's or add.w's.
+; Most importantly, there should be no spills or reloads!
 ;
 ; A9: testNeon:
 ; A9: %.lr.ph
-; A9: add.w r
 ; A9-NOT: lsl.w
 ; A9-NOT: {{ldr|str|adds|add r}}
-; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}}
-; A9: add.w r
-; A9-NOT: {{ldr|str|adds|add r}}
 ; A9-NOT: add.w r
 ; A9: bne
 define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {