Index: include/llvm/CodeGen/Passes.h
===================================================================
--- include/llvm/CodeGen/Passes.h
+++ include/llvm/CodeGen/Passes.h
@@ -637,6 +637,11 @@
   /// createForwardControlFlowIntegrityPass - This pass adds control-flow
   /// integrity.
   ModulePass *createForwardControlFlowIntegrityPass();
+
+  /// InterleavedAccess Pass - This pass identifies and matches interleaved
+  /// memory accesses to target specific intrinsics.
+  ///
+  FunctionPass *createInterleavedAccessPass(const TargetMachine *TM);
 } // End llvm namespace
 
 /// Target machine pass initializer for passes with dependencies. Use with
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1597,6 +1597,31 @@
     return false;
   }
 
+  /// \brief Lower an interleaved load to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p LI is the vector load instruction.
+  /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
+  /// \p Indices is the corresponding indices for each shufflevector.
+  /// \p Factor is the interleave factor.
+  virtual bool lowerInterleavedLoad(LoadInst *LI,
+                                    ArrayRef<ShuffleVectorInst *> Shuffles,
+                                    ArrayRef<unsigned> Indices,
+                                    unsigned Factor) const {
+    return false;
+  }
+
+  /// \brief Lower an interleaved store to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p SI is the vector store instruction.
+  /// \p SVI is the shufflevector to RE-interleave the stored vector.
+  /// \p Factor is the interleave factor.
+  virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                                     unsigned Factor) const {
+    return false;
+  }
+
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
Index: lib/CodeGen/CMakeLists.txt
===================================================================
--- lib/CodeGen/CMakeLists.txt
+++ lib/CodeGen/CMakeLists.txt
@@ -30,6 +30,7 @@
   ImplicitNullChecks.cpp
   InlineSpiller.cpp
   InterferenceCache.cpp
+  InterleavedAccessPass.cpp
   IntrinsicLowering.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
Index: lib/CodeGen/InterleavedAccessPass.cpp
===================================================================
--- /dev/null
+++ lib/CodeGen/InterleavedAccessPass.cpp
@@ -0,0 +1,284 @@
+//=----------------------- InterleavedAccessPass.cpp -----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Interleaved Access pass, which identifies
+// interleaved memory accesses and transforms them into an target specific
+// intrinsics.
+//
+// An interleaved load reads data from memory into several vectors, with
+// DE-interleaving the data on a factor. An interleaved store writes several
+// vectors to memory with RE-interleaving the data on a factor. The interleave
+// factor is equal to the number of vectors.
+//
+// As interleaved accesses are hard to be identified in CodeGen (mainly because
+// the VECTOR_SHUFFLE DAG node is quite different from the shufflevector IR),
+// we identify and transform them to intrinsics in this pass. So that we
+// can easily match them into target specific instructions later in CodeGen.
+//
+// E.g. An interleaved load (Factor = 2):
+//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+//        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+//
+// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
+// intrinsic in ARM backend.
+//
+// E.g. An interleaved store (Factor = 2):
+//        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>  ; Interleaved vec
+//        store <8 x i32> %i.vec, <8 x i32>* %ptr
+//
+// It could be transformed into a st2 intrinsic in AArch64 backend or a vst2
+// intrinsic in ARM backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "interleaved-access"
+
+static const unsigned MIN_FACTOR = 2;
+static const unsigned MAX_FACTOR = 4;
+
+namespace llvm {
+static void initializeInterleavedAccessPass(PassRegistry &);
+}
+
+namespace {
+
+class InterleavedAccess : public FunctionPass {
+
+public:
+  static char ID;
+  InterleavedAccess(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+    initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override { return "Interleaved Access Pass"; }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  const TargetMachine *TM;
+  const TargetLowering *TLI;
+
+  /// \brief Transform an interleaved load into target specific intrinsics.
+  bool matchInterleavedLoad(LoadInst *LI,
+                            SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// \brief Transform an interleaved store into target specific intrinsics.
+  bool matchInterleavedStore(StoreInst *SI,
+                             SmallVector<Instruction *, 32> &DeadInsts);
+};
+} // end anonymous namespace.
+
+char InterleavedAccess::ID = 0;
+INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access",
+    "Match interleaved memory accesses to target specific intrinsics",
+    false, false)
+
+FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
+  return new InterleavedAccess(TM);
+}
+
+/// \brief Check if the mask is a DE-interleave mask of the given factor
+/// \p Factor like:
+///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
+static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
+                                       unsigned &Index) {
+  // Check all potential start indices from 0 to (Factor - 1).
+  for (Index = 0; Index < Factor; Index++) {
+    unsigned i = 0;
+
+    // Check that elements are in ascending order by Factor.
+    for (; i < Mask.size(); i++)
+      if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
+        break;
+
+    if (i == Mask.size())
+      return true;
+  }
+
+  return false;
+}
+
+/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
+///
+/// E.g. DE-interleave masks (Factor = 2) could be:
+///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
+///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
+static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
+                               unsigned &Index) {
+  if (Mask.size() < 2)
+    return false;
+
+  // Check potential Factors.
+  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
+    if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
+      return true;
+
+  return false;
+}
+
+/// \brief Check if the given mask \p Mask is RE-interleaved mask of the given
+/// factor \p Factor.
+///
+/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
+static bool isReInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor) {
+  unsigned NumElts = Mask.size();
+  if (NumElts % Factor)
+    return false;
+
+  unsigned NumSubElts = NumElts / Factor;
+  if (!isPowerOf2_32(NumSubElts))
+    return false;
+
+  for (unsigned i = 0; i < NumSubElts; i++)
+    for (unsigned j = 0; j < Factor; j++)
+      if (Mask[i * Factor + j] >= 0 &&
+          static_cast<unsigned>(Mask[i * Factor + j]) != j * NumSubElts + i)
+        return false;
+
+  return true;
+}
+
+/// \brief Check if the mask is RE-interleave mask for an interleaved store.
+///
+/// E.g. The RE-interleave mask (Factor = 2) could be:
+///     <0, 4, 1, 5, 2, 6, 3, 7>
+static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
+  if (Mask.size() < 4)
+    return false;
+
+  // Check potential Factors.
+  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
+    if (isReInterleaveMaskOfFactor(Mask, Factor))
+      return true;
+
+  return false;
+}
+
+bool InterleavedAccess::matchInterleavedLoad(
+    LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!LI->isSimple())
+    return false;
+
+  SmallVector<ShuffleVectorInst *, 4> Shuffles;
+  // Check if all users of this load are shufflevectors.
+  for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
+    ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
+    if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
+      return false;
+
+    Shuffles.push_back(SVI);
+  }
+
+  if (Shuffles.empty())
+    return false;
+
+  unsigned Factor, Index;
+
+  // Check if the first shufflevector is DE-interleave shuffle.
+  if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
+    return false;
+
+  // Holds the corresponding index for each DE-interleave shuffle.
+  SmallVector<unsigned, 4> Indices;
+  Indices.push_back(Index);
+
+  Type *VecTy = Shuffles[0]->getType();
+
+  // Check if other shufflevectors are also DE-interleaved of the same type
+  // and factor as the first shufflevector.
+  for (unsigned i = 1; i < Shuffles.size(); i++) {
+    if (Shuffles[i]->getType() != VecTy)
+      return false;
+
+    if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
+                                    Index))
+      return false;
+
+    Indices.push_back(Index);
+  }
+
+  DEBUG(dbgs() << "CGP: Found an interleaved load: " << *LI << "\n");
+
+  // Try to create target specific intrinsics to replace the load and shuffles.
+  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
+    return false;
+
+  DEBUG(dbgs() << "CGP: Matched the interleaved load successfully.\n");
+
+  for (auto SVI : Shuffles)
+    DeadInsts.push_back(SVI);
+
+  DeadInsts.push_back(LI);
+  return true;
+}
+
+bool InterleavedAccess::matchInterleavedStore(
+    StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!SI->isSimple())
+    return false;
+
+  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  if (!SVI || !SVI->hasOneUse())
+    return false;
+
+  // Check if the shufflevector is RE-interleave shuffle.
+  unsigned Factor;
+  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
+    return false;
+
+  DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+
+  // Try to create target specific intrinsics to replace the store and shuffle.
+  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+    return false;
+
+  DEBUG(dbgs() << "IA: Matched the interleaved store successfully.\n");
+
+  // Already have a new target specific interleaved store. Erase the old store.
+  DeadInsts.push_back(SI);
+  DeadInsts.push_back(SVI);
+  return true;
+}
+
+bool InterleavedAccess::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+  if (!TM)
+    return false;
+
+  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+
+  // Holds dead instructions that will be erased later.
+  SmallVector<Instruction *, 32> DeadInsts;
+  bool Changed = false;
+
+  for (auto &I : inst_range(F)) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      Changed |= matchInterleavedLoad(LI, DeadInsts);
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+      Changed |= matchInterleavedStore(SI, DeadInsts);
+  }
+
+  for (auto I : DeadInsts)
+    I->eraseFromParent();
+
+  return Changed;
+}
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -310,6 +310,13 @@
                      unsigned &RequiredAligment) const override;
   bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
 
+  bool lowerInterleavedLoad(LoadInst *LI,
+                            ArrayRef<ShuffleVectorInst *> Shuffles,
+                            ArrayRef<unsigned> Indices,
+                            unsigned Factor) const override;
+  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                             unsigned Factor) const override;
+
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6826,6 +6826,154 @@
   return NumBits == 32 || NumBits == 64;
 }
 
+/// \brief Lower an interleaved load to a ldN intrinsic.
+///
+// E.g. Lower an interleaved load (Factor = 2):
+//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+//        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+//      Into:
+//        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
+//        %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+//        %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool AArch64TargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  if (Factor < 2 || Factor > 4)
+    return false;
+
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  const DataLayout *DL = getDataLayout();
+
+  VectorType *VecTy = Shuffles[0]->getType();
+  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+
+  // Skip illegal vector types.
+  if (VecSize != 64 && VecSize != 128)
+    return false;
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  Type *EltTy = VecTy->getVectorElementType();
+  if (EltTy->isPointerTy())
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
+                            VecTy->getVectorNumElements());
+
+  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {VecTy, PtrTy};
+  static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
+                                            Intrinsic::aarch64_neon_ld3,
+                                            Intrinsic::aarch64_neon_ld4};
+  Function *LdNFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+  IRBuilder<> Builder(LI);
+  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+
+  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+
+  // Replace uses of each shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SVI = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+
+    SVI->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store to a stN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 2):
+///        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
+///        store <8 x i32> %i.vec, <8 x i32>* %ptr
+///      Into:
+///        %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
+///        %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
+///        call void llvm.aarch64.neon.st2(%v0, %v1, %ptr)
+bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                                  ShuffleVectorInst *SVI,
+                                                  unsigned Factor) const {
+  if (Factor < 2 || Factor > 4)
+    return false;
+
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleave factor");
+
+  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+  const DataLayout *DL = getDataLayout();
+  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+
+  // Skip illegal vector types.
+  if (SubVecSize != 64 && SubVecSize != 128)
+    return false;
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL->getIntPtrType(EltTy);
+    unsigned NumOpElts =
+        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+
+    // Convert to the corresponding integer vector.
+    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+    SubVecTy = VectorType::get(IntTy, NumSubElts);
+  }
+
+  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
+  Type *Tys[2] = {SubVecTy, PtrTy};
+  static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
+                                             Intrinsic::aarch64_neon_st3,
+                                             Intrinsic::aarch64_neon_st4};
+  Function *StNFunc =
+      Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+  SmallVector<Value *, 5> Ops;
+
+  // Split the shufflevector operands into sub vectors for the new stN call.
+  for (unsigned i = 0; i < Factor; i++)
+    Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
+  Builder.CreateCall(StNFunc, Ops);
+  return true;
+}
+
 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
                        unsigned AlignCheck) {
   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -67,6 +67,11 @@
                           " to make use of cmpxchg flow-based information"),
                  cl::init(true));
 
+static cl::opt<bool> AArch64InterleavedAccessOpt(
+    "aarch64-interleaved-access-opt",
+    cl::desc("Optimize interleaved memory accesses in the AArch64 backend"),
+    cl::init(false), cl::Hidden);
+
 static cl::opt<bool>
 EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
                         cl::desc("Run early if-conversion"),
@@ -225,6 +230,9 @@
 
   TargetPassConfig::addIRPasses();
 
+  if (TM->getOptLevel() != CodeGenOpt::None && AArch64InterleavedAccessOpt)
+    addPass(createInterleavedAccessPass(TM));
+
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -139,6 +139,11 @@
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace);
   /// @}
 };
 
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -407,6 +407,25 @@
   return LT.first;
 }
 
+unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    unsigned Alignment, unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  if (Factor > 1 && Factor < 5) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+
+    // ldN/stN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -433,6 +433,13 @@
     Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                            bool IsStore, bool IsLoad) const override;
 
+    bool lowerInterleavedLoad(LoadInst *LI,
+                              ArrayRef<ShuffleVectorInst *> Shuffles,
+                              ArrayRef<unsigned> Indices,
+                              unsigned Factor) const override;
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                               unsigned Factor) const override;
+
     bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicRMWExpansionKind
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -11357,6 +11357,161 @@
               Addr});
 }
 
+/// \brief Lower an interleaved load to a vldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+///      Into:
+///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr)
+///        %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+///        %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool ARMTargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  if (Factor < 2 || Factor > 4)
+    return false;
+
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  const DataLayout *DL = getDataLayout();
+
+  VectorType *VecTy = Shuffles[0]->getType();
+  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+  bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+
+  // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
+  // support i64/f64 element).
+  if ((VecSize != 64 && VecSize != 128) || EltIs64Bits)
+    return false;
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  Type *EltTy = VecTy->getVectorElementType();
+  if (EltTy->isPointerTy())
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
+                            VecTy->getVectorNumElements());
+
+  static const Intrinsic::ID LoadInt[3] = {Intrinsic::arm_neon_vld2,
+                                           Intrinsic::arm_neon_vld3,
+                                           Intrinsic::arm_neon_vld4};
+
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInt[Factor - 2], VecTy);
+
+  IRBuilder<> Builder(LI);
+  SmallVector<Value *, 2> Ops;
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+  Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
+  Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+  CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+
+  // Replace uses of each shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SV = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+
+    SV->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store to a vstN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 2):
+///        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
+///        store <8 x i32> %i.vec, <8 x i32>* %ptr
+///      Into:
+///        %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
+///        %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
+///        call void llvm.arm.neon.vst2(%v0, %v1, %ptr)
+bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                              ShuffleVectorInst *SVI,
+                                              unsigned Factor) const {
+  if (Factor < 2 || Factor > 4)
+    return false;
+
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleave factor");
+
+  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+  const DataLayout *DL = getDataLayout();
+  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+
+  // Skip illegal sub vector types and vector types of i64/f64 element (vstN
+  // doesn't support i64/f64 element).
+  if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits)
+    return false;
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL->getIntPtrType(EltTy);
+
+    // Convert to the corresponding integer vector.
+    Type *IntVecTy =
+        VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+    SubVecTy = VectorType::get(IntTy, NumSubElts);
+  }
+
+  static Intrinsic::ID StoreInt[3] = {Intrinsic::arm_neon_vst2,
+                                      Intrinsic::arm_neon_vst3,
+                                      Intrinsic::arm_neon_vst4};
+  Function *VstNFunc = Intrinsic::getDeclaration(
+      SI->getModule(), StoreInt[Factor - 2], SubVecTy);
+
+  SmallVector<Value *, 6> Ops;
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+
+  // Split the shufflevector operands into sub vectors for the new vstN call.
+  for (unsigned i = 0; i < Factor; i++)
+    Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+  Ops.push_back(Builder.getInt32(SI->getAlignment()));
+  Builder.CreateCall(VstNFunc, Ops);
+  return true;
+}
+
 enum HABaseType {
   HA_UNKNOWN = 0,
   HA_FLOAT,
Index: lib/Target/ARM/ARMTargetMachine.cpp
===================================================================
--- lib/Target/ARM/ARMTargetMachine.cpp
+++ lib/Target/ARM/ARMTargetMachine.cpp
@@ -38,6 +38,12 @@
                  cl::init(true));
 
 static cl::opt<bool>
+ARMInterleavedAccessOpt("arm-interleaved-access-opt", cl::Hidden,
+                        cl::desc("Optimize interleaved memory accesses"
+                                 " in the ARM backend"),
+                        cl::init(false));
+
+static cl::opt<bool>
 EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden,
                       cl::desc("Enable ARM load/store optimization pass"),
                       cl::init(true));
@@ -333,6 +339,9 @@
     }));
 
   TargetPassConfig::addIRPasses();
+
+  if (TM->getOptLevel() != CodeGenOpt::None && ARMInterleavedAccessOpt)
+    addPass(createInterleavedAccessPass(TM));
 }
 
 bool ARMPassConfig::addPreISel() {
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -126,6 +126,11 @@
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace);
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace);
   /// @}
 };
 
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -478,3 +478,27 @@
   }
   return LT.first;
 }
+
+unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                                unsigned Factor,
+                                                ArrayRef<unsigned> Indices,
+                                                unsigned Alignment,
+                                                unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  // vldN/vstN doesn't support vector types of i64/f64 element.
+  bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+
+  if (Factor > 1 && Factor < 5 && !EltIs64Bits) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+
+    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
Index: test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -0,0 +1,197 @@
+; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -aarch64-interleaved-access-opt=true < %s | FileCheck %s
+
+; CHECK-LABEL: load_factor2:
+; CHECK: ld2 { v0.8b, v1.8b }, [x0]
+define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
+  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
+  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add nsw <8 x i8> %strided.v0, %strided.v1
+  ret <8 x i8> %add
+}
+
+; CHECK-LABEL: load_delat3:
+; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+define <4 x i32> @load_delat3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_factor4:
+; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define <4 x i32> @load_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_factor2:
+; CHECK: st2 { v0.8b, v1.8b }, [x0]
+define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor3:
+; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor4:
+; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
+
+; The following cases test that interleaved access of pointer vectors can be
+; matched to ldN/stN instruction.
+
+; CHECK-LABEL: load_ptrvec_factor2:
+; CHECK: ld2 { v0.2d, v1.2d }, [x0]
+define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
+  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32*> %strided.v0
+}
+
+; CHECK-LABEL: load_ptrvec_factor3:
+; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
+  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: load_ptrvec_factor4:
+; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <8 x i32*>*
+  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
+  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
+  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor2:
+; CHECK: st2 { v0.2d, v1.2d }, [x0]
+define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor3:
+; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor4:
+; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+  %base = bitcast i32* %ptr to <8 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
+  ret void
+}
+
+; Following cases check that shuffle maskes with undef indices can be matched
+; into ldN/stN instruction.
+
+; CHECK-LABEL: load_undef_mask_factor2:
+; CHECK: ld2 { v0.4s, v1.4s }, [x0]
+define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
+  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor3:
+; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor4:
+; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_undef_mask_factor2:
+; CHECK: st2 { v0.4s, v1.4s }, [x0]
+define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor3:
+; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor4:
+; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
Index: test/CodeGen/ARM/arm-interleaved-accesses.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -0,0 +1,204 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -arm-interleaved-access-opt=true < %s | FileCheck %s
+
+; CHECK-LABEL: load_factor2:
+; CHECK: vld2.8 {d16, d17}, [r0]
+define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
+  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
+  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add nsw <8 x i8> %strided.v0, %strided.v1
+  ret <8 x i8> %add
+}
+
+; CHECK-LABEL: load_delat3:
+; CHECK: vld3.32 {d16, d17, d18}, [r0]
+define <2 x i32> @load_delat3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <6 x i32>*
+  %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+  %strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+  %add = add nsw <2 x i32> %strided.v2, %strided.v1
+  ret <2 x i32> %add
+}
+
+; CHECK-LABEL: load_factor4:
+; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
+define <4 x i32> @load_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_factor2:
+; CHECK: vst2.8 {d16, d17}, [r0]
+define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor3:
+; CHECK: vst3.32 {d16, d18, d20}, [r0]!
+; CHECK: vst3.32 {d17, d19, d21}, [r0]
+define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor4:
+; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
+define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
+
+; The following cases test that interleaved access of pointer vectors can be
+; matched to ldN/stN instruction.
+
+; CHECK-LABEL: load_ptrvec_factor2:
+; CHECK: vld2.32 {d16, d17}, [r0]
+define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
+  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32*> %strided.v0
+}
+
+; CHECK-LABEL: load_ptrvec_factor3:
+; CHECK: vld3.32 {d16, d17, d18}, [r0]
+define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
+  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: load_ptrvec_factor4:
+; CHECK: vld4.32 {d16, d17, d18, d19}, [r0]
+define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <8 x i32*>*
+  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
+  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
+  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor2:
+; CHECK: vst2.32 {d16, d17}, [r0]
+define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor3:
+; CHECK: vst3.32 {d16, d17, d18}, [r0]
+define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor4:
+; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
+define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+  %base = bitcast i32* %ptr to <8 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
+  ret void
+}
+
+; Following cases check that shuffle maskes with undef indices can be matched
+; into ldN/stN instruction.
+
+; CHECK-LABEL: load_undef_mask_factor2:
+; CHECK: vld2.32 {d16, d17, d18, d19}, [r0]
+define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
+  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor3:
+; CHECK: vld3.32 {d16, d18, d20}, [r0]!
+; CHECK: vld3.32 {d17, d19, d21}, [r0]
+define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor4:
+; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
+define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_undef_mask_factor2:
+; CHECK: vst2.32 {d16, d17, d18, d19}, [r0]
+define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor3:
+; CHECK: vst3.32 {d16, d18, d20}, [r0]!
+; CHECK: vst3.32 {d17, d19, d21}, [r0]
+define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor4:
+; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
+define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}