Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -300,6 +300,10 @@
   bool isLegalMaskedStore(Type *DataType, int Consecutive) const;
   bool isLegalMaskedLoad(Type *DataType, int Consecutive) const;
 
+  bool supportInterleaveAccess() const;
+  bool isLegalInterleaveType(Type *VecType) const;
+  bool isLegalInterleaveNum(unsigned NumVec) const;
+
   /// \brief Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
@@ -432,6 +436,11 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of interleaved Load and Store instructions.
+  unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -529,6 +538,9 @@
                                      int64_t Scale) = 0;
   virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType, int Consecutive) = 0;
+  virtual bool supportInterleaveAccess() = 0;
+  virtual bool isLegalInterleaveType(Type *VecType) = 0;
+  virtual bool isLegalInterleaveNum(unsigned NumVec) = 0;
   virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                    int64_t BaseOffset, bool HasBaseReg,
                                    int64_t Scale) = 0;
@@ -569,6 +581,9 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src,
+                                             unsigned Alignment,
+                                             unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -642,6 +657,15 @@
   bool isLegalMaskedLoad(Type *DataType, int Consecutive) override {
     return Impl.isLegalMaskedLoad(DataType, Consecutive);
   }
+  bool supportInterleaveAccess() override {
+    return Impl.supportInterleaveAccess();
+  }
+  bool isLegalInterleaveType(Type *VecType) override {
+    return Impl.isLegalInterleaveType(VecType);
+  }
+  bool isLegalInterleaveNum(unsigned NumVec) override {
+    return Impl.isLegalInterleaveNum(NumVec);
+  }
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale) override {
     return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale);
@@ -724,6 +748,11 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) override {
+    return Impl.getInterleaveMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -215,6 +215,12 @@
 
   bool isLegalMaskedLoad(Type *DataType, int Consecutive) { return false; }
 
+  bool supportInterleaveAccess() { return false; }
+
+  bool isLegalInterleaveType(Type *VecType) { return false; }
+
+  bool isLegalInterleaveNum(unsigned NumVec) { return false; }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale) {
     // Guess that all legal addressing mode are free.
@@ -298,6 +304,12 @@
     return 1;
   }
 
+  unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) {
+    return 1;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: include/llvm/IR/IRBuilder.h
===================================================================
--- include/llvm/IR/IRBuilder.h
+++ include/llvm/IR/IRBuilder.h
@@ -435,6 +435,12 @@
   CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align,
                               Value *Mask);
 
+  CallInst *CreateIndexLoad(Type *RetValTy, Value *Ptr, Constant *Indices,
+                            unsigned Align, const Twine &Name = "");
+
+  CallInst *CreateIndexStore(Value *VecVal, Value *Ptr, Constant *Indices,
+                             unsigned Align);
+
   /// \brief Create an assume intrinsic call that allows the optimizer to
   /// assume that the provided condition will be true.
   CallInst *CreateAssumption(Value *Cond);
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -612,6 +612,21 @@
                                    LLVMVectorSameWidth<0, llvm_i1_ty>],
                                   [IntrReadWriteArgMem]>;
 
+//===--------------------- Index load/store Intrinsics --------------------===//
+//
+// The index load
+def int_index_load : Intrinsic<[llvm_anyvector_ty], 
+                               [llvm_ptr_ty,
+                                LLVMVectorSameWidth<0, llvm_i32_ty>,
+                                llvm_i32_ty],
+                               [IntrReadArgMem]>;
+// The index store
+def int_index_store : Intrinsic<[],
+                                [llvm_anyvector_ty, llvm_ptr_ty,
+                                 LLVMVectorSameWidth<0, llvm_i32_ty>,
+                                 llvm_i32_ty],
+                                [IntrReadWriteArgMem]>;
+
 // Intrinsics to support bit sets.
 def int_bitset_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                                 [IntrNoMem]>;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -514,6 +514,7 @@
   if (Lp != AR->getLoop()) {
     DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
           *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
   }
 
   // The address calculation must not wrap. Otherwise, a dependence could be
@@ -761,21 +762,29 @@
                            VectorizerParams::VectorizationFactor : 1);
   unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                            VectorizerParams::VectorizationInterleave : 1);
-
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+  unsigned NumIterations = ForcedFactor * ForcedUnroll;
+  // The number of vectorized & interleaved iterations should not be less than 2
+  if (NumIterations < 2)
+    NumIterations = 2;
+
+  unsigned Stride = std::abs(StrideAPtr);
+  // Positive distance is safe when either of:
+  //   (1) Distance < Stride * TypeByteSize
+  //   (2) Distance >= TypeByteSize * NumIterations * Stride
+  // is true and
+  //   (3) Distance <= MaxSafeDepDistBytes
+  // is true.
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIterations * Stride) ||
+      Distance > MaxSafeDepDistBytes) {
     DEBUG(dbgs() << "LAA: Failure because of Positive distance "
         << Val.getSExtValue() << '\n');
     return Dependence::Backward;
   }
 
   // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  if (Distance > Stride * TypeByteSize)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -111,6 +111,18 @@
   return TTIImpl->isLegalMaskedLoad(DataType, Consecutive);
 }
 
+bool TargetTransformInfo::supportInterleaveAccess() const {
+  return TTIImpl->supportInterleaveAccess();
+}
+
+bool TargetTransformInfo::isLegalInterleaveType(Type *VecType) const {
+  return TTIImpl->isLegalInterleaveType(VecType);
+}
+
+bool TargetTransformInfo::isLegalInterleaveNum(unsigned NumVec) const {
+  return TTIImpl->isLegalInterleaveNum(NumVec);
+}
+
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               int64_t BaseOffset,
                                               bool HasBaseReg,
@@ -232,6 +244,14 @@
 }
 
 unsigned
+TargetTransformInfo::getInterleaveMemoryOpCost(unsigned Opcode, Type *Src,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) const {
+  return TTIImpl->getInterleaveMemoryOpCost(Opcode, Src, Alignment,
+                                            AddressSpace);
+}
+
+unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
   return TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys);
Index: lib/IR/IRBuilder.cpp
===================================================================
--- lib/IR/IRBuilder.cpp
+++ lib/IR/IRBuilder.cpp
@@ -230,6 +230,31 @@
   return createCallHelper(TheFn, Ops, this, Name);
 }
 
+CallInst *IRBuilderBase::CreateIndexLoad(Type *RetValTy, Value *Ptr,
+                                         Constant *Indices, unsigned Align,
+                                         const Twine &Name) {
+  assert(isa<VectorType>(RetValTy) && "expected a vector value");
+  assert(isa<PointerType>(Ptr->getType()) && "expected a pointer value");
+
+  Value *Ops[] = {Ptr, Indices, getInt32(Align)};
+  Module *M = BB->getParent()->getParent();
+  Intrinsic::ID Id = Intrinsic::index_load;
+  Value *TheFn = Intrinsic::getDeclaration(M, Id, RetValTy);
+  return createCallHelper(TheFn, Ops, this, Name);
+}
+
+CallInst *IRBuilderBase::CreateIndexStore(Value *VecVal, Value *Ptr,
+                                          Constant *Indices, unsigned Align) {
+  assert(isa<VectorType>(VecVal->getType()) && "expected a vector value");
+  assert(isa<PointerType>(Ptr->getType()) && "expected a pointer value");
+
+  Value *Ops[] = {VecVal, Ptr, Indices, getInt32(Align)};
+  Module *M = BB->getParent()->getParent();
+  Intrinsic::ID Id = Intrinsic::index_store;
+  Value *TheFn = Intrinsic::getDeclaration(M, Id, VecVal->getType());
+  return createCallHelper(TheFn, Ops, this);
+}
+
 CallInst *IRBuilderBase::CreateGCStatepoint(Value *ActualCallee,
                                             ArrayRef<Value *> CallArgs,
                                             ArrayRef<Value *> DeoptArgs,
Index: lib/Target/AArch64/AArch64.h
===================================================================
--- lib/Target/AArch64/AArch64.h
+++ lib/Target/AArch64/AArch64.h
@@ -38,6 +38,7 @@
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64InterleaveAccessPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
 
Index: lib/Target/AArch64/AArch64InterleaveAccess.cpp
===================================================================
--- /dev/null
+++ lib/Target/AArch64/AArch64InterleaveAccess.cpp
@@ -0,0 +1,286 @@
+//=---------------------- AArch64InterleaveAccess.cpp ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64InterleaveAccess pass which matches
+// interleave access intrinsics to the AArch64 ldN/stN intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-interleave-access"
+
+namespace llvm {
+void initializeAArch64InterleaveAccessPass(PassRegistry &);
+}
+
+namespace {
+class AArch64InterleaveAccess : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64InterleaveAccess() : FunctionPass(ID) {
+    initializeAArch64InterleaveAccessPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "AArch64 Interleave Access";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F) override;
+
+private:
+  TargetTransformInfo *TTI;
+  const DataLayout *DL;
+  Module *M;
+
+  bool castInterleaveIntrinsics(Function::iterator B);
+  bool isInterleaveIndices(Value *IdxOp, unsigned &NumVec, unsigned &NumElts);
+};
+} // end anonymous namespace.
+
+char AArch64InterleaveAccess::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64InterleaveAccess, "aarch64-interleave-access",
+                      "AArch64 Interleave Access Pass", false, false)
+INITIALIZE_PASS_END(AArch64InterleaveAccess, "aarch64-interleave-access",
+                    "AArch64 Interleave Access Pass", false, false)
+
+FunctionPass *llvm::createAArch64InterleaveAccessPass() {
+  return new AArch64InterleaveAccess();
+}
+
+static unsigned getIntrinsicID(unsigned Num, bool IsLoad) {
+  static unsigned LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
+                                Intrinsic::aarch64_neon_ld3,
+                                Intrinsic::aarch64_neon_ld4};
+  static unsigned StoreInt[3] = {Intrinsic::aarch64_neon_st2,
+                                 Intrinsic::aarch64_neon_st3,
+                                 Intrinsic::aarch64_neon_st4};
+
+  if (IsLoad)
+    return LoadInt[Num - 2];
+  else
+    return StoreInt[Num - 2];
+}
+
+// Return true if the given indices list is interleaved, also return the number
+// of vectors and the number of vector elements.
+// E.g. <0, 2, 4, 8, 1, 3, 5, 7>. NumVec is 2 and NumElts is 4.
+//      <0, 3, ..., 9, 1, 4, ..., 10, 2, 5, ..., 11>. NumVec is 3 and NumElts
+//      is 4.
+bool AArch64InterleaveAccess::isInterleaveIndices(Value *IdxOp,
+                                                  unsigned &NumVec,
+                                                  unsigned &NumElts) {
+  const Constant *CIdx = dyn_cast<Constant>(IdxOp);
+  assert(CIdx && (isa<ConstantVector>(CIdx) || isa<ConstantDataVector>(CIdx)) &&
+         "expect indices to be constant vector");
+
+  unsigned NumOp = CIdx->getType()->getVectorNumElements();
+  if (NumOp <= 2)
+    return false;
+
+  Constant *COp0 = CIdx->getAggregateElement(0U);
+  Constant *COp1 = CIdx->getAggregateElement(1);
+  auto *CI0 = dyn_cast<ConstantInt>(COp0);
+  auto *CI1 = dyn_cast<ConstantInt>(COp1);
+  if (!CI0 || !CI1 || !CI0->isZero())
+    return false;
+
+  // Interleave Stride.
+  NumVec = CI1->getZExtValue();
+  if (!TTI->isLegalInterleaveNum(NumVec))
+    return false;
+
+  NumElts = NumOp / NumVec;
+  // The indices should match: 0, NumVec, 2*NumVec, ..., 1, NumVec + 1, ...
+  for (unsigned i = 0; i < NumVec; i++) {
+    for (unsigned j = 0; j < NumElts; j++) {
+      Constant *COp = CIdx->getAggregateElement(j + i * NumElts);
+      auto *CI = dyn_cast<ConstantInt>(COp);
+      if (!CI || CI->getZExtValue() != j * NumVec + i)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+// Get a indices vector starts with sequential constant and ends with UNDEFs:
+//    Idx, Idx + 1, Idx + 2, ..., Idx + Num - 1, undef, undef, ..., undef.
+static Constant *getSequentialIndices(IRBuilder<> &Builder, unsigned Idx,
+                                      unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Indices;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Indices.push_back(Builder.getInt32(i));
+  if (NumUndef == 0)
+    return ConstantVector::get(Indices);
+
+  Constant *UndefVal = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Indices.push_back(UndefVal);
+  return ConstantVector::get(Indices);
+}
+
+// Concatenate two vectors with the same element type. The first vector has
+// not less elements as the second vector.
+static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  if (NumElts1 == NumElts2) {
+    Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 * 2, 0);
+    return Builder.CreateShuffleVector(V1, V2, Mask);
+  }
+
+  assert(NumElts1 > NumElts2 && "Expect the first vector has more elements");
+  // Extend V2 to be the same vector type as V1 with undef elements.
+  Constant *ExtMask =
+      getSequentialIndices(Builder, 0, NumElts2, NumElts1 - NumElts2);
+  Value *UndefV = UndefValue::get(VecTy2);
+  Value *ExtV2 = Builder.CreateShuffleVector(V2, UndefV, ExtMask);
+
+  Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, ExtV2, Mask);
+}
+
+// Concatenate NumVec of vectors in VecList start at Idx
+static Value *ConcateVectors(IRBuilder<> &Builder,
+                             SmallVector<Value *, 4> &VecList, unsigned Idx,
+                             unsigned NumVec) {
+  assert(NumVec != 0 && "Number of vectors should not be zero");
+  if (NumVec == 1)
+    return VecList[Idx];
+
+  unsigned NumPart = PowerOf2Floor(NumVec);
+  // When the number of vectors is not power of 2, split the list and make sure
+  // the first list has power of 2 elements.
+  // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements.
+  if (NumPart == NumVec)
+    NumPart /= 2;
+  Value *V0 = ConcateVectors(Builder, VecList, Idx, NumPart);
+  unsigned NextIdx = Idx + NumPart;
+  Value *V1 = ConcateVectors(Builder, VecList, NextIdx, NumVec - NumPart);
+  return ConcateTwoVectors(Builder, V0, V1);
+}
+
+// Match index load/store intrinsic to the AArch64 ldN/stN intrinsic.
+// E.g. %idxvec = call @index.load(i8 *%ptr, <0, 2, 4, 6, 1, 3, 5, 7>)
+//   To
+//      %ldN = call {<4 x i32>, <4 x i32>} @llvm.aarch64.ld2(%newptr)
+//      %V0 = extract {<4 x i32>, <4 x i32>} %ldN, 0
+//      %V1 = extract {<4 x i32>, <4 x i32>} %ldN, 1
+//      %idxvec = shufflevector %V0, %V1, <8 x i32> <0, 1, 2, 3, 4, 5, 6, 7>
+bool AArch64InterleaveAccess::castInterleaveIntrinsics(Function::iterator BB) {
+  bool Changed = false;
+  IRBuilder<> Builder(BB);
+  for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE;) {
+    CallInst *CI = dyn_cast<CallInst>(I++);
+    if (!CI || !isa<IntrinsicInst>(CI))
+      continue;
+
+    unsigned Intrinsic = CI->getCalledFunction()->getIntrinsicID();
+    if (Intrinsic != Intrinsic::index_load &&
+        Intrinsic != Intrinsic::index_store)
+      continue;
+
+    bool IsLoad = (Intrinsic == Intrinsic::index_load);
+    unsigned PtrIdx = IsLoad ? 0 : 1;
+    Value *IdxOp = CI->getArgOperand(PtrIdx + 1);
+    unsigned NumVec, NumElts;
+    if (!isInterleaveIndices(IdxOp, NumVec, NumElts))
+      continue;
+
+    Value *Ptr = CI->getArgOperand(PtrIdx);
+    // Wide vector type of the index.load/index.store
+    VectorType *WideVecTy;
+    if (IsLoad)
+      WideVecTy = dyn_cast<VectorType>(CI->getType());
+    else
+      WideVecTy = dyn_cast<VectorType>(CI->getArgOperand(0)->getType());
+    // Type for each vector
+    VectorType *VecTy = VectorType::get(WideVecTy->getElementType(), NumElts);
+    if (!TTI->isLegalInterleaveType(VecTy))
+      continue;
+
+    Changed = true;
+    Intrinsic::ID Id = (Intrinsic::ID)(getIntrinsicID(NumVec, IsLoad));
+    unsigned AddressSpace =
+        cast<PointerType>(Ptr->getType())->getPointerAddressSpace();
+    Type *Tys[2] = {VecTy, VecTy->getPointerTo(AddressSpace)};
+    Value *Callee = Intrinsic::getDeclaration(M, Id, Tys);
+
+    Builder.SetInsertPoint(CI);
+    Value *NewPtr =
+        Builder.CreateBitCast(Ptr, VecTy->getPointerTo(AddressSpace));
+    if (IsLoad) {
+      CallInst *CallI = Builder.CreateCall(Callee, NewPtr, "ldN");
+      SmallVector<Value *, 4> Vectors;
+      for (unsigned i = 0; i < NumVec; i++)
+        Vectors.push_back(Builder.CreateExtractValue(CallI, i));
+
+      // Concatenate apart vectors from the new ldN intrinsic call to a wide
+      // vector.
+      Value *LargeVec = ConcateVectors(Builder, Vectors, 0, NumVec);
+      CI->replaceAllUsesWith(LargeVec);
+      CI->eraseFromParent();
+    } else {
+      Value *IdxVec = CI->getArgOperand(0);
+      Value *UndefV = UndefValue::get(WideVecTy);
+
+      // Split the wide vector of index store to apart vectors for the new
+      // stN intrinsic call.
+      SmallVector<Value *, 5> Ops;
+      for (unsigned i = 0; i < NumVec; i++) {
+        Constant *Mask = getSequentialIndices(Builder, NumElts * i, NumElts, 0);
+        Ops.push_back(Builder.CreateShuffleVector(IdxVec, UndefV, Mask));
+      }
+      Ops.push_back(NewPtr);
+      Builder.CreateCall(Callee, Ops);
+      CI->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+bool AArch64InterleaveAccess::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  M = F.getParent();
+  DL = &M->getDataLayout();
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B)
+    Changed |= castInterleaveIntrinsics(B);
+  return Changed;
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -214,6 +214,9 @@
   // ourselves.
   addPass(createAtomicExpandPass(TM));
 
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64InterleaveAccessPass());
+
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
   // ldrex/strex loops to simplify this, but it needs tidying up.
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -139,6 +139,11 @@
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
+  unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                     unsigned Alignment, unsigned AddressSpace);
+  bool supportInterleaveAccess();
+  bool isLegalInterleaveType(Type *DataTy);
+  bool isLegalInterleaveNum(unsigned NumVec);
   /// @}
 };
 
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -407,6 +407,28 @@
   return LT.first;
 }
 
+unsigned AArch64TTIImpl::getInterleaveMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                                   unsigned Alignment,
+                                                   unsigned AddressSpace) {
+  if (!isLegalInterleaveType(SrcTy))
+    // To calculate scalar take the regular cost, without mask
+    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+  return 1;
+}
+
+bool AArch64TTIImpl::isLegalInterleaveNum(unsigned NumVec) {
+  return NumVec > 1 && NumVec < 5;
+}
+
+bool AArch64TTIImpl::supportInterleaveAccess() { return true; }
+
+bool AArch64TTIImpl::isLegalInterleaveType(Type *VecType) {
+  VectorType *VecTy = dyn_cast<VectorType>(VecType);
+  if (!VecTy)
+    return false;
+  return isTypeLegal(VecTy);
+}
+
 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
Index: lib/Target/AArch64/CMakeLists.txt
===================================================================
--- lib/Target/AArch64/CMakeLists.txt
+++ lib/Target/AArch64/CMakeLists.txt
@@ -32,6 +32,7 @@
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
+  AArch64InterleaveAccess.cpp
   AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -351,6 +351,8 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  void vectorizeInterleaveAccesses(Instruction *Instr, Value *Ptr, Type *VecTy);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -568,6 +570,14 @@
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
         WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
 
+  ~LoopVectorizationLegality() {
+    SmallSet<InterleaveAccessList *, 2> DeleteSet;
+    for (auto &I : InterleaveAccessMap)
+      DeleteSet.insert(dyn_cast<InterleaveAccessList>(I.second));
+    for (auto *Ptr : DeleteSet)
+      delete Ptr;
+  }
+
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
     RK_NoReduction, ///< Not a reduction.
@@ -702,6 +712,49 @@
     ConstantInt *StepValue;
   };
 
+  // An interleave access list contains interleaved accesses.
+  // E.g. for (i = 0; i < N; i++) {
+  //        even = A[i*2]
+  //        odd = A[i*2 + 1]
+  //        ...  // operations on even/odd
+  //      }
+  // The InterleaveAccessList consists of two access: load even, load odd.
+  struct InterleaveAccessList {
+    InterleaveAccessList(int Stride, bool IsLoad)
+        : Stride(Stride), IsLoad(IsLoad) {}
+    InterleaveAccessList() : Stride(0) {}
+
+    unsigned getIndex(Instruction *Access) {
+      unsigned Num = AccessList.size();
+      for (unsigned i = 0; i < Num; i++)
+        if (AccessList[i] == Access)
+          return i;
+      return Num;
+    }
+
+    bool isInsertPos(Instruction *Access) { return Access == InsertPos; }
+
+    int Stride;
+    bool IsLoad;
+    unsigned Align;
+    // The interleaved access instruction list.
+    SmallVector<Instruction *, 4> AccessList;
+    // To avoid breaking the dependences of each access, the newly creately
+    // interleave access should be insert at the position of:
+    //    (1) The first load access.
+    // or
+    //    (2) The last store access.
+    //
+    // E.g. %even = load i32             // Insert Position
+    //      %add = add i32 %even         // Use of %even
+    //      %odd = load i32
+    //
+    //      store i32 %even
+    //      %odd = add i32               // Def of %odd
+    //      store i32 %odd               // Insert Position
+    Instruction *InsertPos;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -796,6 +849,15 @@
   unsigned getNumPredStores() const {
     return NumPredStores;
   }
+
+  bool isLegalInterleaveAccess(Instruction *I, Type *VecTy) {
+    return InterleaveAccessMap.count(I) && TTI->isLegalInterleaveType(VecTy);
+  }
+
+  InterleaveAccessList *getInterleaveAccessList(Instruction *I) {
+    return InterleaveAccessMap[I];
+  }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -844,6 +906,17 @@
   /// invariant.
   void collectStridedAccess(Value *LoadOrStoreInst);
 
+  /// Collect constant strided memory access.
+  void collectConstStridedAccess(MapVector<Instruction *, int> &Strides);
+
+  /// Analyze interleave accesses.
+  void analyzeInterleaveAccesses();
+
+  /// For a queue of accesses with the same kind, stride and base pointer,
+  /// find interleaved accesses.
+  void collectInterleaveAccesses(SmallVector<Instruction *, 4> &StrideQueue,
+                                 int Stride);
+
   /// Report an analysis message to assist the user in diagnosing loops that are
   /// not vectorized.  These are handled as LoopAccessReport rather than
   /// VectorizationReport because the << operator of VectorizationReport returns
@@ -902,6 +975,9 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction*, 8> MaskedOp;
+
+  /// Interleave memory data.
+  DenseMap<Value *, InterleaveAccessList *> InterleaveAccessMap;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1737,6 +1813,174 @@
                                      "reverse");
 }
 
+// Create the interleaved indices vector:
+//     0, Num, Num*2, ..., 1, Num + 1, Num*2 + 1, ...
+// E.g. For 2 interleaved vectors of <4 x i32> type, the indices should be:
+//     0, 2, 4, 6, 1, 3, 5, 7
+static Constant *getInterleaveIndices(IRBuilder<> &Builder, unsigned VF,
+                                      unsigned NumVec) {
+  SmallVector<Constant *, 64> Indices;
+  for (unsigned i = 0; i < NumVec; i++)
+    for (unsigned j = 0; j < VF; j++)
+      Indices.push_back(Builder.getInt32(j * NumVec + i));
+  return ConstantVector::get(Indices);
+}
+
+// Get a indices vector starts with sequential constant and ends with UNDEFs:
+//    Idx, Idx + 1, Idx + 2, ..., Idx + Num - 1, undef, undef, ..., undef.
+static Constant *getSequentialIndices(IRBuilder<> &Builder, unsigned Idx,
+                                      unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Indices;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Indices.push_back(Builder.getInt32(i));
+  if (NumUndef == 0)
+    return ConstantVector::get(Indices);
+
+  Constant *UndefVal = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Indices.push_back(UndefVal);
+  return ConstantVector::get(Indices);
+}
+
+// Concatenate two vectors with the same element type. The first vector has
+// not less elements as the second vector.
+static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  if (NumElts1 == NumElts2) {
+    Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 * 2, 0);
+    return Builder.CreateShuffleVector(V1, V2, Mask);
+  }
+
+  assert(NumElts1 > NumElts2 && "Expect the first vector has more elements");
+  // Extend V2 to be the same vector type as V1 with undef elements.
+  Constant *ExtMask =
+      getSequentialIndices(Builder, 0, NumElts2, NumElts1 - NumElts2);
+  Value *UndefV = UndefValue::get(VecTy2);
+  Value *ExtV2 = Builder.CreateShuffleVector(V2, UndefV, ExtMask);
+
+  Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, ExtV2, Mask);
+}
+
+// Concatenate NumVec of vectors in VecList start at Idx
+static Value *ConcateVectors(IRBuilder<> &Builder,
+                             SmallVector<Value *, 4> &VecList, unsigned Idx,
+                             unsigned NumVec) {
+  assert(NumVec != 0 && "Number of vectors should not be zero");
+  if (NumVec == 1)
+    return VecList[Idx];
+
+  unsigned NumPart = PowerOf2Floor(NumVec);
+  // When the number of vectors is not power of 2, split the list and make sure
+  // the first list has power of 2 elements.
+  // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements.
+  if (NumPart == NumVec)
+    NumPart /= 2;
+  Value *V0 = ConcateVectors(Builder, VecList, Idx, NumPart);
+  unsigned NextIdx = Idx + NumPart;
+  Value *V1 = ConcateVectors(Builder, VecList, NextIdx, NumVec - NumPart);
+  return ConcateTwoVectors(Builder, V0, V1);
+}
+
+// Create the vectorized instruction for interleave access.
+// E.g. If VF is 4, transform:
+//      %S1 = load i32* %ptr1
+//      %S2 = load i32* %ptr2
+//   To
+//      %IdxV = call <8 x i32> @index.load(i8* %ptr, <0, 2, 4, 6, 1, 3, 5, 7>)
+//      %V1 = shufflevector <8 x i32> %IdxV, <8 x i32> undef, <0, 1, 2, 3>
+//      %V2 = shufflevector <8 x i32> %IdxV, <8 x i32> undef, <4, 5, 6, 7>
+void InnerLoopVectorizer::vectorizeInterleaveAccesses(Instruction *Instr,
+                                                      Value *Ptr,
+                                                      Type *DataTy) {
+  VectorType *VecTy = dyn_cast<VectorType>(DataTy);
+  assert(VecTy && "expect a vector type");
+
+  LoopVectorizationLegality::InterleaveAccessList *IA =
+      Legal->getInterleaveAccessList(Instr);
+  assert(IA->InsertPos == Instr && "unexpected insert point");
+
+  unsigned Idx = IA->getIndex(Instr);
+  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  Type *I8PtrTy = Builder.getInt8PtrTy(AddressSpace);
+  Constant *Zero = Builder.getInt32(0);
+
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Adjust pointer to the index 0 access.
+    Constant *Offset = Builder.getInt32(-static_cast<int>(Idx));
+    Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero);
+    NewPtr = Builder.CreateGEP(NewPtr, Offset);
+    NewPtr = Builder.CreateBitCast(NewPtr, I8PtrTy);
+    NewPtrs.push_back(NewPtr);
+  }
+
+  SmallVector<Instruction *, 4> &AccessList = IA->AccessList;
+  unsigned Align = IA->Align;
+  unsigned NumVec = AccessList.size();
+  Type *EleTy = VecTy->getElementType();
+  Type *WideVecTy = VectorType::get(EleTy, NumVec * VF);
+  bool IsElePtr = EleTy->isPointerTy();
+  Type *IntPtrVecTy;
+  if (IsElePtr) {
+    const DataLayout &DL = Instr->getModule()->getDataLayout();
+    Type *IntPtrTy = DL.getIntPtrType(EleTy);
+    IntPtrVecTy = VectorType::get(IntPtrTy, NumVec * VF);
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  if (IA->IsLoad) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      // If the element is pointer, call index.load with an integer vector type
+      // and cast the result to pointer vector.
+      Type *RetVecTy = IsElePtr ? IntPtrVecTy : WideVecTy;
+
+      // Call index.load to read data from memory, with deinterleaving.
+      Constant *Mask = getInterleaveIndices(Builder, VF, NumVec);
+      Value *IdxVec = Builder.CreateIndexLoad(RetVecTy, NewPtrs[Part], Mask,
+                                              Align, "interleave.load");
+      propagateMetadata(dyn_cast<Instruction>(IdxVec), Instr);
+      if (IsElePtr)
+        IdxVec = Builder.CreateIntToPtr(IdxVec, WideVecTy);
+
+      // Split the large index load vector into small vectors for apart load.
+      Value *UndefV = UndefValue::get(WideVecTy);
+      for (unsigned i = 0; i < NumVec; i++) {
+        VectorParts &Entry = WidenMap.get(AccessList[i]);
+        Constant *Indices = getSequentialIndices(Builder, VF * i, VF, 0);
+        Entry[Part] = Builder.CreateShuffleVector(IdxVec, UndefV, Indices);
+      }
+    }
+    return;
+  }
+
+  for (unsigned Part = 0; Part < UF; Part++) {
+    SmallVector<Value *, 4> StoredVal;
+    for (unsigned i = 0; i < NumVec; i++)
+      StoredVal.push_back(getVectorValue(AccessList[i]->getOperand(0))[Part]);
+    // Concatenate the apart vectors into a large vector for index.store.
+    Value *IdxVec = ConcateVectors(Builder, StoredVal, 0, NumVec);
+
+    // If the element is pointer, cast the pointer vector to integer pointer.
+    if (IsElePtr)
+      IdxVec = Builder.CreatePtrToInt(IdxVec, IntPtrVecTy);
+
+    // Call index.store write data to memory, with reinterleaving.
+    Constant *Mask = getInterleaveIndices(Builder, VF, NumVec);
+    CallInst *CI = Builder.CreateIndexStore(IdxVec, NewPtrs[Part], Mask, Align);
+    propagateMetadata(CI, Instr);
+  }
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1764,6 +2008,15 @@
   if (ScalarAllocatedSize != VectorElementSize)
     return scalarizeInstruction(Instr);
 
+  if (Legal->isLegalInterleaveAccess(Instr, DataTy)) {
+    LoopVectorizationLegality::InterleaveAccessList *IA =
+        Legal->getInterleaveAccessList(Instr);
+    // Skip the access that is not the insert position.
+    if (!IA->isInsertPos(Instr))
+      return;
+    return vectorizeInterleaveAccesses(Instr, Ptr, DataTy);
+  }
+
   // If the pointer is loop invariant or if it is non-consecutive,
   // scalarize the load.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
@@ -1859,7 +2112,7 @@
       if (Legal->isMaskRequired(SI))
         NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
                                           Mask[Part]);
-      else 
+      else
         NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
       propagateMetadata(NewSI, SI);
     }
@@ -3570,6 +3823,9 @@
   // Collect all of the variables that remain uniform after vectorization.
   collectLoopUniforms();
 
+  if (TTI->supportInterleaveAccess())
+    analyzeInterleaveAccesses();
+
   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
         (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
          "")
@@ -3581,6 +3837,313 @@
   return true;
 }
 
+static bool isInBoundsGep(Value *Ptr) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+    return GEP->isInBounds();
+  return false;
+}
+
+/// \brief Check whether the access through \p Ptr has a constant stride.
+static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                        const SCEV *&PtrScev) {
+  const Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Unexpected non-ptr");
+  const PointerType *PtrTy = cast<PointerType>(Ty);
+
+  PtrScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  // The pointer must be an AddRecExpr pointer and the accesss function must
+  // stride over the innermost loop.
+  if (!AR || Lp != AR->getLoop())
+    return 0;
+
+  // The address calculation must not wrap. Otherwise, a dependence could be
+  // inverted.
+  // An inbounds getelementptr that is a AddRec with a unit stride
+  // cannot wrap per definition. The unit stride requirement is checked later.
+  // An getelementptr without an inbounds attribute and unit stride would have
+  // to access the pointer value "0" which is undefined behavior in address
+  // space 0, therefore we can also vectorize this case.
+  bool IsInBoundsGEP = isInBoundsGep(Ptr);
+  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
+  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero)
+    return 0;
+
+  // Check the step is constant.
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C)
+    return 0;
+
+  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+  int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+  const APInt &APStepVal = C->getValue()->getValue();
+
+  // Huge step value - give up.
+  if (APStepVal.getBitWidth() > 64)
+    return 0;
+
+  int64_t StepVal = APStepVal.getSExtValue();
+
+  // Strided access.
+  int64_t Stride = StepVal / Size;
+  int64_t Rem = StepVal % Size;
+  if (Rem)
+    return 0;
+
+  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
+  // know we can't "wrap around the address space". In case of address space
+  // zero we know that this won't happen without triggering undefined behavior.
+  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
+      Stride != 1 && Stride != -1)
+    return 0;
+
+  return Stride;
+}
+
+static Value *getPointerOperand(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+void LoopVectorizationLegality::collectConstStridedAccess(
+    MapVector<Instruction *, int> &StridedAccesses) {
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+    if (blockNeedsPredication(*bb))
+      continue;
+
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Value *Ptr = getPointerOperand(it);
+      if (!Ptr)
+        continue;
+
+      const SCEV *PtrScev;
+      int Stride = isStridedPtr(SE, Ptr, TheLoop, PtrScev);
+      if (Stride < 2 && Stride > -2)
+        continue;
+
+      DEBUG(dbgs() << "LV: Found a constant strided access\n");
+      DEBUG(dbgs() << "  pointer:" << *Ptr << "\n");
+      DEBUG(dbgs() << "  Stride:" << Stride << "\n");
+      StridedAccesses[it] = Stride;
+    }
+  }
+}
+
+static unsigned getAddressSpaceOperand(Instruction *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+static Value *getBasePointer(Instruction *I, const DataLayout &DL) {
+  Value *Ptr = getPointerOperand(I);
+  assert(Ptr && "Fail to get point from a memory instruction");
+  return GetUnderlyingObject(Ptr, DL);
+}
+
+static bool isConsecutiveAccess(Instruction *A, Instruction *B,
+                                ScalarEvolution *SE, const DataLayout &DL) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same type.
+  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
+    return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *C = SE->getConstant(BaseDelta);
+  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
+  return X == PtrSCEVB;
+}
+
+void LoopVectorizationLegality::collectInterleaveAccesses(
+    SmallVector<Instruction *, 4> &StrideQueue, int Stride) {
+  if (StrideQueue.size() < 2) {
+    StrideQueue.clear();
+    return;
+  }
+
+  SetVector<Instruction *> Heads;
+  SetVector<Instruction *> Tails;
+  SmallDenseMap<Instruction *, Instruction *> ConsecutiveChain;
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (unsigned i = 0, e = StrideQueue.size(); i != e; i++) {
+    for (unsigned j = 0; j != e; j++) {
+      if (i == j)
+        continue;
+      if (isConsecutiveAccess(StrideQueue[i], StrideQueue[j], SE, DL)) {
+        Heads.insert(StrideQueue[i]);
+        Tails.insert(StrideQueue[j]);
+        ConsecutiveChain[StrideQueue[i]] = StrideQueue[j];
+      }
+    }
+  }
+
+  for (SetVector<Instruction *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+    SmallSetVector<Instruction *, 4> Operands;
+    Instruction *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      Operands.insert(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    unsigned Size = Operands.size();
+    unsigned NumVec = std::abs(Stride);
+    if (Size < NumVec)
+      continue;
+
+    // If Size >= NumVec, there may be more than 1 interleave access.
+    for (unsigned Part = 0; Part < Size / NumVec; Part++) {
+      bool IsLoad = StrideQueue[0]->mayReadFromMemory();
+      InterleaveAccessList *ID = new InterleaveAccessList(Stride, IsLoad);
+      unsigned MaxAlign = 1;
+      for (unsigned i = Part * NumVec; i < Part * NumVec + NumVec; i++) {
+        Instruction *Access = Operands[i];
+        ID->AccessList.push_back(Access);
+        InterleaveAccessMap[Access] = ID;
+
+        unsigned Align = IsLoad ? dyn_cast<LoadInst>(Access)->getAlignment()
+                                : dyn_cast<StoreInst>(Access)->getAlignment();
+        if (Align > MaxAlign)
+          MaxAlign = Align;
+      }
+      ID->Align = MaxAlign;
+
+      if (IsLoad) { // Insert point is the first load access.
+        for (unsigned i = 0, e = StrideQueue.size(); i != e; i++) {
+          if (Operands.count(StrideQueue[i])) {
+            ID->InsertPos = StrideQueue[i];
+            break;
+          }
+        }
+      } else { // Insert point is the last store access.
+        for (int i = StrideQueue.size() - 1, e = -1; i != e; i--) {
+          if (Operands.count(StrideQueue[i])) {
+            ID->InsertPos = StrideQueue[i];
+            break;
+          }
+        }
+      }
+
+      DEBUG(dbgs() << "LV: Found interleaved accesses with Stride " << Stride
+                   << "\n");
+    }
+  }
+
+  StrideQueue.clear();
+  return;
+}
+
+void LoopVectorizationLegality::analyzeInterleaveAccesses() {
+  MapVector<Instruction *, int> StridedAccesses;
+  collectConstStridedAccess(StridedAccesses);
+
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+    if (blockNeedsPredication(*bb))
+      continue;
+
+    // Queue for the accesses of the same kind, Stride and Base pointer.
+    SmallVector<Instruction *, 4> StrideQueue;
+    bool TryMerge = false;
+    Value *CurBase = nullptr;
+    int CurStride;
+    bool IsLoad = false;
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      if (TryMerge) {
+        collectInterleaveAccesses(StrideQueue, CurStride);
+        CurBase = nullptr;
+        TryMerge = false;
+      }
+
+      if (!it->mayReadOrWriteMemory())
+        continue;
+
+      // Break. Merge the queue.
+      if (!StridedAccesses.count(it)) {
+        TryMerge = true;
+        continue;
+      }
+
+      Value *Base = getBasePointer(it, DL);
+      int Stride = StridedAccesses[it];
+      bool Read = it->mayReadFromMemory();
+
+      if (CurBase == nullptr) {
+        // Skip the illegal number of interleaved access.
+        if (!TTI->isLegalInterleaveNum(std::abs(Stride)))
+          continue;
+
+        StrideQueue.push_back(it);
+        CurBase = Base;
+        CurStride = Stride;
+        IsLoad = Read;
+        continue;
+      }
+
+      // This instructure doesn't fit the current queue. Try to merge current
+      // queue and insert it again.
+      if (CurBase != Base || CurStride != Stride || IsLoad != Read) {
+        TryMerge = true;
+        --it;
+        continue;
+      }
+
+      StrideQueue.push_back(it);
+    }
+
+    if (StrideQueue.size() > 1)
+      collectInterleaveAccesses(StrideQueue, CurStride);
+  }
+}
+
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
     return DL.getIntPtrType(Ty);
@@ -5103,7 +5666,9 @@
     const DataLayout &DL = I->getModule()->getDataLayout();
     unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
     unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
-    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
+    bool IsInterleave = Legal->isLegalInterleaveAccess(I, VectorTy);
+    if ((!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) &&
+        !IsInterleave) {
       bool IsComplexComputation =
         isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
       unsigned Cost = 0;
@@ -5129,11 +5694,16 @@
 
     // Wide load/stores.
     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    if (Legal->isMaskRequired(I))
+    if (Legal->isMaskRequired(I)) {
       Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
                                         AS);
-    else
+    } else if (IsInterleave) {
+      Alignment = Legal->getInterleaveAccessList(I)->Align;
+      Cost += TTI.getInterleaveMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
+                                            AS);
+    } else {
       Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+    }
 
     if (Reverse)
       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
Index: test/CodeGen/AArch64/interleaved-access-to-ldN-stN.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/interleaved-access-to-ldN-stN.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnueabi"
+
+
+; Make sure the intrinsic about 2 interleaved vectors can be matched
+; CHECK-LABEL: test_ld2_st2:
+; CHECK: ld2
+; CHECK: st2
+
+define void @test_ld2_st2(i8* %ptr) {
+entry:
+  %interleave.load = call <8 x i32> @llvm.index.load.v8i32(i8* %ptr, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+  %0 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = add nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+  %3 = add nsw <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
+  %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  call void @llvm.index.store.v8i32(<8 x i32> %4, i8* %ptr, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+  ret void
+}
+
+; Make sure the intrinsic about 3 interleaved vectors can be matched
+; CHECK-LABEL: test_ld3_st3:
+; CHECK: ld3
+; CHECK: st3
+
+define void @test_ld3_st3(i8* %ptr) {
+entry:
+  %interleave.load = call <12 x i32> @llvm.index.load.v12i32(i8* %ptr, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+  %0 = shufflevector <12 x i32> %interleave.load, <12 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <12 x i32> %interleave.load, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <12 x i32> %interleave.load, <12 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %3 = add nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+  %4 = add nsw <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
+  %5 = add nsw <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
+  %6 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %7 = shufflevector <4 x i32> %5, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8 = shufflevector <8 x i32> %6, <8 x i32> %7, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  call void @llvm.index.store.v12i32(<12 x i32> %8, i8* %ptr, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+  ret void
+}
+
+; Make sure the intrinsic about 3 interleaved vectors can be matched
+; CHECK-LABEL: test_ld4_st4:
+; CHECK: ld4
+; CHECK: st4
+
+define void @test_ld4_st4(i8* %ptr) {
+entry:
+  %interleave.load = call <16 x i32> @llvm.index.load.v16i32(i8* %ptr, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>, i32 4)
+  %0 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %3 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %4 = add nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+  %5 = add nsw <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
+  %6 = add nsw <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
+  %7 = add nsw <4 x i32> %3, <i32 4, i32 4, i32 4, i32 4>
+  %8 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %9 = shufflevector <4 x i32> %6, <4 x i32> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %10 = shufflevector <8 x i32> %8, <8 x i32> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  call void @llvm.index.store.v16i32(<16 x i32> %10, i8* %ptr, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>, i32 4)
+  ret void
+}
+
+declare <8 x i32> @llvm.index.load.v8i32(i8*, <8 x i32>, i32)
+declare void @llvm.index.store.v8i32(<8 x i32>, i8*, <8 x i32>, i32)
+declare <12 x i32> @llvm.index.load.v12i32(i8*, <12 x i32>, i32)
+declare void @llvm.index.store.v12i32(<12 x i32>, i8*, <12 x i32>, i32)
+declare <16 x i32> @llvm.index.load.v16i32(i8*, <16 x i32>, i32)
+declare void @llvm.index.store.v16i32(<16 x i32>, i8*, <16 x i32>, i32)
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -102,26 +102,23 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[IL:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[IL1:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <8 x i32> %[[IL1]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <8 x i32> %[[IL1]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[IL:.*]] = call <4 x i32> @llvm.index.load.v4i32(i8* {{.*}}, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; FORCE-VEC: shufflevector <4 x i32> %[[IL]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; FORCE-VEC: shufflevector <4 x i32> %[[IL]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
@@ -0,0 +1,358 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UNROLL
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnueabi"
+
+; Make sure that we can vectorize the interleave access to a struct.
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; int test_struct_load2(struct ST2 *A, int n) {
+;   int sum = 0;
+;   for (int i = 0; i < 1024; ++i) {
+;     sum += A[i].x;
+;     sum += A[i].y;
+;   }
+;
+;   return sum;
+; }
+
+; CHECK-LABEL: @test_struct_load2(
+; CHECK: %[[IL:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UNROLL-LABEL: @test_struct_load2(
+; UNROLL: %[[IL0:.*]] = call <8 x i32> @llvm.index.load.v8i32
+; UNROLL: %[[IL1:.*]] = call <8 x i32> @llvm.index.load.v8i32
+
+%struct.ST2 = type { i32, i32 }
+define i32 @test_struct_load2(%struct.ST2* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %add = add nsw i32 %0, %sum
+  %add1 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  ret i32 %add1.lcssa
+}
+
+
+; Make sure that the following case can be vectorized.
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %[[IL:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %[[VEC:.*]] = shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: call void @llvm.index.store.v8i32(<8 x i32> %[[VEC]], i8* {{.*}}, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+
+; UNROLL-LABEL: @test_array_load2_store2(
+; UNROLL: %[[IL0:.*]] = call <8 x i32> @llvm.index.load.v8i32
+; UNROLL: %[[IL1:.*]] = call <8 x i32> @llvm.index.load.v8i32
+; UNROLL: call void @llvm.index.store.v8i32
+; UNROLL: call void @llvm.index.store.v8i32
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+; };
+; int test_struct_load3(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load3(
+; CHECK: %[[IL:.*]] = call <12 x i32> @llvm.index.load.v12i32(i8* {{.*}}, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UNROLL-LABEL: @test_struct_load3(
+; UNROLL: call <12 x i32> @llvm.index.load.v12i32
+; UNROLL: call <12 x i32> @llvm.index.load.v12i32
+
+%struct.ST3 = type { i32, i32, i32 }
+
+define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.015
+  %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add5
+}
+
+
+; Make sure that the following case can be vectorized.
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %[[IL:.*]] = call <12 x i32> @llvm.index.load.v12i32(i8* {{.*}}, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK: call void @llvm.index.store.v12i32(<12 x i32> {{.*}}, i8* {{.*}}, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+
+; UNROLL-LABEL: @test_struct_array_load3_store3(
+; UNROLL: call <12 x i32> @llvm.index.load.v12i32
+; UNROLL: call <12 x i32> @llvm.index.load.v12i32
+; UNROLL: call void @llvm.index.store.v12i32
+; UNROLL: call void @llvm.index.store.v12i32
+
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %[[IL:.*]] = call <16 x i32> @llvm.index.load.v16i32(i8* {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>, i32 4)
+; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+; UNROLL-LABEL: @test_struct_load4(
+; UNROLL: call <16 x i32> @llvm.index.load.v16i32
+; UNROLL: call <16 x i32> @llvm.index.load.v16i32
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: call void @llvm.index.store.v16i32(<16 x i32> {{.*}}, i8* {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>, i32 4)
+
+; UNROLL-LABEL: @test_struct_store4(
+; UNROLL: load <4 x i32>, <4 x i32>
+; UNROLL: load <4 x i32>, <4 x i32>
+; UNROLL: call void @llvm.index.store.v16i32
+; UNROLL: call void @llvm.index.store.v16i32
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}