Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -444,6 +444,11 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of a member of the interleaved Load/Store.
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -582,6 +587,10 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              unsigned Stride,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -740,6 +749,12 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment,
+                                           AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -300,6 +300,20 @@
     return 1;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VecType = dyn_cast<VectorType>(VecTy);
+    assert(VecType && "Expect a vector type");
+
+    // Cost for the sequential memory access for a vector
+    unsigned Cost = 1;
+    // Estimate the interleave cost to be extracting all lanes or inserting all
+    // lanes.
+    Cost += VecType->getNumElements();
+    return Cost;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -530,6 +530,7 @@
   if (Lp != AR->getLoop()) {
     DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
           *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
   }
 
   // The address calculation must not wrap. Otherwise, a dependence could be
@@ -777,21 +778,29 @@
                            VectorizerParams::VectorizationFactor : 1);
   unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                            VectorizerParams::VectorizationInterleave : 1);
-
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+  unsigned NumIterations = ForcedFactor * ForcedUnroll;
+  // The number of vectorized & interleaved iterations should not be less than 2
+  if (NumIterations < 2)
+    NumIterations = 2;
+
+  unsigned Stride = std::abs(StrideAPtr);
+  // Positive distance is safe when either of:
+  //   (1) Distance < Stride * TypeByteSize
+  //   (2) Distance >= TypeByteSize * NumIterations * Stride
+  // is true and
+  //   (3) Distance <= MaxSafeDepDistBytes
+  // is true.
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIterations * Stride) ||
+      Distance > MaxSafeDepDistBytes) {
     DEBUG(dbgs() << "LAA: Failure because of Positive distance "
         << Val.getSExtValue() << '\n');
     return Dependence::Backward;
   }
 
   // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  if (Distance >= TypeByteSize * NumIterations * Stride)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,13 @@
   return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
+unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Stride, unsigned Alignment,
+    unsigned AddressSpace) const {
+  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment,
+                                             AddressSpace);
+}
+
 unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
Index: lib/Target/AArch64/AArch64.h
===================================================================
--- lib/Target/AArch64/AArch64.h
+++ lib/Target/AArch64/AArch64.h
@@ -38,6 +38,7 @@
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64ShuffleVectorAndMemAccessOptPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
 
Index: lib/Target/AArch64/AArch64ShuffleVectorAndMemAccessOpt.cpp
===================================================================
--- /dev/null
+++ lib/Target/AArch64/AArch64ShuffleVectorAndMemAccessOpt.cpp
@@ -0,0 +1,487 @@
+//=---------------- AArch64ShuffleVectorAndMemAccessOpt.cpp ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ShuffleVectorAndMemAccessOpt pass which
+// matches:
+//   (1) A load and an interleaved shufflevector into a ldN intrinsic
+//   (2) A load and several strided shufflevectors into a ldN intrinsic
+//   (3) An interleaved shufflevector and a store into a stN intrinsic.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-shuffle-access-opt"
+
+namespace llvm {
+static void initializeAArch64ShuffleVectorAndMemAccessOptPass(PassRegistry &);
+}
+
+namespace {
+class AArch64ShuffleVectorAndMemAccessOpt : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64ShuffleVectorAndMemAccessOpt() : FunctionPass(ID) {
+    initializeAArch64ShuffleVectorAndMemAccessOptPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "AArch64 ShuffleVector and Memory Access Optimization Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F) override;
+
+private:
+  TargetTransformInfo *TTI;
+  const DataLayout *DL;
+  Module *M;
+
+  // Check if the mask of the given shufflevecotor is like:
+  //     <0, Stride, Stride*2, ..., Stride*(NumElts-1), 1, Stride+1, ...>
+  // E.g. For 2 interleaved vectors of <4 x i32> type, the mask is:
+  //     <0, 2, 4, 6, 1, 3, 5, 7>
+  bool isInterleavedLoadMask(ShuffleVectorInst *SVI, unsigned &Stride,
+                             unsigned &NumElts, VectorType *&VecTy);
+  // Match a load and a shufflevector with interleaved mask into a ldN
+  // intrinsic. E.G. Translate following IRs:
+  //      %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+  //      %interleaved.vec = shufflevector <8 x i32> %wide.vec, undef,
+  //                                                 <0, 2, 4, 8, 1, 3, 5, 7>
+  // into:
+  //      %ld2 = call {<4 x i32>, <4 x i32>} aarch64.neon.ld2(%ptr)
+  //      %v0 = extractvalue %ld2, 0
+  //      %v1 = extractvalue %ld2, 1
+  //      %interleaved.vec = shufflevector %v0, %v1, <0, 1, 2, 3, 4, 5, 6, 7>
+  bool matchInterleavedLoad(ShuffleVectorInst *SVI, unsigned Stride,
+                            unsigned NumElts, VectorType *VecTy);
+
+  // Check if the mask of the given shufflevecotor is like:
+  //      <Index, Index+Stride, ..., Index+(NumElts-1)*Stride>
+  // E.g. For 2 interleaved vectors of <4 x i32> type, the strided masks are:
+  //      Index 0:  <0, 2, 4, 6>
+  //      Index 1:  <1, 3, 5, 7>
+  bool isStridedLoadMask(ShuffleVectorInst *SVI, unsigned &Stride,
+                         unsigned &NumElts, VectorType *&VecTy);
+  // Match a load and several shufflevectors with strided mask into a ldN
+  // intrinsic. E.G. Translate following IRs:
+  //      %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+  //      %v0 = shufflevector %wide.vec, undef, <0, 2, 4, 8>
+  //      %v1 = shufflevector %wide.vec, undef, <1, 3, 5, 7>
+  // into:
+  //      %ld2 = call {<4 x i32>, <4 x i32>} @aarch64.neon.ld2(%ptr)
+  //      %v0 = extractvalue %ld2, 0
+  //      %v1 = extractvalue %ld2, 1
+  bool matchStridedLoad(ShuffleVectorInst *SVI, unsigned Stride,
+                        unsigned NumElts, VectorType *VecTy);
+
+  // Check if the mask of the given shufflevecotor is like:
+  //     <0, NumElts, NumElts*2, ..., NumElts*(Stride-1), 1, NumElts+1, ...>
+  // E.g. For 2 interleaved vectors of <4 x i32> type, the mask is:
+  //     <0, 4, 1, 5, 2, 6, 3, 7>
+  bool isInterleavedStoreMask(ShuffleVectorInst *SVI, unsigned &Stride,
+                              unsigned &NumElts, VectorType *&VecTy);
+  // Match a shufflevector with interleaved mask and a store into a stN
+  // intrinsic. E.G. Translate following IRs:
+  //      %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1,
+  //                                                 <0, 4, 1, 5, 2, 6, 3, 7>
+  //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
+  // into:
+  //      call void @aarch64.neon.ld2(<4 x i32> %v0, <4 x i32> %v1, %ptr)
+  //
+  // Or translate following IRs:
+  //      %interleaved.vec = shufflevector <8 x i32> %wide.vec, undef,
+  //                                                 <0, 4, 1, 5, 2, 6, 3, 7>
+  //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
+  // into:
+  //      %v0 = shufflevector <8 x i32> %wide,vec, undef, <0, 1, 2, 3>
+  //      %v1 = shufflevector <8 x i32> %wide,vec, undef, <4, 5, 6, 7>
+  //      call void @aarch64.neon.ld2(<4 x i32> %v0, <4 x i32> %v1, %ptr)
+  bool matchInterleavedStore(ShuffleVectorInst *SVI, unsigned Stride,
+                             unsigned NumElts, VectorType *VecTy);
+};
+} // end anonymous namespace.
+
+char AArch64ShuffleVectorAndMemAccessOpt::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64ShuffleVectorAndMemAccessOpt, DEBUG_TYPE,
+                      "AArch64 ShuffleVector and Memory Access Opt Pass", false,
+                      false)
+INITIALIZE_PASS_END(AArch64ShuffleVectorAndMemAccessOpt, DEBUG_TYPE,
+                    "AArch64 ShuffleVector and Memory Access Opt Pass", false,
+                    false)
+
+FunctionPass *llvm::createAArch64ShuffleVectorAndMemAccessOptPass() {
+  return new AArch64ShuffleVectorAndMemAccessOpt();
+}
+
+// Get a ldN/stN intrinsic addroding to the Stride (which must be 2, 3 or 4).
+static unsigned getLdNStNIntrinsicID(unsigned Stride, bool IsLoad) {
+  static unsigned LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
+                                Intrinsic::aarch64_neon_ld3,
+                                Intrinsic::aarch64_neon_ld4};
+  static unsigned StoreInt[3] = {Intrinsic::aarch64_neon_st2,
+                                 Intrinsic::aarch64_neon_st3,
+                                 Intrinsic::aarch64_neon_st4};
+
+  if (IsLoad)
+    return LoadInt[Stride - 2];
+  else
+    return StoreInt[Stride - 2];
+}
+
+// Concatenate two vectors with the same type.
+static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2,
+                                unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(i));
+
+  return Builder.CreateShuffleVector(V1, V2, ConstantVector::get(Mask));
+}
+
+// Concatenate vectors with the same type.
+static Value *ConcateVectors(IRBuilder<> &Builder,
+                             SmallVector<Value *, 4> &List) {
+  unsigned Size = List.size();
+  assert(Size > 1 && Size < 5 && "Invalid number of vectors");
+
+  Value *V0 = List[0];
+  Value *V1 = List[1];
+  VectorType *VecTy = dyn_cast<VectorType>(V0->getType());
+  unsigned NumElts = VecTy->getNumElements();
+  if (Size == 2)
+    return ConcateTwoVectors(Builder, V0, V1, NumElts * 2);
+
+  Value *V2 = List[2];
+  Value *V3 = (Size == 3) ? UndefValue::get(VecTy) : List[3];
+  unsigned TotalNumElts = (Size == 3) ? NumElts * 3 : NumElts * 4;
+
+  Value *V0V1 = ConcateTwoVectors(Builder, V0, V1, NumElts * 2);
+  Value *V2V3 = ConcateTwoVectors(Builder, V2, V3, NumElts * 2);
+  return ConcateTwoVectors(Builder, V0V1, V2V3, TotalNumElts);
+}
+
+// TODO: Could also consider about undef mask (i.e. the negative mask)
+bool AArch64ShuffleVectorAndMemAccessOpt::isInterleavedLoadMask(
+    ShuffleVectorInst *SVI, unsigned &Stride, unsigned &NumElts,
+    VectorType *&VecTy) {
+  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
+  if (!LI || !LI->isSimple() || LI->getType() != SVI->getType())
+    return false;
+
+  SmallVector<int, 16> Mask(SVI->getShuffleMask());
+  unsigned NumTotalElts = Mask.size();
+  if (NumTotalElts < 4 || Mask[0] != 0)
+    return false;
+
+  Stride = static_cast<unsigned>(Mask[1]);
+  if (Stride < 2 || Stride > 4)
+    return false;
+
+  if (NumTotalElts < Stride || NumTotalElts % Stride != 0)
+    return false;
+
+  NumElts = NumTotalElts / Stride;
+  // Mask should be: 0, Stride, ..., (NumElts-1)*Stride, 1, Stride+1, ...
+  for (unsigned i = 0; i < Stride; i++)
+    for (unsigned j = 0; j < NumElts; j++)
+      if (static_cast<unsigned>(Mask[j + i * NumElts]) != j * Stride + i)
+        return false;
+
+  VecTy = VectorType::get(SVI->getType()->getElementType(), NumElts);
+  return TTI->isTypeLegal(VecTy);
+}
+
+// TODO: Could also consider about undef mask (i.e. the negative mask)
+bool AArch64ShuffleVectorAndMemAccessOpt::isInterleavedStoreMask(
+    ShuffleVectorInst *SVI, unsigned &Stride, unsigned &NumElts,
+    VectorType *&VecTy) {
+  if (!SVI->hasOneUse())
+    return false;
+  StoreInst *SI = dyn_cast<StoreInst>(SVI->user_back());
+  if (!SI || !SI->isSimple())
+    return false;
+
+  SmallVector<int, 16> Mask(SVI->getShuffleMask());
+  unsigned NumTotalElts = Mask.size();
+  if (NumTotalElts < 4 || Mask[0] != 0)
+    return false;
+
+  NumElts = static_cast<unsigned>(Mask[1]);
+  if (NumElts < 2 || NumElts > 16)
+    return false;
+
+  if (NumTotalElts < NumElts || NumTotalElts % NumElts != 0)
+    return false;
+
+  Stride = NumTotalElts / NumElts;
+  // Mask should be: 0, NumElts, ..., (Stride-1)*NumElts, 1, NumElts+1, ...
+  for (unsigned i = 0; i < NumElts; i++)
+    for (unsigned j = 0; j < Stride; j++)
+      if (static_cast<unsigned>(Mask[j + i * Stride]) != j * NumElts + i)
+        return false;
+
+  VecTy = VectorType::get(SVI->getType()->getElementType(), NumElts);
+  return TTI->isTypeLegal(VecTy);
+}
+
+// TODO: Could also consider about undef mask (i.e. the negative mask)
+bool AArch64ShuffleVectorAndMemAccessOpt::isStridedLoadMask(
+    ShuffleVectorInst *SVI, unsigned &Stride, unsigned &NumElts,
+    VectorType *&VecTy) {
+  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
+  if (!LI || !LI->isSimple())
+    return false;
+
+  SmallVector<int, 16> Mask(SVI->getShuffleMask());
+  NumElts = Mask.size();
+  if (NumElts < 2)
+    return false;
+
+  unsigned Index = static_cast<unsigned>(Mask[0]);
+  Stride = static_cast<unsigned>(Mask[1]) - Index;
+  if (Stride < 2 || Stride > 4 || Index >= Stride)
+    return false;
+
+  VectorType *WideVecTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType());
+  unsigned TotalElts = WideVecTy->getVectorNumElements();
+  if (TotalElts % Stride != 0 || TotalElts / Stride != NumElts)
+    return false;
+
+  // The index should match: Idx, Idx+Stride, ..., Idx+(NumElts-1)*Stride
+  for (unsigned i = 1; i < NumElts; i++)
+    if (static_cast<unsigned>(Mask[i]) != Index + Stride * i)
+      return false;
+
+  // Also need to check whether the vector type is legal.
+  VecTy = VectorType::get(WideVecTy->getElementType(), NumElts);
+  return TTI->isTypeLegal(VecTy);
+}
+
+// Check if the given mask is strided mask and it is in the same group as the
+// given Stride and NumElts.
+// E.g. If Stride is 2 and NumElts is 4, the masks in the same group are:
+//      Index 0:  <0, 2, 4, 6>
+//      Index 1:  <1, 3, 5, 7>
+static bool isStridedMaskInGroup(ArrayRef<int> Mask, unsigned &Index,
+                                 unsigned Stride, unsigned NumElts) {
+  if (Mask.size() != NumElts)
+    return false;
+
+  Index = static_cast<unsigned>(Mask[0]);
+  if (Index >= Stride)
+    return false;
+
+  for (unsigned i = 1; i < NumElts; i++)
+    if (static_cast<unsigned>(Mask[i]) != Index + Stride * i)
+      return false;
+  return true;
+}
+
+bool AArch64ShuffleVectorAndMemAccessOpt::matchStridedLoad(
+    ShuffleVectorInst *SVI, unsigned Stride, unsigned NumElts,
+    VectorType *VecTy) {
+  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
+
+  // Record the index in the interleaved load for each strided load.
+  SmallVector<std::pair<ShuffleVectorInst *, unsigned>, 4> StridedList;
+  for (Value::user_iterator UI = LI->user_begin(), E = LI->user_end();
+       UI != E;) {
+    ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(*UI++);
+    if (!SV)
+      continue;
+    unsigned Order;
+    if (isStridedMaskInGroup(SV->getShuffleMask(), Order, Stride, NumElts))
+      StridedList.push_back(std::make_pair(SV, Order));
+  }
+
+  // Watch out for the pointer vector type, which can not be an argument of
+  // the intrinsics. Need to get integer vector first and then convert it to
+  // pointer vector.
+  Type *EltTy = VecTy->getVectorElementType();
+  bool IsEltPtr = EltTy->isPointerTy();
+  if (IsEltPtr)
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy), NumElts);
+
+  Type *NewPtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {VecTy, NewPtrTy};
+  Intrinsic::ID IntID = (Intrinsic::ID)getLdNStNIntrinsicID(Stride, true);
+  Value *Callee = Intrinsic::getDeclaration(M, IntID, Tys);
+
+  IRBuilder<> Builder(LI);
+  Value *NewPtr = Builder.CreateBitCast(LI->getPointerOperand(), NewPtrTy);
+  Value *CallI = Builder.CreateCall(Callee, NewPtr, "ldN");
+  unsigned Size = StridedList.size();
+  // Replace each strided shuffle with the result vector of ldN.
+
+  DEBUG(dbgs() << "Replace:\n    " << *LI << "\n");
+  for (unsigned i = 0; i < Size; i++) {
+    ShuffleVectorInst *SV = StridedList[i].first;
+    unsigned Index = StridedList[i].second;
+    Value *V = Builder.CreateExtractValue(CallI, Index);
+
+    // Convert the integer vector to pointer vector.
+    if (IsEltPtr)
+      V = Builder.CreateIntToPtr(V, SV->getType());
+    SV->replaceAllUsesWith(V);
+
+    DEBUG(dbgs() << "    " << *SV << "\n");
+    // Avoid analyzing this shufflevector twice.
+    if (SV != SVI)
+      SV->eraseFromParent();
+  }
+  DEBUG(dbgs() << "By:\n    " << *CallI << "\n");
+
+  return true;
+}
+
+bool AArch64ShuffleVectorAndMemAccessOpt::matchInterleavedLoad(
+    ShuffleVectorInst *SVI, unsigned Stride, unsigned NumElts,
+    VectorType *VecTy) {
+  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
+
+  // Watch out for the pointer vector type, which can not be an argument of
+  // the intrinsics. Need to get integer vector first and then convert it to
+  // pointer vector.
+  Type *EltTy = VecTy->getVectorElementType();
+  bool IsEltPtr = EltTy->isPointerTy();
+  if (IsEltPtr)
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy), NumElts);
+
+  Type *NewPtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {VecTy, NewPtrTy};
+  Intrinsic::ID IntID = (Intrinsic::ID)getLdNStNIntrinsicID(Stride, true);
+  Value *Callee = Intrinsic::getDeclaration(M, IntID, Tys);
+  IRBuilder<> Builder(LI);
+  Value *NewPtr = Builder.CreateBitCast(LI->getPointerOperand(), NewPtrTy);
+
+  // Create a ldN call to do the interleaved load.
+  CallInst *CallI = Builder.CreateCall(Callee, NewPtr, "ldN");
+  DEBUG(dbgs() << "Replace:\n    " << *LI << "\n    " << *SVI << "\n");
+  DEBUG(dbgs() << "By:\n    " << *CallI << "\n");
+
+  SmallVector<Value *, 4> Vectors;
+  for (unsigned i = 0; i < Stride; i++)
+    Vectors.push_back(Builder.CreateExtractValue(CallI, i));
+
+  // Concatenate apart vectors from the new ldN intrinsic call to a wide
+  // vector.
+  Value *WideVec = ConcateVectors(Builder, Vectors);
+
+  // Convert the integer vector to pointer vector.
+  if (IsEltPtr)
+    WideVec = Builder.CreateIntToPtr(WideVec, SVI->getType());
+
+  SVI->replaceAllUsesWith(WideVec);
+  return true;
+}
+
+// Get a mask consists of two parts: sequential constants, several UNDEFs.
+// I.E. <Idx, Idx + 1, Idx + 2, ..., Idx + NumSeq - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx,
+                                   unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Mask.push_back(Builder.getInt32(i));
+  // Return directly if there is no undef value.
+  if (NumUndef == 0)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+bool AArch64ShuffleVectorAndMemAccessOpt::matchInterleavedStore(
+    ShuffleVectorInst *SVI, unsigned Stride, unsigned NumElts,
+    VectorType *VecTy) {
+  Value *V0 = SVI->getOperand(0);
+  VectorType *SVOpTy = dyn_cast<VectorType>(V0->getType());
+
+  // Watch out for the pointer vector type, which can not be an argument of
+  // the intrinsics. Need to convert pointer vector to integer vector.
+  Type *EltTy = VecTy->getVectorElementType();
+  bool IsEltPtr = EltTy->isPointerTy();
+  if (IsEltPtr)
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy), NumElts);
+
+  StoreInst *SI = dyn_cast<StoreInst>(SVI->user_back());
+  Value *V1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // Convert pointer vector to integer vector.
+  if (IsEltPtr) {
+    Type *IntVecTy = VectorType::get(DL->getIntPtrType(EltTy),
+                                     SVOpTy->getVectorNumElements());
+    V0 = Builder.CreatePtrToInt(V0, IntVecTy);
+    V1 = Builder.CreatePtrToInt(V1, IntVecTy);
+  }
+
+  SmallVector<Value *, 5> Ops;
+  for (unsigned i = 0; i < Stride; i++) {
+    Constant *Mask = getSequentialMask(Builder, NumElts * i, NumElts, 0);
+    Ops.push_back(Builder.CreateShuffleVector(V0, V1, Mask));
+  }
+
+  Type *NewPtrTy = VecTy->getPointerTo(SI->getPointerAddressSpace());
+  Value *NewPtr = Builder.CreateBitCast(SI->getPointerOperand(), NewPtrTy);
+  Ops.push_back(NewPtr);
+  Type *Tys[2] = {VecTy, NewPtrTy};
+  Intrinsic::ID IntID = (Intrinsic::ID)getLdNStNIntrinsicID(Stride, false);
+  Value *Callee = Intrinsic::getDeclaration(M, IntID, Tys);
+
+  // Create a stN call do the interleaved store.
+  CallInst *CallI = Builder.CreateCall(Callee, Ops);
+
+  DEBUG(dbgs() << "Replace:\n    " << *SI << "\n    " << *SVI << "\n");
+  DEBUG(dbgs() << "By:\n    " << *CallI << "\n");
+  SI->eraseFromParent();
+  return true;
+}
+
+bool AArch64ShuffleVectorAndMemAccessOpt::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  M = F.getParent();
+  DL = &M->getDataLayout();
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; I++) {
+      ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I);
+      if (!SVI)
+        continue;
+
+      unsigned Stride, NumElts;
+      VectorType *VecTy;
+      if (isInterleavedLoadMask(SVI, Stride, NumElts, VecTy))
+        Changed |= matchInterleavedLoad(SVI, Stride, NumElts, VecTy);
+      else if (isStridedLoadMask(SVI, Stride, NumElts, VecTy))
+        Changed |= matchStridedLoad(SVI, Stride, NumElts, VecTy);
+
+      if (isInterleavedStoreMask(SVI, Stride, NumElts, VecTy))
+        Changed |= matchInterleavedStore(SVI, Stride, NumElts, VecTy);
+    }
+  }
+  return Changed;
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -67,6 +67,11 @@
                           " to make use of cmpxchg flow-based information"),
                  cl::init(true));
 
+static cl::opt<bool> AArch64ShuffleAccessOpt(
+    "aarch64-shuffle-access-opt",
+    cl::desc("Optimize shufflevector and memory access in AArch64 backend"),
+    cl::init(true), cl::Hidden);
+
 static cl::opt<bool>
 EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
                         cl::desc("Run early if-conversion"),
@@ -225,6 +230,9 @@
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass());
 
+  if (TM->getOptLevel() != CodeGenOpt::None && AArch64ShuffleAccessOpt)
+    addPass(createAArch64ShuffleVectorAndMemAccessOptPass());
+
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -139,6 +139,9 @@
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace);
   /// @}
 };
 
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -407,6 +407,33 @@
   return LT.first;
 }
 
+unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
+                                                    Type *VecTy,
+                                                    unsigned Stride,
+                                                    unsigned Alignment,
+                                                    unsigned AddressSpace) {
+  VectorType *VecType = dyn_cast<VectorType>(VecTy);
+  assert(VecType && "Expect a vector type");
+
+  unsigned Cost = getMemoryOpCost(Opcode, VecType, Alignment, AddressSpace);
+  if (isTypeLegal(VecType) && Stride > 1 && Stride < 5)
+    return Cost;
+
+  unsigned NumElts = VecType->getNumElements();
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Unexpected opcode for interleaved memory access");
+
+  // Total cost for the memory access of a wide vector
+  bool IsLoad = (Opcode == Instruction::Load);
+  // Roughly estimate the shuffle cost to be extracting each lanes or inserting
+  // each lanes.
+  for (unsigned i = 0; i < NumElts; i++)
+    Cost += getVectorInstrCost(IsLoad ? Instruction::ExtractElement
+                                      : Instruction::InsertElement,
+                               VecType, i);
+  return Cost;
+}
+
 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
Index: lib/Target/AArch64/CMakeLists.txt
===================================================================
--- lib/Target/AArch64/CMakeLists.txt
+++ lib/Target/AArch64/CMakeLists.txt
@@ -38,6 +38,7 @@
   AArch64PBQPRegAlloc.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64ShuffleVectorAndMemAccessOpt.cpp
   AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -134,6 +134,39 @@
     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
     cl::desc("Enable symblic stride memory access versioning"));
 
+/// This enables vectorization on the interleaved memory accesses in code like
+/// the following.
+/// for (i = 0; i < N; i+=3) {
+///   R = Pic[i];       // Load R color elements
+///   G = Pic[i+1];     // Load G color elements
+///   B = Pic[i+2];     // Load B color elements
+///   ... // do something to R, G, B
+/// }
+///
+/// Will be translated to a wide vector load and several shufflevectors
+///   %wide.vec = load <12 x i32>, <12 x i32>* %ptr           ; load for R,G,B
+///   %R.vec = shufflevector %wide.vec, undef, <0, 3, 6, 9>   ; mask for R load
+///   %G.vec = shufflevector %wide.vec, undef, <1, 4, 7, 10>  ; mask for G load
+///   %B.vec = shufflevector %wide.vec, undef, <2, 5, 8, 11>  ; mask for B load
+///
+/// for (i = 0; i < N; i+=3) {
+///   ... do something to R, G, B
+///   Pic[i]   = R;     // Store R color elements
+///   Pic[i+1] = G;     // Store G color elements
+///   Pic[i+2] = B;     // Store B color elements
+/// }
+///
+/// Will be translated to several shufflevectors and a wide vector store
+///   %RG.vec = shufflevector %R.vec, %G.vec, <0, 1, 2, ..., 7>
+///   %BU.vec = shufflevector %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+///   %interleave = shufflevector %RG.vec, %BU.vec,
+///        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>  ; mask for interleaved store
+///   store <12 x i32> %interleave, <12 x i32>* %ptr         ; write for R,G,B
+static cl::opt<bool>
+    VectorizeInterleavedAccess("vectorize-interleaved-access", cl::init(true),
+                               cl::Hidden,
+                               cl::desc("Vectorize interleaved memory access"));
+
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
@@ -351,6 +384,8 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  void vectorizeInterleavedAccess(Instruction *InsPos, Value *Ptr, Type *VecTy);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -568,6 +603,15 @@
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
         WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
 
+  ~LoopVectorizationLegality() {
+    SmallSet<InterleavedAccess *, 4> DelSet;
+    // Collecting all the pointers to avoid delete a pointer twice,
+    for (auto &I : InterleavedAccessMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
   /// This enum represents the kinds of inductions that we support.
   enum InductionKind {
     IK_NoInduction,  ///< Not an induction variable.
@@ -640,6 +684,54 @@
     ConstantInt *StepValue;
   };
 
+  // An interleaved access list contains a group of interleaved accesses.
+  // E.g. for (i = 0; i < N; i+=2) {
+  //        even = A[i];
+  //        odd  = A[i + 1];
+  //        ...  // operations on even/odd
+  //      }
+  // The List consists of two accesses: even load, odd load.
+  struct InterleavedAccess {
+    InterleavedAccess(int Stride, bool IsLoad)
+        : Stride(Stride), IsLoad(IsLoad) {}
+    InterleavedAccess() : Stride(0) {}
+
+    unsigned getIndex(Instruction *Access) {
+      assert(Members.count(Access) &&
+             "The interleaved access doesn't contain such member");
+
+      for (unsigned i = 0; i < Members.size(); i++)
+        if (Members[i] == Access)
+          return i;
+      return Members.size();
+    }
+
+    unsigned getNumMembers() { return Members.size(); }
+
+    bool isInsertPos(Instruction *Access) { return Access == InsertPos; }
+
+    int Stride;
+    bool IsLoad;
+    unsigned Align;
+    // The interleaved access list.
+    SmallSetVector<Instruction *, 4> Members;
+
+    // To avoid breaking the dependence, the new vectorized interleaved access
+    // should be inserted at the position of either:
+    //    (1) The first load access.
+    // or
+    //    (2) The last store access.
+    //
+    // E.g. %even = load i32             // Insert Position (First load)
+    //      %add = add i32 %even         // Use of %even
+    //      %odd = load i32              // Should not insert here
+    //
+    //      store i32 %even              // Should not insert here
+    //      %odd = add i32               // Def of %odd
+    //      store i32 %odd               // Insert Position (Last store)
+    Instruction *InsertPos;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -730,6 +822,16 @@
   unsigned getNumPredStores() const {
     return NumPredStores;
   }
+
+  bool belongsToInterleavedAccess(Instruction *I) {
+    return InterleavedAccessMap.count(I);
+  }
+  InterleavedAccess *getInterleavedAccess(Instruction *I) {
+    assert(belongsToInterleavedAccess(I) &&
+           "The given access must belong to an interleaved access");
+    return InterleavedAccessMap[I];
+  }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -762,7 +864,21 @@
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
-  void collectStridedAccess(Value *LoadOrStoreInst);
+  void collectInvarStridedAccess(Value *LoadOrStoreInst);
+
+  /// Analyze interleaved accesses.
+  void analyzeInterleavedAccess();
+
+  /// Collect constant strided memory accesses.
+  ///
+  /// Looks for accesses like "a[i * C]" where "C" is a constant (Except +/-1).
+  void
+  collectConstStridedAccess(MapVector<Instruction *, int> &StridedAccesses);
+
+  /// Find all interleaved accesses in a queue of loads/stores with the same
+  /// stride and base pointer.
+  void collectInterleavedAccess(SmallVector<Instruction *, 4> &Queue,
+                                int Stride, bool IsLoad);
 
   /// Report an analysis message to assist the user in diagnosing loops that are
   /// not vectorized.  These are handled as LoopAccessReport rather than
@@ -822,6 +938,10 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction*, 8> MaskedOp;
+
+  /// A map contains the relationships between the members of an interleaved
+  /// access and the interleaved access.
+  DenseMap<Instruction *, InterleavedAccess *> InterleavedAccessMap;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1657,6 +1777,196 @@
                                      "reverse");
 }
 
+// Get the mask to interleaved a wide vector concatenated from several small
+// vectors like: V0, V1, V2, ...
+// The mask is:
+//     <V0's 1st index, V1's 1st index, ..., V0's 2nd index, V1's 2nd index, ...
+//                                     , V0's last index, V1's last index, ...>
+//     which is equal to <0, VF, VF*2, ..., VF*NumVec-VF, 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors of <4 x i32> type, the mask is:
+//      <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF,
+                                         unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVec; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting:
+//     <Start, Start + Stride, Start + 2*Stride, ..., Start + (VF-1)*Stride>
+// This can be used to interleave a wide vector into several small vectors.
+// E.g. To interleaved a wide vector of <8 x i32> type:
+//     %v0 = shufflevector <8 x i32> %wide.vec, undef, <0, 2, 4, 6>
+//     %v1 = shufflevector <8 x i32> %wide.vec, undef, <1, 3, 5, 7>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+                                unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consist of sequential constants,
+// The second part consist of UNDEFs.
+// I.E. <Idx, Idx + 1, Idx + 2, ..., Idx + NumSeq - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx,
+                                   unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Mask.push_back(Builder.getInt32(i));
+  // Return directly if there is no undef value.
+  if (NumUndef == 0)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 1st vector always
+// has the same elements as the 2nd vector or more elements.
+// If the 1st vector has more elements, extend the 2nd vector with UNDEFs.
+static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  if (NumElts1 == NumElts2) {
+    Constant *Mask = getSequentialMask(Builder, 0, NumElts1 * 2, 0);
+    return Builder.CreateShuffleVector(V1, V2, Mask);
+  }
+
+  assert(NumElts1 > NumElts2 && "Expect the first vector has more elements");
+  // Extend V2 to with undef elements.
+  Constant *ExtMask =
+      getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+  Value *Undef = UndefValue::get(VecTy2);
+  Value *ExtV = Builder.CreateShuffleVector(V2, Undef, ExtMask);
+
+  Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, ExtV, Mask);
+}
+
+// Concatenate number of vectors in the give list starting from Idx
+static Value *ConcateVectors(IRBuilder<> &Builder,
+                             SmallVector<Value *, 4> &List, unsigned Idx,
+                             unsigned NumVec) {
+  assert(NumVec != 0 && "Number of vectors should not be zero");
+  if (NumVec == 1)
+    return List[Idx];
+
+  unsigned PartNum = PowerOf2Floor(NumVec);
+  // When the number of vectors is not power of 2, split the list and make sure
+  // the first list has power of 2 elements.
+  // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements.
+  if (PartNum == NumVec)
+    PartNum /= 2;
+  Value *V0 = ConcateVectors(Builder, List, Idx, PartNum);
+  unsigned NextIdx = Idx + PartNum;
+  Value *V1 = ConcateVectors(Builder, List, NextIdx, NumVec - PartNum);
+  return ConcateTwoVectors(Builder, V0, V1);
+}
+
+// Create the vectorized instruction for interleaved access.
+// E.g. Transform interleaved load with 2 members:
+//      %s1 = load i32, i32* %ptr1
+//      %s2 = load i32, i32* %ptr2
+//   To
+//      %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//      %v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
+//      %v2 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+//
+// Transform interleaved store with 3 members:
+//      store i32 %s1, i32 %ptr1
+//      store i32 %s2, i32 %ptr2
+//      store i32 %s3, i32 %ptr3
+//   To
+//      %v1_v2 = shufflevector <4 x i32> %v1, %v2, <0, 1, 2, 3, 4, 5, 6, 7>
+//      %v3_und = shufflevector <4 x i32> %v3, undef, <0, 1, 2, 3, 4, 5, 6, 7>
+//      %interleaved.vec = shufflevector %v1_v2, %v3_und,
+//                                      <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+//      store <12 x i32> %interleaved.vec, <12 x i32> %ptr
+void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr,
+                                                     Value *Ptr, Type *DataTy) {
+  VectorType *VecTy = dyn_cast<VectorType>(DataTy);
+  assert(VecTy && "expect a vector type");
+
+  LoopVectorizationLegality::InterleavedAccess *IA =
+      Legal->getInterleavedAccess(Instr);
+  assert(IA->InsertPos == Instr && "Must be the insert position");
+
+  unsigned Idx = IA->getIndex(Instr);
+  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  SmallSetVector<Instruction *, 4> &Members = IA->Members;
+  unsigned Align = IA->Align;
+  unsigned NumVec = Members.size();
+
+  Type *EleTy = VecTy->getElementType();
+  Type *WideVecTy = VectorType::get(EleTy, NumVec * VF);
+  Type *WidePtrTy = WideVecTy->getPointerTo(AddressSpace);
+
+  Constant *Zero = Builder.getInt32(0);
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Adjust pointer to the access of index 0.
+    Constant *Offset = Builder.getInt32(-static_cast<int>(Idx));
+    Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero);
+    NewPtr = Builder.CreateGEP(NewPtr, Offset);
+
+    // Bit cast to the new wide vector pointer type.
+    NewPtr = Builder.CreateBitCast(NewPtr, WidePtrTy);
+    NewPtrs.push_back(NewPtr);
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  if (IA->IsLoad) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      Instruction *WideLoad =
+          Builder.CreateAlignedLoad(NewPtrs[Part], Align, false, "wide.vec");
+
+      // Interleave and split the loaded wide vector into small vectors.
+      Value *Undef = UndefValue::get(WideVecTy);
+      for (unsigned i = 0; i < NumVec; i++) {
+        VectorParts &Entry = WidenMap.get(Members[i]);
+        Constant *StridedMask = getStridedMask(Builder, i, NumVec, VF);
+        Entry[Part] = Builder.CreateShuffleVector(WideLoad, Undef, StridedMask,
+                                                  "strided.vec");
+      }
+
+      propagateMetadata(WideLoad, Instr);
+    }
+    return;
+  }
+
+  for (unsigned Part = 0; Part < UF; Part++) {
+    SmallVector<Value *, 4> StoredVal;
+    for (unsigned i = 0; i < NumVec; i++)
+      StoredVal.push_back(getVectorValue(Members[i]->getOperand(0))[Part]);
+    // Concatenate the apart vectors into a wide vector.
+    Value *WideVec = ConcateVectors(Builder, StoredVal, 0, NumVec);
+
+    Constant *IMask = getInterleavedStoreMask(Builder, VF, NumVec);
+    // Interleave the data in the wide vector.
+    Value *IVec = Builder.CreateShuffleVector(
+        WideVec, UndefValue::get(WideVecTy), IMask, "interleaved.vec");
+    Instruction *WideStore =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Align, false);
+    propagateMetadata(WideStore, Instr);
+  }
+  return;
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1684,6 +1994,15 @@
   if (ScalarAllocatedSize != VectorElementSize)
     return scalarizeInstruction(Instr);
 
+  if (Legal->belongsToInterleavedAccess(Instr)) {
+    LoopVectorizationLegality::InterleavedAccess *IA =
+        Legal->getInterleavedAccess(Instr);
+    // Skip the access if it is not the insert position.
+    if (!IA->isInsertPos(Instr))
+      return;
+    return vectorizeInterleavedAccess(Instr, Ptr, DataTy);
+  }
+
   // If the pointer is loop invariant or if it is non-consecutive,
   // scalarize the load.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
@@ -1780,7 +2099,7 @@
       if (Legal->isMaskRequired(SI))
         NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
                                           Mask[Part]);
-      else 
+      else
         NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
       propagateMetadata(NewSI, SI);
     }
@@ -3414,6 +3733,344 @@
   return true;
 }
 
+static bool isInBoundsGep(Value *Ptr) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+    return GEP->isInBounds();
+  return false;
+}
+
+/// \brief Check whether the access through \p Ptr has a constant stride.
+static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                        const SCEV *&PtrScev) {
+  const Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Unexpected non-ptr");
+  const PointerType *PtrTy = cast<PointerType>(Ty);
+
+  PtrScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  // The pointer must be an AddRecExpr pointer and the accesss function must
+  // stride over the innermost loop.
+  if (!AR || Lp != AR->getLoop())
+    return 0;
+
+  // The address calculation must not wrap. Otherwise, a dependence could be
+  // inverted.
+  // An inbounds getelementptr that is a AddRec with a unit stride
+  // cannot wrap per definition. The unit stride requirement is checked later.
+  // An getelementptr without an inbounds attribute and unit stride would have
+  // to access the pointer value "0" which is undefined behavior in address
+  // space 0, therefore we can also vectorize this case.
+  bool IsInBoundsGEP = isInBoundsGep(Ptr);
+  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
+  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero)
+    return 0;
+
+  // Check the step is constant.
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C)
+    return 0;
+
+  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+  int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+  const APInt &APStepVal = C->getValue()->getValue();
+
+  // Huge step value - give up.
+  if (APStepVal.getBitWidth() > 64)
+    return 0;
+
+  int64_t StepVal = APStepVal.getSExtValue();
+
+  // Strided access.
+  int64_t Stride = StepVal / Size;
+  int64_t Rem = StepVal % Size;
+  if (Rem)
+    return 0;
+
+  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
+  // know we can't "wrap around the address space". In case of address space
+  // zero we know that this won't happen without triggering undefined behavior.
+  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
+      Stride != 1 && Stride != -1)
+    return 0;
+
+  return Stride;
+}
+
+static unsigned getAddressSpaceOperand(Instruction *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+static Value *getPointerOperand(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+static Value *getBasePointer(Instruction *I, const DataLayout &DL) {
+  Value *Ptr = getPointerOperand(I);
+  assert(Ptr && "Fail to get point from a memory instruction");
+  return GetUnderlyingObject(Ptr, DL);
+}
+
+static bool isConsecutiveAccess(Instruction *A, Instruction *B,
+                                ScalarEvolution *SE, const DataLayout &DL) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same type.
+  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
+    return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *C = SE->getConstant(BaseDelta);
+  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
+  return X == PtrSCEVB;
+}
+
+void LoopVectorizationLegality::collectConstStridedAccess(
+    MapVector<Instruction *, int> &StridedAccesses) {
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+    if (blockNeedsPredication(*bb))
+      continue;
+
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Value *Ptr = getPointerOperand(it);
+      if (!Ptr)
+        continue;
+
+      const SCEV *PtrScev;
+      int Stride = isStridedPtr(SE, Ptr, TheLoop, PtrScev);
+      if (Stride < 2 && Stride > -2)
+        continue;
+
+      DEBUG(dbgs() << "LV: Found a constant strided access\n");
+      DEBUG(dbgs() << "  pointer:" << *Ptr << "\n");
+      DEBUG(dbgs() << "  Stride:" << Stride << "\n");
+      StridedAccesses[it] = Stride;
+    }
+  }
+}
+
+void LoopVectorizationLegality::collectInterleavedAccess(
+    SmallVector<Instruction *, 4> &Queue, int Stride, bool IsLoad) {
+  if (Queue.size() < 2) {
+    Queue.clear();
+    return;
+  }
+
+  SetVector<Instruction *> Heads;
+  SetVector<Instruction *> Tails;
+  SmallDenseMap<Instruction *, Instruction *> ConsecutiveChain;
+
+  // Collect all the consecutive pairs.
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (unsigned i = 0, e = Queue.size(); i != e; i++) {
+    for (unsigned j = 0; j != e; j++) {
+      if (i == j)
+        continue;
+      if (isConsecutiveAccess(Queue[i], Queue[j], SE, DL)) {
+        Heads.insert(Queue[i]);
+        Tails.insert(Queue[j]);
+        ConsecutiveChain[Queue[i]] = Queue[j];
+      }
+    }
+  }
+
+  for (SetVector<Instruction *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+    SmallSetVector<Instruction *, 4> ConsecutiveList;
+    Instruction *I = *it;
+
+    // Collect a list with consecutive accesses.
+    while (Tails.count(I) || Heads.count(I)) {
+      ConsecutiveList.insert(I);
+      // Move to the next access in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    unsigned Size = ConsecutiveList.size();
+    unsigned Num = std::abs(Stride);
+    if (Size < Num)
+      continue;
+
+    // As every interleave access consists of the same number of members as the
+    // stride. There may be more than 1 interleaved access in this list.
+    for (unsigned Part = 0; Part < Size / Num; Part++) {
+      InterleavedAccess *ILAccess = new InterleavedAccess(Stride, IsLoad);
+      SmallSetVector<Instruction *, 4> &Members = ILAccess->Members;
+      unsigned MaxAlign = 1;
+      for (unsigned i = Part * Num; i < Part * Num + Num; i++) {
+        Instruction *Access = ConsecutiveList[i];
+        // Insert each member.
+        Members.insert(Access);
+        // Record the relationship of the member and the interleaved access
+        InterleavedAccessMap[Access] = ILAccess;
+
+        unsigned Align = IsLoad ? dyn_cast<LoadInst>(Access)->getAlignment()
+                                : dyn_cast<StoreInst>(Access)->getAlignment();
+        if (Align > MaxAlign)
+          MaxAlign = Align;
+      }
+      // The alignment of the interleaved access is the max member alignment.
+      ILAccess->Align = MaxAlign;
+
+      // To avoid breaking the dependences, the new vectorized interleaved
+      // access has different insert positions for load and store.
+      if (IsLoad) {
+        for (unsigned i = 0, e = Queue.size(); i != e; i++) {
+          // Start from the first access in the queue. If an access is in the
+          // current interleaved access, it is the first access and set it to be
+          // the insert position.
+          if (Members.count(Queue[i])) {
+            ILAccess->InsertPos = Queue[i];
+            break;
+          }
+        }
+      } else {
+        for (int i = Queue.size() - 1, e = -1; i != e; i--) {
+          // Start from the last access in the queue. If an access is in the
+          // current interleaved access, it is the last access and set it to be
+          // the insert position.
+          if (Members.count(Queue[i])) {
+            ILAccess->InsertPos = Queue[i];
+            break;
+          }
+        }
+      }
+
+      DEBUG(dbgs() << "LV: Found interleaved accesses with Stride " << Stride
+                   << "\n");
+    }
+  }
+
+  return;
+}
+
+void LoopVectorizationLegality::analyzeInterleavedAccess() {
+  MapVector<Instruction *, int> StridedAccesses;
+  collectConstStridedAccess(StridedAccesses);
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+
+  // The following logic groups the constant strided accesses and find the
+  // interleaved accesses in two steps:
+  //   (1) Groups loads/stores with the same stride and base pointers into a
+  //       queue.
+  //   (2) Calls another function to collect all interleaved accesses in that
+  //       queue.
+  // TODO: It can't collect interleaved access if the loads/stores are mixed.
+  // E.g.   for (i = 0; i < N; i+=2) {
+  //          A[i] += 1;
+  //          A[i+1] += 2;
+  //        }
+  // It will fail to identify the interleaved accesses as the mixed order:
+  //        load A[i], store A[i], load A[i+1], store A[i+1].
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+    if (blockNeedsPredication(*bb))
+      continue;
+
+    // Queue for the loads/stores of the same stride and base pointer.
+    SmallVector<Instruction *, 4> AccessQueue;
+    bool TryMerge = false;
+    Value *CurBase = nullptr;
+    int CurStride;
+    bool IsLoad = false;
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      if (TryMerge) {
+        collectInterleavedAccess(AccessQueue, CurStride, IsLoad);
+        CurBase = nullptr;
+        TryMerge = false;
+      }
+
+      if (!it->mayReadOrWriteMemory())
+        continue;
+
+      // A non-strided memory access may break the dependence. Merge the queue.
+      if (!StridedAccesses.count(it)) {
+        TryMerge = true;
+        continue;
+      }
+
+      Value *Base = getBasePointer(it, DL);
+      int Stride = StridedAccesses[it];
+      bool Read = it->mayReadFromMemory();
+
+      // Start a new queue
+      if (CurBase == nullptr) {
+        AccessQueue.clear();
+        AccessQueue.push_back(it);
+        CurBase = Base;
+        CurStride = Stride;
+        IsLoad = Read;
+        continue;
+      }
+
+      // This instructure doesn't fit the current queue. Try to merge current
+      // queue and insert it again.
+      // TODO: this may missing opportunities as it hasn't collected all
+      // loads/stores with the same base pointer and stride.
+      if (CurBase != Base || CurStride != Stride || IsLoad != Read) {
+        TryMerge = true;
+        --it;
+        continue;
+      }
+
+      AccessQueue.push_back(it);
+    }
+
+    // Collect the tail accesses.
+    if (CurBase != nullptr) {
+      collectInterleavedAccess(AccessQueue, CurStride, IsLoad);
+      CurBase = nullptr;
+      TryMerge = false;
+    }
+  }
+}
+
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
     return DL.getIntPtrType(Ty);
@@ -3601,12 +4258,12 @@
           return false;
         }
         if (EnableMemAccessVersioning)
-          collectStridedAccess(ST);
+          collectInvarStridedAccess(ST);
       }
 
       if (EnableMemAccessVersioning)
         if (LoadInst *LI = dyn_cast<LoadInst>(it))
-          collectStridedAccess(LI);
+          collectInvarStridedAccess(LI);
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
@@ -3629,6 +4286,9 @@
     }
   }
 
+  if (VectorizeInterleavedAccess)
+    analyzeInterleavedAccess();
+
   return true;
 }
 
@@ -3744,7 +4404,7 @@
   return Stride;
 }
 
-void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
+void LoopVectorizationLegality::collectInvarStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
     Ptr = LI->getPointerOperand();
@@ -3757,7 +4417,7 @@
   if (!Stride)
     return;
 
-  DEBUG(dbgs() << "LV: Found a strided access that we can version");
+  DEBUG(dbgs() << "LV: Found an invariable strided access that we can version");
   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
   Strides[Ptr] = Stride;
   StrideSet.insert(Stride);
@@ -4575,6 +5235,14 @@
       return TTI.getAddressComputationCost(VectorTy) +
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    // Interleaved memory loads/stores
+    if (Legal->belongsToInterleavedAccess(I)) {
+      LoopVectorizationLegality::InterleavedAccess *IA =
+          Legal->getInterleavedAccess(I);
+      return TTI.getInterleavedMemoryOpCost(I->getOpcode(), VectorTy,
+                                            IA->getNumMembers(), IA->Align, AS);
+    }
+
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
Index: test/CodeGen/AArch64/shuffle-access-opt.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/shuffle-access-opt.ll
@@ -0,0 +1,175 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; CHECK-LABEL: test_load_stride2_1Mask:
+; CHECK: ld2 { v0.4s, v1.4s }, [x0]
+define <4 x i32> @test_load_stride2_1Mask(i32* %ptr) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
+  %interleaved.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %strided.v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %strided.v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: test_load_stride2_2Masks:
+; CHECK: ld2 { v0.4s, v1.4s }, [x0]
+define <4 x i32> @test_load_stride2_2Masks(i32* %ptr) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
+  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: test_load_stride3_1Mask:
+; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+define <4 x i32> @test_load_stride3_1Mask(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %interleaved.vec = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>
+  %strided.v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %strided.v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %strided.v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %add = add nsw <4 x i32> %strided.v1, %strided.v2
+  %add1 = add nsw <4 x i32> %add, %strided.v0
+  ret <4 x i32> %add1
+}
+
+; CHECK-LABEL: test_load_stride3_2Masks:
+; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+define <4 x i32> @test_load_stride3_2Masks(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: test_load_stride4_1Mask:
+; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define <4 x i32> @test_load_stride4_1Mask(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %interleaved.vec = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  %strided.v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %strided.v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %strided.v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %strided.v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v3
+  %add1 = add nsw <4 x i32> %strided.v1, %strided.v2
+  %add2 = add nsw <4 x i32> %add, %add1
+  ret <4 x i32> %add2
+}
+
+; CHECK-LABEL: test_load_stride4_3Masks:
+; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define <4 x i32> @test_load_stride4_3Masks(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: test_store_stride2:
+; CHECK: st2 { v0.4s, v1.4s }, [x0]
+define void @test_store_stride2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_store_stride3:
+; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+define void @test_store_stride3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_store_stride4:
+; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define void @test_store_stride4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
+
+; The following cases test that vectors of pointer elements can be well translated.
+; CHECK-LABEL: test_load_stride2_ptrvec:
+; CHECK: ld2 { v0.2d, v1.2d }, [x0]
+define void @test_load_stride2_ptrvec(i32** %ptr, <4 x i32*>* %ptr1) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
+  %interleaved.vec = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr1
+  ret void
+}
+
+; CHECK-LABEL: test_load_stride3_ptrvec:
+; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+define void @test_load_stride3_ptrvec(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
+  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: test_load_stride4_ptrvec:
+; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+define void @test_load_stride4_ptrvec(i32** %ptr, <8 x i32*>* %ptr1) {
+  %base = bitcast i32** %ptr to <8 x i32*>*
+  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
+  %interleaved.vec = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr1
+  ret void
+}
+
+; CHECK-LABEL: test_store_stride2_ptrvec:
+; CHECK: st2 { v0.2d, v1.2d }, [x0]
+define void @test_store_stride2_ptrvec(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_store_stride3_ptrvec:
+; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+define void @test_store_stride3_ptrvec(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_store_stride4_ptrvec:
+; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+define void @test_store_stride4_ptrvec(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+  %base = bitcast i32* %ptr to <8 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
+  ret void
+}
+
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 2>&1 | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=VF2
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
@@ -20,12 +20,12 @@
 ; CHECK: %index.next = add i64 %index, 8
 ; CHECK: icmp eq i64 %index.next, 512
 
-; FORCE-VEC-LABEL: @ind_plus2(
-; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
-; FORCE-VEC: mul nsw <2 x i32>
-; FORCE-VEC: add nsw <2 x i32>
-; FORCE-VEC: %index.next = add i64 %index, 2
-; FORCE-VEC: icmp eq i64 %index.next, 512
+; VF2-LABEL: @ind_plus2(
+; VF2: %wide.load = load <2 x i32>, <2 x i32>*
+; VF2: mul nsw <2 x i32>
+; VF2: add nsw <2 x i32>
+; VF2: %index.next = add i64 %index, 2
+; VF2: icmp eq i64 %index.next, 512
 define i32 @ind_plus2(i32* %A) {
 entry:
   br label %for.body
@@ -64,12 +64,12 @@
 ; CHECK: %index.next = add i64 %index, 8
 ; CHECK: icmp eq i64 %index.next, 512
 
-; FORCE-VEC-LABEL: @ind_minus2(
-; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
-; FORCE-VEC: mul nsw <2 x i32>
-; FORCE-VEC: add nsw <2 x i32>
-; FORCE-VEC: %index.next = add i64 %index, 2
-; FORCE-VEC: icmp eq i64 %index.next, 512
+; VF2-LABEL: @ind_minus2(
+; VF2: %wide.load = load <2 x i32>, <2 x i32>*
+; VF2: mul nsw <2 x i32>
+; VF2: add nsw <2 x i32>
+; VF2: %index.next = add i64 %index, 2
+; VF2: icmp eq i64 %index.next, 512
 define i32 @ind_minus2(i32* %A) {
 entry:
   br label %for.body
@@ -102,30 +102,27 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
-
-; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: mul nsw <2 x i32>
-; FORCE-VEC: add nsw <2 x i32>
-; FORCE-VEC: %index.next = add i64 %index, 2
-; FORCE-VEC: icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
+
+; VF2-LABEL: @ptr_ind_plus2(
+; VF2: %[[V:.*]] = load <4 x i32>
+; VF2: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; VF2: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; VF2: mul nsw <2 x i32>
+; VF2: add nsw <2 x i32>
+; VF2: %index.next = add i64 %index, 2
+; VF2: icmp eq i64 %index.next, 1024
 define i32 @ptr_ind_plus2(i32* %A) {
 entry:
   br label %for.body
Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
@@ -0,0 +1,398 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UF2
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Make sure that we can vectorize the interleave access to a struct.
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; int test_struct_load2(struct ST2 *A, int n) {
+;   int sum = 0;
+;   for (int i = 0; i < 1024; ++i) {
+;     sum += A[i].x;
+;     sum += A[i].y;
+;   }
+;
+;   return sum;
+; }
+
+; CHECK-LABEL: @test_struct_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UF2-LABEL: @test_struct_load2(
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+
+%struct.ST2 = type { i32, i32 }
+define i32 @test_struct_load2(%struct.ST2* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %add = add nsw i32 %0, %sum
+  %add1 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  ret i32 %add1.lcssa
+}
+
+
+; Make sure that the following case can be vectorized.
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+; UF2-LABEL: @test_array_load2_store2(
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: store <8 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: store <8 x i32>
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+; };
+; int test_struct_load3(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UF2-LABEL: @test_struct_load3(
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+
+%struct.ST3 = type { i32, i32, i32 }
+
+define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.015
+  %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add5
+}
+
+
+; Make sure that the following case can be vectorized.
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+; UF2-LABEL: @test_struct_array_load3_store3(
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <12 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <12 x i32>
+
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+; UF2-LABEL: @test_struct_load4(
+; UF2: load <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: load <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+; UF2-LABEL: @test_struct_store4(
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <16 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <16 x i32>
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}