Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -499,6 +499,11 @@
                                       const ValueToValueMap &PtrToStride,
                                       Value *Ptr, Value *OrigPtr = nullptr);
 
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                 const ValueToValueMap &StridesMap);
+
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
 ///
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -444,6 +444,14 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of one member in the interleaved access.
+  /// \p Vecty is the wide vecotor type of the whole interleaved access.
+  /// \p SubTy is the vector type of the member.
+  /// \p Index is the index of current member.
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -582,6 +590,10 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              Type *SubTy, unsigned Index,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -740,6 +752,12 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index,
+                                           Alignment, AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -300,6 +300,12 @@
     return 1;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Alignment,
+                                      unsigned AddressSpace) {
+    return 1;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -522,6 +522,50 @@
     return Cost;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VT = dyn_cast<VectorType>(VecTy);
+    VectorType *SubVT = dyn_cast<VectorType>(SubTy);
+    assert(VT && SubVT && "Expect vector types");
+
+    unsigned NumElts = VT->getNumElements();
+    unsigned SubNumElts = SubVT->getNumElements();
+    unsigned Stride = NumElts / SubNumElts;
+    assert(NumElts % SubNumElts == 0 && Stride > 1 && "Invalide vector types");
+    assert(Index < Stride && "Invalid Index");
+
+    // The load/store cost is one part of the wide vector memory op cost.
+    unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+    Cost = Cost / Stride;
+
+    // Then need to calculate the cost for interleave operations.
+    if (Opcode == Instruction::Load) {
+      // E.g. Interleaved load for the vector %V1 of index 1:
+      //     %V = load <4 x i32>, <4 x i32>* %ptr
+      //     %V1 = shufflevector <4 x i32> %V, <4 x i32> undef, <1, 3>
+      // The cost is extract element 1 and element 3 from the wide vector, and
+      // insert them sequentially into the sub vector.
+      for (unsigned i = 0; i < SubNumElts; i++) {
+        Cost += getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                   Index + i * Stride);
+        Cost += getVectorInstrCost(Instruction::InsertElement, SubTy, i);
+      }
+    } else {
+      // E.g. Interleaved store for the vector %V1 of index 1:
+      //     %IV = shufflevector <2 x i32> %V0, <2 x i32> %V1, <0, 2, 1, 3>
+      //     store <4 x i32> %IV, <4 x i32> %ptr
+      // The cost is extract element 0 and element 1 from %V1, and insert them
+      // into lane 1 and lane 3 of the wide vector.
+      for (unsigned i = 0; i < SubNumElts; i++) {
+        Cost += getVectorInstrCost(Instruction::ExtractElement, SubTy, i);
+        Cost += getVectorInstrCost(Instruction::InsertElement, VecTy,
+                                   Index + i * Stride);
+      }
+    }
+    return Cost;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     unsigned ISD = 0;
Index: include/llvm/Transforms/Utils/VectorUtils.h
===================================================================
--- include/llvm/Transforms/Utils/VectorUtils.h
+++ include/llvm/Transforms/Utils/VectorUtils.h
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 
 namespace llvm {
 
@@ -200,6 +201,11 @@
   return Intrinsic::not_intrinsic;
 }
 
+Value *getPointerOperand(Value *I);
+
+bool isConsecutiveAccess(Value *A, Value *B, ScalarEvolution *SE,
+                         const DataLayout &DL);
+
 } // llvm namespace
 
 #endif
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -284,11 +284,6 @@
   return AR->isAffine();
 }
 
-/// \brief Check the stride of the pointer and ensure that it does not wrap in
-/// the address space.
-static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                        const ValueToValueMap &StridesMap);
-
 bool AccessAnalysis::canCheckPtrAtRT(
     LoopAccessInfo::RuntimePointerCheck &RtCheck, unsigned &NumComparisons,
     ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap,
@@ -504,8 +499,8 @@
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                        const ValueToValueMap &StridesMap) {
+int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                       const ValueToValueMap &StridesMap) {
   const Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
@@ -770,28 +765,39 @@
     return Dependence::Unknown;
   }
 
-  unsigned Distance = (unsigned) Val.getZExtValue();
-
-  // Bail out early if passed-in parameters make vectorization not feasible.
-  unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
-                           VectorizerParams::VectorizationFactor : 1);
-  unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
-                           VectorizerParams::VectorizationInterleave : 1);
+  unsigned ForcedFactor = VectorizerParams::VectorizationFactor;
+  unsigned ForcedUnroll = VectorizerParams::VectorizationInterleave;
+  // The number of iterations to be vectorized and unrolled.
+  // If one passed-in parameter is not forced, notice that the number is 0. Then
+  // we need to adjust it.
+  unsigned NumIter = ForcedFactor * ForcedUnroll;
+  if (!NumIter) {
+    NumIter = ForcedFactor ? ForcedFactor : ForcedUnroll;
+    // It could be 0 (if both parameters are not forced) or 1 (if the
+    // forced parameter is 1). Then we set it to be the minimum number: 2.
+    if (NumIter < 2)
+      NumIter = 2;
+  }
 
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+  unsigned Distance = (unsigned)Val.getZExtValue();
+  unsigned Stride = std::abs(StrideAPtr);
+  // Positive distance is safe when
+  //     Distance <= MaxSafeDepDistBytes
+  // And either one below is true
+  //     Distance < Stride * TypeByteSize
+  //     Distance >= TypeByteSize * NumIter * Stride
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIter * Stride) ||
+      Distance > MaxSafeDepDistBytes) {
     DEBUG(dbgs() << "LAA: Failure because of Positive distance "
         << Val.getSExtValue() << '\n');
     return Dependence::Backward;
   }
 
-  // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  // If Distance < Stride * TypeByteSize, it is always safe. just keep the
+  // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes.
+  if (Distance >= Stride * TypeByteSize)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,13 @@
   return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
+unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, Type *SubTy, unsigned Index,
+    unsigned Alignment, unsigned AddressSpace) const {
+  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index,
+                                             Alignment, AddressSpace);
+}
+
 unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
Index: lib/Transforms/Utils/CMakeLists.txt
===================================================================
--- lib/Transforms/Utils/CMakeLists.txt
+++ lib/Transforms/Utils/CMakeLists.txt
@@ -37,6 +37,7 @@
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
+  VectorUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
Index: lib/Transforms/Utils/VectorUtils.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Utils/VectorUtils.cpp
@@ -0,0 +1,76 @@
+//===--------------- VectorUtils.cpp - Vectorizer utilities ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some vectorizer utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/VectorUtils.h"
+
+namespace llvm {
+
+Value *getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+static unsigned getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+bool isConsecutiveAccess(Value *A, Value *B, ScalarEvolution *SE,
+                         const DataLayout &DL) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same type.
+  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
+    return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *C = SE->getConstant(BaseDelta);
+  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
+  return X == PtrSCEVB;
+}
+
+} // End llvm namespace
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -134,6 +134,11 @@
     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
     cl::desc("Enable symblic stride memory access versioning"));
 
+/// This enables vectorization on the interleaved memory accesses
+static cl::opt<bool> VectorizeInterleavedAccess(
+    "vectorize-interleaved-access", cl::init(false), cl::Hidden,
+    cl::desc("Vectorize interleaved memory access in a loop"));
+
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
@@ -351,6 +356,11 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  /// Vectorize the interleaved memory access. \p Instr is the member in insert
+  /// position.
+  void vectorizeInterleavedAccess(Instruction *Instr, Value *Ptr,
+                                  VectorType *VecTy);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -568,6 +578,15 @@
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
         WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
 
+  ~LoopVectorizationLegality() {
+    SmallSet<InterleavedAccess *, 4> DelSet;
+    // Collecting all the pointers to avoid delete a pointer twice.
+    for (auto &I : InterleavedAccessMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
   /// This enum represents the kinds of inductions that we support.
   enum InductionKind {
     IK_NoInduction,  ///< Not an induction variable.
@@ -640,6 +659,44 @@
     ConstantInt *StepValue;
   };
 
+  // An interleaved access consists of a group of interleaved member accesses.
+  // E.g. for (i = 0; i < N; i+=2) {
+  //        even = A[i];         // Member of index 0
+  //        odd  = A[i + 1];     // Member of index 1
+  //        ...
+  //      }
+  struct InterleavedAccess {
+    InterleavedAccess(int Stride, bool Reverse, bool IsLoad)
+        : Stride(Stride), Reverse(Reverse), IsLoad(IsLoad) {}
+    InterleavedAccess() : Stride(0) {}
+
+    unsigned getIndex(Instruction *Access) {
+      assert(Members.count(Access) &&
+             "The interleaved access doesn't contain such member");
+
+      for (unsigned i = 0; i < Members.size(); i++)
+        if (Members[i] == Access)
+          return i;
+      llvm_unreachable("Contains no such member");
+    }
+
+    SmallSetVector<Instruction *, 4> Members;
+    unsigned Stride;
+    bool Reverse;
+    bool IsLoad;
+    unsigned Align;
+    // To avoid breaking the dependence, the new vectorized interleaved access
+    // should be inserted at either the first load or the last store.
+    // E.g. %even = load i32             // Insert Position
+    //      %add = add i32 %even         // Use of %even
+    //      %odd = load i32              //
+    //
+    //      store i32 %even              //
+    //      %odd = add i32               // Def of %odd
+    //      store i32 %odd               // Insert Position
+    Instruction *InsertPos;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -730,6 +787,16 @@
   unsigned getNumPredStores() const {
     return NumPredStores;
   }
+
+  bool belongsToInterleavedAccess(Instruction *I) {
+    return InterleavedAccessMap.count(I);
+  }
+  InterleavedAccess *getInterleavedAccess(Instruction *I) {
+    assert(belongsToInterleavedAccess(I) &&
+           "Not a member of an interleaved access");
+    return InterleavedAccessMap[I];
+  }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -762,7 +829,21 @@
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
-  void collectStridedAccess(Value *LoadOrStoreInst);
+  void collectInvarStridedAccess(Value *LoadOrStoreInst);
+
+  /// Analyze interleaved accesses.
+  void analyzeInterleavedAccess();
+
+  /// Collect constant strided memory access.
+  ///
+  /// Looks for accesses like "a[i * C]" where "C" is a constant (Except +/-1)
+  void
+  collectConstStridedAccess(MapVector<Instruction *, int> &StridedAccesses);
+
+  /// Find all interleaved accesses in \p Queue which consist of loads/stores
+  /// with the same stride \p Stride and base pointer.
+  void collectInterleavedAccess(SmallVector<Instruction *, 4> &Queue,
+                                int Stride, bool IsLoad);
 
   /// Report an analysis message to assist the user in diagnosing loops that are
   /// not vectorized.  These are handled as LoopAccessReport rather than
@@ -822,6 +903,9 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction*, 8> MaskedOp;
+
+  /// Contains the relationships between the members and the interleaved access.
+  DenseMap<Instruction *, InterleavedAccess *> InterleavedAccessMap;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1657,6 +1741,225 @@
                                      "reverse");
 }
 
+// Get the mask to interleave a wide vector concatenated from several small
+// vectors like: V0, V1, V2, ...
+// The mask is:
+//     <V0's 1st index, V1's 1st index, ..., V0's 2nd index, ...>
+// Which is equal to
+//     <0, VF, VF*2, ..., VF*NumVec-VF, 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
+//      <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF,
+                                         unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVec; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting from index Start:
+//     <Start, Start + Stride, ..., Start + (VF-1)*Stride>
+// This can be used to interleave a wide vector into several small vectors.
+// E.g. To interleave a wide vector of <8 x i32> type into two vectors:
+//     %v0 = shufflevector <8 x i32> %wide.vec, undef, <0, 2, 4, 6>
+//     %v1 = shufflevector <8 x i32> %wide.vec, undef, <1, 3, 5, 7>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+                                unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consist of sequential constants,
+// The second part consist of UNDEFs.
+// I.E. <Idx, Idx + 1, Idx + 2, ..., Idx + NumSeq - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx,
+                                   unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Mask.push_back(Builder.getInt32(i));
+  // Return directly if there is no undef value.
+  if (NumUndef == 0)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 1st vector always
+// has the same number of elements as the 2nd vector or more elements.
+// If the 1st vector has more elements, extend the 2nd vector with UNDEFs.
+static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  if (NumElts1 == NumElts2) {
+    Constant *Mask = getSequentialMask(Builder, 0, NumElts1 * 2, 0);
+    return Builder.CreateShuffleVector(V1, V2, Mask);
+  }
+
+  assert(NumElts1 > NumElts2 && "Expect the first vector has more elements");
+  // Extend V2 with undef elements.
+  Constant *ExtMask =
+      getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+  Value *Undef = UndefValue::get(VecTy2);
+  Value *ExtV = Builder.CreateShuffleVector(V2, Undef, ExtMask);
+
+  Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, ExtV, Mask);
+}
+
+// Concatenate number of vectors in the give list starting from Idx
+static Value *ConcatenateVectors(IRBuilder<> &Builder,
+                                 SmallVector<Value *, 4> &List,
+                                 unsigned StartIdx, unsigned NumVec) {
+  assert(NumVec != 0 && "Number of vectors should not be zero");
+  if (NumVec == 1)
+    return List[StartIdx];
+
+  unsigned PartNum = PowerOf2Floor(NumVec);
+  // When the number of vectors is not power of 2, split the list and make sure
+  // the first list has power of 2 elements.
+  // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements.
+  if (PartNum == NumVec)
+    PartNum /= 2;
+  Value *V0 = ConcatenateVectors(Builder, List, StartIdx, PartNum);
+  Value *V1 =
+      ConcatenateVectors(Builder, List, StartIdx + PartNum, NumVec - PartNum);
+  return ConcatenateTwoVectors(Builder, V0, V1);
+}
+
+// E.g. Translate following interleaved loads (VF is 4):
+// for (i = 0; i < N; i+=3) {
+//   R = Pic[i];             // interleaved load of index 0
+//   G = Pic[i+1];           // interleaved load of index 1
+//   B = Pic[i+2];           // interleaved load of index 2
+//   ... // do something to R, G, B
+// }
+//
+// To:
+//   %wide.vec = load <12 x i32>, <12 x i32>* %ptr           ; read R,G,B
+//   %R.vec = shufflevector %wide.vec, undef, <0, 3, 6, 9>
+//   %G.vec = shufflevector %wide.vec, undef, <1, 4, 7, 10>
+//   %B.vec = shufflevector %wide.vec, undef, <2, 5, 8, 11>
+//
+// Or translate following interleaved stores (VF is 4):
+// for (i = 0; i < N; i+=3) {
+//   ... do something to R, G, B
+//   Pic[i]   = R;           // interleaved store of index 0
+//   Pic[i+1] = G;           // interleaved store of index 1
+//   Pic[i+2] = B;           // interleaved store of index 2
+// }
+//
+// To:
+//   %R_G.vec = shufflevector %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//   %B_U.vec = shufflevector %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+//   %interleaved.vec = shufflevector %R_G.vec, %B_U.vec,
+//                                       <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+//   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr    ; write R,G,B
+void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr,
+                                                     Value *Ptr,
+                                                     VectorType *VecTy) {
+  LoopVectorizationLegality::InterleavedAccess *IA =
+      Legal->getInterleavedAccess(Instr);
+  assert(IA && IA->InsertPos == Instr && "Unexpected instruction");
+  unsigned Idx = IA->getIndex(Instr);
+  SmallSetVector<Instruction *, 4> &Members = IA->Members;
+  unsigned NumVec = IA->Stride;
+  bool Reverse = IA->Reverse;
+  unsigned Align = IA->Align;
+  // Set the default alignment if there is no alignment.
+  if (!Align) {
+    const DataLayout &DL = Instr->getModule()->getDataLayout();
+    Align = DL.getABITypeAlignment(VecTy);
+  }
+
+  // Create the wide vector types for the wide load/store.
+  Type *WideVecTy = VectorType::get(VecTy->getElementType(), NumVec * VF);
+  Type *WidePtrTy =
+      WideVecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  Constant *Zero = Builder.getInt32(0);
+  SmallVector<Value *, 2> NewPtrs;
+  setDebugLocFromInst(Builder, Ptr);
+  // Adjust the new pointers.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero);
+    // Adjust the new pointer to point to the access of index 0.
+    NewPtr =
+        Builder.CreateGEP(NewPtr, Builder.getInt32(-static_cast<int>(Idx)));
+
+    // If the address is reversed, then the wide load/store needs to start at
+    // the last access of index 0.
+    // E.g. For the reversed access of interleaved load with 2 members:
+    //      {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]}
+    // The new pointer is now pointed to A[i]. Adjust it to A[i-6].
+    if (Reverse)
+      NewPtr = Builder.CreateGEP(
+          NewPtr, Builder.getInt32(-static_cast<int>(VF * NumVec - NumVec)));
+
+    // Bit cast to the new wide vector pointer type.
+    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, WidePtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  // Vectorize interleaved loads.
+  if (IA->IsLoad) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      Instruction *WideLoad =
+          Builder.CreateAlignedLoad(NewPtrs[Part], Align, "wide.vec");
+
+      // Interleave and split the loaded wide vector into small vectors.
+      Value *Undef = UndefValue::get(WideVecTy);
+      for (unsigned i = 0; i < NumVec; i++) {
+        Constant *StridedMask = getStridedMask(Builder, i, NumVec, VF);
+        Value *NewV = Builder.CreateShuffleVector(WideLoad, Undef, StridedMask,
+                                                  "strided.vec");
+        VectorParts &Entry = WidenMap.get(Members[i]);
+        Entry[Part] = Reverse ? reverseVector(NewV) : NewV;
+      }
+
+      propagateMetadata(WideLoad, Instr);
+    }
+    return;
+  }
+
+  // Vectorize interleaved stores.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    SmallVector<Value *, 4> StoredVecs;
+    for (auto &I : Members) {
+      Value *StoredVec = getVectorValue(I->getOperand(0))[Part];
+      if (Reverse)
+        StoredVec = reverseVector(StoredVec);
+      StoredVecs.push_back(StoredVec);
+    }
+    // Concatenate the apart vectors into a wide vector.
+    Value *WideVec = ConcatenateVectors(Builder, StoredVecs, 0, NumVec);
+
+    Constant *IMask = getInterleavedStoreMask(Builder, VF, NumVec);
+    // Interleave the data in the wide vector.
+    Value *IVec = Builder.CreateShuffleVector(
+        WideVec, UndefValue::get(WideVecTy), IMask, "interleaved.vec");
+
+    Instruction *WideStore =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Align);
+    propagateMetadata(WideStore, Instr);
+  }
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1684,6 +1987,15 @@
   if (ScalarAllocatedSize != VectorElementSize)
     return scalarizeInstruction(Instr);
 
+  if (Legal->belongsToInterleavedAccess(Instr)) {
+    LoopVectorizationLegality::InterleavedAccess *IA =
+        Legal->getInterleavedAccess(Instr);
+    // Skip the access if it is not the insert position.
+    if (IA->InsertPos != Instr)
+      return;
+    return vectorizeInterleavedAccess(Instr, Ptr, dyn_cast<VectorType>(DataTy));
+  }
+
   // If the pointer is loop invariant or if it is non-consecutive,
   // scalarize the load.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
@@ -3414,6 +3726,213 @@
   return true;
 }
 
+void LoopVectorizationLegality::collectConstStridedAccess(
+    MapVector<Instruction *, int> &StridedAccesses) {
+  for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE;
+       ++BB) {
+    if (blockNeedsPredication(*BB))
+      continue;
+
+    for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; I++) {
+      Value *Ptr = getPointerOperand(I);
+      if (!Ptr)
+        continue;
+
+      int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+      if (Stride < 2 && Stride > -2)
+        continue;
+
+      DEBUG(dbgs() << "LV: Found a constant strided access\n");
+      DEBUG(dbgs() << "  pointer:" << *Ptr << "\n");
+      DEBUG(dbgs() << "  Stride:" << Stride << "\n");
+      StridedAccesses[I] = Stride;
+    }
+  }
+}
+
+void LoopVectorizationLegality::collectInterleavedAccess(
+    SmallVector<Instruction *, 4> &Queue, int Stride, bool IsLoad) {
+  if (Queue.size() < 2)
+    return;
+
+  SetVector<Instruction *> Heads;
+  SetVector<Instruction *> Tails;
+  SmallDenseMap<Instruction *, Instruction *> ConsecutivePairs;
+
+  // Collect all the consecutive pairs.
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (auto &H : Queue) {
+    for (auto &T : Queue) {
+      if (H == T || !isConsecutiveAccess(H, T, SE, DL))
+        continue;
+      // If there is already Head or Tail, we have redundant accesses. This may
+      // result in incorrect results. Just return directly.
+      // E.g. Three writes on:
+      //      A[i] (1)
+      //      A[i] (2)
+      //      A[i+1] (3)
+      // If we combine (1) and (3) together and generate code after (2):
+      //      A[i] (2)
+      //      A[i]+A[i+1] (1)(2)
+      // Then we get incorrect result.
+      if (Heads.count(H) || Tails.count(T))
+        return;
+
+      Heads.insert(H);
+      Tails.insert(T);
+      ConsecutivePairs[H] = T;
+    }
+  }
+
+  for (auto &I : Heads) {
+    if (Tails.count(I))
+      continue;
+
+    SmallSetVector<Instruction *, 4> ConsecutiveChain;
+    // Collect a chain with consecutive accesses.
+    auto Inst = I;
+    while (Tails.count(Inst) || Heads.count(Inst)) {
+      ConsecutiveChain.insert(Inst);
+      // Move to the next access in the chain.
+      Inst = ConsecutivePairs[Inst];
+    }
+
+    unsigned ChainLen = ConsecutiveChain.size();
+    // The length of the interleaved access list is the absolute value of stride
+    unsigned IALen = std::abs(Stride);
+    if (ChainLen < IALen)
+      continue;
+
+    // There may be more than 1 interleaved access list in this chain.
+    for (unsigned Part = 0; Part < ChainLen / IALen; Part++) {
+      // If the Stride is negative, it is reversed.
+      bool Reverse = Stride < 0;
+      InterleavedAccess *IA = new InterleavedAccess(IALen, Reverse, IsLoad);
+      DEBUG(dbgs() << "LV: Found an interleaved accesses with Stride "
+                   << IALen << "\n");
+
+      SmallSetVector<Instruction *, 4> &Members = IA->Members;
+      unsigned MinAlign = 0;
+      for (unsigned i = 0; i < IALen; i++) {
+        Instruction *Access = ConsecutiveChain[Part * IALen + i];
+        Members.insert(Access);
+        // Record the relationship of the member and the interleaved access
+        InterleavedAccessMap[Access] = IA;
+
+        unsigned Align = IsLoad ? dyn_cast<LoadInst>(Access)->getAlignment()
+                                : dyn_cast<StoreInst>(Access)->getAlignment();
+        if (!MinAlign)
+          MinAlign = Align;
+        else if (Align && Align < MinAlign)
+          MinAlign = Align;
+      }
+      // The alignment of the interleaved access is the min alignment.
+      IA->Align = MinAlign;
+
+      // To avoid breaking the dependences, the new vectorized interleaved
+      // access has different insert positions for load and store.
+      if (IsLoad) {
+        for (auto &II : Queue) {
+          // Find the first load and set it to be the insert position.
+          if (Members.count(II)) {
+            IA->InsertPos = II;
+            break;
+          }
+        }
+      } else {
+        for (auto II = Queue.rbegin(), E = Queue.rend(); II != E; ++II) {
+          // Find the last store and set it to be the insert position.
+          if (Members.count(*II)) {
+            IA->InsertPos = *II;
+            break;
+          }
+        }
+      }
+    } // end of handling consecutive list.
+  }   // end of handling Heads
+}
+
+static Value *getBasePointer(Instruction *I, const DataLayout &DL) {
+  Value *Ptr = getPointerOperand(I);
+  assert(Ptr && "Fail to get point from a memory instruction");
+  return GetUnderlyingObject(Ptr, DL);
+}
+
+void LoopVectorizationLegality::analyzeInterleavedAccess() {
+  MapVector<Instruction *, int> StridedAccesses;
+  collectConstStridedAccess(StridedAccesses);
+
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  // The following code groups the constant strided accesses and find the
+  // interleaved accesses in two steps:
+  //   (1) Groups loads/stores with the same stride and base pointer into a
+  //       queue.
+  //   (2) Calls another function to collect all interleaved accesses in that
+  //       queue.
+  // TODO: It can't collect interleaved access if the loads/stores are mixed.
+  // E.g. for (i = 0; i < N; i+=2) {
+  //        A[i] += 1;
+  //        A[i+1] += 2;
+  //      }
+  // It could fail to identify the interleaved accesses as the mixed order:
+  //      load A[i], store A[i], load A[i+1], store A[i+1].
+  for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE;
+       ++BB) {
+    if (blockNeedsPredication(*BB))
+      continue;
+
+    // Queue for the loads/stores of the same stride and base pointer.
+    SmallVector<Instruction *, 4> Queue;
+    Value *CurBase = nullptr;
+    unsigned CurStride = 0;
+    bool CurLoad = false;
+    for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I) {
+      if (!I->mayReadOrWriteMemory())
+        continue;
+
+      Value *Base = nullptr;
+      unsigned Stride = 0;
+      bool IsLoad = false;
+      if (StridedAccesses.count(I)) {
+        Base = getBasePointer(I, DL);
+        Stride = StridedAccesses[I];
+        IsLoad = I->mayReadFromMemory();
+        // If there is no current queue, start a new queue with this access.
+        if (!CurBase) {
+          Queue.push_back(I);
+          CurBase = Base;
+          CurStride = Stride;
+          CurLoad = IsLoad;
+          continue;
+        }
+
+        // Add to the current queue if this access matches the stride, base,
+        // and memory operation type.
+        if (CurBase == Base && CurStride == Stride && CurLoad == IsLoad) {
+          Queue.push_back(I);
+          continue;
+        }
+      }
+
+      // If this access can not be added into the queue, it may break the
+      // dependence. So try to analyze and clear current queue.
+      collectInterleavedAccess(Queue, CurStride, CurLoad);
+      Queue.clear();
+      CurBase = Base;
+      // Continue if this access is not strided access.
+      if (!CurBase)
+        continue;
+
+      Queue.push_back(I);
+      CurStride = Stride;
+      CurLoad = IsLoad;
+    } // End of handling BB
+
+    // Collect the tailed accesses.
+    collectInterleavedAccess(Queue, CurStride, CurLoad);
+  } // End of handling TheLoop
+}
+
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
     return DL.getIntPtrType(Ty);
@@ -3601,12 +4120,12 @@
           return false;
         }
         if (EnableMemAccessVersioning)
-          collectStridedAccess(ST);
+          collectInvarStridedAccess(ST);
       }
 
       if (EnableMemAccessVersioning)
         if (LoadInst *LI = dyn_cast<LoadInst>(it))
-          collectStridedAccess(LI);
+          collectInvarStridedAccess(LI);
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
@@ -3629,6 +4148,9 @@
     }
   }
 
+  if (VectorizeInterleavedAccess)
+    analyzeInterleavedAccess();
+
   return true;
 }
 
@@ -3744,7 +4266,7 @@
   return Stride;
 }
 
-void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
+void LoopVectorizationLegality::collectInvarStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
     Ptr = LI->getPointerOperand();
@@ -3757,7 +4279,7 @@
   if (!Stride)
     return;
 
-  DEBUG(dbgs() << "LV: Found a strided access that we can version");
+  DEBUG(dbgs() << "LV: Found an invariable strided access that we can version");
   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
   Strides[Ptr] = Stride;
   StrideSet.insert(Stride);
@@ -4575,6 +5097,21 @@
       return TTI.getAddressComputationCost(VectorTy) +
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    // Interleaved memory loads/stores
+    if (Legal->belongsToInterleavedAccess(I)) {
+      LoopVectorizationLegality::InterleavedAccess *IA =
+          Legal->getInterleavedAccess(I);
+      Type *WideVecTy =
+          VectorType::get(VectorTy->getVectorElementType(),
+                          VectorTy->getVectorNumElements() * IA->Stride);
+      unsigned Cost = TTI.getInterleavedMemoryOpCost(
+          I->getOpcode(), WideVecTy, VectorTy, IA->getIndex(I), IA->Align, AS);
+      if (IA->Reverse)
+        Cost +=
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+      return Cost;
+    }
+
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -382,9 +382,6 @@
     }
   }
 
-  /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
-
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -412,14 +409,6 @@
   /// vectorized, or NULL. They may happen in cycles.
   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 
-  /// \brief Take the pointer operand from the Load/Store instruction.
-  /// \returns NULL if this is not a valid Load/Store instruction.
-  static Value *getPointerOperand(Value *I);
-
-  /// \brief Take the address space operand from the Load/Store instruction.
-  /// \returns -1 if this is not a valid Load/Store instruction.
-  static unsigned getAddressSpaceOperand(Value *I);
-
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
   int getGatherCost(Type *Ty);
@@ -1130,8 +1119,8 @@
           return;
         }
         const DataLayout &DL = F->getParent()->getDataLayout();
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
-          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], SE, DL)) {
+          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], SE, DL)) {
             ++NumLoadsWantToChangeOrder;
           }
           BS.cancelScheduling(VL);
@@ -1303,7 +1292,7 @@
       const DataLayout &DL = F->getParent()->getDataLayout();
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], SE, DL)) {
           BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
@@ -1774,63 +1763,6 @@
   return getGatherCost(VecTy);
 }
 
-Value *BoUpSLP::getPointerOperand(Value *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return nullptr;
-}
-
-unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
-  if (LoadInst *L = dyn_cast<LoadInst>(I))
-    return L->getPointerAddressSpace();
-  if (StoreInst *S = dyn_cast<StoreInst>(I))
-    return S->getPointerAddressSpace();
-  return -1;
-}
-
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
-  unsigned ASA = getAddressSpaceOperand(A);
-  unsigned ASB = getAddressSpaceOperand(B);
-
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB))
-    return false;
-
-  // Make sure that A and B are different pointers of the same type.
-  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
-    return false;
-
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
-  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
-  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
-  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
-  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
-
-  APInt OffsetDelta = OffsetB - OffsetA;
-
-  // Check if they are based on the same pointer. That makes the offsets
-  // sufficient.
-  if (PtrA == PtrB)
-    return OffsetDelta == Size;
-
-  // Compute the necessary base pointer delta to have the necessary final delta
-  // equal to the size.
-  APInt BaseDelta = Size - OffsetDelta;
-
-  // Otherwise compute the distance with SCEV between the base pointers.
-  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *C = SE->getConstant(BaseDelta);
-  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
-  return X == PtrSCEVB;
-}
-
 // Reorder commutative operations in alternate shuffle if the resulting vectors
 // are consecutive loads. This would allow us to vectorize the tree.
 // If we have something like-
@@ -1858,10 +1790,10 @@
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+        if (isConsecutiveAccess(L, L1, SE, DL) && VL1->isCommutative()) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+        } else if (isConsecutiveAccess(L, L1, SE, DL) && VL2->isCommutative()) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -1872,10 +1804,10 @@
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+        if (isConsecutiveAccess(L, L1, SE, DL) && VL1->isCommutative()) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+        } else if (isConsecutiveAccess(L, L1, SE, DL) && VL2->isCommutative()) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2005,7 +1937,7 @@
   for (unsigned j = 0; j < VL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        if (isConsecutiveAccess(L, L1, DL)) {
+        if (isConsecutiveAccess(L, L1, SE, DL)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2013,7 +1945,7 @@
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        if (isConsecutiveAccess(L, L1, DL)) {
+        if (isConsecutiveAccess(L, L1, SE, DL)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -3248,7 +3180,7 @@
       if (i == j)
         continue;
       const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
-      if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
+      if (isConsecutiveAccess(Stores[i], Stores[j], SE, DL)) {
         Tails.insert(Stores[j]);
         Heads.insert(Stores[i]);
         ConsecutiveChain[Stores[i]] = Stores[j];
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+; RUN: opt -S < %s -vectorize-interleaved-access=true -loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -vectorize-interleaved-access=true -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
@@ -102,26 +102,23 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
@@ -0,0 +1,497 @@
+; RUN: opt -S -loop-vectorize -vectorize-interleaved-access=true -instcombine -force-vector-width=4 -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -vectorize-interleaved-access=true -instcombine -force-vector-width=4 -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UF2
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Make sure that we can vectorize the interleave access to a struct.
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+; int test_struct_load2(struct ST2 *A) {
+;   int sum = 0;
+;   for (int i = 0; i < 1024; ++i) {
+;     sum += A[i].x;
+;     sum += A[i].y;
+;   }
+;
+;   return sum;
+; }
+
+; CHECK-LABEL: @test_struct_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UF2-LABEL: @test_struct_load2(
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+
+%struct.ST2 = type { i32, i32 }
+define i32 @test_struct_load2(%struct.ST2* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %add = add nsw i32 %0, %sum
+  %add1 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  ret i32 %add1.lcssa
+}
+
+
+; Make sure that the following case can be vectorized.
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+; UF2-LABEL: @test_array_load2_store2(
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: store <8 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: store <8 x i32>
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+; };
+; int test_struct_load3(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UF2-LABEL: @test_struct_load3(
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+
+%struct.ST3 = type { i32, i32, i32 }
+
+define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.015
+  %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add5
+}
+
+
+; Make sure that the following case can be vectorized.
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+; UF2-LABEL: @test_struct_array_load3_store3(
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <12 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <12 x i32>
+
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+; UF2-LABEL: @test_struct_load4(
+; UF2: load <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: load <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+; UF2-LABEL: @test_struct_store4(
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <16 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <16 x i32>
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; This case checks that reversed interleaved loads can be vectorized:
+; struct ST2 {
+;   int x;
+;   int y;
+; };
+; int test_reversed_load2(struct ST2 *A) {
+;   int add = 0;
+;   int sub = 0;
+;   for (int i = 1023; i >= 0; i--) {
+;     add += A[i].x;
+;     sub -= A[i].y;
+;   }
+;   return add*sub;
+; }
+
+; CHECK-LABEL: @test_reversed_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+
+define i32 @test_reversed_load2(%struct.ST2* nocapture readonly %A) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %mul = mul nsw i32 %sub4, %add1
+  ret i32 %mul
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %sub.015 = phi i32 [ 0, %entry ], [ %sub4, %for.body ]
+  %add.014 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add1 = add nsw i32 %0, %add.014
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub4 = sub nsw i32 %sub.015, %1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; This case checks that reversed interleaved loads/stores can be vectorized:
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}