Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -444,6 +444,11 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of a member of the interleaved Load/Store.
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -582,6 +587,10 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              unsigned Stride,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -740,6 +749,12 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment,
+                                           AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -300,6 +300,20 @@
     return 1;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Stride, unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VecType = dyn_cast<VectorType>(VecTy);
+    assert(VecType && "Expect a vector type");
+
+    // Cost for the sequential memory access for a vector
+    unsigned Cost = 1;
+    // Estimate the interleave cost to be extracting all lanes or inserting all
+    // lanes.
+    Cost += VecType->getNumElements();
+    return Cost;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -530,6 +530,7 @@
   if (Lp != AR->getLoop()) {
     DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
           *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
   }
 
   // The address calculation must not wrap. Otherwise, a dependence could be
@@ -777,21 +778,29 @@
                            VectorizerParams::VectorizationFactor : 1);
   unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                            VectorizerParams::VectorizationInterleave : 1);
-
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+  unsigned NumIterations = ForcedFactor * ForcedUnroll;
+  // The number of vectorized & interleaved iterations should not be less than 2
+  if (NumIterations < 2)
+    NumIterations = 2;
+
+  unsigned Stride = std::abs(StrideAPtr);
+  // Positive distance is safe when either of:
+  //   (1) Distance < Stride * TypeByteSize
+  //   (2) Distance >= TypeByteSize * NumIterations * Stride
+  // is true and
+  //   (3) Distance <= MaxSafeDepDistBytes
+  // is true.
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIterations * Stride) ||
+      Distance > MaxSafeDepDistBytes) {
     DEBUG(dbgs() << "LAA: Failure because of Positive distance "
         << Val.getSExtValue() << '\n');
     return Dependence::Backward;
   }
 
   // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  if (Distance >= TypeByteSize * NumIterations * Stride)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,13 @@
   return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
+unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Stride, unsigned Alignment,
+    unsigned AddressSpace) const {
+  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment,
+                                             AddressSpace);
+}
+
 unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -134,6 +134,37 @@
     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
     cl::desc("Enable symblic stride memory access versioning"));
 
+/// This enables vectorization on the interleaved memory accesses in code like
+/// the following.
+/// for (i = 0; i < N; i+=3) {
+///   R = Pic[i];       // Load R color elements
+///   G = Pic[i+1];     // Load G color elements
+///   B = Pic[i+2];     // Load B color elements
+///   ... // do something to R, G, B
+/// }
+/// Will be translated to a wide vector load and several shufflevectors
+///   %wide.vec = load <12 x i32>, <12 x i32>* %ptr           ; load for R,G,B
+///   %R.vec = shufflevector %wide.vec, undef, <0, 3, 6, 9>   ; mask for R
+///   %G.vec = shufflevector %wide.vec, undef, <1, 4, 7, 10>  ; mask for G
+///   %B.vec = shufflevector %wide.vec, undef, <2, 5, 8, 11>  ; mask for B
+///
+/// for (i = 0; i < N; i+=3) {
+///   ... do something to R, G, B
+///   Pic[i]   = R;     // Store R color elements
+///   Pic[i+1] = G;     // Store G color elements
+///   Pic[i+2] = B;     // Store B color elements
+/// }
+/// Will be translated to several shufflevectors and a wide vector store
+///   %RG.vec = shufflevector %R.vec, %G.vec, <0, 1, 2, ..., 7>
+///   %BU.vec = shufflevector %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+///   %interleave = shufflevector %RG.vec, %BU.vec,
+///        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>  ; mask for interleaved store
+///   store <12 x i32> %interleave, <12 x i32>* %ptr         ; write for R,G,B
+static cl::opt<bool>
+    VectorizeInterleavedAccess("vectorize-interleaved-access", cl::init(true),
+                               cl::Hidden,
+                               cl::desc("Vectorize interleaved memory access"));
+
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
@@ -351,6 +382,8 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  void vectorizeInterleavedAccess(Instruction *InsPos, Value *Ptr, Type *VecTy);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -568,6 +601,15 @@
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
         WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
 
+  ~LoopVectorizationLegality() {
+    SmallSet<InterleavedAccess *, 4> DelSet;
+    // Collecting all the pointers to avoid delete a pointer twice,
+    for (auto &I : InterleavedAccessMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
   /// This enum represents the kinds of inductions that we support.
   enum InductionKind {
     IK_NoInduction,  ///< Not an induction variable.
@@ -640,6 +682,56 @@
     ConstantInt *StepValue;
   };
 
+  // An interleaved access list contains a group of interleaved accesses.
+  // E.g. for (i = 0; i < N; i+=2) {
+  //        even = A[i];
+  //        odd  = A[i + 1];
+  //        ...  // operations on even/odd
+  //      }
+  // The List consists of two accesses: even load, odd load.
+  struct InterleavedAccess {
+    InterleavedAccess(int Stride, bool IsLoad)
+        : Stride(Stride), IsLoad(IsLoad) {}
+    InterleavedAccess() : Stride(0) {}
+
+    unsigned getIndex(Instruction *Access) {
+      assert(Members.count(Access) &&
+             "The interleaved access doesn't contain such member");
+
+      for (unsigned i = 0; i < Members.size(); i++)
+        if (Members[i] == Access)
+          return i;
+      return Members.size();
+    }
+
+    unsigned getNumMembers() { return Members.size(); }
+
+    bool isInsertPos(Instruction *Access) { return Access == InsertPos; }
+
+    bool isReverse() { return Stride < 0; }
+
+    int Stride;
+    bool IsLoad;
+    unsigned Align;
+    // The interleaved access list.
+    SmallSetVector<Instruction *, 4> Members;
+
+    // To avoid breaking the dependence, the new vectorized interleaved access
+    // should be inserted at the position of either:
+    //    (1) The first load access.
+    // or
+    //    (2) The last store access.
+    //
+    // E.g. %even = load i32             // Insert Position (First load)
+    //      %add = add i32 %even         // Use of %even
+    //      %odd = load i32              // Should not insert here
+    //
+    //      store i32 %even              // Should not insert here
+    //      %odd = add i32               // Def of %odd
+    //      store i32 %odd               // Insert Position (Last store)
+    Instruction *InsertPos;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -730,6 +822,16 @@
   unsigned getNumPredStores() const {
     return NumPredStores;
   }
+
+  bool belongsToInterleavedAccess(Instruction *I) {
+    return InterleavedAccessMap.count(I);
+  }
+  InterleavedAccess *getInterleavedAccess(Instruction *I) {
+    assert(belongsToInterleavedAccess(I) &&
+           "The given access doesn't belong to any interleaved access");
+    return InterleavedAccessMap[I];
+  }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -762,7 +864,21 @@
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
-  void collectStridedAccess(Value *LoadOrStoreInst);
+  void collectInvarStridedAccess(Value *LoadOrStoreInst);
+
+  /// Analyze interleaved accesses.
+  void analyzeInterleavedAccess();
+
+  /// Collect constant strided memory accesses.
+  ///
+  /// Looks for accesses like "a[i * C]" where "C" is a constant (Except +/-1).
+  void
+  collectConstStridedAccess(MapVector<Instruction *, int> &StridedAccesses);
+
+  /// Find all interleaved accesses in a queue of loads/stores with the same
+  /// stride and base pointer.
+  void collectInterleavedAccess(SmallVector<Instruction *, 4> &Queue,
+                                int Stride, bool IsLoad);
 
   /// Report an analysis message to assist the user in diagnosing loops that are
   /// not vectorized.  These are handled as LoopAccessReport rather than
@@ -822,6 +938,10 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction*, 8> MaskedOp;
+
+  /// A map contains the relationships between the members of an interleaved
+  /// access and the interleaved access.
+  DenseMap<Instruction *, InterleavedAccess *> InterleavedAccessMap;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1657,6 +1777,211 @@
                                      "reverse");
 }
 
+// Get the mask to interleave a wide vector concatenated from several small
+// vectors like: V0, V1, V2, ...
+// The mask is:
+//     <V0's 1st index, V1's 1st index, ..., V0's 2nd index, V1's 2nd index, ..>
+//     which is equal to <0, VF, VF*2, ..., VF*NumVec-VF, 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors and VF is 4, the mask is:
+//      <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF,
+                                         unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVec; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting:
+//     <Start, Start + Stride, Start + 2*Stride, ..., Start + (VF-1)*Stride>
+// This can be used to interleave a wide vector into several small vectors.
+// E.g. To interleaved a wide vector of <8 x i32> type:
+//     %v0 = shufflevector <8 x i32> %wide.vec, undef, <0, 2, 4, 6>
+//     %v1 = shufflevector <8 x i32> %wide.vec, undef, <1, 3, 5, 7>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+                                unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consist of sequential constants,
+// The second part consist of UNDEFs.
+// I.E. <Idx, Idx + 1, Idx + 2, ..., Idx + NumSeq - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx,
+                                   unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Mask.push_back(Builder.getInt32(i));
+  // Return directly if there is no undef value.
+  if (NumUndef == 0)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 1st vector always
+// has the same number of elements as the 2nd vector or more elements.
+// If the 1st vector has more elements, extend the 2nd vector with UNDEFs.
+static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  if (NumElts1 == NumElts2) {
+    Constant *Mask = getSequentialMask(Builder, 0, NumElts1 * 2, 0);
+    return Builder.CreateShuffleVector(V1, V2, Mask);
+  }
+
+  assert(NumElts1 > NumElts2 && "Expect the first vector has more elements");
+  // Extend V2 with undef elements.
+  Constant *ExtMask =
+      getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+  Value *Undef = UndefValue::get(VecTy2);
+  Value *ExtV = Builder.CreateShuffleVector(V2, Undef, ExtMask);
+
+  Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, ExtV, Mask);
+}
+
+// Concatenate number of vectors in the give list starting from Idx
+static Value *ConcateVectors(IRBuilder<> &Builder,
+                             SmallVector<Value *, 4> &List, unsigned Idx,
+                             unsigned NumVec) {
+  assert(NumVec != 0 && "Number of vectors should not be zero");
+  if (NumVec == 1)
+    return List[Idx];
+
+  unsigned PartNum = PowerOf2Floor(NumVec);
+  // When the number of vectors is not power of 2, split the list and make sure
+  // the first list has power of 2 elements.
+  // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements.
+  if (PartNum == NumVec)
+    PartNum /= 2;
+  Value *V0 = ConcateVectors(Builder, List, Idx, PartNum);
+  Value *V1 = ConcateVectors(Builder, List, Idx + PartNum, NumVec - PartNum);
+  return ConcateTwoVectors(Builder, V0, V1);
+}
+
+// Create the vectorized instruction for interleaved access.
+// E.g. Transform interleaved load with 2 members:
+//      %s1 = load i32, i32* %ptr1
+//      %s2 = load i32, i32* %ptr2
+//   To
+//      %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//      %v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
+//      %v2 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+//
+// Transform interleaved store with 3 members:
+//      store i32 %s1, i32 %ptr1
+//      store i32 %s2, i32 %ptr2
+//      store i32 %s3, i32 %ptr3
+//   To
+//      %v1_v2 = shufflevector <4 x i32> %v1, %v2, <0, 1, 2, 3, 4, 5, 6, 7>
+//      %v3_und = shufflevector <4 x i32> %v3, undef, <0, 1, 2, 3, 4, 5, 6, 7>
+//      %interleaved.vec = shufflevector %v1_v2, %v3_und,
+//                                      <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+//      store <12 x i32> %interleaved.vec, <12 x i32> %ptr
+void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr,
+                                                     Value *Ptr, Type *DataTy) {
+  VectorType *VecTy = dyn_cast<VectorType>(DataTy);
+  assert(VecTy && "expect a vector type");
+
+  LoopVectorizationLegality::InterleavedAccess *IA =
+      Legal->getInterleavedAccess(Instr);
+  assert(IA->InsertPos == Instr && "Must be the insert position");
+
+  unsigned Idx = IA->getIndex(Instr);
+  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  SmallSetVector<Instruction *, 4> &Members = IA->Members;
+  unsigned Align = IA->Align;
+  if (!Align) {
+    const DataLayout &DL = Instr->getModule()->getDataLayout();
+    Align = DL.getABITypeAlignment(DataTy);
+  }
+  unsigned NumVec = IA->getNumMembers();
+  Type *WideVecTy = VectorType::get(VecTy->getElementType(), NumVec * VF);
+  Type *WidePtrTy = WideVecTy->getPointerTo(AddressSpace);
+
+  Constant *Zero = Builder.getInt32(0);
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  bool Reverse = IA->isReverse();
+  for (unsigned Part = 0; Part < UF; Part++) {
+    Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero);
+    // Adjust pointer to the access of index 0.
+    NewPtr =
+        Builder.CreateGEP(NewPtr, Builder.getInt32(-static_cast<int>(Idx)));
+
+    // If the address is reversed, then the wide load/store needs to start at
+    // the last access of index 0.
+    // E.g. For the reversed access of interleaved load for 2 members:
+    //      {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]}
+    // The NewPtr is pointed to A[i]. Need to adjust to A[i-6].
+    if (Reverse)
+      NewPtr = Builder.CreateGEP(
+          NewPtr, Builder.getInt32(-static_cast<int>(VF * NumVec - NumVec)));
+
+    // Bit cast to the new wide vector pointer type.
+    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, WidePtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  if (IA->IsLoad) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      Instruction *WideLoad =
+          Builder.CreateAlignedLoad(NewPtrs[Part], Align, "wide.vec");
+
+      // Interleave and split the loaded wide vector into small vectors.
+      Value *Undef = UndefValue::get(WideVecTy);
+      for (unsigned i = 0; i < NumVec; i++) {
+        VectorParts &Entry = WidenMap.get(Members[i]);
+        Constant *StridedMask = getStridedMask(Builder, i, NumVec, VF);
+        Value *NewV = Builder.CreateShuffleVector(WideLoad, Undef, StridedMask,
+                                                  "strided.vec");
+        Entry[Part] = Reverse ? reverseVector(NewV) : NewV;
+      }
+
+      propagateMetadata(WideLoad, Instr);
+    }
+    return;
+  }
+
+  for (unsigned Part = 0; Part < UF; Part++) {
+    SmallVector<Value *, 4> StoredVecs;
+    for (unsigned i = 0; i < NumVec; i++) {
+      Value *StoredVec = getVectorValue(Members[i]->getOperand(0))[Part];
+      if (Reverse)
+        StoredVec = reverseVector(StoredVec);
+      StoredVecs.push_back(StoredVec);
+    }
+    // Concatenate the apart vectors into a wide vector.
+    Value *WideVec = ConcateVectors(Builder, StoredVecs, 0, NumVec);
+
+    Constant *IMask = getInterleavedStoreMask(Builder, VF, NumVec);
+    // Interleave the data in the wide vector.
+    Value *IVec = Builder.CreateShuffleVector(
+        WideVec, UndefValue::get(WideVecTy), IMask, "interleaved.vec");
+
+    Instruction *WideStore =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Align);
+    propagateMetadata(WideStore, Instr);
+  }
+  return;
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1684,6 +2009,15 @@
   if (ScalarAllocatedSize != VectorElementSize)
     return scalarizeInstruction(Instr);
 
+  if (Legal->belongsToInterleavedAccess(Instr)) {
+    LoopVectorizationLegality::InterleavedAccess *IA =
+        Legal->getInterleavedAccess(Instr);
+    // Skip the access if it is not the insert position.
+    if (!IA->isInsertPos(Instr))
+      return;
+    return vectorizeInterleavedAccess(Instr, Ptr, DataTy);
+  }
+
   // If the pointer is loop invariant or if it is non-consecutive,
   // scalarize the load.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
@@ -3414,6 +3748,344 @@
   return true;
 }
 
+static bool isInBoundsGep(Value *Ptr) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+    return GEP->isInBounds();
+  return false;
+}
+
+/// \brief Check whether the access through \p Ptr has a constant stride.
+static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                        const SCEV *&PtrScev) {
+  const Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Unexpected non-ptr");
+  const PointerType *PtrTy = cast<PointerType>(Ty);
+
+  PtrScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  // The pointer must be an AddRecExpr pointer and the accesss function must
+  // stride over the innermost loop.
+  if (!AR || Lp != AR->getLoop())
+    return 0;
+
+  // The address calculation must not wrap. Otherwise, a dependence could be
+  // inverted.
+  // An inbounds getelementptr that is a AddRec with a unit stride
+  // cannot wrap per definition. The unit stride requirement is checked later.
+  // An getelementptr without an inbounds attribute and unit stride would have
+  // to access the pointer value "0" which is undefined behavior in address
+  // space 0, therefore we can also vectorize this case.
+  bool IsInBoundsGEP = isInBoundsGep(Ptr);
+  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
+  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero)
+    return 0;
+
+  // Check the step is constant.
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C)
+    return 0;
+
+  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+  int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+  const APInt &APStepVal = C->getValue()->getValue();
+
+  // Huge step value - give up.
+  if (APStepVal.getBitWidth() > 64)
+    return 0;
+
+  int64_t StepVal = APStepVal.getSExtValue();
+
+  // Strided access.
+  int64_t Stride = StepVal / Size;
+  int64_t Rem = StepVal % Size;
+  if (Rem)
+    return 0;
+
+  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
+  // know we can't "wrap around the address space". In case of address space
+  // zero we know that this won't happen without triggering undefined behavior.
+  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
+      Stride != 1 && Stride != -1)
+    return 0;
+
+  return Stride;
+}
+
+static unsigned getAddressSpaceOperand(Instruction *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+static Value *getPointerOperand(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+static Value *getBasePointer(Instruction *I, const DataLayout &DL) {
+  Value *Ptr = getPointerOperand(I);
+  assert(Ptr && "Fail to get point from a memory instruction");
+  return GetUnderlyingObject(Ptr, DL);
+}
+
+static bool isConsecutiveAccess(Instruction *A, Instruction *B,
+                                ScalarEvolution *SE, const DataLayout &DL) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same type.
+  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
+    return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *C = SE->getConstant(BaseDelta);
+  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
+  return X == PtrSCEVB;
+}
+
+void LoopVectorizationLegality::collectConstStridedAccess(
+    MapVector<Instruction *, int> &StridedAccesses) {
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+    if (blockNeedsPredication(*bb))
+      continue;
+
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Value *Ptr = getPointerOperand(it);
+      if (!Ptr)
+        continue;
+
+      const SCEV *PtrScev;
+      int Stride = isStridedPtr(SE, Ptr, TheLoop, PtrScev);
+      if (Stride < 2 && Stride > -2)
+        continue;
+
+      DEBUG(dbgs() << "LV: Found a constant strided access\n");
+      DEBUG(dbgs() << "  pointer:" << *Ptr << "\n");
+      DEBUG(dbgs() << "  Stride:" << Stride << "\n");
+      StridedAccesses[it] = Stride;
+    }
+  }
+}
+
+void LoopVectorizationLegality::collectInterleavedAccess(
+    SmallVector<Instruction *, 4> &Queue, int Stride, bool IsLoad) {
+  if (Queue.size() < 2) {
+    Queue.clear();
+    return;
+  }
+
+  SetVector<Instruction *> Heads;
+  SetVector<Instruction *> Tails;
+  SmallDenseMap<Instruction *, Instruction *> ConsecutiveChain;
+
+  // Collect all the consecutive pairs.
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (unsigned i = 0, e = Queue.size(); i != e; i++) {
+    for (unsigned j = 0; j != e; j++) {
+      if (i == j)
+        continue;
+      if (isConsecutiveAccess(Queue[i], Queue[j], SE, DL)) {
+        Heads.insert(Queue[i]);
+        Tails.insert(Queue[j]);
+        ConsecutiveChain[Queue[i]] = Queue[j];
+      }
+    }
+  }
+
+  for (SetVector<Instruction *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+    SmallSetVector<Instruction *, 4> ConsecutiveList;
+    Instruction *I = *it;
+
+    // Collect a list with consecutive accesses.
+    while (Tails.count(I) || Heads.count(I)) {
+      ConsecutiveList.insert(I);
+      // Move to the next access in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    unsigned Size = ConsecutiveList.size();
+    unsigned Num = std::abs(Stride);
+    if (Size < Num)
+      continue;
+
+    // As every interleave access consists of the same number of members as the
+    // stride. There may be more than 1 interleaved access in this list.
+    for (unsigned Part = 0; Part < Size / Num; Part++) {
+      InterleavedAccess *IA = new InterleavedAccess(Stride, IsLoad);
+      SmallSetVector<Instruction *, 4> &Members = IA->Members;
+      unsigned MaxAlign = 0;
+      for (unsigned i = Part * Num; i < Part * Num + Num; i++) {
+        Instruction *Access = ConsecutiveList[i];
+        // Insert each member.
+        Members.insert(Access);
+        // Record the relationship of the member and the interleaved access
+        InterleavedAccessMap[Access] = IA;
+
+        unsigned Align = IsLoad ? dyn_cast<LoadInst>(Access)->getAlignment()
+                                : dyn_cast<StoreInst>(Access)->getAlignment();
+        if (Align > MaxAlign)
+          MaxAlign = Align;
+      }
+      // The alignment of the interleaved access is the max member alignment.
+      IA->Align = MaxAlign;
+
+      // To avoid breaking the dependences, the new vectorized interleaved
+      // access has different insert positions for load and store.
+      if (IsLoad) {
+        for (unsigned i = 0, e = Queue.size(); i != e; i++) {
+          // Start from the first access in the queue. If an access is in the
+          // current interleaved access, it is the first access and set it to be
+          // the insert position.
+          if (Members.count(Queue[i])) {
+            IA->InsertPos = Queue[i];
+            break;
+          }
+        }
+      } else {
+        for (int i = Queue.size() - 1, e = -1; i != e; i--) {
+          // Start from the last access in the queue. If an access is in the
+          // current interleaved access, it is the last access and set it to be
+          // the insert position.
+          if (Members.count(Queue[i])) {
+            IA->InsertPos = Queue[i];
+            break;
+          }
+        }
+      }
+
+      DEBUG(dbgs() << "LV: Found an interleaved accesses with Stride " << Stride
+                   << "\n");
+    }
+  }
+
+  return;
+}
+
+void LoopVectorizationLegality::analyzeInterleavedAccess() {
+  MapVector<Instruction *, int> StridedAccesses;
+  collectConstStridedAccess(StridedAccesses);
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+
+  // The following logic groups the constant strided accesses and find the
+  // interleaved accesses in two steps:
+  //   (1) Groups loads/stores with the same stride and base pointers into a
+  //       queue.
+  //   (2) Calls another function to collect all interleaved accesses in that
+  //       queue.
+  // TODO: It can't collect interleaved access if the loads/stores are mixed.
+  // E.g.   for (i = 0; i < N; i+=2) {
+  //          A[i] += 1;
+  //          A[i+1] += 2;
+  //        }
+  // It will fail to identify the interleaved accesses as the mixed order:
+  //        load A[i], store A[i], load A[i+1], store A[i+1].
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+    if (blockNeedsPredication(*bb))
+      continue;
+
+    // Queue for the loads/stores of the same stride and base pointer.
+    SmallVector<Instruction *, 4> AccessQueue;
+    bool TryMerge = false;
+    Value *CurBase = nullptr;
+    int CurStride;
+    bool IsLoad = false;
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      if (TryMerge) {
+        collectInterleavedAccess(AccessQueue, CurStride, IsLoad);
+        CurBase = nullptr;
+        TryMerge = false;
+      }
+
+      if (!it->mayReadOrWriteMemory())
+        continue;
+
+      // A non-strided memory access may break the dependence. Merge the queue.
+      if (!StridedAccesses.count(it)) {
+        TryMerge = true;
+        continue;
+      }
+
+      Value *Base = getBasePointer(it, DL);
+      int Stride = StridedAccesses[it];
+      bool Read = it->mayReadFromMemory();
+
+      // Start a new queue
+      if (CurBase == nullptr) {
+        AccessQueue.clear();
+        AccessQueue.push_back(it);
+        CurBase = Base;
+        CurStride = Stride;
+        IsLoad = Read;
+        continue;
+      }
+
+      // This instructure doesn't fit the current queue. Try to merge current
+      // queue and insert it again.
+      // TODO: this may missing opportunities as it hasn't collected all
+      // loads/stores with the same base pointer and stride.
+      if (CurBase != Base || CurStride != Stride || IsLoad != Read) {
+        TryMerge = true;
+        --it;
+        continue;
+      }
+
+      AccessQueue.push_back(it);
+    }
+
+    // Collect the tail accesses.
+    if (CurBase != nullptr) {
+      collectInterleavedAccess(AccessQueue, CurStride, IsLoad);
+      CurBase = nullptr;
+      TryMerge = false;
+    }
+  }
+}
+
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
     return DL.getIntPtrType(Ty);
@@ -3601,12 +4273,12 @@
           return false;
         }
         if (EnableMemAccessVersioning)
-          collectStridedAccess(ST);
+          collectInvarStridedAccess(ST);
       }
 
       if (EnableMemAccessVersioning)
         if (LoadInst *LI = dyn_cast<LoadInst>(it))
-          collectStridedAccess(LI);
+          collectInvarStridedAccess(LI);
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
@@ -3629,6 +4301,9 @@
     }
   }
 
+  if (VectorizeInterleavedAccess)
+    analyzeInterleavedAccess();
+
   return true;
 }
 
@@ -3744,7 +4419,7 @@
   return Stride;
 }
 
-void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
+void LoopVectorizationLegality::collectInvarStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
     Ptr = LI->getPointerOperand();
@@ -3757,7 +4432,7 @@
   if (!Stride)
     return;
 
-  DEBUG(dbgs() << "LV: Found a strided access that we can version");
+  DEBUG(dbgs() << "LV: Found an invariable strided access that we can version");
   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
   Strides[Ptr] = Stride;
   StrideSet.insert(Stride);
@@ -4575,6 +5250,18 @@
       return TTI.getAddressComputationCost(VectorTy) +
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    // Interleaved memory loads/stores
+    if (Legal->belongsToInterleavedAccess(I)) {
+      LoopVectorizationLegality::InterleavedAccess *IA =
+          Legal->getInterleavedAccess(I);
+      unsigned Cost = TTI.getInterleavedMemoryOpCost(
+          I->getOpcode(), VectorTy, IA->getNumMembers(), IA->Align, AS);
+      if (IA->isReverse())
+        Cost +=
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+      return Cost;
+    }
+
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -102,26 +102,23 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll
@@ -0,0 +1,497 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UF2
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Make sure that we can vectorize the interleave access to a struct.
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+; int test_struct_load2(struct ST2 *A) {
+;   int sum = 0;
+;   for (int i = 0; i < 1024; ++i) {
+;     sum += A[i].x;
+;     sum += A[i].y;
+;   }
+;
+;   return sum;
+; }
+
+; CHECK-LABEL: @test_struct_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UF2-LABEL: @test_struct_load2(
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+
+%struct.ST2 = type { i32, i32 }
+define i32 @test_struct_load2(%struct.ST2* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %add = add nsw i32 %0, %sum
+  %add1 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  ret i32 %add1.lcssa
+}
+
+
+; Make sure that the following case can be vectorized.
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+; UF2-LABEL: @test_array_load2_store2(
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: load <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: store <8 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: store <8 x i32>
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+; };
+; int test_struct_load3(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+; UF2-LABEL: @test_struct_load3(
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+
+%struct.ST3 = type { i32, i32, i32 }
+
+define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.015
+  %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add5
+}
+
+
+; Make sure that the following case can be vectorized.
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+; UF2-LABEL: @test_struct_array_load3_store3(
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: load <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <12 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <12 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <12 x i32>
+
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+; UF2-LABEL: @test_struct_load4(
+; UF2: load <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: load <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+; UF2: shufflevector <16 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+; UF2-LABEL: @test_struct_store4(
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <16 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <4 x i32>
+; UF2: shufflevector <8 x i32>
+; UF2: store <16 x i32>
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; This case checks that reversed interleaved loads can be vectorized:
+; struct ST2 {
+;   int x;
+;   int y;
+; };
+; int test_reversed_load2(struct ST2 *A) {
+;   int add = 0;
+;   int sub = 0;
+;   for (int i = 1023; i >= 0; i--) {
+;     add += A[i].x;
+;     sub -= A[i].y;
+;   }
+;   return add*sub;
+; }
+
+; CHECK-LABEL: @test_reversed_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+
+define i32 @test_reversed_load2(%struct.ST2* nocapture readonly %A) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %mul = mul nsw i32 %sub4, %add1
+  ret i32 %mul
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %sub.015 = phi i32 [ 0, %entry ], [ %sub4, %for.body ]
+  %add.014 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add1 = add nsw i32 %0, %add.014
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub4 = sub nsw i32 %sub.015, %1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; This case checks that reversed interleaved loads/stores can be vectorized:
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}