Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -237,6 +237,10 @@
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const;
 
+  /// \brief Check whether the data dependence could prevent store-load
+  /// forwarding.
+  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+
 private:
   ScalarEvolution *SE;
   const Loop *InnermostLoop;
@@ -286,10 +290,220 @@
   Dependence::DepType isDependent(const MemAccessInfo &A, unsigned AIdx,
                                   const MemAccessInfo &B, unsigned BIdx,
                                   const ValueToValueMap &Strides);
+};
 
-  /// \brief Check whether the data dependence could prevent store-load
-  /// forwarding.
-  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+/// \brief The group of interleaved loads/stores sharing the same stride and
+/// close to each other.
+///
+/// Each member in this group has an index starting from 0 and the largest
+/// index should never be greater than Delta (interleaved factor) which is the
+/// absolute value of the access stride.
+///
+/// E.g. Interleaved load group of Delta 4 (with a gap: index 2):
+///        for (unsigned i = 0; i < 1024; i+=4) {
+///          a = A[i];                           // Member of index 0
+///          b = A[i+1];                         // Member of index 1
+///          d = A[i+3];                         // Member of index 3
+///          ...
+///        }
+///
+///      Interleaved store group of Delta 4:
+///        for (unsigned i = 0; i < 1024; i+=4) {
+///          ...
+///          A[i]   = a;                         // Member of index 0
+///          A[i+1] = b;                         // Member of index 1
+///          A[i+2] = c;                         // Member of index 2
+///          A[i+3] = d;                         // Member of index 3
+///        }
+///
+/// Note: the interleaved load group could have gaps (missing members), but
+/// the interleaved store group doesn't allow gap.
+class InterleaveGroup {
+public:
+  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
+      : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(nullptr) {
+    assert(Align && "The alignment should be non-zero");
+
+    Delta = std::abs(Stride);
+    assert(Delta > 1 && "Invalid interleave factor");
+
+    Reverse = Stride < 0;
+    Members[0] = Instr;
+  }
+
+  bool isReverse() const { return Reverse; }
+  unsigned getDelta() const { return Delta; }
+  unsigned getAlign() const { return Align; }
+  unsigned getNumMembers() const { return Members.size(); }
+
+  /// \brief Try to insert a new member \p Instr with index \p Index and
+  /// alignment \p NewAlign.
+  ///
+  /// \returns false if the instruction doesn't belong to the group.
+  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
+    assert(NewAlign && "The new member's alignment should be non-zero");
+
+    int Key = Index + SmallestKey;
+
+    // Skip if there is already a member with the same index.
+    if (Members.count(Key))
+      return false;
+
+    if (Key > LargestKey) {
+      // The largest index is always less than the Delta.
+      if (Index >= Delta)
+        return false;
+
+      LargestKey = Key;
+    } else if (Key < SmallestKey) {
+      // The largest index is always less than the Delta.
+      if (LargestKey - Key >= Delta)
+        return false;
+
+      SmallestKey = Key;
+    }
+
+    // It's always safe to select the minimum alignment.
+    Align = std::min(Align, NewAlign);
+    Members[Key] = Instr;
+    return true;
+  }
+
+  /// \brief Get the member with the given index \p Index
+  ///
+  /// \returns nullptr if dosn't contain such member.
+  Instruction *getMember(unsigned Index) const {
+    int Key = SmallestKey + Index;
+    if (!Members.count(Key))
+      return nullptr;
+
+    return Members.find(Key)->second;
+  }
+
+  /// \brief Get the index for the given member. Unlike the key in the member
+  /// map, the index starts from 0.
+  unsigned getIndex(Instruction *Instr) const {
+    for (auto I : Members)
+      if (I.second == Instr)
+        return I.first - SmallestKey;
+
+    llvm_unreachable("InterleaveGroup contains no such member");
+  }
+
+  Instruction *getInsertPos() const { return InsertPos; }
+  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+
+  /// \brief Print the group.
+  void print(raw_ostream &OS, unsigned Depth) const {
+    OS.indent(Depth) << "Interleave Group of Delta " << Delta << ":\n";
+
+    for (int i = 0; i < Delta; i++) {
+      Instruction *Member = getMember(i);
+      if (Member)
+        OS.indent(Depth + 2) << "Index " << i << ":" << *Member << "\n";
+    }
+  }
+
+private:
+  int Delta; // Interleave Factor.
+  bool Reverse;
+  unsigned Align;
+  DenseMap<int, Instruction *> Members;
+  int SmallestKey;
+  int LargestKey;
+
+  // To avoid breaking dependences, an interleaved access should be inserted
+  // at either the first load or the last store in program order.
+  // E.g. %even = load i32             // Insert Position
+  //      %add = add i32 %even         // Use of %even
+  //      %odd = load i32
+  //
+  //      store i32 %even
+  //      %odd = add i32               // Def of %odd
+  //      store i32 %odd               // Insert Position
+  Instruction *InsertPos;
+};
+
+/// \brief Drive the analysis of interleaved memory accesses in the loop.
+///
+/// Call this class to analyze interleaved accesses only when the memory
+/// dependence check says we can vectorize the loop. Otherwise it's meaningless
+/// to do analysis as the vectorization on interleaved accesses is unsafe.
+///
+/// The analysis collects interleave groups and records the relationships
+/// between the member and the group in a map.
+class InterleavedAccessInfo {
+public:
+  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
+                        MemoryDepChecker *DepChecker)
+      : SE(SE), TheLoop(L), DT(DT), DepChecker(DepChecker) {}
+
+  ~InterleavedAccessInfo() {
+    SmallSet<InterleaveGroup *, 4> DelSet;
+    // Collecting all the group pointers to avoid release a pointer twice.
+    for (auto &I : InterleaveGroupMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
+  /// \brief Analyze the interleaved accesses.  Substitute symbolic strides
+  /// using \p Strides.
+  void analyzeInterleaving(const ValueToValueMap &Strides);
+
+  /// \brief Check if \p Instr belongs to any interleave group.
+  bool isAccessInterleaved(Instruction *Instr) const {
+    return InterleaveGroupMap.count(Instr);
+  }
+
+  /// \brief Get the interleave group that \p Instr belongs to.
+  ///
+  /// \returns nullptr if doesn't have such group.
+  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
+    if (InterleaveGroupMap.count(Instr))
+      return InterleaveGroupMap.find(Instr)->second;
+    return nullptr;
+  }
+
+  /// \brief Print the interleave groups.
+  void print(raw_ostream &OS, unsigned Depth) const;
+
+private:
+  ScalarEvolution *SE;
+  Loop *TheLoop;
+  DominatorTree *DT;
+  MemoryDepChecker *DepChecker;
+
+  /// Contains the relationships between the members and the interleave group.
+  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
+
+  /// \brief Group the interleaved loads/stores. Record the write-write pairs
+  /// whose dependences may be broken by the vectorization.
+  void groupInterleavedAccesses(
+      ArrayRef<Instruction *> InstrList,
+      SmallVector<std::pair<Instruction *, Instruction *>, 2> &WritePairs,
+      const ValueToValueMap &Strides);
+
+  /// \brief Create a new interleave group with the given instruction \p Instr,
+  /// stride \p Stride and alignment \p Align.
+  ///
+  /// \returns the newly created interleave group.
+  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
+                                         unsigned Align) {
+    assert(!InterleaveGroupMap.count(Instr) &&
+           "Already in an interleaved access group");
+    InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
+    return InterleaveGroupMap[Instr];
+  }
+
+  /// \brief Release the group and remove all the relationships.
+  void releaseGroup(InterleaveGroup *Group) {
+    for (unsigned i = 0; i < Group->getDelta(); i++)
+      if (Instruction *Member = Group->getMember(i))
+        InterleaveGroupMap.erase(Member);
+
+    delete Group;
+  }
 };
 
 /// \brief Drive the analysis of memory accesses in the loop
@@ -306,6 +520,9 @@
 /// generates run-time checks to prove independence.  This is done by
 /// AccessAnalysis::canCheckPtrAtRT and the checks are maintained by the
 /// RuntimePointerCheck class.
+///
+/// If the memory accesses can be vectorized, it will analyze the interleaved
+/// access information, which is delegated to the InterleavedAccessInfo class.
 class LoopAccessInfo {
 public:
   /// This struct holds information about the memory runtime legality check that
@@ -427,6 +644,16 @@
     return DepChecker.getInstructionsForAccess(Ptr, isWrite);
   }
 
+  /// \brief Check if \p Instr belongs to any interleave group.
+  bool isAccessInterleaved(Instruction *Instr) const {
+    return InterleaveInfo.isAccessInterleaved(Instr);
+  }
+
+  /// \brief Get the interleave group that \p Instr belongs to.
+  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) const {
+    return InterleaveInfo.getInterleaveGroup(Instr);
+  }
+
   /// \brief Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
@@ -460,6 +687,10 @@
   /// loop-independent and loop-carried dependences between memory accesses.
   MemoryDepChecker DepChecker;
 
+  /// \brief The interleaved access information contains groups of interleaved
+  /// accesses with the same stride and close to each other.
+  InterleavedAccessInfo InterleaveInfo;
+
   /// \brief Number of memchecks required to prove independence of otherwise
   /// may-alias pointers
   unsigned NumComparisons;
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -444,6 +444,18 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of the interleaved memory operation.
+  /// \p Opcode is the memory operation code
+  /// \p VecTy is the vector type of the interleaved access.
+  /// \p Delta is the interleave factor
+  /// \p Indices is the indices for interleaved load members (as interleaved
+  ///    loads allow gaps)
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -582,6 +594,11 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              unsigned Delta,
+                                              ArrayRef<unsigned> Indices,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -740,6 +757,14 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Indices,
+                                           Alignment, AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -300,6 +300,14 @@
     return 1;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+    return 1;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -522,6 +522,75 @@
     return Cost;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VT = dyn_cast<VectorType>(VecTy);
+    assert(VT && "Expect vector types");
+
+    unsigned NumElts = VT->getNumElements();
+    assert(Delta >= 2 && !(NumElts % Delta) && "Invalid Delta");
+
+    unsigned NumSubElts = NumElts / Delta;
+    VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
+
+    // Firstly, the cost of load/store operation.
+    unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+    // Then plus the cost of interleave operation.
+    if (Opcode == Instruction::Load) {
+      // The interleave cost is similar to extract elements of the sub vectors
+      // from the wide vector, and insert them into sub vectors.
+      //
+      // E.g. Interleaved load of Delta 3 to 2 sub vectors (%v0, %v2):
+      //      %vec = load <12 x i32>, <12 x i32> %ptr
+      //      %v0 = shuffle %vec, undef, <0, 3, 6, 9>         ; Index 0
+      //      %v2 = shuffle %vec, undef, <2, 5, 8, 11>        ; Index 2
+      // The cost is estimated as extract elements at 0, 2, 3, 5, 6, 8, 9, 11
+      // from the <12 x i32> vector and insert them into two <4 x i32> vectors.
+
+      assert(Indices.size() <= Delta &&
+             "Interleaved memory op has too many members");
+      for (unsigned Index : Indices) {
+        assert(Index < Delta && "Invalid index for interleaved memory op");
+        for (unsigned i = 0; i < NumSubElts; i++)
+          Cost +=
+              getVectorInstrCost(Instruction::ExtractElement, VT, Index + i);
+      }
+
+      unsigned InsSubCost = 0;
+      for (unsigned i = 0; i < NumSubElts; i++)
+        InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);
+
+      Cost += Indices.size() * InsSubCost;
+    } else {
+      // The interleave cost is extract each element from sub vectors, and
+      // insert them into the wide vector.
+      //
+      // E.g. Interleaved store with Delta 3 (For vector %v0, %v1, %v2):
+      //      %v0_v1 = shuffle %v0, %v1, <0, 1, 2, 3, 4, 5, 6, 7>
+      //      %v2_u = shuffle %v2, undef, <0, 1, 2, 3, u, u, u, u>
+      //      %interleaved.vec = shuffle %v0_v1, %v2_u,
+      //                                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+      //      store <12 x i32> %interleaved.vec, <12 x i32>* %ptr
+      // The cost is estimated as extract all elements from the 3 <4 x i32>
+      // vectors (Total 3 * 4 elements) and insert into the <12 x i32> vector.
+
+      unsigned ExtSubCost = 0;
+      for (unsigned i = 0; i < NumSubElts; i++)
+        ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
+
+      Cost += Delta * ExtSubCost;
+
+      for (unsigned i = 0; i < NumElts; i++)
+        Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
+    }
+
+    return Cost;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     unsigned ISD = 0;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -41,6 +41,10 @@
                             VectorizerParams::VectorizationInterleave));
 unsigned VectorizerParams::VectorizationInterleave;
 
+static cl::opt<bool> EnableInterleaving(
+    "enable-interleaving", cl::init(false), cl::Hidden,
+    cl::desc("Enable analyzing interleaved accesses in a loop"));
+
 static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
     "runtime-memory-check-threshold", cl::Hidden,
     cl::desc("When performing memory disambiguation checks at runtime do not "
@@ -783,21 +787,32 @@
                            VectorizerParams::VectorizationFactor : 1);
   unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                            VectorizerParams::VectorizationInterleave : 1);
+  // The number of iterations to be vectorized and unrolled.
+  unsigned NumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
+
+  unsigned Stride = std::abs(StrideAPtr);
+  // Safe when either one below is true
+  //     Distance < Stride * TypeByteSize
+  //     Distance >= TypeByteSize * NumIter * Stride
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIter * Stride)) {
+    DEBUG(dbgs() << "LAA: Failure because of positive distance "
+                 << Val.getSExtValue() << '\n');
+    return Dependence::Backward;
+  }
 
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
-    DEBUG(dbgs() << "LAA: Failure because of Positive distance "
-        << Val.getSExtValue() << '\n');
+  // Safe when positive distance is not greater than the max safe distance.
+  if (Distance > MaxSafeDepDistBytes) {
+    DEBUG(dbgs() << "LAA: Failure because positive distance "
+                 << Val.getSExtValue() << " is greater than max safe distance "
+                 << MaxSafeDepDistBytes << "\n");
     return Dependence::Backward;
   }
 
-  // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  // If Distance < Stride * TypeByteSize, it is always safe. just keep the
+  // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes.
+  if (Distance >= Stride * TypeByteSize)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
@@ -900,6 +915,241 @@
   OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
 }
 
+struct StrideDescriptor {
+  StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align)
+      : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
+
+  int Stride;
+  const SCEV *Scev;
+  unsigned Size;
+  unsigned Align;
+};
+
+void InterleavedAccessInfo::groupInterleavedAccesses(
+    ArrayRef<Instruction *> InstrList,
+    SmallVector<std::pair<Instruction *, Instruction *>, 2> &WritePairs,
+    const ValueToValueMap &Strides) {
+  // Holds all the stride accesses.
+  SmallVector<std::pair<Instruction *, StrideDescriptor>, 16> StrideAccess;
+
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (auto I : InstrList) {
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    assert((LI || SI) && "Invalid access instruction");
+
+    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+
+    // Only analyze non-unit stride accesses.
+    if (std::abs(Stride) < 2)
+      continue;
+
+    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+    unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+    // An alignment of 0 means target ABI alignment.
+    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(PtrTy->getElementType());
+
+    StrideAccess.push_back(
+        std::make_pair(I, StrideDescriptor(Stride, Scev, Size, Align)));
+  }
+
+  if (!StrideAccess.size())
+    return;
+
+  // Search the load-load/write-write pair B-A in bottom-up order and try to
+  // insert B into the interleave group of A according to 3 rules:
+  //   1. A and B have the same stride.
+  //   2. A and B have the same memory object size.
+  //   3. The distance of B to the leader of the group is less than the Delta.
+  //
+  // The bottom-up order can avoid breaking the WAW dependences between two
+  // pointers with the same base.
+  // E.g.  A[i]   = a;   (1)
+  //       A[i]   = b;   (2)
+  //       A[i+1] = c    (3)
+  // We form the group (2)+(3), so that (1) has to form groups with access
+  // above (1), which guarantees that (1) is always above (2).
+  for (auto I = StrideAccess.rbegin(), E = StrideAccess.rend(); I != E; ++I) {
+    Instruction *A = I->first;
+    StrideDescriptor DesA = I->second;
+
+    InterleaveGroup *Group = getInterleaveGroup(A);
+    if (!Group) {
+      DEBUG(dbgs() << "LAA: Creating an interleave group with:" << *A << '\n');
+      Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+    }
+
+    for (auto II = std::next(I); II != E; ++II) {
+      Instruction *B = II->first;
+      StrideDescriptor DesB = II->second;
+
+      if (A->mayReadFromMemory() != B->mayReadFromMemory())
+        continue;
+
+      // Check the rule 1 and 2.
+      if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
+        continue;
+
+      // Calculate the distance and prepare for the rule 3.
+      const SCEVConstant *DistToA =
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+      if (!DistToA)
+        continue;
+
+      int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
+      // Read/write the same location.
+      if (!DistanceToA)
+        continue;
+
+      int Size = static_cast<int>(DesA.Size);
+      if (DistanceToA % Size) {
+        // Record the write-write pairs that could be broken after vectorizaton.
+        //
+        // E.g.  char *Tmp = (char *)A;       // A is int*
+        //       int *B = (int *) (++Tmp);
+        //       for(i = 0; i < n; i+=2) {
+        //         A[i]   = a;                // (1)
+        //         B[i]   = b;                // (2)
+        //         A[i+1] = c;                // (3)
+        //       }
+        // The combine of (1) and (3) will be inserted below (2).
+        if (A->mayWriteToMemory())
+          // Reuse the store-load forwarding check for whether there is overlap.
+          if (DepChecker->couldPreventStoreLoadForward(std::abs(DistanceToA),
+                                                       Size))
+            WritePairs.push_back(std::make_pair(B, A));
+        continue;
+      }
+
+      // Skip if B is already in a Group.
+      if (getInterleaveGroup(B))
+        continue;
+
+      // The index of B is the index of A plus the related index to A.
+      int IndexB = Group->getIndex(A) + DistanceToA / Size;
+
+      // Try to insert B into the group.
+      if (Group->insertMember(B, IndexB, DesB.Align)) {
+        DEBUG(dbgs() << "LAA: Inserted:" << *B << '\n'
+                     << "     into the interleave group with" << *A << '\n');
+        InterleaveGroupMap[B] = Group;
+      }
+    }
+  }
+}
+
+void InterleavedAccessInfo::analyzeInterleaving(
+    const ValueToValueMap &Strides) {
+  // Holds load/store instructions in program order.
+  SmallVector<Instruction *, 16> InstrList;
+
+  for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE;
+       ++BB) {
+    bool IsPred = LoopAccessInfo::blockNeedsPredication(*BB, TheLoop, DT);
+
+    for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I) {
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+        continue;
+      // FIXME: As currently we can't handle predicated access, return directly.
+      if (IsPred)
+        return;
+
+      InstrList.push_back(I);
+    }
+  }
+
+  if (!InstrList.size())
+    return;
+
+  // Holds the write-write pairs that could be broken by the vectorization.
+  SmallVector<std::pair<Instruction *, Instruction *>, 2> WritePairs;
+
+  DEBUG(dbgs() << "LAA: Analyzing interleaved accesses...\n");
+  groupInterleavedAccesses(InstrList, WritePairs, Strides);
+
+  // Filter eligible groups and set the insert position. We only keep the load
+  // group that has small gaps (less than half Delta) and the fully interleaved
+  // store group (has no gap).
+  for (auto I : InstrList) {
+    InterleaveGroup *Group = getInterleaveGroup(I);
+    if (!Group)
+      continue;
+
+    if (I->mayReadFromMemory()) {
+      if (Group->getNumMembers() >= Group->getDelta() / 2) {
+        // Choose the first load in program order.
+        if (!Group->getInsertPos())
+          Group->setInsertPos(I);
+        continue;
+      }
+    } else if (Group->getNumMembers() == Group->getDelta()) {
+      // Choose the last store in program order.
+      Group->setInsertPos(I);
+      continue;
+    }
+
+    releaseGroup(Group);
+  }
+
+  // Make sure the dependence of each write-write pair is still safe.
+  for (auto Pair : WritePairs) {
+    Instruction *A = Pair.first;
+    Instruction *B = Pair.second;
+    auto GroupA = getInterleaveGroup(A);
+    auto GroupB = getInterleaveGroup(B);
+
+    if (!GroupA && !GroupB)
+      continue;
+
+    Instruction *PosA = GroupA ? GroupA->getInsertPos() : A;
+    Instruction *PosB = GroupB ? GroupB->getInsertPos() : B;
+
+    // Continue if neither A nor B will be moved to another position.
+    if (PosA == A && PosB == B)
+      continue;
+
+    bool IsSafe = true;
+
+    for (auto I = InstrList.rbegin(), E = InstrList.rend(); I != E; ++I) {
+      if (*I == PosB)
+        break;
+      // If the position of A is after the position of B, it is unsafe.
+      if (*I == PosA) {
+        IsSafe = false;
+        break;
+      }
+    }
+
+    if (IsSafe)
+      continue;
+
+    if (PosA != A)
+      releaseGroup(GroupA);
+    if (PosB != B)
+      releaseGroup(GroupB);
+  }
+}
+
+void InterleavedAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth) << "Interleaved access groups:\n";
+
+  SmallSet<InterleaveGroup *, 4> Seen;
+  for (auto I : InterleaveGroupMap) {
+    if (Seen.count(I.second))
+      continue;
+
+    // Print the group.
+    I.second->print(OS, Depth + 2);
+    OS << "\n";
+    Seen.insert(I.second);
+  }
+}
+
 bool LoopAccessInfo::canAnalyzeLoop() {
   // We need to have a loop header.
   DEBUG(dbgs() << "LAA: Found a loop: " <<
@@ -1319,12 +1569,15 @@
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : DepChecker(SE, L), NumComparisons(0), TheLoop(L), SE(SE), DL(DL),
-      TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
-      MaxSafeDepDistBytes(-1U), CanVecMem(false),
-      StoreToLoopInvariantAddress(false) {
+    : DepChecker(SE, L), InterleaveInfo(SE, L, DT, &DepChecker),
+      NumComparisons(0), TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT),
+      LI(LI), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1U),
+      CanVecMem(false), StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(Strides);
+
+  if (CanVecMem && EnableInterleaving)
+    InterleaveInfo.analyzeInterleaving(Strides);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -1354,6 +1607,9 @@
   OS.indent(Depth) << "Store to invariant address was "
                    << (StoreToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
+
+  InterleaveInfo.print(OS, Depth);
+  OS << "\n";
 }
 
 const LoopAccessInfo &
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,13 @@
   return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
+unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Delta, ArrayRef<unsigned> Indices,
+    unsigned Alignment, unsigned AddressSpace) const {
+  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Indices,
+                                             Alignment, AddressSpace);
+}
+
 unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -34,6 +34,10 @@
 // Variable uniformity checks are inspired by:
 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
+// The interleaved access vectorization is based on the paper:
+//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
+//  Data for SIMD
+//
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
@@ -351,6 +355,9 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  /// Try to vectorize the interleaved access group that \p Instr belongs to.
+  void vectorizeInterleaveGroup(Instruction *Instr);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -693,8 +700,16 @@
     return LAI->getRuntimePointerCheck();
   }
 
-  const LoopAccessInfo *getLAI() const {
-    return LAI;
+  const LoopAccessInfo *getLAI() const { return LAI; }
+
+  /// \brief Check if \p Instr belongs to any interleaved access group.
+  bool isAccessInterleaved(Instruction *Instr) {
+    return LAI->isAccessInterleaved(Instr);
+  }
+
+  /// \brief Get the interleaved access group that \p I belongs to.
+  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+    return LAI->getInterleavedAccessGroup(Instr);
   }
 
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
@@ -1657,6 +1672,256 @@
                                      "reverse");
 }
 
+// Get a mask to interleave \p NumVec vectors into a wide vector.
+// I.E  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
+//      <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
+                                    unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVec; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting from index \p Start.
+// I.E.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+                                unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consist of sequential integers
+// starting from 0, The second part consist of UNDEFs.
+// I.E. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
+                                   unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumInt; i++)
+    Mask.push_back(Builder.getInt32(i));
+
+  // Return directly if there is no undef value.
+  if (!NumUndef)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 2nd vector should
+// not have more elements than the 1st vector. If the 2nd vector has less
+// elements, extend it with UNDEFs.
+static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+  if (NumElts1 > NumElts2) {
+    // Extend with UNDEFs.
+    Constant *ExtMask =
+        getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
+    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+  }
+
+  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+// Concatenate vectors in the given list. All vectors have the same type.
+static Value *ConcatenateVectors(IRBuilder<> &Builder,
+                                 ArrayRef<Value *> InputList) {
+  unsigned NumVec = InputList.size();
+  assert(NumVec > 1 && "Should be at least two vectors");
+
+  SmallVector<Value *, 8> VecList;
+  SmallVector<Value *, 8> TmpList;
+  VecList.append(InputList.begin(), InputList.end());
+  do {
+    for (unsigned i = 0; i < NumVec / 2; i++)
+      TmpList.push_back(
+          ConcatenateTwoVectors(Builder, VecList[2 * i], VecList[2 * i + 1]));
+
+    // Push the last vector if the total number of vectors is odd.
+    if (NumVec % 2 != 0)
+      TmpList.push_back(VecList[NumVec - 1]);
+
+    VecList.clear();
+    VecList.append(TmpList.begin(), TmpList.end());
+    NumVec = VecList.size();
+    TmpList.clear();
+  } while (NumVec > 1);
+
+  return VecList[0];
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (Delta is 3):
+//   for (i = 0; i < N; i+=3) {
+//     R = Pic[i];             // Member of index 0
+//     G = Pic[i+1];           // Member of index 1
+//     B = Pic[i+2];           // Member of index 2
+//     ... // do something to R, G, B
+//   }
+// To:
+//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
+//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
+//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
+//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
+//
+// Or translate following interleaved store group (Delta is 3):
+//   for (i = 0; i < N; i+=3) {
+//     ... do something to R, G, B
+//     Pic[i]   = R;           // Member of index 0
+//     Pic[i+1] = G;           // Member of index 1
+//     Pic[i+2] = B;           // Member of index 2
+//   }
+// To:
+//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
+//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  // Skip if current instruction is not the insert position.
+  if (Instr != Group->getInsertPos())
+    return;
+
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  unsigned Delta = Group->getDelta();
+  Type *VecTy = VectorType::get(ScalarTy, Delta * VF);
+  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+
+  // Prepare for the new pointers.
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  unsigned Index = Group->getIndex(Instr);
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Notice current instruction could be any index. Need to adjust the address
+    // to the member of index 0.
+    //
+    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
+    //       b = A[i];       // Member of index 0
+    // Current pointer is pointed to A[i+1], adjust it to A[i].
+    //
+    // E.g.  A[i+1] = a;     // Member of index 1
+    //       A[i]   = b;     // Member of index 0
+    //       A[i+2] = c;     // Member of index 2 (Current instruction)
+    // Current pointer is pointed to A[i+2], adjust it to A[i].
+    Value *NewPtr =
+        Builder.CreateExtractElement(PtrParts[Part], Builder.getInt32(0));
+    NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+
+    // If the address is reverse, then the vector load/store needs to start at
+    // the last access of index 0.
+    // E.g. For the reversed access of interleaved load with 2 members:
+    //      {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]}
+    // The new pointer is now pointed to A[i]. Adjust it to A[i-6].
+    if (Group->isReverse())
+      NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-(Delta * (VF - 1))));
+
+    // Cast to the vector pointer type.
+    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  Value *UndefVec = UndefValue::get(VecTy);
+
+  // Vectorize the interleaved load group.
+  if (LI) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      Instruction *CallI = Builder.CreateAlignedLoad(
+          NewPtrs[Part], Group->getAlign(), "wide.vec");
+
+      for (unsigned i = 0; i < Delta; i++) {
+        Instruction *Member = Group->getMember(i);
+
+        // Skip the gaps in the group.
+        if (!Member)
+          continue;
+
+        Constant *StrideMask = getStridedMask(Builder, i, Delta, VF);
+        Value *StridedVec = Builder.CreateShuffleVector(
+            CallI, UndefVec, StrideMask, "strided.vec");
+
+        // If this member has different type, cast the result type.
+        if (Member->getType() != ScalarTy) {
+          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+        }
+
+        VectorParts &Entry = WidenMap.get(Member);
+        Entry[Part] =
+            Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+      }
+
+      propagateMetadata(CallI, Instr);
+    }
+    return;
+  }
+
+  // The sub vector type for current instruction.
+  VectorType *SubVT = VectorType::get(ScalarTy, VF);
+
+  // Vectorize the interleaved store group.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Collect the stored vector from each member.
+    SmallVector<Value *, 4> StoredVecs;
+    for (unsigned i = 0; i < Delta; i++) {
+      // Interleaved store group doesn't allow a gap, so each index has a member
+      Instruction *Member = Group->getMember(i);
+      assert(Member && "Fail to get a member from an interleaved store group");
+
+      Value *StoredVec =
+          getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
+      if (Group->isReverse())
+        StoredVec = reverseVector(StoredVec);
+
+      // If this member has different type, cast it to a unified type.
+      if (StoredVec->getType() != SubVT)
+        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+
+      StoredVecs.push_back(StoredVec);
+    }
+
+    // Concatenate all vectors into a wide vector.
+    Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+
+    // Interleave the elements in the vector.
+    Constant *IMask = getInterleavedMask(Builder, VF, Delta);
+    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
+                                              "interleaved.vec");
+
+    Instruction *CallI =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlign());
+    propagateMetadata(CallI, Instr);
+  }
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1664,6 +1929,10 @@
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
+  // Try to vectorize the interleave group if this access is interleaved.
+  if (Legal->isAccessInterleaved(Instr))
+    return vectorizeInterleaveGroup(Instr);
+
   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
@@ -4575,6 +4844,41 @@
       return TTI.getAddressComputationCost(VectorTy) +
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    // Interleaved access
+    if (Legal->isAccessInterleaved(I)) {
+      auto Group = Legal->getInterleavedAccessGroup(I);
+      assert(Group && "Fail to get an interleaved access group.");
+
+      // Only calculate the cost once.
+      if (Group->getInsertPos() != I)
+        return 0;
+
+      unsigned Delta = Group->getDelta();
+      Type *WideVecTy =
+          VectorType::get(VectorTy->getVectorElementType(),
+                          VectorTy->getVectorNumElements() * Delta);
+
+      SmallVector<unsigned, 4> Indices;
+      if (LI) {
+        // Collect the gaps in interleaved loads. No need to do this for
+        // interleaved stores, which only support fully interleave without gap.
+        for (unsigned i = 0; i < Delta; i++)
+          if (Group->getMember(i))
+            Indices.push_back(i);
+      }
+
+      // Calculate the cost of the whole interleaved group.
+      unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                     Group->getDelta(), Indices,
+                                                     Group->getAlign(), AS);
+
+      if (Group->isReverse())
+        Cost +=
+            Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+      return Cost;
+    }
+
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
Index: test/Analysis/LoopAccessAnalysis/analyze-interleaving.ll
===================================================================
--- /dev/null
+++ test/Analysis/LoopAccessAnalysis/analyze-interleaving.ll
@@ -0,0 +1,544 @@
+; RUN: opt -S -loop-accesses -analyze -enable-interleaving=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check an interleaved load group of Delta 2 and an interleaved store group of
+; Delta 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+%struct.ST2 = type { i32, i32 }
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check an interleaved load group of Delta 3 and an interleaved store group of
+; Delta 3.
+
+; int A[3072];
+; struct ST3 S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     S[i].x = X1 + 1;
+;     S[i].y = X2 + 2;
+;     S[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 3:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK-NEXT:   Index 2:
+; CHECK: Interleave Group of Delta 3:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK-NEXT:   Index 2:
+
+%struct.ST3 = type { i32, i32, i32 }
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check an interleaved load group of Delta 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 4:
+; CHECK-NEXT:   Index 0:  %0 = load i32, i32* %x, align 4
+; CHECK-NEXT:   Index 1:  %1 = load i32, i32* %y, align 4
+; CHECK-NEXT:   Index 2:  %2 = load i32, i32* %z, align 4
+; CHECK-NEXT:   Index 3:  %3 = load i32, i32* %w, align 4
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check on an interleaved store group of Delta 4.
+
+; int B[1024];
+; struct ST4 T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 4:
+; CHECK-NEXT:   Index 0:  store i32 %add, i32* %x, align 4
+; CHECK-NEXT:   Index 1:  store i32 %mul, i32* %y, align 4
+; CHECK-NEXT:   Index 2:  store i32 %sub, i32* %z, align 4
+; CHECK-NEXT:   Index 3:  store i32 %add5, i32* %w, align 4
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check a reverse interleaved load group of Delta 4 and a reverse interleaved
+; store group of Delta 4.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reverse_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+define void @test_reverse_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check an interleaved load group of Delta 2 with 1 gap (without the load on
+; odd element).
+
+; void even_load(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NOT:    Index 1
+
+define void @even_load(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check an interleaved store group of Delta 2 with 1 gap. We should not identify
+; such case as a group.
+
+; void even_store(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;    A[i] = B[i] + B[i+1];
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NOT:    Index 0:  store
+
+define void @even_store(i32* nocapture %A, i32* nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add3, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check access groups identified from mixed loads and stores.
+
+; void mixed_load_store(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %2, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %4, %3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check access groups identified from members with different kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+
+; Check two interleaved store groups which will break write after write
+; dependence. We should not identify them as interleaved groups.
+
+; void waw_dep(int *A) {
+;   char *Tmp = (char *)A;
+;   int *B = (int *) (++Tmp);
+;   for(int i = 0; i < 1024; i+=2) {
+;     A[i]   = i;                // (1)
+;     B[i]   = i;                // (2)
+;     A[i+1] = i + 1;            // (3)
+;     B[i+1] = i + 1;            // (4)
+;   }
+; }
+;
+; The combine of (1)+(3) or (2)+(4) will break the dependence.
+
+; CHECK: Interleaved access groups:
+; CHECK-NOT: Interleave Group:
+
+define void @waw_dep(i32* nocapture %A) {
+entry:
+  %0 = bitcast i32* %A to i8*
+  %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1
+  %1 = bitcast i8* %incdec.ptr to i32*
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %2 = trunc i64 %indvars.iv to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx2, align 4
+  %3 = or i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = trunc i64 %3 to i32
+  store i32 %4, i32* %arrayidx5, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3
+  store i32 %4, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check two interleaved store groups which won't break write after write
+; dependence.
+
+; void no_waw_dep(int *A) {
+;   char *Tmp = (char *)A;
+;   int *B = (int *) (++Tmp);
+;   for(int i = 0; i < 1024; i+=2) {
+;     A[i]   = i;                // (1)
+;     A[i+1] = i + 1;            // (2)
+;     B[i]   = i;                // (3)
+;     B[i+1] = i + 1;            // (4)
+;   }
+; }
+;
+; The combine of (1)+(2) or (3)+(4) won't break the dependence.
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:  store i32 %2
+; CHECK-NEXT:   Index 1:  store i32 %4
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:  store i32 %2
+; CHECK-NEXT:   Index 1:  store i32 %4
+
+define void @no_waw_dep(i32* nocapture %A) {
+entry:
+  %0 = bitcast i32* %A to i8*
+  %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1
+  %1 = bitcast i8* %incdec.ptr to i32*
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %2 = trunc i64 %indvars.iv to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %3 = or i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = trunc i64 %3 to i32
+  store i32 %4, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx5, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3
+  store i32 %4, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaving=true | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -enable-interleaving=true | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
@@ -102,26 +102,23 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
Index: test/Transforms/LoopVectorize/interleaved-accesses.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -0,0 +1,427 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaving=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check vectorization on an interleaved load group of Delta 2 and an interleaved
+; store group of Delta 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+%struct.ST2 = type { i32, i32 }
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of Delta 3 and an interleaved
+; store group of Delta 3.
+
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+%struct.ST3 = type { i32, i32, i32 }
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of Delta 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check vectorization on an interleaved store group of Delta 4.
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on a reverse interleaved load group of Delta 2 and
+; a reverse interleaved store group of Delta 2.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of Delta 2 with 1 gap
+; (missing the load of odd elements).
+
+; void even_load(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shl nsw <4 x i32> %strided.vec, <i32 1, i32 1, i32 1, i32 1>
+define void @even_load(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load_store(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load_store(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec
+define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %2, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %4, %3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups with members having different
+; kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK-LABEL: @int_float_struct(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
+; CHECK: add nsw <4 x i32>
+; CHECK: fadd fast <4 x float>
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }