Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -237,6 +237,10 @@
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const;
 
+  /// \brief Check whether the data dependence could prevent store-load
+  /// forwarding.
+  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+
 private:
   ScalarEvolution *SE;
   const Loop *InnermostLoop;
@@ -286,10 +290,226 @@
   Dependence::DepType isDependent(const MemAccessInfo &A, unsigned AIdx,
                                   const MemAccessInfo &B, unsigned BIdx,
                                   const ValueToValueMap &Strides);
+};
 
-  /// \brief Check whether the data dependence could prevent store-load
-  /// forwarding.
-  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+/// \brief The group of interleaved loads/stores sharing the same stride and
+/// close to each other.
+///
+/// Each member in this group has an index starting from 0 and the largest
+/// index should never be greater than Delta (interleaved factor) which is the
+/// absolute value of the access stride.
+///
+/// E.g. Interleaved load group of Delta 4 (with a gap of index 2):
+///        for (unsigned i = 0; i < 1024; i+=4) {
+///          a = A[i];                           // Member of index 0
+///          b = A[i+1];                         // Member of index 1
+///          d = A[i+3];                         // Member of index 3
+///          ...
+///        }
+///
+///      Interleaved store group of Delta 4:
+///        for (unsigned i = 0; i < 1024; i+=4) {
+///          ...
+///          A[i]   = a;                         // Member of index 0
+///          A[i+1] = b;                         // Member of index 1
+///          A[i+2] = c;                         // Member of index 2
+///          A[i+3] = d;                         // Member of index 3
+///        }
+///
+/// Note: the interleaved load group could have gaps (missing members), but
+/// the interleaved store group doesn't allow gaps.
+class InterleaveGroup {
+public:
+  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
+      : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(nullptr) {
+    assert(Align && "The alignment should be non-zero");
+
+    Delta = std::abs(Stride);
+    assert(Delta > 1 && "Invalid interleave factor");
+
+    Reverse = Stride < 0;
+    Members[0] = Instr;
+  }
+
+  InterleaveGroup()
+      : Delta(0), Reverse(false), Align(0), SmallestKey(0), LargestKey(0),
+        InsertPos(nullptr) {}
+
+  bool isReverse() const { return Reverse; }
+  unsigned getDelta() const { return Delta; }
+  unsigned getAlign() const { return Align; }
+  unsigned getNumMembers() const { return Members.size(); }
+
+  /// \brief Try to insert a new member \p Instr with index \p Index and
+  /// alignment \p NewAlign.
+  ///
+  /// \returns false if the instruction doesn't belong to the group.
+  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
+    assert(NewAlign && "The new member's alignment should be non-zero");
+
+    int Key = Index + SmallestKey;
+
+    // Skip if there is already a member with the same index.
+    if (Members.count(Key))
+      return false;
+
+    if (Key > LargestKey) {
+      // The largest index is always less than the Delta.
+      if (Index >= Delta)
+        return false;
+
+      LargestKey = Key;
+    } else if (Key < SmallestKey) {
+      // The largest index is always less than the Delta.
+      if (LargestKey - Key >= Delta)
+        return false;
+
+      SmallestKey = Key;
+    }
+
+    // It's always safe to select the minimum alignment.
+    Align = std::min(Align, NewAlign);
+    Members[Key] = Instr;
+    return true;
+  }
+
+  /// \brief Get the member with the given index \p Index
+  ///
+  /// \returns nullptr if dosn't contain such member.
+  Instruction *getMember(unsigned Index) const {
+    int Key = SmallestKey + Index;
+    if (!Members.count(Key))
+      return nullptr;
+
+    return Members.find(Key)->second;
+  }
+
+  /// \brief Get the index for the given member. Unlike the key in the member
+  /// map, the index starts from 0.
+  unsigned getIndex(Instruction *Instr) const {
+    for (auto I : Members)
+      if (I.second == Instr)
+        return I.first - SmallestKey;
+
+    llvm_unreachable("InterleaveGroup contains no such member");
+  }
+
+  Instruction *getInsertPos() const { return InsertPos; }
+  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+
+  /// \brief Print the group.
+  void print(raw_ostream &OS, unsigned Depth) const {
+    OS.indent(Depth) << "Interleave Group of Delta " << Delta << ":\n";
+
+    for (int i = 0; i < Delta; i++) {
+      Instruction *Member = getMember(i);
+      if (Member)
+        OS.indent(Depth + 2) << "Index " << i << ":" << *Member << "\n";
+    }
+  }
+
+private:
+  int Delta; // Interleave Factor.
+  bool Reverse;
+  unsigned Align;
+  DenseMap<int, Instruction *> Members;
+  int SmallestKey;
+  int LargestKey;
+
+  // To avoid breaking dependences, an interleaved access should be inserted
+  // at either the first load or the last store in program order.
+  // E.g. %even = load i32             // Insert Position
+  //      %add = add i32 %even         // Use of %even
+  //      %odd = load i32
+  //
+  //      store i32 %even
+  //      %odd = add i32               // Def of %odd
+  //      store i32 %odd               // Insert Position
+  Instruction *InsertPos;
+};
+
+/// \brief Drive the analysis of interleaved memory accesses in the loop.
+///
+/// Call this class to analyze interleaved accesses only when the memory
+/// dependence check says we can vectorize the loop. Or it's meaningless to do
+/// analysis as the vectorization on interleaved accesses is not safe.
+///
+/// The analysis collects interleave groups and records the relationships
+/// between the member and the group in a map.
+class InterleavedAccessInfo {
+public:
+  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
+                        MemoryDepChecker *DepChecker)
+      : SE(SE), TheLoop(L), DT(DT), DepChecker(DepChecker) {}
+
+  ~InterleavedAccessInfo() {
+    SmallSet<InterleaveGroup *, 4> DelSet;
+    // Collecting all the pointers to avoid delete a pointer twice.
+    for (auto &I : InterleaveGroupMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
+  /// \brief Analyze the interelaved accesses.  Substitute symbolic strides
+  /// using \p Strides.
+  void analyzeInterleaving(const ValueToValueMap &Strides);
+
+  /// \brief Check if \p Instr belongs to any interleave group.
+  bool isAccessInterleaved(Instruction *Instr) const {
+    return InterleaveGroupMap.count(Instr);
+  }
+
+  /// \brief Get the interleave group that \p Instr belongs to.
+  ///
+  /// \returns nullptr if doesn't have such group.
+  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
+    if (InterleaveGroupMap.count(Instr))
+      return InterleaveGroupMap.find(Instr)->second;
+    return nullptr;
+  }
+
+  /// \brief Print the interleave groups.
+  void print(raw_ostream &OS, unsigned Depth) const;
+
+private:
+  ScalarEvolution *SE;
+  Loop *TheLoop;
+  DominatorTree *DT;
+  MemoryDepChecker *DepChecker;
+
+  /// Contains the relationships between the members and the interleave group.
+  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
+
+  /// \brief Group the interleaved loads/stores. Record the write-write pairs
+  /// whose dependences may be broken by the vectorization.
+  void groupInterleavedAccesses(
+      ArrayRef<Instruction *> InstrList,
+      SmallVector<std::pair<Instruction *, Instruction *>, 2> &WritePairs,
+      const ValueToValueMap &Strides);
+
+  /// \brief Create a new interleave group with the given instruction \p Instr,
+  /// stride \p Stride and alignment \p Align.
+  ///
+  /// \returns the newly created interleave group.
+  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
+                                                unsigned Align) {
+    assert(!InterleaveGroupMap.count(Instr) &&
+           "Already in an interleaved access group");
+    InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
+    return InterleaveGroupMap[Instr];
+  }
+
+  /// \brief Release the group and remove all the relationships.
+  void releaseGroup(InterleaveGroup *Group) {
+    for (unsigned i = 0; i < Group->getDelta(); i++) {
+      Instruction *Member = Group->getMember(i);
+      if (Member)
+        InterleaveGroupMap.erase(Member);
+    }
+
+    delete Group;
+  }
 };
 
 /// \brief Drive the analysis of memory accesses in the loop
@@ -306,6 +526,9 @@
 /// generates run-time checks to prove independence.  This is done by
 /// AccessAnalysis::canCheckPtrAtRT and the checks are maintained by the
 /// RuntimePointerCheck class.
+///
+/// If the memory accesses can be vectorized, it will analyze the interleaved
+/// access information, which is delegated to the InterleavedAccessInfo class.
 class LoopAccessInfo {
 public:
   /// This struct holds information about the memory runtime legality check that
@@ -427,6 +650,16 @@
     return DepChecker.getInstructionsForAccess(Ptr, isWrite);
   }
 
+  /// \brief Check if \p Instr belongs to any interleave group.
+  bool isAccessInterleaved(Instruction *Instr) const {
+    return InterleaveInfo.isAccessInterleaved(Instr);
+  }
+
+  /// \brief Get the interleave group that \p Instr belongs to.
+  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) const {
+    return InterleaveInfo.getInterleaveGroup(Instr);
+  }
+
   /// \brief Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
@@ -460,6 +693,10 @@
   /// loop-independent and loop-carried dependences between memory accesses.
   MemoryDepChecker DepChecker;
 
+  /// \brief The interleaved access information contains groups of interleaved
+  /// accesses with the same stride and close to each other.
+  InterleavedAccessInfo InterleaveInfo;
+
   /// \brief Number of memchecks required to prove independence of otherwise
   /// may-alias pointers
   unsigned NumComparisons;
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -444,6 +444,15 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of one member in the interleaved access.
+  /// \p Vecty is the wide vecotor type of the whole interleaved access.
+  /// \p SubTy is the vector type of the member.
+  /// \p Index is the index of current member in the interleaved access.
+  /// \p Gap is the number of missing members
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta, unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -582,6 +591,10 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              unsigned Delta,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -740,6 +753,12 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta, unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Alignment,
+                                           AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -300,6 +300,12 @@
     return 1;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta, unsigned Alignment,
+                                      unsigned AddressSpace) {
+    return Delta;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -522,6 +522,51 @@
     return Cost;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Delta, unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VT = dyn_cast<VectorType>(VecTy);
+    assert(VT && "Expect vector types");
+
+    unsigned NumElts = VT->getNumElements();
+    assert(Delta >= 2 && !(NumElts % Delta) && "Invalid Delta");
+
+    // The memory op cost is equal to the cost of the whole wide vector divied
+    // by the actual number of members.
+    unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+    unsigned NumSubElts = NumElts / Delta;
+    VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
+
+    // Then need to calculate the cost for interleave operations.
+    if (Opcode == Instruction::Load) {
+      // The cost is to extract all elements from the wide vector, and insert
+      // them into sub vectors.
+
+      for (unsigned i = 0; i < NumElts; i++)
+        Cost += getVectorInstrCost(Instruction::ExtractElement, VT, i);
+
+      unsigned InsSubCost = 0;
+      for (unsigned i = 0; i < NumSubElts; i++)
+        InsSubCost = getVectorInstrCost(Instruction::InsertElement, SubVT, i);
+
+      Cost += Delta * InsSubCost;
+    } else {
+      // The cost is extract each element from sub vectors, and insert them
+      // into the wide vector.
+
+      unsigned ExtSubCost = 0;
+      for (unsigned i = 0; i < NumSubElts; i++)
+        ExtSubCost = getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
+      Cost += Delta * ExtSubCost;
+
+      for (unsigned i = 0; i < NumElts; i++)
+        Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
+    }
+
+    return Cost;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     unsigned ISD = 0;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -41,6 +41,10 @@
                             VectorizerParams::VectorizationInterleave));
 unsigned VectorizerParams::VectorizationInterleave;
 
+static cl::opt<bool> EnableInterleaving(
+    "enable-interleaving", cl::init(false), cl::Hidden,
+    cl::desc("Enable analyzing interleaved accesses in a loop"));
+
 static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
     "runtime-memory-check-threshold", cl::Hidden,
     cl::desc("When performing memory disambiguation checks at runtime do not "
@@ -783,21 +787,32 @@
                            VectorizerParams::VectorizationFactor : 1);
   unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                            VectorizerParams::VectorizationInterleave : 1);
+  // The number of iterations to be vectorized and unrolled.
+  unsigned NumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
+
+  unsigned Stride = std::abs(StrideAPtr);
+  // Safe when either one below is true
+  //     Distance < Stride * TypeByteSize
+  //     Distance >= TypeByteSize * NumIter * Stride
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIter * Stride)) {
+    DEBUG(dbgs() << "LAA: Failure because of positive distance "
+                 << Val.getSExtValue() << '\n');
+    return Dependence::Backward;
+  }
 
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
-    DEBUG(dbgs() << "LAA: Failure because of Positive distance "
-        << Val.getSExtValue() << '\n');
+  // Safe when positive distance is not greater than the max safe distance.
+  if (Distance > MaxSafeDepDistBytes) {
+    DEBUG(dbgs() << "LAA: Failure because positive distance "
+                 << Val.getSExtValue() << " is greater than max safe distance "
+                 << MaxSafeDepDistBytes << "\n");
     return Dependence::Backward;
   }
 
-  // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  // If Distance < Stride * TypeByteSize, it is always safe. just keep the
+  // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes.
+  if (Distance >= Stride * TypeByteSize)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
@@ -900,6 +915,246 @@
   OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
 }
 
+struct StrideDescriptor {
+  StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align)
+      : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
+
+  StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
+
+  int Stride;
+  const SCEV *Scev;
+  unsigned Size;
+  unsigned Align;
+};
+
+void InterleavedAccessInfo::groupInterleavedAccesses(
+    ArrayRef<Instruction *> InstrList,
+    SmallVector<std::pair<Instruction *, Instruction *>, 2> &WritePairs,
+    const ValueToValueMap &Strides) {
+  // Holds all the stride accesses.
+  SmallVector<std::pair<Instruction *, StrideDescriptor>, 32> StrideAccess;
+
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (auto I : InstrList) {
+    //    // FIXME: To handle predicted loads/stores.
+    //    if (blockNeedsPredication(I->getParent(), TheLoop, DT))
+    //    continue;
+
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    assert((LI || SI) && "Invalid access instructon");
+
+    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+
+    // Only analyze non-unit stride accesses.
+    if (std::abs(Stride) < 2)
+      continue;
+
+    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+    unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+    // An alignment of 0 means target ABI alignment.
+    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(PtrTy->getElementType());
+
+    StrideAccess.push_back(
+        std::make_pair(I, StrideDescriptor(Stride, Scev, Size, Align)));
+  }
+
+  if (!StrideAccess.size())
+    return;
+
+  // Search the load-load/write-write pair B-A in bottom-up order and try to
+  // insert B into the interleave group of A according to 3 rules:
+  //   1. A and B have the same stride.
+  //   2. A and B have the same memory object size.
+  //   3. The distance of B to the leader of the group is less than the Delta.
+  //
+  // The bottom-up order can avoid breaking the WAW dependences betweem two
+  // pointers with the same base.
+  // E.g.  A[i]   = a;   (1)
+  //       A[i]   = b;   (2)
+  //       A[i+1] = c    (3)
+  // We form the group (2)+(3), so that (1) has to form groups with access
+  // above (1), which guarantees that (1) is always above (2).
+  for (auto I = StrideAccess.rbegin(), E = StrideAccess.rend(); I != E; ++I) {
+    Instruction *A = I->first;
+    StrideDescriptor DesA = I->second;
+
+    InterleaveGroup *Group = getInterleaveGroup(A);
+    if (!Group) {
+      DEBUG(dbgs() << "LAA: Creating an interleave group with:" << *A << '\n');
+      Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+    }
+
+    for (auto II = std::next(I); II != E; ++II) {
+      Instruction *B = II->first;
+      StrideDescriptor DesB = II->second;
+
+      if (A->mayReadFromMemory() != B->mayReadFromMemory())
+        continue;
+
+      // Check the rule 1 and 2.
+      if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
+        continue;
+
+      // Calculate the distance and prepare for the rule 3.
+      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+      if (!DistToA)
+        continue;
+
+      int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
+      // Read/write the same location.
+      if (!DistanceToA)
+        continue;
+
+      int Size = static_cast<int>(DesA.Size);
+      if (DistanceToA % Size) {
+        // Record the write-write pairs that could be broken after vectorizaton.
+        //
+        // E.g.  char *Tmp = (char *)A;       // A is int*
+        //       int *B = (int *) (++Tmp);
+        //       for(i = 0; i < n; i+=2) {
+        //         A[i]   = a;                // (1)
+        //         B[i]   = b;                // (2)
+        //         A[i+1] = c;                // (3)
+        //       }
+        // The combine of (1) and (3) will be inserted below (2).
+        if (A->mayWriteToMemory())
+          // Reuse the store-load forwarding check for whether there is overlap.
+          if (DepChecker->couldPreventStoreLoadForward(std::abs(DistanceToA),
+                                                       Size))
+            WritePairs.push_back(std::make_pair(B, A));
+        continue;
+      }
+
+      // Skip if B is already in a Group.
+      if (getInterleaveGroup(B))
+        continue;
+
+      // The index of B is the index of A plus the related index to A.
+      int IndexB = Group->getIndex(A) + DistanceToA / Size;
+
+      // Try to insert B into the group.
+      if (Group->insertMember(B, IndexB, DesB.Align)) {
+        DEBUG(dbgs() << "LAA: Inserted a member:" << *B << '\n'
+                     << "     into the interleave group with" << *A << '\n');
+        InterleaveGroupMap[B] = Group;
+      }
+    }
+  }
+}
+
+void InterleavedAccessInfo::analyzeInterleaving(
+    const ValueToValueMap &Strides) {
+  // Holds load/store instructions in program order.
+  SmallVector<Instruction *, 16> InstrList;
+
+  for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE;
+       ++BB) {
+    bool IsPred = LoopAccessInfo::blockNeedsPredication(*BB, TheLoop, DT);
+
+    for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I) {
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+        continue;
+      // FIXME: As currently we can't handle predicated access, return directly.
+      if (IsPred)
+        return;
+
+      InstrList.push_back(I);
+    }
+  }
+
+  if (!InstrList.size())
+    return;
+
+  // Holds the write-write pairs that could be broken by the vectorization.
+  SmallVector<std::pair<Instruction *, Instruction *>, 2> WritePairs;
+
+  DEBUG(dbgs() << "LAA: Analyzing interleaved accesses...\n");
+  groupInterleavedAccesses(InstrList, WritePairs, Strides);
+
+  // Filter eligible groups and set the insert position. We only keep the load
+  // group that has small gaps (less than half Delta) and the fully interleaved
+  // store group (has no gap).
+  for (auto I : InstrList) {
+    InterleaveGroup *Group = getInterleaveGroup(I);
+    if (!Group)
+      continue;
+
+    if (I->mayReadFromMemory()) {
+      if (Group->getNumMembers() >= Group->getDelta() / 2) {
+        // Choose the first member in program order.
+        if (!Group->getInsertPos())
+          Group->setInsertPos(I);
+        continue;
+      }
+    } else if (Group->getNumMembers() == Group->getDelta()) {
+      // Choose the last member in program order.
+      Group->setInsertPos(I);
+      continue;
+    }
+
+    releaseGroup(Group);
+  }
+
+  // Make sure the dependence of each write-write pair is still safe.
+  for (auto Pair : WritePairs) {
+    Instruction *A = Pair.first;
+    Instruction *B = Pair.second;
+    auto GroupA = getInterleaveGroup(A);
+    auto GroupB = getInterleaveGroup(B);
+
+    if (!GroupA && !GroupB)
+      continue;
+
+    Instruction *PosA = GroupA ? GroupA->getInsertPos() : A;
+    Instruction *PosB = GroupB ? GroupB->getInsertPos() : B;
+
+    // No need to check if both A and B won't be moved to other position.
+    if (PosA == A && PosB == B)
+      continue;
+
+    bool IsSafe = true;
+
+    for (auto I = InstrList.rbegin(), E = InstrList.rend(); I != E; ++I) {
+      if (*I == PosB)
+        break;
+      // If the position of A is after the position of B, it is unsafe.
+      if (*I == PosA) {
+        IsSafe = false;
+        break;
+      }
+    }
+
+    if (IsSafe)
+      continue;
+
+    if (PosA != A)
+      releaseGroup(GroupA);
+    if (PosB != B)
+      releaseGroup(GroupB);
+  }
+}
+
+void InterleavedAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth) << "Interleaved access groups:\n";
+
+  SmallSet<InterleaveGroup *, 4> Seen;
+  for (auto I : InterleaveGroupMap) {
+    if (Seen.count(I.second))
+      continue;
+
+    // Print the group.
+    I.second->print(OS, Depth + 2);
+    OS << "\n";
+    Seen.insert(I.second);
+  }
+}
+
 bool LoopAccessInfo::canAnalyzeLoop() {
   // We need to have a loop header.
   DEBUG(dbgs() << "LAA: Found a loop: " <<
@@ -1199,7 +1454,7 @@
 }
 
 bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
-                                           DominatorTree *DT)  {
+                                           DominatorTree *DT) {
   assert(TheLoop->contains(BB) && "Unknown block used");
 
   // Blocks that do not dominate the latch need predication.
@@ -1319,12 +1574,15 @@
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : DepChecker(SE, L), NumComparisons(0), TheLoop(L), SE(SE), DL(DL),
-      TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
-      MaxSafeDepDistBytes(-1U), CanVecMem(false),
-      StoreToLoopInvariantAddress(false) {
+    : DepChecker(SE, L), InterleaveInfo(SE, L, DT, &DepChecker),
+      NumComparisons(0), TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT),
+      LI(LI), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1U),
+      CanVecMem(false), StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(Strides);
+
+  if (CanVecMem && EnableInterleaving)
+    InterleaveInfo.analyzeInterleaving(Strides);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -1354,6 +1612,9 @@
   OS.indent(Depth) << "Store to invariant address was "
                    << (StoreToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
+
+  InterleaveInfo.print(OS, Depth);
+  OS << "\n";
 }
 
 const LoopAccessInfo &
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,13 @@
   return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
+unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Delta, unsigned Alignment,
+    unsigned AddressSpace) const {
+  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Alignment,
+                                             AddressSpace);
+}
+
 unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -34,6 +34,10 @@
 // Variable uniformity checks are inspired by:
 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
+// The interleaved access vectorization is based on the paper:
+//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
+//  Data for SIMD
+//
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
@@ -351,6 +355,9 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  /// Try to vectorize the interleaved access group that \p Instr belongs to.
+  void vectorizeInterleaveGroup(Instruction *Instr);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -693,8 +700,16 @@
     return LAI->getRuntimePointerCheck();
   }
 
-  const LoopAccessInfo *getLAI() const {
-    return LAI;
+  const LoopAccessInfo *getLAI() const { return LAI; }
+
+  /// \brief Check if \p Instr belongs to any interleave group.
+  bool isAccessInterleaved(Instruction *Instr) {
+    return LAI->isAccessInterleaved(Instr);
+  }
+
+  /// \brief Get the interleaved access group that \p I belongs to.
+  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+    return LAI->getInterleavedAccessGroup(Instr);
   }
 
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
@@ -1657,6 +1672,256 @@
                                      "reverse");
 }
 
+// Get a mask to interleave a wide vector concatenated from \p NumVec small
+// vectors.
+// I.E  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
+//      <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
+                                    unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVec; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting from index \p Start.
+// I.E.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+                                unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consist of sequential integers
+// starting from 0, The second part consist of UNDEFs.
+// I.E. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
+                                   unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumInt; i++)
+    Mask.push_back(Builder.getInt32(i));
+
+  // Return directly if there is no undef value.
+  if (!NumUndef)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 2st vector should
+// not have more elements than the 1st vector. If the 1st vector has more
+// elements, extend the 2nd vector with UNDEFs.
+static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+  if (NumElts1 > NumElts2) {
+    // Extend with undefs to have the same number of elements as the V1.
+    Constant *ExtMask =
+        getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
+    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+  }
+
+  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+// Concatenate vectors in the given list. All vectors have the same type.
+static Value *ConcatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> InputList) {
+  unsigned NumVec = InputList.size();
+  assert(NumVec > 1 && "Should be at least two vectors");
+
+  SmallVector<Value *, 8> VecList;
+  SmallVector<Value *, 8> TmpList;
+  VecList.append(InputList.begin(), InputList.end());
+  do {
+    for (unsigned i = 0; i < NumVec / 2; i++)
+      TmpList.push_back(
+          ConcatenateTwoVectors(Builder, VecList[2 * i], VecList[2 * i + 1]));
+
+    // Push the last vector if the total number of vectors is odd.
+    if (NumVec % 2 != 0)
+      TmpList.push_back(VecList[NumVec - 1]);
+
+    VecList.clear();
+    VecList.append(TmpList.begin(), TmpList.end());
+    NumVec = VecList.size();
+    TmpList.clear();
+  } while (NumVec > 1);
+
+  return VecList[0];
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (Delta is 3):
+//   for (i = 0; i < N; i+=3) {
+//     R = Pic[i];             // Member of index 0
+//     G = Pic[i+1];           // Member of index 1
+//     B = Pic[i+2];           // Member of index 2
+//     ... // do something to R, G, B
+//   }
+// To:
+//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
+//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
+//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
+//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
+//
+// Or translate following interleaved store group (Delta is 3):
+//   for (i = 0; i < N; i+=3) {
+//     ... do something to R, G, B
+//     Pic[i]   = R;           // Member of index 0
+//     Pic[i+1] = G;           // Member of index 1
+//     Pic[i+2] = B;           // Member of index 2
+//   }
+// To:
+//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
+//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  // Skip if current instruction is not the insert position.
+  if (Instr != Group->getInsertPos())
+    return;
+
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  unsigned Delta = Group->getDelta();
+  Type *VecTy = VectorType::get(ScalarTy, Delta * VF);
+  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+
+  // Prepare for the new pointers.
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  unsigned Index = Group->getIndex(Instr);
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Notice current instruction could be any index. Need to adjust the address
+    // to the member of index 0.
+    //
+    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
+    //       b = A[i];       // Member of index 0
+    // Current pointer is pointed to A[i+1], adjust it to A[i].
+    //
+    // E.g.  A[i+1] = a;     // Member of index 1
+    //       A[i]   = b;     // Member of index 0
+    //       A[i+2] = c;     // Member of index 2 (Current instruction)
+    // Current pointer is pointed to A[i+2], adjust it to A[i].
+    Value *NewPtr =
+        Builder.CreateExtractElement(PtrParts[Part], Builder.getInt32(0));
+    NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+
+    // If the address is reversed, then the wide load/store needs to start at
+    // the last access of index 0.
+    // E.g. For the reversed access of interleaved load with 2 members:
+    //      {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]}
+    // The new pointer is now pointed to A[i]. Adjust it to A[i-6].
+    if (Group->isReverse())
+      NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-(Delta * (VF - 1))));
+
+    // Cast to the vector pointer type.
+    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  Value *UndefVec = UndefValue::get(VecTy);
+
+  // Vectorize the interleaved load group.
+  if (LI) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      Instruction *CallI = Builder.CreateAlignedLoad(
+          NewPtrs[Part], Group->getAlign(), "wide.vec");
+
+      for (unsigned i = 0; i < Delta; i++) {
+        Instruction *Member = Group->getMember(i);
+
+        // Skip the gaps in the group.
+        if (!Member)
+          continue;
+
+        Constant *StrideMask = getStridedMask(Builder, i, Delta, VF);
+        Value *StridedVec = Builder.CreateShuffleVector(
+            CallI, UndefVec, StrideMask, "strided.vec");
+
+        // If this member has different type, cast the result type.
+        if (Member->getType() != ScalarTy) {
+          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+        }
+
+        VectorParts &Entry = WidenMap.get(Member);
+        Entry[Part] =
+            Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+      }
+
+      propagateMetadata(CallI, Instr);
+    }
+    return;
+  }
+
+  // The sub vector type for current instruction.
+  VectorType *SubVT = VectorType::get(ScalarTy, VF);
+
+  // Vectorize the interleaved store group.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Collect the stored vector from each member.
+    SmallVector<Value *, 4> StoredVecs;
+    for (unsigned i = 0; i < Delta; i++) {
+      // Interleaved store group doesn't allow a gap, so each index has a member
+      Instruction *Member = Group->getMember(i);
+      assert(Member && "Fail to get a member from an interleaved store group");
+
+      Value *StoredVec =
+          getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
+      if (Group->isReverse())
+        StoredVec = reverseVector(StoredVec);
+
+      // If this member has different type, cast it to a unified type.
+      if (StoredVec->getType() != SubVT)
+        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+
+      StoredVecs.push_back(StoredVec);
+    }
+
+    // Concatenate all vectors into a wide vector.
+    Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+
+    // Interleave the elements in the wide vector.
+    Constant *IMask = getInterleavedMask(Builder, VF, Delta);
+    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
+                                              "interleaved.vec");
+
+    Instruction *CallI =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlign());
+    propagateMetadata(CallI, Instr);
+  }
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1664,6 +1929,10 @@
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
+  // Try to vectorize the interleave group if this access is interleaved.
+  if (Legal->isAccessInterleaved(Instr))
+    return vectorizeInterleaveGroup(Instr);
+
   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
@@ -4575,6 +4844,31 @@
       return TTI.getAddressComputationCost(VectorTy) +
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    // Interleaved access
+    if (Legal->isAccessInterleaved(I)) {
+      auto Group = Legal->getInterleavedAccessGroup(I);
+      assert(Group && "Fail to get an interleaved access group.");
+
+      // Only calculate the cost once.
+      if (Group->getInsertPos() != I)
+        return 0;
+
+      unsigned Delta = Group->getDelta();
+      Type *WideVecTy =
+          VectorType::get(VectorTy->getVectorElementType(),
+                          VectorTy->getVectorNumElements() * Delta);
+
+      // Calculate the cost of the whole interleaved group.
+      unsigned Cost = TTI.getInterleavedMemoryOpCost(
+          I->getOpcode(), WideVecTy, Group->getDelta(), Delta, AS);
+
+      if (Group->isReverse())
+        Cost +=
+            Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+      return Cost;
+    }
+
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
Index: test/Analysis/LoopAccessAnalysis/analyze-interleaving.ll
===================================================================
--- /dev/null
+++ test/Analysis/LoopAccessAnalysis/analyze-interleaving.ll
@@ -0,0 +1,544 @@
+; RUN: opt -S -loop-accesses -analyze -enable-interleaving=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check an interleaved load group of Delta 2 and an interleaved store group of
+; Delta 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+%struct.ST2 = type { i32, i32 }
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check an interleaved load group of Delta 3 and an interleaved store group of
+; Delta 3.
+
+; int A[3072];
+; struct ST3 S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     S[i].x = X1 + 1;
+;     S[i].y = X2 + 2;
+;     S[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 3:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK-NEXT:   Index 2:
+; CHECK: Interleave Group of Delta 3:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK-NEXT:   Index 2:
+
+%struct.ST3 = type { i32, i32, i32 }
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check an interleaved load group of Delta 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 4:
+; CHECK-NEXT:   Index 0:  %0 = load i32, i32* %x, align 4
+; CHECK-NEXT:   Index 1:  %1 = load i32, i32* %y, align 4
+; CHECK-NEXT:   Index 2:  %2 = load i32, i32* %z, align 4
+; CHECK-NEXT:   Index 3:  %3 = load i32, i32* %w, align 4
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check on an interleaved store group of Delta 4.
+
+; int B[1024];
+; struct ST4 T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 4:
+; CHECK-NEXT:   Index 0:  store i32 %add, i32* %x, align 4
+; CHECK-NEXT:   Index 1:  store i32 %mul, i32* %y, align 4
+; CHECK-NEXT:   Index 2:  store i32 %sub, i32* %z, align 4
+; CHECK-NEXT:   Index 3:  store i32 %add5, i32* %w, align 4
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check a reverse interleaved load group of Delta 4 and a reverse interleaved
+; store group of Delta 4.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reverse_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+define void @test_reverse_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check an interleaved load group of Delta 2 with 1 gap (without the load on
+; odd element).
+
+; void even_load(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NOT:    Index 1
+
+define void @even_load(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check an interleaved store group of Delta 2 with 1 gap. We should not identify
+; such case as a group.
+
+; void even_store(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;    A[i] = B[i] + B[i+1];
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NOT:    Index 0:  store
+
+define void @even_store(i32* nocapture %A, i32* nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add3, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check access groups identified from mixed loads and stores.
+
+; void mixed_load_store(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %2, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %4, %3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check access groups identified from members with different kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:
+; CHECK-NEXT:   Index 1:
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+
+; Check two interleaved store groups which will break write after write
+; dependence. We should not identify them as interleaved groups.
+
+; void waw_dep(int *A) {
+;   char *Tmp = (char *)A;
+;   int *B = (int *) (++Tmp);
+;   for(int i = 0; i < 1024; i+=2) {
+;     A[i]   = i;                // (1)
+;     B[i]   = i;                // (2)
+;     A[i+1] = i + 1;            // (3)
+;     B[i+1] = i + 1;            // (4)
+;   }
+; }
+;
+; The combine of (1)+(3) or (2)+(4) will break the dependence.
+
+; CHECK: Interleaved access groups:
+; CHECK-NOT: Interleave Group:
+
+define void @waw_dep(i32* nocapture %A) {
+entry:
+  %0 = bitcast i32* %A to i8*
+  %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1
+  %1 = bitcast i8* %incdec.ptr to i32*
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %2 = trunc i64 %indvars.iv to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx2, align 4
+  %3 = or i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = trunc i64 %3 to i32
+  store i32 %4, i32* %arrayidx5, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3
+  store i32 %4, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check two interleaved store groups which won't break write after write
+; dependence.
+
+; void no_waw_dep(int *A) {
+;   char *Tmp = (char *)A;
+;   int *B = (int *) (++Tmp);
+;   for(int i = 0; i < 1024; i+=2) {
+;     A[i]   = i;                // (1)
+;     A[i+1] = i + 1;            // (2)
+;     B[i]   = i;                // (3)
+;     B[i+1] = i + 1;            // (4)
+;   }
+; }
+;
+; The combine of (1)+(2) or (3)+(4) won't break the dependence.
+
+; CHECK: Interleaved access groups:
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:  store i32 %2
+; CHECK-NEXT:   Index 1:  store i32 %4
+; CHECK: Interleave Group of Delta 2:
+; CHECK-NEXT:   Index 0:  store i32 %2
+; CHECK-NEXT:   Index 1:  store i32 %4
+
+define void @no_waw_dep(i32* nocapture %A) {
+entry:
+  %0 = bitcast i32* %A to i8*
+  %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1
+  %1 = bitcast i8* %incdec.ptr to i32*
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %2 = trunc i64 %indvars.iv to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %3 = or i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = trunc i64 %3 to i32
+  store i32 %4, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx5, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3
+  store i32 %4, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaving=true | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -enable-interleaving=true | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
@@ -102,26 +102,23 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
Index: test/Transforms/LoopVectorize/interleaved-accesses.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -0,0 +1,427 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaving=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check vectorization on an interleaved load group of Delta 2 and an interleaved
+; store group of Delta 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+%struct.ST2 = type { i32, i32 }
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of Delta 3 and an interleaved
+; store group of Delta 3.
+
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+%struct.ST3 = type { i32, i32, i32 }
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of Delta 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check vectorization on an interleaved store group of Delta 4.
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on a reverse interleaved load group of Delta 2 and
+; a reverse interleaved store group of Delta 2.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of Delta 2 with 1 gap
+; (missing the load of odd elements).
+
+; void even_load(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shl nsw <4 x i32> %strided.vec, <i32 1, i32 1, i32 1, i32 1>
+define void @even_load(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load_store(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load_store(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec
+define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %2, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %4, %3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups with members having different
+; kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK-LABEL: @int_float_struct(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
+; CHECK: add nsw <4 x i32>
+; CHECK: fadd fast <4 x float>
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }