Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -368,11 +368,126 @@
     SmallVector<unsigned, 2> AliasSetId;
   };
 
+  struct InterleavedAccess {
+    InterleavedAccess(Instruction *Leader, int MBStride, unsigned Align)
+        : Align(Align), InsertPos(nullptr) {
+      // The Stride of interleaved access is absolute value of member stride.
+      Stride = std::abs(MBStride);
+      Reverse = MBStride < 0;
+      Members.push_back(std::make_pair(Leader, 0)); // Member of index 0
+    }
+
+    InterleavedAccess()
+        : Stride(0), Reverse(false), Align(0), InsertPos(nullptr) {}
+
+    bool isReverse() const { return Reverse; }
+    unsigned getStride() const { return Stride; };
+    unsigned getAlign() const { return Align; }
+
+    // Fully interleaved access has all the members from index 0 to (Stride -1).
+    // Returns the number of missing members.
+    unsigned getNumGaps() const {
+      unsigned NumMembers = Members.size();
+      assert(NumMembers <= Stride && "Unexpected number of members");
+      return Stride - NumMembers;
+    }
+
+    // Return true if successfully insert a new member. \p Index is relative
+    // to the leader (The member with index 0), so it could be negative.
+    // This could fail if it breaks two rules below:
+    //     1. The max index is always less than the stride.
+    //     2. There is already a member with same index.
+    bool insertMember(Instruction *Inst, int Index, unsigned Alignment) {
+      unsigned AbsIdx = std::abs(Index);
+      if (Index < 0) {
+        unsigned LIndex = Members.back().second;
+        // Break rule 1. Doesn't belong to this group.
+        if (LIndex + AbsIdx >= Stride)
+          return false;
+
+        for (auto &I : Members)
+          I.second += AbsIdx;
+
+        Members.insert(Members.begin(), std::make_pair(Inst, 0));
+        Align = std::min(Align, Alignment);
+        return true;
+      }
+
+      // Break rule 1. Doesn't belong to this group.
+      if (AbsIdx >= Stride)
+        return false;
+
+      auto I = Members.begin();
+      for (auto E = Members.end(); I != E; ++I) {
+        if (I->second > AbsIdx)
+          break;
+
+        // Break rule 2. There is already a member with the same index.
+        if (I->second == AbsIdx)
+          return false;
+      }
+
+      Members.insert(I, std::make_pair(Inst, AbsIdx));
+      Align = std::min(Align, Alignment);
+      return true;
+    }
+
+    void removeMember(Instruction *Inst) {
+      for (auto I = Members.begin(), E = Members.end(); I != E; ++I) {
+        if (I->first != Inst)
+          continue;
+
+        Members.erase(I);
+        return;
+      }
+    }
+
+    const ArrayRef<std::pair<Instruction *, unsigned>> getMembers() const {
+      return Members;
+    }
+
+    unsigned getIndex(Instruction *Inst) const {
+      for (auto I : Members)
+        if (I.first == Inst)
+          return I.second;
+      llvm_unreachable("Contains no such member");
+    }
+
+    Instruction *getInsertPos() const { return InsertPos; }
+    void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+
+  private:
+    unsigned Stride;
+    bool Reverse;
+    unsigned Align;
+    SmallVector<std::pair<Instruction *, unsigned>, 4> Members;
+
+    // To avoid breaking dependences, an interleaved access should be inserted
+    // at either the first load or the last store in program order.
+    // E.g. %even = load i32             // Insert Position
+    //      %add = add i32 %even         // Use of %even
+    //      %odd = load i32
+    //
+    //      store i32 %even
+    //      %odd = add i32               // Def of %odd
+    //      store i32 %odd               // Insert Position
+    Instruction *InsertPos;
+  };
+
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
                  const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                  DominatorTree *DT, LoopInfo *LI,
                  const ValueToValueMap &Strides);
 
+  ~LoopAccessInfo() {
+    SmallSet<InterleavedAccess *, 4> DelSet;
+    // Collecting all the pointers to avoid delete a pointer twice.
+    for (auto &I : InterleavedAccessMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles.
   bool canVectorizeMemory() const { return CanVecMem; }
@@ -440,10 +555,40 @@
     return StoreToLoopInvariantAddress;
   }
 
+  bool isInterleaved(Instruction *I) const {
+    return InterleavedAccessMap.count(I);
+  }
+
+  /// \brief Return the interleaved access the given value \p V belongs to.
+  /// Return nullptr if \p V doesn't belong to any interleaved access.
+  const LoopAccessInfo::InterleavedAccess *
+  getInterleavedAccess(Instruction *I) const {
+    if (!InterleavedAccessMap.count(I))
+      return nullptr;
+    return InterleavedAccessMap.find(I)->second;
+  }
+
 private:
   /// \brief Analyze the loop.  Substitute symbolic strides using Strides.
   void analyzeLoop(const ValueToValueMap &Strides);
 
+  /// \brief Analyze the loop dependences. \p Loads and \p Stores contain
+  /// loads/stores in program order.  Substitute symbolic strides using Strides.
+  void analyzeDependences(ArrayRef<Value *> Loads, ArrayRef<Value *> Stores,
+                          bool IsAnnotatedParallel,
+                          const ValueToValueMap &Strides);
+
+  /// \brief Analyze the interelaved accesses. The \p AccessList constains
+  /// memory accesses in program order.
+  void analyzeInterleaving(ArrayRef<Instruction *> AccessList,
+                           const ValueToValueMap &Strides);
+
+  /// \brief Group loads/stores into interleaved accesses according to their
+  /// address, stride, memory object size. \p AccessList constains memory
+  /// accesses in program order.
+  void groupInterleavedAccess(ArrayRef<Instruction *> AccessList,
+                              const ValueToValueMap &Strides);
+
   /// \brief Check if the structure of the loop allows it to be analyzed by this
   /// pass.
   bool canAnalyzeLoop();
@@ -485,6 +630,9 @@
   /// \brief The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   Optional<LoopAccessReport> Report;
+
+  /// Contains the relationships between the members and the interleaved access.
+  DenseMap<Instruction *, InterleavedAccess *> InterleavedAccessMap;
 };
 
 Value *stripIntegerCast(Value *V);
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -444,6 +444,16 @@
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of one member in the interleaved access.
+  /// \p Vecty is the wide vecotor type of the whole interleaved access.
+  /// \p SubTy is the vector type of the member.
+  /// \p Index is the index of current member in the interleaved access.
+  /// \p Gap is the number of missing members
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Gap,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -582,6 +592,10 @@
   virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              Type *SubTy, unsigned Index,
+                                              unsigned Gap, unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
   virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -740,6 +754,13 @@
                                  unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Gap,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index, Gap,
+                                           Alignment, AddressSpace);
+  }
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -300,6 +300,13 @@
     return 1;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Gap,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+    return 1;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     return 1;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -522,6 +522,53 @@
     return Cost;
   }
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy,
+                                      unsigned Index, unsigned Gap,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VT = dyn_cast<VectorType>(VecTy);
+    VectorType *SubVT = dyn_cast<VectorType>(SubTy);
+    assert(VT && SubVT && "Expect vector types");
+
+    unsigned NumElts = VT->getNumElements();
+    unsigned SubNumElts = SubVT->getNumElements();
+    unsigned Stride = NumElts / SubNumElts;
+    assert(NumElts % SubNumElts == 0 && Stride > 1 && "Invalide vector types");
+    assert(Index < Stride && "Invalid Index");
+    assert(Gap < Stride && "Invalid Gap");
+
+    // The memory op cost is equal to the cost of the whole wide vector divied
+    // by the actual number of members.
+    unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+    Cost = Cost / (Stride - Gap);
+
+    // Then need to calculate the cost for interleave operations.
+    if (Opcode == Instruction::Load) {
+      // E.g. Interleaved load for the vector %V1 of index 1:
+      //     %V = load <4 x i32>, <4 x i32>* %ptr
+      //     %V1 = shufflevector <4 x i32> %V, <4 x i32> undef, <1, 3>
+      // The cost is extract element 1 and element 3 from the wide vector, and
+      // insert them sequentially into the sub vector.
+      for (unsigned i = 0; i < SubNumElts; i++) {
+        Cost += getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                   Index + i * Stride);
+        Cost += getVectorInstrCost(Instruction::InsertElement, SubTy, i);
+      }
+    } else {
+      // E.g. Interleaved store for the vector %V1 of index 1:
+      //     %IV = shufflevector <2 x i32> %V0, <2 x i32> %V1, <0, 2, 1, 3>
+      //     store <4 x i32> %IV, <4 x i32> %ptr
+      // The cost is extract element 0 and element 1 from %V1, and insert them
+      // into lane 1 and lane 3 of the wide vector.
+      for (unsigned i = 0; i < SubNumElts; i++) {
+        Cost += getVectorInstrCost(Instruction::ExtractElement, SubTy, i);
+        Cost += getVectorInstrCost(Instruction::InsertElement, VecTy,
+                                   Index + i * Stride);
+      }
+    }
+    return Cost;
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Type *> Tys) {
     unsigned ISD = 0;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -41,6 +41,10 @@
                             VectorizerParams::VectorizationInterleave));
 unsigned VectorizerParams::VectorizationInterleave;
 
+static cl::opt<bool> AnalyzeInterleavedAccess(
+    "analyze-interleaved-access", cl::init(false), cl::Hidden,
+    cl::desc("Enable analyzing interleaved access in loop"));
+
 static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
     "runtime-memory-check-threshold", cl::Hidden,
     cl::desc("When performing memory disambiguation checks at runtime do not "
@@ -777,21 +781,28 @@
                            VectorizerParams::VectorizationFactor : 1);
   unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                            VectorizerParams::VectorizationInterleave : 1);
-
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+  // The number of iterations to be vectorized and unrolled.
+  unsigned NumIter =
+      std::max(ForcedFactor * ForcedUnroll, static_cast<unsigned>(2));
+
+  unsigned Stride = std::abs(StrideAPtr);
+  // Positive distance is safe when
+  //     Distance <= MaxSafeDepDistBytes
+  // And either one below is true
+  //     Distance < Stride * TypeByteSize
+  //     Distance >= TypeByteSize * NumIter * Stride
+  if (!(Distance < Stride * TypeByteSize ||
+        Distance >= TypeByteSize * NumIter * Stride) ||
+      Distance > MaxSafeDepDistBytes) {
     DEBUG(dbgs() << "LAA: Failure because of Positive distance "
         << Val.getSExtValue() << '\n');
     return Dependence::Backward;
   }
 
-  // Positive distance bigger than max vectorization factor.
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  // If Distance < Stride * TypeByteSize, it is always safe. just keep the
+  // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes.
+  if (Distance >= Stride * TypeByteSize)
+    MaxSafeDepDistBytes = Distance;
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   if (IsTrueDataDependence &&
@@ -947,89 +958,184 @@
   return true;
 }
 
-void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
+struct StrideDescriptor {
+  StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align)
+      : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
 
-  typedef SmallVector<Value*, 16> ValueVector;
-  typedef SmallPtrSet<Value*, 16> ValueSet;
+  StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
 
-  // Holds the Load and Store *instructions*.
-  ValueVector Loads;
-  ValueVector Stores;
+  int Stride;
+  const SCEV *Scev;
+  unsigned Size;
+  unsigned Align;
+};
 
-  // Holds all the different accesses in the loop.
-  unsigned NumReads = 0;
-  unsigned NumReadWrites = 0;
+void LoopAccessInfo::groupInterleavedAccess(ArrayRef<Instruction *> InstList,
+                                            const ValueToValueMap &Strides) {
+  // Collect and record all the strided accesses.
+  DenseMap<Instruction *, StrideDescriptor> StridedAccesses;
+  for (auto I : InstList) {
+    // FIXME: To handle predicted loads/stores.
+    if (blockNeedsPredication(I->getParent(), TheLoop, DT))
+      continue;
+
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    assert((LI || SI) && "Invalid access instructon");
+
+    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+    // Only analyze non-unit stride accesses.
+    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+    if (std::abs(Stride) < 2)
+      continue;
+
+    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+    unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+    // An alignment of 0 means target abi alignment.
+    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(PtrTy->getElementType());
+
+    StridedAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
+  }
 
-  PtrRtCheck.Pointers.clear();
-  PtrRtCheck.Need = false;
+  // Search the load-load/write-write pair A-B in bottom-up order. Create a new
+  // interleaved access for A if A doesn't belong to any interleaved access.
+  // Try to insert B into the interleaved access of A according to 3 rules:
+  //   1. A and B have the same stride.
+  //   2. A and B has the same memory object size.
+  //   3. The distance of B to the leader of the interleaved access is less
+  //      than the Stride.
+  //
+  // Even though the interleaved access vectorization may change the program
+  // order, no need to worry about the dependences. As the dependence check has
+  // already checked there is no overlap between pointers of different base.
+  // For the pointers sharing the same base. There may be three dependeces:
+  //
+  //   The RAW (Read After Write) dependence.
+  //      E.g.  A[i] = a;
+  //            b = A[i];
+  // This won't exist as it is a store-load forwarding conflict, which has
+  // already been checked and forbidden in the dependence check.
+  //
+  //   The WAW (Write After Write) dependence:
+  //      E.g.  A[i] = a; (1)
+  //            A[i] = b; (2)
+  // The bottom-up search order gurantees (2) is always inserted into the
+  // interleaved store below the interleaved store that (1) belongs to.
+  //
+  //   The WAR (Write After Read) dependence:
+  //      E.g.  a = A[i];
+  //            A[i] = b;
+  // The insert positon of interleaved store and interleaved load can gurantee
+  // the write is always after the read.
+  for (auto I = InstList.rbegin(), E = InstList.rend(); I != E; ++I) {
+    Instruction *A = *I;
+    if (!StridedAccesses.count(A))
+      continue;
+
+    StrideDescriptor DesA = StridedAccesses[A];
+
+    InterleavedAccess *IA;
+    if (isInterleaved(A)) {
+      IA = InterleavedAccessMap[A];
+    } else {
+      // Create a new interleave access with A.
+      IA = new InterleavedAccess(A, DesA.Stride, DesA.Align);
+      InterleavedAccessMap[A] = IA;
+    }
 
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+    for (auto II = std::next(I); II != E; ++II) {
+      Instruction *B = *II;
+      if (A->mayReadFromMemory() != B->mayReadFromMemory())
+        continue;
 
-  // For each block.
-  for (Loop::block_iterator bb = TheLoop->block_begin(),
-       be = TheLoop->block_end(); bb != be; ++bb) {
+      if (!StridedAccesses.count(B) || isInterleaved(B))
+        continue;
 
-    // Scan the BB and collect legal loads and stores.
-    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
-         ++it) {
+      StrideDescriptor DesB = StridedAccesses[B];
 
-      // If this is a load, save it. If this instruction can read from memory
-      // but is not a load, then we quit. Notice that we don't handle function
-      // calls that read or write.
-      if (it->mayReadFromMemory()) {
-        // Many math library functions read the rounding mode. We will only
-        // vectorize a loop if it contains known function calls that don't set
-        // the flag. Therefore, it is safe to ignore this read from memory.
-        CallInst *Call = dyn_cast<CallInst>(it);
-        if (Call && getIntrinsicIDForCall(Call, TLI))
-          continue;
+      // Check the rule 1 and 2.
+      if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
+        continue;
 
-        // If the function has an explicit vectorized counterpart, we can safely
-        // assume that it can be vectorized.
-        if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
-            TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
-          continue;
+      // Calculate the distance and prepare for the rule 3.
+      const SCEV *DistToA = SE->getMinusSCEV(DesB.Scev, DesA.Scev);
+      const SCEVConstant *C = dyn_cast<SCEVConstant>(DistToA);
+      if (!C)
+        continue;
 
-        LoadInst *Ld = dyn_cast<LoadInst>(it);
-        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
-          emitAnalysis(LoopAccessReport(Ld)
-                       << "read with atomic ordering or volatile read");
-          DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
-          CanVecMem = false;
-          return;
-        }
-        NumLoads++;
-        Loads.push_back(Ld);
-        DepChecker.addAccess(Ld);
+      int DistanceToA = C->getValue()->getValue().getZExtValue();
+      // Read/write the same location.
+      if (!DistanceToA)
         continue;
-      }
 
-      // Save 'store' instructions. Abort if other instructions write to memory.
-      if (it->mayWriteToMemory()) {
-        StoreInst *St = dyn_cast<StoreInst>(it);
-        if (!St) {
-          emitAnalysis(LoopAccessReport(it) <<
-                       "instruction cannot be vectorized");
-          CanVecMem = false;
-          return;
-        }
-        if (!St->isSimple() && !IsAnnotatedParallel) {
-          emitAnalysis(LoopAccessReport(St)
-                       << "write with atomic ordering or volatile write");
-          DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
-          CanVecMem = false;
-          return;
-        }
-        NumStores++;
-        Stores.push_back(St);
-        DepChecker.addAccess(St);
-      }
-    } // Next instr.
-  } // Next block.
+      int Size = static_cast<int>(DesA.Size);
+      assert(!(DistanceToA % Size) && "Distance must be multiple of the size");
+
+      // The index of B is equal to the index of A plus the related index.
+      int IndexToA = DistanceToA / Size;
+      int IndexA = static_cast<int>(IA->getIndex(A));
+
+      // Try to insert B into the group with A.
+      if (IA->insertMember(B, IndexA + IndexToA, DesB.Align))
+        InterleavedAccessMap[B] = IA;
+      continue;
+    }
+  }
+}
+
+void LoopAccessInfo::analyzeInterleaving(ArrayRef<Instruction *> InstList,
+                                         const ValueToValueMap &Strides) {
+  if (InstList.size() < 2)
+    return;
+
+  groupInterleavedAccess(InstList, Strides);
+
+  // Set insert positons and clean interleaved stores with gaps.
+  for (auto I : InstList) {
+    if (!isInterleaved(I))
+      continue;
+
+    InterleavedAccess *IA = InterleavedAccessMap[I];
+
+    // Set the insert position for a interleaved load.
+    if (I->mayReadFromMemory()) {
+      // The first member in program order if current insert position is null.
+      if (!IA->getInsertPos())
+        IA->setInsertPos(I);
+      continue;
+    }
+
+    // Set the insert position for a fully interleaved store.
+    if (!IA->getNumGaps()) {
+      // Finally, the insert position is the last member in program order.
+      IA->setInsertPos(I);
+      continue;
+    }
 
-  // Now we have two lists that hold the loads and the stores.
-  // Next, we find the pointers that they use.
+    // Erase a interleaved store with gaps.
+    ArrayRef<std::pair<Instruction *, unsigned>> Members = IA->getMembers();
+    for (auto I : Members)
+      InterleavedAccessMap.erase(I.first);
+    delete IA;
+  }
+}
 
+void LoopAccessInfo::analyzeDependences(ArrayRef<Value *> Loads,
+                                        ArrayRef<Value *> Stores,
+                                        bool IsAnnotatedParallel,
+                                        const ValueToValueMap &Strides) {
+  // Holds all the different accesses in the loop.
+  unsigned NumReads = 0;
+  unsigned NumReadWrites = 0;
+
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
+
+  typedef SmallPtrSet<Value *, 16> ValueSet;
   // Check if we see any stores. If there are no stores, then we don't
   // care if the pointers are *restrict*.
   if (!Stores.size()) {
@@ -1049,9 +1155,9 @@
   // writes and between reads and writes, but not between reads and reads.
   ValueSet Seen;
 
-  ValueVector::iterator I, IE;
-  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
-    StoreInst *ST = cast<StoreInst>(*I);
+  // Find the pointers that stores use.
+  for (auto I : Stores) {
+    StoreInst *ST = cast<StoreInst>(I);
     Value* Ptr = ST->getPointerOperand();
     // Check for store to loop invariant address.
     StoreToLoopInvariantAddress |= isUniform(Ptr);
@@ -1072,15 +1178,15 @@
   }
 
   if (IsAnnotatedParallel) {
-    DEBUG(dbgs()
-          << "LAA: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
+    DEBUG(dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
+                 << "checks.\n");
     CanVecMem = true;
     return;
   }
 
-  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
-    LoadInst *LD = cast<LoadInst>(*I);
+  // Find the pointers that loads use.
+  for (auto I : Loads) {
+    LoadInst *LD = cast<LoadInst>(I);
     Value* Ptr = LD->getPointerOperand();
     // If we did *not* see this pointer before, insert it to the
     // read list. If we *did* see it before, then it is already in
@@ -1192,6 +1298,89 @@
   }
 }
 
+void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
+
+  typedef SmallVector<Value *, 16> ValueVector;
+
+  // Holds the Load and Store *instructions* in program order.
+  ValueVector Loads;
+  ValueVector Stores;
+  // Holds the load/store instructions in program order.
+  SmallVector<Instruction *, 16> InstList;
+
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+                            be = TheLoop->block_end();
+       bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+        // Many math library functions read the rounding mode. We will only
+        // vectorize a loop if it contains known function calls that don't set
+        // the flag. Therefore, it is safe to ignore this read from memory.
+        CallInst *Call = dyn_cast<CallInst>(it);
+        if (Call && getIntrinsicIDForCall(Call, TLI))
+          continue;
+
+        // If the function has an explicit vectorized counterpart, we can safely
+        // assume that it can be vectorized.
+        if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
+            TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
+          continue;
+
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+          emitAnalysis(LoopAccessReport(Ld)
+                       << "read with atomic ordering or volatile read");
+          DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
+          CanVecMem = false;
+          return;
+        }
+        NumLoads++;
+        Loads.push_back(Ld);
+        InstList.push_back(Ld);
+        DepChecker.addAccess(Ld);
+        continue;
+      }
+
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) {
+          emitAnalysis(LoopAccessReport(it)
+                       << "instruction cannot be vectorized");
+          CanVecMem = false;
+          return;
+        }
+        if (!St->isSimple() && !IsAnnotatedParallel) {
+          emitAnalysis(LoopAccessReport(St)
+                       << "write with atomic ordering or volatile write");
+          DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
+          CanVecMem = false;
+          return;
+        }
+        NumStores++;
+        Stores.push_back(St);
+        InstList.push_back(St);
+        DepChecker.addAccess(St);
+      }
+    } // Next instr.
+  }   // Next block.
+
+  analyzeDependences(Loads, Stores, IsAnnotatedParallel, Strides);
+
+  if (CanVecMem && AnalyzeInterleavedAccess)
+    analyzeInterleaving(InstList, Strides);
+}
+
 bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
                                            DominatorTree *DT)  {
   assert(TheLoop->contains(BB) && "Unknown block used");
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,13 @@
   return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
+unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, Type *SubTy, unsigned Index, unsigned Gap,
+    unsigned Alignment, unsigned AddressSpace) const {
+  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index, Gap,
+                                             Alignment, AddressSpace);
+}
+
 unsigned
 TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -34,6 +34,10 @@
 // Variable uniformity checks are inspired by:
 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
+// The interleaved access vectorization is based on the paper:
+//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
+//  Data for SIMD
+//
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
@@ -351,6 +355,9 @@
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
+  /// Try to vectorize the interleaved access that \p Instr belongs to.
+  void vectorizeInterleavedAccess(Instruction *Instr);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -693,8 +700,13 @@
     return LAI->getRuntimePointerCheck();
   }
 
-  const LoopAccessInfo *getLAI() const {
-    return LAI;
+  const LoopAccessInfo *getLAI() const { return LAI; }
+
+  bool isInterleaved(Instruction *I) { return LAI->isInterleaved(I); }
+
+  const LoopAccessInfo::InterleavedAccess *
+  getInterleavedAccess(Instruction *I) {
+    return LAI->getInterleavedAccess(I);
   }
 
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
@@ -1657,6 +1669,252 @@
                                      "reverse");
 }
 
+// Get a mask to interleave a wide vector concatenated from \p NumVec small
+// vectors.
+// I.E  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
+//      <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF,
+                                         unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVec; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting from index \p Start.
+// I.E.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+                                unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consist of sequential constants,
+// The second part consist of UNDEFs.
+// I.E. <Idx, Idx + 1, Idx + 2, ..., Idx + NumSeq - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx,
+                                   unsigned NumSeq, unsigned NumUndef) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = Idx; i < Idx + NumSeq; i++)
+    Mask.push_back(Builder.getInt32(i));
+  // Return directly if there is no undef value.
+  if (!NumUndef)
+    return ConstantVector::get(Mask);
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndef; i++)
+    Mask.push_back(Undef);
+  return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 1st vector always
+// has the same number of elements as the 2nd vector or more elements.
+// If the 1st vector has more elements, extend the 2nd vector with UNDEFs.
+static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+  if (NumElts1 > NumElts2) {
+    // Extend V2 with undef elements.
+    Constant *ExtMask =
+        getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+  }
+
+  Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+// Concatenate vectors in the given list. All vectors have the same type.
+static Value *ConcatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> List) {
+  unsigned NumVec = List.size();
+  assert(NumVec > 1 && "Should be at least two vectors");
+
+  SmallVector<Value *, 8> Vecs;
+  SmallVector<Value *, 8> TmpList;
+  Vecs.append(List.begin(), List.end());
+  do {
+    for (unsigned i = 0; i < NumVec / 2; i++)
+      TmpList.push_back(
+          ConcatenateTwoVectors(Builder, Vecs[2 * i], Vecs[2 * i + 1]));
+
+    // Push the last vector if the total number of vectors is odd.
+    if (NumVec % 2 != 0)
+      TmpList.push_back(Vecs[NumVec - 1]);
+
+    Vecs.clear();
+    Vecs.append(TmpList.begin(), TmpList.end());
+    NumVec = Vecs.size();
+    TmpList.clear();
+  } while (NumVec > 1);
+
+  return Vecs[0];
+}
+
+// Try to vectorize the interleaved access that \p Instr belongs to.
+//
+// E.g. Translate following loads (VF is 4):
+//   for (i = 0; i < N; i+=3) {
+//     R = Pic[i];
+//     G = Pic[i+1];
+//     B = Pic[i+2];
+//     ... // do something to R, G, B
+//   }
+// To:
+//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
+//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
+//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
+//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
+//
+// Or translate following stores (VF is 4):
+//   for (i = 0; i < N; i+=3) {
+//     ... do something to R, G, B
+//     Pic[i]   = R;
+//     Pic[i+1] = G;
+//     Pic[i+2] = B;
+//   }
+// To:
+//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
+//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr) {
+  auto *IA = Legal->getInterleavedAccess(Instr);
+  assert(IA && "Doesn't belong to any interleaved access");
+
+  // Skip if current instruction is not the insert position.
+  if (Instr != IA->getInsertPos())
+    return;
+
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  unsigned Stride = IA->getStride();
+  Type *VecTy = VectorType::get(ScalarTy, Stride * VF);
+  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+
+  // Prepare for the new pointers.
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  unsigned Idx = IA->getIndex(Instr);
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Notice current instruction could be any index. Need to adjust the address
+    // to the member of index 0.
+    //
+    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
+    //       b = A[i];       // Member of index 0
+    // Current pointer is pointed to A[i+1], adjust it to A[i].
+    //
+    // E.g.  A[i+1] = a;     // Member of index 1
+    //       A[i]   = b;     // Member of index 0
+    //       A[i+2] = c;     // Member of index 2 (Current instruction)
+    // Current pointer is pointed to A[i+2], adjust it to A[i].
+    Value *NewPtr =
+        Builder.CreateExtractElement(PtrParts[Part], Builder.getInt32(0));
+    NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Idx));
+
+    // If the address is reversed, then the wide load/store needs to start at
+    // the last access of index 0.
+    // E.g. For the reversed access of interleaved load with 2 members:
+    //      {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]}
+    // The new pointer is now pointed to A[i]. Adjust it to A[i-6].
+    if (IA->isReverse())
+      NewPtr =
+          Builder.CreateGEP(NewPtr, Builder.getInt32(-(Stride * (VF - 1))));
+
+    // Cast to the new wide vector pointer type.
+    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  const ArrayRef<std::pair<Instruction *, unsigned>> Members = IA->getMembers();
+  Value *UndefVec = UndefValue::get(VecTy);
+
+  // Vectorize the interleaved load.
+  if (LI) {
+    for (unsigned Part = 0; Part < UF; Part++) {
+      // Load a wide vector contains elements for all members.
+      Instruction *WideLoad =
+          Builder.CreateAlignedLoad(NewPtrs[Part], IA->getAlign(), "wide.vec");
+
+      for (auto Member : Members) {
+        // Extract the wide vector for current member according to the index.
+        Constant *StridedMask =
+            getStridedMask(Builder, Member.second, Stride, VF);
+        Value *StrideVec = Builder.CreateShuffleVector(
+            WideLoad, UndefVec, StridedMask, "strided.vec");
+
+        Instruction *MBInst = Member.first;
+
+        // If this member has different type, cast the result type.
+        if (MBInst->getType() != Instr->getType()) {
+          VectorType *OtherVTy = VectorType::get(MBInst->getType(), VF);
+          StrideVec = Builder.CreateBitOrPointerCast(StrideVec, OtherVTy);
+        }
+
+        VectorParts &Entry = WidenMap.get(MBInst);
+        Entry[Part] = IA->isReverse() ? reverseVector(StrideVec) : StrideVec;
+      }
+
+      propagateMetadata(WideLoad, Instr);
+    }
+    return;
+  }
+
+  // The narrow vector type for current instruction.
+  VectorType *MBVecTy = VectorType::get(ScalarTy, VF);
+
+  // Vectorize the interleaved store.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Collect the stored vector for each member.
+    SmallVector<Value *, 4> StoredVecs;
+    for (auto I : Members) {
+      Value *StoredVec =
+          getVectorValue(dyn_cast<StoreInst>(I.first)->getValueOperand())[Part];
+
+      if (IA->isReverse())
+        StoredVec = reverseVector(StoredVec);
+
+      // If this member has different type, cast it to a unified type.
+      if (StoredVec->getType() != MBVecTy)
+        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, MBVecTy);
+
+      StoredVecs.push_back(StoredVec);
+    }
+    // Concatenate all vectors into a wide vector.
+    Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+
+    // Interleave the elements in the wide vector.
+    Constant *IMask = getInterleavedStoreMask(Builder, VF, Stride);
+    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
+                                              "interleaved.vec");
+
+    // Store the interleaved wide vector.
+    Instruction *WideStore =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], IA->getAlign());
+    propagateMetadata(WideStore, Instr);
+  }
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -1664,6 +1922,10 @@
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
+  // Try to vectorize the interleaved access.
+  if (Legal->isInterleaved(Instr))
+    return vectorizeInterleavedAccess(Instr);
+
   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
@@ -4575,6 +4837,24 @@
       return TTI.getAddressComputationCost(VectorTy) +
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    // Interleaved load/store
+    if (Legal->isInterleaved(I)) {
+      auto IA = Legal->getInterleavedAccess(I);
+      Type *WideVecTy =
+          VectorType::get(VectorTy->getVectorElementType(),
+                          VectorTy->getVectorNumElements() * IA->getStride());
+
+      // Calculate the cost of the interleaved access for this member.
+      unsigned Cost = TTI.getInterleavedMemoryOpCost(
+          I->getOpcode(), WideVecTy, VectorTy, IA->getIndex(I),
+          IA->getNumGaps(), IA->getAlign(), AS);
+
+      if (IA->isReverse())
+        Cost +=
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+      return Cost;
+    }
+
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,8 +1,7 @@
-; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -analyze-interleaved-access=true 2>&1 | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -analyze-interleaved-access=true | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
 
 ; Test integer induction variable of step 2:
 ;   for (int i = 0; i < 1024; i+=2) {
@@ -102,26 +101,23 @@
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
Index: test/Transforms/LoopVectorize/interleaved-access.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/interleaved-access.ll
@@ -0,0 +1,602 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -analyze-interleaved-access=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Make sure that we can vectorize the interleave access to a struct.
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+; int test_struct_load2(struct ST2 *A) {
+;   int sum = 0;
+;   for (int i = 0; i < 1024; ++i) {
+;     sum += A[i].x;
+;     sum += A[i].y;
+;   }
+;
+;   return sum;
+; }
+
+; CHECK-LABEL: @test_struct_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+%struct.ST2 = type { i32, i32 }
+define i32 @test_struct_load2(%struct.ST2* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %add = add nsw i32 %0, %sum
+  %add1 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  ret i32 %add1.lcssa
+}
+
+
+; Make sure that the following case can be vectorized.
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx0, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %C
+  %mul = mul nsw i32 %2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+; };
+; int test_struct_load3(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+
+%struct.ST3 = type { i32, i32, i32 }
+
+define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.015
+  %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add5
+}
+
+
+; Make sure that the following case can be vectorized.
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %0 = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure that the following case can be vectorized.
+; struct ST3{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST3 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+
+; int B[1024];
+; struct ST T[1024];
+; void test_struct_store4() {
+;   int *ptr = B;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     T[i].x = X + 1;
+;     T[i].y = X * 2;
+;     T[i].z = X + 3;
+;     T[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+@B = common global [1024 x i32] zeroinitializer, align 4
+@T = common global [1024 x %struct.ST4] zeroinitializer, align 4
+
+define void @test_struct_store4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1
+  %0 = load i32, i32* %ptr.017, align 4
+  %add = add nsw i32 %0, 1
+  %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %0, 1
+  %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %sub = add nsw i32 %0, 3
+  %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2
+  store i32 %sub, i32* %z, align 4
+  %add5 = add nsw i32 %0, 4
+  %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3
+  store i32 %add5, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; This case checks that reversed interleaved loads can be vectorized:
+; struct ST2 {
+;   int x;
+;   int y;
+; };
+; int test_reversed_load2(struct ST2 *A) {
+;   int add = 0;
+;   int sub = 0;
+;   for (int i = 1023; i >= 0; i--) {
+;     add += A[i].x;
+;     sub -= A[i].y;
+;   }
+;   return add*sub;
+; }
+
+; CHECK-LABEL: @test_reversed_load2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+
+define i32 @test_reversed_load2(%struct.ST2* nocapture readonly %A) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %mul = mul nsw i32 %sub4, %add1
+  ret i32 %mul
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %sub.015 = phi i32 [ 0, %entry ], [ %sub4, %for.body ]
+  %add.014 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add1 = add nsw i32 %0, %add.014
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %1 = load i32, i32* %y, align 4
+  %sub4 = sub nsw i32 %sub.015, %1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; This case checks that reversed interleaved loads/stores can be vectorized:
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %0, %1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %2, %1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check the load of odd elements of A can be vectorized as interleaved access.
+; void odd_load(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @odd_load(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shl nsw <4 x i32> %strided.vec, <i32 1, i32 1, i32 1, i32 1>
+define void @odd_load(i32* nocapture readonly %A, i32* nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check the store of odd elements of A can not be vectorized as interleaved
+; access. BTW, the interleaved load of B still can be vectorized as interleaved
+; access
+; void odd_store(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;    A[i] = B[i] + B[i+1];
+; }
+
+; CHECK-LABEL: @odd_store(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK-NOT: store <8 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK: store i32
+; CHECK: extractelement <4 x i32>
+; CHECK: store i32
+; CHECK: extractelement <4 x i32>
+; CHECK: store i32
+; CHECK: extractelement <4 x i32>
+; CHECK: store i32
+define void @odd_store(i32* nocapture %A, i32* nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add3, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check that mixed interleaved loads/stores can be vectorized.
+; void mixed_load_store(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load_store(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec
+define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %2, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %4, %3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check that interleaved loads of different types can be vectorized. As the int
+; value is index 0 of interleaved access. The interleaved load uses a integer
+; vector pointer.
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK-LABEL: @int_float_struct(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
+; CHECK: add nsw <4 x i32>
+; CHECK: fadd fast <4 x float>
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }