Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -264,6 +264,14 @@
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const;
 
+  /// \brief Check the dependence for two accesses with the same stride \p
+  /// Stride. \p Distance is the positive distance between the accesses and \p
+  /// TypeByteSize is the type size in bytes of the accesses.
+  ///
+  /// \returns true if the accesses are independent.
+  static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride,
+                                            unsigned TypeByteSize);
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
   /// applies dynamic knowledge to simplify SCEV expressions and convert them
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -1113,13 +1113,9 @@
   return false;
 }
 
-/// \brief Check the dependence for two accesses with the same stride \p Stride.
-/// \p Distance is the positive distance and \p TypeByteSize is type size in
-/// bytes.
-///
-/// \returns true if they are independent.
-static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride,
-                                          unsigned TypeByteSize) {
+bool MemoryDepChecker::areStridedAccessesIndependent(unsigned Distance,
+                                                     unsigned Stride,
+                                                     unsigned TypeByteSize) {
   assert(Stride > 1 && "The stride must be greater than 1");
   assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
   assert(Distance > 0 && "The distance must be non-zero");
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -833,8 +833,8 @@
 class InterleavedAccessInfo {
 public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
-                        DominatorTree *DT)
-      : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {}
+                        DominatorTree *DT, LoopInfo *LI)
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), RequiresScalarEpilogue(false) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -875,6 +875,12 @@
   /// out-of-bounds requires a scalar epilogue iteration for correctness.
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 
+  /// \brief Returns true if \p Stride is allowed in an interleaved group.
+  static bool isStrideAllowed(int Stride) {
+    unsigned Factor = std::abs(Stride);
+    return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
+  }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -883,6 +889,7 @@
   PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
+  LoopInfo *LI;
 
   /// True if the loop may contain non-reversed interleaved groups with
   /// out-of-bounds accesses. We ensure we don't speculatively access memory
@@ -931,6 +938,12 @@
   void collectConstStridedAccesses(
       MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
       const ValueToValueMap &Strides);
+
+  /// \brief Returns true if memory accesses \p A and \p B can be reordered. We
+  /// use the provided StrideDescriptors and MemoryDepChecker to determine if
+  /// any dependences would be violated.
+  bool canReorderMemAccs(Instruction *A, Instruction *B, StrideDescriptor &DesA,
+                         StrideDescriptor &DesB);
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -1261,13 +1274,14 @@
                             DominatorTree *DT, TargetLibraryInfo *TLI,
                             AliasAnalysis *AA, Function *F,
                             const TargetTransformInfo *TTI,
-                            LoopAccessAnalysis *LAA,
+                            LoopAccessAnalysis *LAA, LoopInfo *LI,
                             LoopVectorizationRequirements *R,
                             LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+        InterleaveInfo(PSE, L, DT, LI), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R),
+        Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1873,7 +1887,7 @@
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI,
                                   &Requirements, &Hints);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
@@ -4991,7 +5005,17 @@
   // Holds load/store instructions in program order.
   SmallVector<Instruction *, 16> AccessList;
 
-  for (auto *BB : TheLoop->getBlocks()) {
+  // Since it's desired that the load/store instructions be maintained in
+  // "program order" for the interleaved access analysis, we have to visit the
+  // blocks in the loop in reverse postorder (i.e., in a topological order).
+  // Such an ordering will ensure that any load/store that may be executed
+  // before a second load/store will precede the second load/store in the
+  // AccessList.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+  for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E;
+       ++I) {
+    BasicBlock *BB = *I;
     bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 
     for (auto &I : *BB) {
@@ -5016,13 +5040,6 @@
     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
     int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
 
-    // The factor of the corresponding interleave group.
-    unsigned Factor = std::abs(Stride);
-
-    // Ignore the access if the factor is too small or too large.
-    if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
-      continue;
-
     const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
@@ -5036,26 +5053,98 @@
   }
 }
 
-// Analyze interleaved accesses and collect them into interleave groups.
+bool InterleavedAccessInfo::canReorderMemAccs(Instruction *A, Instruction *B,
+                                              StrideDescriptor &DesA,
+                                              StrideDescriptor &DesB) {
+
+  // Lambda expression for getting the pointer operand of the memory accesses.
+  auto getPointerOperand = [&](Instruction *I) -> Value * {
+    if (auto *Load = dyn_cast<LoadInst>(I))
+      return Load->getPointerOperand();
+    if (auto *Store = dyn_cast<StoreInst>(I))
+      return Store->getPointerOperand();
+    return nullptr;
+  };
+
+  // If neither instruction writes to memory, they can safely be reordered.
+  if (!A->mayWriteToMemory() && !B->mayWriteToMemory())
+    return true;
+
+  // If the distance is non-constant, the vectorized loop should be guarded
+  // with run-time pointer aliasing checks. The checks should ensure that the
+  // memory locations A and B access throughout the loop are disjoint. Because
+  // the memory must be disjoint if the checks succeed, it's safe to reorder.
+  auto *Distance =
+      dyn_cast<SCEVConstant>(PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
+  if (!Distance)
+    return true;
+
+  // LAA allows non-positive dependence distances for vectorization. However,
+  // since we may reorder the instructions, the direction of the dependence
+  // doesn't really matter, so we take the absolute value. If the distance is
+  // zero, the instructions can't be reordered.
+  auto NonNegDistance = std::abs(Distance->getAPInt().getSExtValue());
+  if (NonNegDistance == 0)
+    return false;
+
+  // At this point, we know there is a non-zero distance between the accesses.
+  // If they are strided, we can try to prove them independent. Otherwise, they
+  // are dependent. First, we get the interleave factor of the accesses. If
+  // they aren't interleaved or they have different strides, don't reorder.
+  auto FactorA = std::abs(DesA.Stride);
+  auto FactorB = std::abs(DesB.Stride);
+  if (FactorA != FactorB || FactorA <= 1)
+    return false;
+
+  // Get the types of the pointer operands. If they're not the same, don't
+  // reorder.
+  auto *PtrTyA = cast<PointerType>(getPointerOperand(A)->getType());
+  auto *PtrTyB = cast<PointerType>(getPointerOperand(B)->getType());
+  if (PtrTyA != PtrTyB)
+    return false;
+
+  // We need the module's data layout to determine the size of the type the
+  // accesses read from and write to.
+  auto DL = A->getModule()->getDataLayout();
+
+  // Finally, try and prove independence. If the strided accesses are
+  // independent, they can be reordered.
+  return MemoryDepChecker::areStridedAccessesIndependent(
+      NonNegDistance, FactorA, DL.getTypeAllocSize(PtrTyA->getElementType()));
+}
+
+// Analyze interleaved accesses and collect them into interleaved load and
+// store groups.
+//
+// When generating code for an interleaved load group, we effectively hoist all
+// loads in the group to the location of the first load in program order. When
+// generating code for an interleaved store group, we sink all stores to the
+// location of the last store. This code motion can change the order of load
+// and store instructions and may break dependences. The inserted run-time
+// pointer aliasing checks ensure the absence of some dependences. However, we
+// need to explicitly check that we won't violate the dependences allowed for
+// vectorization.
 //
-// Notice that the vectorization on interleaved groups will change instruction
-// orders and may break dependences. But the memory dependence check guarantees
-// that there is no overlap between two pointers of different strides, element
-// sizes or underlying bases.
+// A vectorizable loop can legally contain dependences at constant non-positive
+// distances and at constant positive distances if they are greater than or
+// equal to the threshold used to compute the vectorization factor. Dependences
+// with non-constant distances require aliasing checks, so we can assume these
+// dependences are not present in the vectorized loop.
 //
-// For pointers sharing the same stride, element size and underlying base, no
-// need to worry about Read-After-Write dependences and Write-After-Read
-// dependences.
+// The code generation strategy mentioned above ensures that we won't violate
+// any write-after-read (WAR) dependences.
 //
-// E.g. The RAW dependence:  A[i] = a;
-//                           b = A[i];
-// This won't exist as it is a store-load forwarding conflict, which has
-// already been checked and forbidden in the dependence check.
+// E.g., for the WAR dependence:  a = A[i];  // (1)
+//                                A[i] = b;  // (2)
 //
-// E.g. The WAR dependence:  a = A[i];  // (1)
-//                           A[i] = b;  // (2)
-// The store group of (2) is always inserted at or below (2), and the load group
-// of (1) is always inserted at or above (1). The dependence is safe.
+// The store group of (2) is always inserted at or below (2), and the load
+// group of (1) is always inserted at or above (1). Thus, the instructions will
+// never be reordered. All other dependences are checked to ensure the
+// correctness of the instruction reordering.
+//
+// The algorithm visits all memory accesses in the loop in bottom-up program
+// order. Program order is established by traversing the blocks in the loop in
+// reverse postorder when collecting the accesses.
 void InterleavedAccessInfo::analyzeInterleaving(
     const ValueToValueMap &Strides) {
   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
@@ -5077,34 +5166,51 @@
   //   1. A and B have the same stride.
   //   2. A and B have the same memory object size.
   //   3. B belongs to the group according to the distance.
-  //
-  // The bottom-up order can avoid breaking the Write-After-Write dependences
-  // between two pointers of the same base.
-  // E.g.  A[i]   = a;   (1)
-  //       A[i]   = b;   (2)
-  //       A[i+1] = c    (3)
-  // We form the group (2)+(3) in front, so (1) has to form groups with accesses
-  // above (1), which guarantees that (1) is always above (2).
   for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
        ++I) {
     Instruction *A = I->first;
     StrideDescriptor DesA = I->second;
 
-    InterleaveGroup *Group = getInterleaveGroup(A);
-    if (!Group) {
-      DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
-      Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+    // Initialize a group for A if it has an allowable stride. Even if we don't
+    // create a group for A, we continue with the bottom-up algorithm to ensure
+    // we don't break any of A's dependences.
+    InterleaveGroup *Group = nullptr;
+    if (isStrideAllowed(DesA.Stride)) {
+      Group = getInterleaveGroup(A);
+      if (!Group) {
+        DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
+        Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+      }
+      if (A->mayWriteToMemory())
+        StoreGroups.insert(Group);
+      else
+        LoadGroups.insert(Group);
     }
 
-    if (A->mayWriteToMemory())
-      StoreGroups.insert(Group);
-    else
-      LoadGroups.insert(Group);
-
     for (auto II = std::next(I); II != E; ++II) {
       Instruction *B = II->first;
       StrideDescriptor DesB = II->second;
 
+      // If instructions A and B cannot be reordered because they're in a
+      // dependence relation, we can't add additional instructions to A's group
+      // since this may cause illegal code motion. If B is already in a group
+      // release it. We check that B is a write because our code motion won't
+      // ever violate WAR dependences.
+      if (B->mayWriteToMemory() && !canReorderMemAccs(A, B, DesA, DesB)) {
+        if (isInterleaved(B)) {
+          InterleaveGroup *StoreGroup = getInterleaveGroup(B);
+          StoreGroups.remove(StoreGroup);
+          releaseGroup(StoreGroup);
+        }
+        break;
+      }
+
+      // At this point, we've checked for code motion that would break
+      // dependences. If A or B has a stride that's not allowed, there's
+      // nothing left to do.
+      if (!isStrideAllowed(DesA.Stride) || !isStrideAllowed(DesB.Stride))
+        continue;
+
       // Ignore if B is already in a group or B is a different memory operation.
       if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
         continue;
Index: test/Transforms/LoopVectorize/interleaved-accesses.ll
===================================================================
--- test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -555,4 +555,108 @@
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+; Check vectorization of interleaved access groups in the presence of
+; dependences. When generating code, interleaved loads are inserted at the
+; location of the first load in a group, and interleaved stores are inserted at
+; the location of the last store in a group. However, all dependent accesses
+; should occur in the same order. Thus, we prevent group formation for
+; dependent accesses.
+;
+; In the following tests, the stores are scalarized because the required code
+; motion would break dependences. The remaining interleaved load groups have
+; gaps, so we must execute at least one iteration of the scalar epilogue loop.
+
+; void PR27626_0(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_0(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.end, label %scalar.ph
+
+%pair.i32 = type { i32, i32 }
+define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 8
+  %0 = load i32, i32* %p_i.x, align 8
+  store i32 %0, i32 *%p_i.y, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; void PR27626_1(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i + 1].x = z;
+;     p[i].y = p[i].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_1(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.end, label %scalar.ph
+
+define void @PR27626_1(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add nuw nsw i64 %i, 1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i_plus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i_plus_1.x, align 8
+  %0 = load i32, i32* %p_i.x, align 8
+  store i32 %0, i32 *%p_i.y, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }