Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -827,8 +827,8 @@
 class InterleavedAccessInfo {
 public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
-                        DominatorTree *DT)
-      : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {}
+                        DominatorTree *DT, LoopInfo *LI)
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), RequiresScalarEpilogue(false) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -869,6 +869,7 @@
   PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
+  LoopInfo *LI;
 
   /// True if the loop may contain non-reversed interleaved groups with
   /// out-of-bounds accesses. We ensure we don't speculatively access memory
@@ -917,6 +918,21 @@
   void collectConstStridedAccesses(
       MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
       const ValueToValueMap &Strides);
+
+  /// \brief Returns true if there is a loop-independent read-after-write
+  /// dependence between instructions \p R and \p W.
+  ///
+  /// Such a dependence exists if the dependence distance between the
+  /// instructions is zero and the write may be executed before the read. It
+  /// should be enough to just consider zero distance dependences (and not
+  /// worry about the affects of vectorization) since store groups with gaps
+  /// will be removed.
+  bool isLoopIndependentRAW(Instruction *R, Instruction *W, int Distance) {
+    if (!Distance && R->mayReadFromMemory() && W->mayWriteToMemory() &&
+        !DT->dominates(R, W))
+      return true;
+    return false;
+  }
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -1247,13 +1263,14 @@
                             DominatorTree *DT, TargetLibraryInfo *TLI,
                             AliasAnalysis *AA, Function *F,
                             const TargetTransformInfo *TTI,
-                            LoopAccessAnalysis *LAA,
+                            LoopAccessAnalysis *LAA, LoopInfo *LI,
                             LoopVectorizationRequirements *R,
                             LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+        InterleaveInfo(PSE, L, DT, LI), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R),
+        Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1854,7 +1871,7 @@
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI,
                                   &Requirements, &Hints);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
@@ -4974,7 +4991,17 @@
   // Holds load/store instructions in program order.
   SmallVector<Instruction *, 16> AccessList;
 
-  for (auto *BB : TheLoop->getBlocks()) {
+  // Since it's desired that the load/store instructions be maintained in
+  // "program order" for the interleaved access analysis, we have to visit the
+  // blocks in the loop in reverse postorder (i.e., in a topological order).
+  // Such an ordering will ensure that any load/store that may be executed
+  // before a second load/store will precede the second load/store in the
+  // AccessList.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+  for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E;
+       ++I) {
+    BasicBlock *BB = *I;
     bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 
     for (auto &I : *BB) {
@@ -5027,18 +5054,19 @@
 // sizes or underlying bases.
 //
 // For pointers sharing the same stride, element size and underlying base, no
-// need to worry about Read-After-Write dependences and Write-After-Read
-// dependences.
-//
-// E.g. The RAW dependence:  A[i] = a;
-//                           b = A[i];
-// This won't exist as it is a store-load forwarding conflict, which has
-// already been checked and forbidden in the dependence check.
+// need to worry about Write-After-Read dependences. However, Read-After-Write
+// dependences must be considered.
 //
 // E.g. The WAR dependence:  a = A[i];  // (1)
 //                           A[i] = b;  // (2)
 // The store group of (2) is always inserted at or below (2), and the load group
 // of (1) is always inserted at or above (1). The dependence is safe.
+//
+// E.g. The RAW dependence:  A[i] = a;  // (1)
+//                           b = A[i];  // (2)
+// We have to ensure that the insert position of the store group of (1) may not
+// be below the insert position of the load group of (2). If this condition
+// does not hold, (1) and (2) would be reordered, breaking the dependence.
 void InterleavedAccessInfo::analyzeInterleaving(
     const ValueToValueMap &Strides) {
   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
@@ -5054,6 +5082,10 @@
   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
   // Holds all interleaved load groups temporarily.
   SmallSetVector<InterleaveGroup *, 4> LoadGroups;
+  // Holds store instructions whose interleaved groups must be removed from
+  // consideration due to the possibility of breaking read-after-write
+  // dependences.
+  SmallPtrSet<Instruction *, 4> StoresToRemove;
 
   // Search the load-load/write-write pair B-A in bottom-up order and try to
   // insert B into the interleave group of A according to 3 rules:
@@ -5084,14 +5116,14 @@
     else
       LoadGroups.insert(Group);
 
+    // Holds store instructions that may be involved in a broken
+    // read-after-write dependence.
+    SmallVector<Instruction *, 2> LoopIndependentRAWStores;
+
     for (auto II = std::next(I); II != E; ++II) {
       Instruction *B = II->first;
       StrideDescriptor DesB = II->second;
 
-      // Ignore if B is already in a group or B is a different memory operation.
-      if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
-        continue;
-
       // Check the rule 1 and 2.
       if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
         continue;
@@ -5104,11 +5136,55 @@
 
       int DistanceToA = DistToA->getAPInt().getSExtValue();
 
+      // Check for a loop-independent read-after-write dependence from B (a
+      // possible store) to A (a possible load).
+      //
+      // If a dependence may exist, and the store is already in an interleaved
+      // group, the store will be "moved" to the insert position of the group.
+      // If the insert position may follow the load (it should, given the
+      // bottom-up order in which we are visiting the instructions), the
+      // read-after-write dependence would be broken. Thus, mark the store for
+      // removal, and prevent further loads from being added to the group. If
+      // additional loads were to be added to the group, this would cause the
+      // current load to "move" to a new location, breaking the dependence.
+      //
+      // If a dependence may exist, and the store is not already in an
+      // interleaved group, this is not necessary a problem. We don't yet know
+      // if the load or store will be "moved" to a new insert position. Save
+      // the store for later consideration. If we try and add another load to
+      // the group, the dependence would then be broken.
+      if (isLoopIndependentRAW(A, B, DistanceToA)) {
+        if (isInterleaved(B))
+          if (!DT->dominates(getInterleaveGroup(B)->getInsertPos(), A)) {
+            StoresToRemove.insert(B);
+            break;
+          }
+        LoopIndependentRAWStores.push_back(B);
+      }
+
+      // Ignore if B is already in a group or B is a different memory operation.
+      if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
+        continue;
+
       // Skip if the distance is not multiple of size as they are not in the
       // same group.
       if (DistanceToA % static_cast<int>(DesA.Size))
         continue;
 
+      // Ensure that "moving" A (a load) to the insert position of B (a load)
+      // will not violate a loop-independent read-after-write dependence.
+      //
+      // The store instructions involved in any read-after-write dependences
+      // have already been recorded. If a dependence would be violated, mark
+      // the store for removal, and prevent further loads from being added to
+      // the group by breaking.
+      if (A->mayReadFromMemory() && !LoopIndependentRAWStores.empty()) {
+        for (Instruction *Store : LoopIndependentRAWStores)
+          if (!DT->dominates(Store, B))
+            StoresToRemove.insert(Store);
+        break;
+      }
+
       // The index of B is the index of A plus the related index to A.
       int IndexB =
           Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
@@ -5126,6 +5202,17 @@
     } // Iteration on instruction B
   }   // Iteration on instruction A
 
+  // Remove interleaved store groups having members involved in a potentially
+  // broken read-after-write dependence. We simply remove the entire group
+  // instead of the offending members since store groups with gaps will be
+  // removed anyway.
+  for (Instruction *Store : StoresToRemove)
+    if (isInterleaved(Store)) {
+      InterleaveGroup *StoreGroup = getInterleaveGroup(Store);
+      StoreGroups.remove(StoreGroup);
+      releaseGroup(StoreGroup);
+    }
+
   // Remove interleaved store groups with gaps.
   for (InterleaveGroup *Group : StoreGroups)
     if (Group->getNumMembers() != Group->getFactor())
Index: test/Transforms/LoopVectorize/interleaved-accesses.ll
===================================================================
--- test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -555,4 +555,76 @@
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+; Check vectorization of interleaved accesses groups in the presence of a
+; loop-independent read-after-write dependence. When generating code,
+; interleaved loads are inserted at the location of the first load, and
+; interleaved stores are inserted at the location of the last store. This
+; load/store re-ordering can break RAW dependences.
+;
+; In this test, we create two separate load groups with gaps to avoid breaking
+; the dependence. We do the same for the stores. Because the interleaved store
+; groups have gaps, they are scalarized. Because the interleaved load groups
+; have gaps, we must execute at least on iteration of the scalar epligue loop.
+
+; struct pair {
+;  int x;
+;  int y;
+; };
+;
+; void PR27626(struct pair *p, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p->x = p->y;
+;     p->y = p->x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i64>, <8 x i64>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i64> %[[L1]], i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i64> %[[L1]], i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i64> %[[L1]], i32 4
+; CHECK:   store i64 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i64> %[[L1]], i32 6
+; CHECK:   store i64 %[[X4]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i64>, <8 x i64>* {{.*}}
+; CHECK:   %[[X5:.+]] = extractelement <8 x i64> %[[L2]], i32 0
+; CHECK:   store i64 %[[X5]], {{.*}}
+; CHECK:   %[[X6:.+]] = extractelement <8 x i64> %[[L2]], i32 2
+; CHECK:   store i64 %[[X6]], {{.*}}
+; CHECK:   %[[X7:.+]] = extractelement <8 x i64> %[[L2]], i32 4
+; CHECK:   store i64 %[[X7]], {{.*}}
+; CHECK:   %[[X8:.+]] = extractelement <8 x i64> %[[L2]], i32 6
+; CHECK:   store i64 %[[X8]], {{.*}}
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.end, label %scalar.ph
+
+define void @PR27626(%pair *%p, i64 %n) {
+
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  store i64 %0, i64* %p.0, align 8
+  %1 = load i64, i64* %p.0, align 8
+  store i64 %1, i64 *%p.1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }