Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11128,6 +11128,28 @@
   return DAG.getNode(ISD::BUILD_VECTOR, SL, Ty, BuildVector);
 }
 
+enum NodeIdFlags {
+  Unmarked = -1,
+  Marked = 1
+};
+
+static void markDependence(SDNode* N) {
+  if (!N)
+    return;
+
+  // Mark this node with the value 1.
+  N->setNodeId(Marked);
+
+  // Mark all uses.
+  for (auto UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) {
+    SDNode *User = *UI;
+    if (User && User->getNodeId() == Marked)
+      continue;
+
+    markDependence(User);
+  }
+}
+
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
                   SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
                   unsigned NumStores, bool IsConstantSrc, bool UseVector) {
@@ -11154,6 +11176,41 @@
   LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
   SDLoc DL(StoreNodes[0].MemNode);
 
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+
+  //Check if merging store candidate would cause a loop.
+  if (UseAA){
+    // Any pair of candidates here may cause a problem, but as all
+    // stores are on parallel chains so in the correct case, none of
+    // the nodes will be predecessors of any other. Check in parallel
+    SmallPtrSet <const SDNode *, 16> Visited;
+    SmallVector <const SDNode *, 8> Worklist;
+    // search ops of store candidates
+    for(unsigned i=0; i<NumStores;++i)
+      Worklist.push_back(StoreNodes[i].MemNode);
+    (DAG.getEntryNode().getNode())->
+        hasPredecessorHelper(StoreNodes[0].MemNode, Visited, Worklist);
+    for(unsigned i=0; i<NumStores;++i){
+      if (Visited.count(StoreNodes[i].MemNode))
+          return false;
+    }
+  }
+  else { // store chain case
+    //All stores appear on a chain. So we checking predecessors to all nodes will fail trivially
+    //but as stores candidates only produce a single value, it is sufficient to ensure the transitive
+    //closure of users of the last store lacks any of hte candidates.
+    for (SDNode &Node : DAG.allnodes())
+      Node.setNodeId(Unmarked);
+    markDependence(LatestOp);
+    for (unsigned i=0; i < NumStores; ++i) {
+      if (i == LatestNodeUsed)
+        continue;
+      if (StoreNodes[i].MemNode->getNodeId() != Unmarked)
+        return false;
+    }
+  }
+
   SDValue StoredVal;
   if (UseVector) {
     bool IsVec = MemVT.isVector();
Index: test/CodeGen/AArch64/vector_merge_dep_check.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a53 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+%"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248" = type { float, float }
+
+; Function Attrs: noinline norecurse nounwind ssp uwtable
+define void @_ZN5Eigen8internal13gemm_pack_rhsINSt3__17complexIfEElLi2ELi0ELb0ELb1EEclEPS4_PKS4_lllll(<2 x i64>* %argA, <2 x i64>* %argB) #0 align 2 {
+entry:
+  br i1 undef, label %polly.loop_header134, label %polly.cond184
+
+polly.cond184:                                    ; preds = %entry
+  ret void
+
+polly.loop_header134:                             ; preds = %polly.loop_header134, %entry
+  %lsr.iv37 = phi %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248"* [ %scevgep38, %polly.loop_header134 ], [ undef, %entry ]
+  %lsr.iv3739 = bitcast %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248"* %lsr.iv37 to i64*
+  %_p_vec_full = load <2 x i64>, <2 x i64>* %argA, align 4, !alias.scope !1, !noalias !3, !llvm.mem.parallel_loop_access !9
+  %0 = extractelement <2 x i64> %_p_vec_full, i32 1
+  store i64 %0, i64* %lsr.iv3739, align 8, !alias.scope !4, !noalias !10, !llvm.mem.parallel_loop_access !9
+  %_p_vec_full155 = load <2 x i64>, <2 x i64>* %argB, align 4, !alias.scope !1, !noalias !3, !llvm.mem.parallel_loop_access !9
+  %1 = extractelement <2 x i64> %_p_vec_full155, i32 0
+  %scevgep41 = getelementptr i64, i64* %lsr.iv3739, i64 -1
+  store i64 %1, i64* %scevgep41, align 8, !alias.scope !4, !noalias !10, !llvm.mem.parallel_loop_access !9
+  %2 = extractelement <2 x i64> %_p_vec_full155, i32 1
+  %scevgep40 = getelementptr i64, i64* %lsr.iv3739, i64 1
+  store i64 %2, i64* %scevgep40, align 8, !alias.scope !4, !noalias !10, !llvm.mem.parallel_loop_access !9
+  %scevgep38 = getelementptr %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248", %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248"* %lsr.iv37, i64 4
+  br label %polly.loop_header134
+}
+
+; CHECK: ret
+
+attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a53" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Snapdragon LLVM ARM Compiler 3.8.0 (based on LLVM 3.8.0)"}
+!1 = distinct !{!1, !2, !"polly.alias.scope.rhs"}
+!2 = distinct !{!2, !"polly.alias.scope.domain"}
+!3 = !{!4, !5, !6, !7, !8}
+!4 = distinct !{!4, !2, !"polly.alias.scope.blockB"}
+!5 = distinct !{!5, !2, !"polly.alias.scope.add28.lcssa.reg2mem"}
+!6 = distinct !{!6, !2, !"polly.alias.scope.count.0.lcssa.reg2mem"}
+!7 = distinct !{!7, !2, !"polly.alias.scope.mul"}
+!8 = distinct !{!8, !2, !"polly.alias.scope.add28.us.lcssa.reg2mem"}
+!9 = distinct !{!9}
+!10 = !{!1, !5, !6, !7, !8}