Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11128,6 +11128,28 @@ return DAG.getNode(ISD::BUILD_VECTOR, SL, Ty, BuildVector); } +enum NodeIdFlags { + Unmarked = -1, + Marked = 1 +}; + +static void markDependence(SDNode* N) { + if (!N) + return; + + // Mark this node with the value 1. + N->setNodeId(Marked); + + // Mark all uses. + for (auto UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) { + SDNode *User = *UI; + if (User && User->getNodeId() == Marked) + continue; + + markDependence(User); + } +} + bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector) { @@ -11154,6 +11176,41 @@ LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); + bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA + : DAG.getSubtarget().useAA(); + + //Check if merging store candidate would cause a loop. + if (UseAA){ + // Any pair of candidates here may cause a problem, but as all + // stores are on parallel chains so in the correct case, none of + // the nodes will be predecessors of any other. Check in parallel + SmallPtrSet Visited; + SmallVector Worklist; + // search ops of store candidates + for(unsigned i=0; i + hasPredecessorHelper(StoreNodes[0].MemNode, Visited, Worklist); + for(unsigned i=0; igetNodeId() != Unmarked) + return false; + } + } + SDValue StoredVal; if (UseVector) { bool IsVec = MemVT.isVector(); Index: test/CodeGen/AArch64/vector_merge_dep_check.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/vector_merge_dep_check.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s | FileCheck %s +; RUN: llc -mcpu=cortex-a53 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android" + +%"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248" = type { float, float } + +; Function Attrs: noinline norecurse nounwind ssp uwtable +define void @_ZN5Eigen8internal13gemm_pack_rhsINSt3__17complexIfEElLi2ELi0ELb0ELb1EEclEPS4_PKS4_lllll(<2 x i64>* %argA, <2 x i64>* %argB) #0 align 2 { +entry: + br i1 undef, label %polly.loop_header134, label %polly.cond184 + +polly.cond184: ; preds = %entry + ret void + +polly.loop_header134: ; preds = %polly.loop_header134, %entry + %lsr.iv37 = phi %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248"* [ %scevgep38, %polly.loop_header134 ], [ undef, %entry ] + %lsr.iv3739 = bitcast %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248"* %lsr.iv37 to i64* + %_p_vec_full = load <2 x i64>, <2 x i64>* %argA, align 4, !alias.scope !1, !noalias !3, !llvm.mem.parallel_loop_access !9 + %0 = extractelement <2 x i64> %_p_vec_full, i32 1 + store i64 %0, i64* %lsr.iv3739, align 8, !alias.scope !4, !noalias !10, !llvm.mem.parallel_loop_access !9 + %_p_vec_full155 = load <2 x i64>, <2 x i64>* %argB, align 4, !alias.scope !1, !noalias !3, !llvm.mem.parallel_loop_access !9 + %1 = extractelement <2 x i64> %_p_vec_full155, i32 0 + %scevgep41 = getelementptr i64, i64* %lsr.iv3739, i64 -1 + store i64 %1, i64* %scevgep41, align 8, !alias.scope !4, !noalias !10, !llvm.mem.parallel_loop_access !9 + %2 = extractelement <2 x i64> %_p_vec_full155, i32 1 + %scevgep40 = getelementptr i64, i64* %lsr.iv3739, i64 1 + store i64 %2, i64* %scevgep40, align 8, !alias.scope !4, !noalias !10, !llvm.mem.parallel_loop_access !9 + %scevgep38 = getelementptr %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248", %"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248"* %lsr.iv37, i64 4 + br label %polly.loop_header134 +} + +; CHECK: ret + +attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a53" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"Snapdragon LLVM ARM Compiler 3.8.0 (based on LLVM 3.8.0)"} +!1 = distinct !{!1, !2, !"polly.alias.scope.rhs"} +!2 = distinct !{!2, !"polly.alias.scope.domain"} +!3 = !{!4, !5, !6, !7, !8} +!4 = distinct !{!4, !2, !"polly.alias.scope.blockB"} +!5 = distinct !{!5, !2, !"polly.alias.scope.add28.lcssa.reg2mem"} +!6 = distinct !{!6, !2, !"polly.alias.scope.count.0.lcssa.reg2mem"} +!7 = distinct !{!7, !2, !"polly.alias.scope.mul"} +!8 = distinct !{!8, !2, !"polly.alias.scope.add28.us.lcssa.reg2mem"} +!9 = distinct !{!9} +!10 = !{!1, !5, !6, !7, !8}