Index: include/llvm/ADT/SCCIterator.h =================================================================== --- include/llvm/ADT/SCCIterator.h +++ include/llvm/ADT/SCCIterator.h @@ -18,6 +18,12 @@ /// To visit S1 *before* S2, use the scc_iterator on the Inverse graph. (NOTE: /// This requires some simple wrappers and is not supported yet.) /// +/// An scc_iterator provides the SCCs reachable from a single entry node. In +/// order to obtain the SCCs of a graph having multiple entry nodes, invoke +/// scc_iterator on each entry node providing it an (optional) DontRevisitNodes +/// argument. This argument holds nodes that have been visited and should not +/// be revisited, and is augmented by scc_iterator. This follows Tarjan's +/// original O(N+E) total time algorithm. //===----------------------------------------------------------------------===// #ifndef LLVM_ADT_SCCITERATOR_H @@ -26,6 +32,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/iterator.h" +#include #include namespace llvm { @@ -79,6 +86,8 @@ /// node, the next child to visit, and the minimum uplink value of all child std::vector VisitStack; + std::set *DontRevisitNodes; + /// A single "visit" within the non-recursive DFS traversal. void DFSVisitOne(NodeType *N); @@ -88,7 +97,8 @@ /// Compute the next SCC using the DFS traversal. void GetNextSCC(); - scc_iterator(NodeType *entryN) : visitNum(0) { + scc_iterator(NodeType *entryN, std::set *DRN = nullptr) + : visitNum(0), DontRevisitNodes(DRN) { DFSVisitOne(entryN); GetNextSCC(); } @@ -97,8 +107,9 @@ scc_iterator() {} public: - static scc_iterator begin(const GraphT &G) { - return scc_iterator(GT::getEntryNode(G)); + static scc_iterator begin(const GraphT &G, + std::set *DontRevisitNodes = nullptr) { + return scc_iterator(GT::getEntryNode(G), DontRevisitNodes); } static scc_iterator end(const GraphT &) { return scc_iterator(); } @@ -135,11 +146,19 @@ assert(nodeVisitNumbers.count(Old) && "Old not in scc_iterator?"); nodeVisitNumbers[New] = nodeVisitNumbers[Old]; nodeVisitNumbers.erase(Old); + if (DontRevisitNodes && DontRevisitNodes->count(Old)) { + DontRevisitNodes->erase(Old); + DontRevisitNodes->insert(New); + } } }; template void scc_iterator::DFSVisitOne(NodeType *N) { + if (DontRevisitNodes && DontRevisitNodes->count(N)) + return; + if (DontRevisitNodes) + DontRevisitNodes->insert(N); ++visitNum; nodeVisitNumbers[N] = visitNum; SCCNodeStack.push_back(N); @@ -221,8 +240,11 @@ } /// \brief Construct the begin iterator for a deduced graph type T. -template scc_iterator scc_begin(const T &G) { - return scc_iterator::begin(G); +template +scc_iterator scc_begin( + const T &G, + std::set::NodeType *> *DontRevisitNodes = nullptr) { + return scc_iterator::begin(G, DontRevisitNodes); } /// \brief Construct the end iterator for a deduced graph type T. @@ -231,13 +253,17 @@ } /// \brief Construct the begin iterator for a deduced graph type T's Inverse. -template scc_iterator > scc_begin(const Inverse &G) { - return scc_iterator >::begin(G); +template +scc_iterator> scc_begin( + const Inverse &G, + std::set>::NodeType *> *DontRevisitNodes = + nullptr) { + return scc_iterator>::begin(G, DontRevisitNodes); } /// \brief Construct the end iterator for a deduced graph type T's Inverse. template scc_iterator > scc_end(const Inverse &G) { - return scc_iterator >::end(G); + return scc_iterator>::end(G); } } // End llvm namespace Index: include/llvm/Support/DataFlow.h =================================================================== --- include/llvm/Support/DataFlow.h +++ include/llvm/Support/DataFlow.h @@ -25,35 +25,35 @@ template <> struct GraphTraits { typedef const Value NodeType; - typedef Value::const_use_iterator ChildIteratorType; + typedef Value::const_user_iterator ChildIteratorType; static NodeType *getEntryNode(const Value *G) { return G; } static inline ChildIteratorType child_begin(NodeType *N) { - return N->use_begin(); + return N->user_begin(); } static inline ChildIteratorType child_end(NodeType *N) { - return N->use_end(); + return N->user_end(); } }; template <> struct GraphTraits { typedef Value NodeType; - typedef Value::use_iterator ChildIteratorType; + typedef Value::user_iterator ChildIteratorType; static NodeType *getEntryNode(Value *G) { return G; } static inline ChildIteratorType child_begin(NodeType *N) { - return N->use_begin(); + return N->user_begin(); } static inline ChildIteratorType child_end(NodeType *N) { - return N->use_end(); + return N->user_end(); } }; Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -18,6 +18,7 @@ #include "llvm/Transforms/Scalar/SLPVectorizer.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" @@ -39,6 +40,7 @@ #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/DataFlow.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize.h" @@ -389,6 +391,11 @@ /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; + /// Scan use-def dependence graph to find and mark scalar instructions that + /// belong to the tree and lie on dependence cycles. Returns true if such new + /// instructions were found. + bool markCyclicInstructions(); + private: struct TreeEntry; @@ -490,6 +497,12 @@ /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A list of scalars that we found lying on a dependence cycle. + ValueSet OnDependenceCycle; + + /// A list of scalars that we already visited looking for cycles. + std::set VisitedInstructions; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser (Value *S, llvm::User *U, int L) : @@ -967,6 +980,30 @@ } +bool BoUpSLP::markCyclicInstructions() { + bool InstructionsMarked = false; + DEBUG(dbgs() << "SLP: Marking cyclic instructions.\n"); + if (VectorizableTree.empty()) + return InstructionsMarked; + ArrayRef VL = VectorizableTree[0].Scalars; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Inverse VLi = cast(VL[i]); + DEBUG(dbgs() << "SLP: checking instruction (" << *VL[i] << ").\n"); + for (scc_iterator> I = scc_begin(VLi, &VisitedInstructions); + !I.isAtEnd(); ++I) { + const std::vector &ValueSCC = *I; + for (auto Val : ValueSCC) + DEBUG(dbgs() << "SLP: in scc (" << *Val << ").\n"); + if (ValueSCC.size() > 1) { + OnDependenceCycle.insert(ValueSCC.begin(), ValueSCC.end()); + InstructionsMarked = true; + } + } + } + return InstructionsMarked; +} + + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy; bool isAltShuffle = false; @@ -1854,6 +1891,15 @@ for (TreeEntry &TE : VectorizableTree) { int C = getEntryCost(&TE); + if (C < 0 && !OnDependenceCycle.empty()) { + for (int Lane = 0, LE = TE.Scalars.size(); Lane != LE; ++Lane) + if (OnDependenceCycle.count(TE.Scalars[Lane])) { + DEBUG(dbgs() << "SLP: Instruction found to lie on cycle.\n"); + C = 1000; + break; + } + } + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); Cost += C; @@ -3613,6 +3659,12 @@ int Cost = R.getTreeCost(); DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + + if (Cost < CostThreshold && R.markCyclicInstructions()) { + Cost = R.getTreeCost(); + DEBUG(dbgs() << "SLP: Cost considering cycles=" << Cost << "\n"); + } + if (Cost < CostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); R.vectorizeTree(); Index: test/Transforms/SLPVectorizer/X86/pr25108.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr25108.ll +++ test/Transforms/SLPVectorizer/X86/pr25108.ll @@ -0,0 +1,154 @@ +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Check that we account for cyclic dependences and avoid vectorizing this. +; CHECK-LABEL: compute +; CHECK-NOT: x float> +; Function Attrs: norecurse nounwind uwtable +define void @compute(float* nocapture %v) #0 { +entry: + %arrayidx18 = getelementptr inbounds float, float* %v, i64 6 + %arrayidx.phi.trans.insert = getelementptr inbounds float, float* %v, i64 1 + %.pre = load float, float* %arrayidx.phi.trans.insert, align 4 + %.pre6 = load float, float* %v, align 4 + %arrayidx10.phi.trans.insert = getelementptr inbounds float, float* %v, i64 2 + %.pre7 = load float, float* %arrayidx10.phi.trans.insert, align 4 + %arrayidx10.1.phi.trans.insert = getelementptr inbounds float, float* %v, i64 3 + %.pre8 = load float, float* %arrayidx10.1.phi.trans.insert, align 4 + %arrayidx10.2.phi.trans.insert = getelementptr inbounds float, float* %v, i64 4 + %.pre9 = load float, float* %arrayidx10.2.phi.trans.insert, align 4 + %arrayidx10.3.phi.trans.insert = getelementptr inbounds float, float* %v, i64 5 + %.pre10 = load float, float* %arrayidx10.3.phi.trans.insert, align 4 + %.pre11 = load float, float* %arrayidx18, align 4 + %arrayidx10.5.phi.trans.insert = getelementptr inbounds float, float* %v, i64 7 + %.pre12 = load float, float* %arrayidx10.5.phi.trans.insert, align 4 + br label %for.cond2.preheader + +for.cond2.preheader: ; preds = %for.cond2.preheader, %entry + %0 = phi float [ %.pre12, %entry ], [ %mul27, %for.cond2.preheader ] + %1 = phi float [ %.pre11, %entry ], [ %mul27.1, %for.cond2.preheader ] + %2 = phi float [ %.pre10, %entry ], [ %mul27.2, %for.cond2.preheader ] + %3 = phi float [ %.pre9, %entry ], [ %mul27.3, %for.cond2.preheader ] + %4 = phi float [ %.pre8, %entry ], [ %mul27.4, %for.cond2.preheader ] + %5 = phi float [ %.pre7, %entry ], [ %mul27.5, %for.cond2.preheader ] + %6 = phi float [ %.pre6, %entry ], [ %mul, %for.cond2.preheader ] + %7 = phi float [ %.pre, %entry ], [ %mul.1, %for.cond2.preheader ] + %i.03 = phi i32 [ 0, %entry ], [ %inc31, %for.cond2.preheader ] + %add7 = fadd float %7, %6 + %mul = fmul float %add7, %5 + %add7.1 = fadd float %5, %7 + %mul.1 = fmul float %add7.1, %4 + %add7.2 = fadd float %4, %5 + %mul.2 = fmul float %add7.2, %3 + %add7.3 = fadd float %3, %4 + %mul.3 = fmul float %add7.3, %2 + %add7.4 = fadd float %2, %3 + %mul.4 = fmul float %add7.4, %1 + %add7.5 = fadd float %1, %2 + %mul.5 = fmul float %add7.5, %0 + %add21 = fadd float %1, %0 + %mul27 = fmul float %add21, %mul.5 + %add21.1 = fadd float %mul.5, %1 + %mul27.1 = fmul float %add21.1, %mul.4 + %add21.2 = fadd float %mul.4, %mul.5 + %mul27.2 = fmul float %add21.2, %mul.3 + %add21.3 = fadd float %mul.3, %mul.4 + %mul27.3 = fmul float %add21.3, %mul.2 + %add21.4 = fadd float %mul.2, %mul.3 + %mul27.4 = fmul float %add21.4, %mul.1 + %add21.5 = fadd float %mul.1, %mul.2 + %mul27.5 = fmul float %add21.5, %mul + %inc31 = add nuw nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc31, 10000 + br i1 %exitcond, label %for.end32, label %for.cond2.preheader + +for.end32: ; preds = %for.cond2.preheader + store float %mul, float* %v, align 4 + store float %mul.1, float* %arrayidx.phi.trans.insert, align 4 + store float %mul27, float* %arrayidx10.5.phi.trans.insert, align 4 + store float %mul27.1, float* %arrayidx18, align 4 + store float %mul27.2, float* %arrayidx10.3.phi.trans.insert, align 4 + store float %mul27.3, float* %arrayidx10.2.phi.trans.insert, align 4 + store float %mul27.4, float* %arrayidx10.1.phi.trans.insert, align 4 + store float %mul27.5, float* %arrayidx10.phi.trans.insert, align 4 + ret void +} + +; CHECK-LABEL: computed +; CHECK-NOT: x double> +; Function Attrs: norecurse nounwind uwtable +define void @computed(double* nocapture %v) #0 { +entry: + %arrayidx18 = getelementptr inbounds double, double* %v, i64 6 + %arrayidx.phi.trans.insert = getelementptr inbounds double, double* %v, i64 1 + %.pre = load double, double* %arrayidx.phi.trans.insert, align 8 + %.pre6 = load double, double* %v, align 8 + %arrayidx10.phi.trans.insert = getelementptr inbounds double, double* %v, i64 2 + %.pre7 = load double, double* %arrayidx10.phi.trans.insert, align 8 + %arrayidx10.1.phi.trans.insert = getelementptr inbounds double, double* %v, i64 3 + %.pre8 = load double, double* %arrayidx10.1.phi.trans.insert, align 8 + %arrayidx10.2.phi.trans.insert = getelementptr inbounds double, double* %v, i64 4 + %.pre9 = load double, double* %arrayidx10.2.phi.trans.insert, align 8 + %arrayidx10.3.phi.trans.insert = getelementptr inbounds double, double* %v, i64 5 + %.pre10 = load double, double* %arrayidx10.3.phi.trans.insert, align 8 + %.pre11 = load double, double* %arrayidx18, align 8 + %arrayidx10.5.phi.trans.insert = getelementptr inbounds double, double* %v, i64 7 + %.pre12 = load double, double* %arrayidx10.5.phi.trans.insert, align 8 + br label %for.cond2.preheader + +for.cond2.preheader: ; preds = %for.cond2.preheader, %entry + %0 = phi double [ %.pre12, %entry ], [ %mul27, %for.cond2.preheader ] + %1 = phi double [ %.pre11, %entry ], [ %mul27.1, %for.cond2.preheader ] + %2 = phi double [ %.pre10, %entry ], [ %mul27.2, %for.cond2.preheader ] + %3 = phi double [ %.pre9, %entry ], [ %mul27.3, %for.cond2.preheader ] + %4 = phi double [ %.pre8, %entry ], [ %mul27.4, %for.cond2.preheader ] + %5 = phi double [ %.pre7, %entry ], [ %mul27.5, %for.cond2.preheader ] + %6 = phi double [ %.pre6, %entry ], [ %mul, %for.cond2.preheader ] + %7 = phi double [ %.pre, %entry ], [ %mul.1, %for.cond2.preheader ] + %i.03 = phi i32 [ 0, %entry ], [ %inc31, %for.cond2.preheader ] + %add7 = fadd double %7, %6 + %mul = fmul double %add7, %5 + %add7.1 = fadd double %5, %7 + %mul.1 = fmul double %add7.1, %4 + %add7.2 = fadd double %4, %5 + %mul.2 = fmul double %add7.2, %3 + %add7.3 = fadd double %3, %4 + %mul.3 = fmul double %add7.3, %2 + %add7.4 = fadd double %2, %3 + %mul.4 = fmul double %add7.4, %1 + %add7.5 = fadd double %1, %2 + %mul.5 = fmul double %add7.5, %0 + %add21 = fadd double %1, %0 + %mul27 = fmul double %add21, %mul.5 + %add21.1 = fadd double %mul.5, %1 + %mul27.1 = fmul double %add21.1, %mul.4 + %add21.2 = fadd double %mul.4, %mul.5 + %mul27.2 = fmul double %add21.2, %mul.3 + %add21.3 = fadd double %mul.3, %mul.4 + %mul27.3 = fmul double %add21.3, %mul.2 + %add21.4 = fadd double %mul.2, %mul.3 + %mul27.4 = fmul double %add21.4, %mul.1 + %add21.5 = fadd double %mul.1, %mul.2 + %mul27.5 = fmul double %add21.5, %mul + %inc31 = add nuw nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc31, 10000 + br i1 %exitcond, label %for.end32, label %for.cond2.preheader + +for.end32: ; preds = %for.cond2.preheader + store double %mul, double* %v, align 8 + store double %mul.1, double* %arrayidx.phi.trans.insert, align 8 + store double %mul27, double* %arrayidx10.5.phi.trans.insert, align 8 + store double %mul27.1, double* %arrayidx18, align 8 + store double %mul27.2, double* %arrayidx10.3.phi.trans.insert, align 8 + store double %mul27.3, double* %arrayidx10.2.phi.trans.insert, align 8 + store double %mul27.4, double* %arrayidx10.1.phi.trans.insert, align 8 + store double %mul27.5, double* %arrayidx10.phi.trans.insert, align 8 + ret void +} + +attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="slm" "target-features"="+aes,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"PIC Level", i32 2}