diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h @@ -95,7 +95,7 @@ /// Whether the iteration graph is sorted in admissible topoOrder. /// Sets outerParNest on success with sparse output - bool isAdmissibleTopoOrder(); + bool isAdmissibleTopoOrder(const std::vector &topSort); // // Topological delegate and sort methods. @@ -103,10 +103,11 @@ LoopOrd topSortSize() const { return topSort.size(); } LoopId topSortAt(LoopOrd n) const { return topSort.at(n); } - void topSortPushBack(LoopId i) { topSort.push_back(i); } - void topSortClear(size_t capacity = 0) { - topSort.clear(); - topSort.reserve(capacity); + void topSortSet(const std::vector &value) { + assert(isAdmissibleTopoOrder(value)); + if (hasSparseOutput()) + outerParNest = computeTopoOrderNest(value); + topSort = value; } ArrayRef getTopSortSlice(LoopOrd n, LoopOrd m) const; @@ -173,11 +174,15 @@ // (cf., `getLoopVar` and `topSortAt`). std::vector topSort; + // Computes `outerParNest` for a given ordering. We add a function for this + // because we need to compute this when checking if an ordering is admissible. + LoopOrd computeTopoOrderNest(const std::vector &topSort); + // Sparse tensor as output. Implemented either through direct injective // insertion in lexicographic index order or through access pattern // expansion in the innermost loop nest (`expValues` through `expCount`). OpOperand *sparseOut; - // The count of outer non-filter loops, as defined by `isAdmissibleTopoOrder`. + // The count of outer non-filter loops, as defined by `computeTopoOrderNest`. LoopOrd outerParNest; Value insChain; Value expValues; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp @@ -183,16 +183,12 @@ return isMaterializing(lhs->get()); } -bool CodegenEnv::isAdmissibleTopoOrder() { - if (!hasSparseOutput()) - return true; - - OpOperand *lhs = linalgOp.getDpsInitOperand(0); +LoopOrd CodegenEnv::computeTopoOrderNest(const std::vector &topSort) { // Accept "truly dynamic" if the output tensor materializes uninitialized // into the computation and insertions occur in lexicographic index order. LoopOrd nest = 0; const auto iteratorTypes = linalgOp.getIteratorTypesArray(); - assert(topSortSize() == latticeMerger.getNumLoops()); + assert(topSort.size() == latticeMerger.getNumLoops()); for (const LoopId i : topSort) { if (!latticeMerger.isFilterLoop(i)) { // We only count non-filter loops as filter loops should be considered @@ -202,11 +198,22 @@ nest++; } } + + return nest; +} + +bool CodegenEnv::isAdmissibleTopoOrder(const std::vector &topSort) { + if (!hasSparseOutput()) + return true; + + LoopOrd nest = computeTopoOrderNest(topSort); + + OpOperand *lhs = linalgOp.getDpsInitOperand(0); + // Determine admissible dynamic insertion situations: // (1) fully injective, since there are no reductions, // (2) admissible 1-d expansion in innermost dimension. if (static_cast(nest) >= linalgOp.getRank(lhs) - 1) { - outerParNest = nest; return true; } return false; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -45,38 +45,6 @@ namespace { -/// Iteration graph sorting. -enum class SortMask : unsigned { - // The individual mask bits. - kIncludeDenseOutput = 0x1, // b001 - kIncludeDenseInput = 0x2, // b010 - kIncludeUndef = 0x4, // b100 - // The subsets of mask bits. - kIncludeAll = 0x7, // b111 - kIncludeDense = 0x3, // b011 - kSparseOnly = 0x0, // b000 -}; - -inline static bool includesAny(SortMask mask1, SortMask mask2) { - return static_cast(mask1) & static_cast(mask2); -} - -inline static bool includesDenseInput(SortMask mask) { - return includesAny(mask, SortMask::kIncludeDenseInput); -} - -inline static bool includesDenseOutput(SortMask mask) { - return includesAny(mask, SortMask::kIncludeDenseOutput); -} - -inline static bool includesDense(SortMask mask) { - return includesAny(mask, SortMask::kIncludeDense); -} - -inline static bool includesUndef(SortMask mask) { - return includesAny(mask, SortMask::kIncludeUndef); -} - /// A helper class that visits an affine expression and tries to find an /// AffineDimExpr to which the corresponding iterator from a GenericOp matches /// the desired iterator type. @@ -121,8 +89,117 @@ SmallVector dims; }; +/// Allows constructing a topological sort incrementally. Loosely inspired by +/// `llvm::ScheduleDAGTopologicalSort`. +class IncrementalTopoSort { +public: + // Initializes the sorting with the given order. As the sorting is 'stable' + // the ordering between nodes will be kept unless a newly added edge requires + // it to change. The graph will not contain any edges at this point. + IncrementalTopoSort(SmallVector &initialOrder); + + // Returns whether adding an edge from `a` to `b` would create a cycle and + // invalidate the sort. + bool createsCycle(LoopId a, LoopId b); + + std::vector &getCurrent() { return index2Loop; }; + + // Adds a directed edge from `a` to `b` to the graph and updates the sorting. + void addEdge(LoopId a, LoopId b); + +private: + bool DFS(LoopId a, LoopOrd upperBound, llvm::SmallBitVector &visited); + void allocate(LoopId l, LoopOrd index) { + loop2Index[l] = index; + index2Loop[index] = l; + } + + LoopOrd numLoops; + std::vector index2Loop; + SmallVector loop2Index; + + SmallVector> succ; +}; + } // namespace +IncrementalTopoSort::IncrementalTopoSort(SmallVector &initialOrder) + : numLoops(initialOrder.size()) { + index2Loop.resize(numLoops); + loop2Index.resize(numLoops); + + for (LoopOrd i = 0; i < numLoops; i++) { + allocate(initialOrder[i], i); + } + + succ.resize(numLoops); +} + +bool IncrementalTopoSort::createsCycle(LoopId a, LoopId b) { + LoopOrd lowerBound = loop2Index[b]; + LoopOrd upperBound = loop2Index[a]; + + if (upperBound < lowerBound) + return false; + + llvm::SmallBitVector visited(numLoops); + return DFS(b, upperBound, visited); +} + +void IncrementalTopoSort::addEdge(LoopId a, LoopId b) { + LoopOrd lowerBound = loop2Index[b]; + LoopOrd upperBound = loop2Index[a]; + + // Check if the constraint is not already satisfied. + if (lowerBound < upperBound) { + llvm::SmallBitVector visited(numLoops); + + // Find nodes which need to be moved behind `a`. + bool createsCycle = DFS(b, upperBound, visited); + assert(!createsCycle); + + SmallVector moveToBack; + + for (LoopOrd i = lowerBound; i <= upperBound; ++i) { + LoopId l = index2Loop[i]; + if (visited.test(l)) { + moveToBack.push_back(l); + } else { + allocate(l, i - moveToBack.size()); + } + } + + for (unsigned j = 0; j < moveToBack.size(); ++j) { + LoopId l = moveToBack[j]; + allocate(l, upperBound + 1 - moveToBack.size() + j); + } + } + + succ[a].push_back(b); +} + +bool IncrementalTopoSort::DFS(LoopId a, LoopOrd upperBound, + llvm::SmallBitVector &visited) { + SmallVector queue = {a}; + + while (!queue.empty()) { + LoopId u = queue.pop_back_val(); + visited.set(u); + + for (LoopId v : succ[u]) { + if (v == index2Loop[upperBound]) + return true; + + // Ignore nodes that come after upper bound as there cannot be any back + // edges. + if (!visited.test(v) && loop2Index[v] < upperBound) + queue.push_back(v); + } + } + + return false; +} + //===----------------------------------------------------------------------===// // Sparse compiler analysis methods. //===----------------------------------------------------------------------===// @@ -450,70 +527,11 @@ return annotated; } -/// A helper to compute a topological sort. O(n^2) time complexity -/// as we use adj matrix for the graph. -/// The sorted result will put the first Reduction iterator to the -/// latest possible `LoopOrd`. -/// -/// The `inDegree` is indexed by `LoopId`, and the `adjM` is indexed by -/// `(LoopId,LoopId)`. -static bool topSortOptimal(CodegenEnv &env, - ArrayRef iteratorTypes, - std::vector &inDegree, - std::vector> &adjM) { - std::vector redIt; // reduce iterator with 0 degree - std::vector parIt; // parallel iterator with 0 degree - std::vector filterIt; // filter loop with 0 degree - const LoopId numLoops = env.merger().getNumLoops(); - for (LoopId i = 0; i < numLoops; i++) { - if (inDegree[i] == 0) { - if (env.merger().isFilterLoop(i)) - filterIt.push_back(i); - else if (linalg::isReductionIterator(iteratorTypes[i])) - redIt.push_back(i); - else - parIt.push_back(i); - } - } +namespace { +enum ConstraintKind { Sparse, DenseInput, DenseOutput, Undef }; - while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) { - // We always choose in order of filter loop -> parallel loop -> reduction - // loop because - // 1. Putting reduction loop early might make the loop sequence - // inadmissible. - // 2. Filter loops should be put as early as possible for better - // performance, since only one (if any) iteration will carry the - // computation. E.g., for (1 to N) - // for (1 to M) - // for (1 to K) - // if (xxx) - // O(X) computation => O(NMK+NMX) time complexity - // - // By putting the filter loop one level up, we got - // - // for (1 to N) - // for (1 to K) - // if (xxx) - // for (1 to M) - // O(X) computation => O(NK+NMX) time complexity - auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt); - auto src = it.back(); - env.topSortPushBack(src); - it.pop_back(); - // Update in-degree, and push 0-degree node into worklist. - for (LoopId dst = 0; dst < numLoops; dst++) { - if (adjM[src][dst] && --inDegree[dst] == 0) { - if (env.merger().isFilterLoop(dst)) - filterIt.push_back(dst); - else if (linalg::isReductionIterator(iteratorTypes[dst])) - redIt.push_back(dst); - else - parIt.push_back(dst); - } - } - } - return env.topSortSize() == numLoops; -} +using OrderingConstraint = std::pair; +} // namespace /// Helper method to add all constraints from the indices in one affine /// expression before all indices in the other affine expression. For @@ -525,18 +543,15 @@ /// /// The `inDegree` is indexed by `LoopId`, and the `adjM` is indexed by /// `(LoopId,LoopId)`. -static void addAffineOrderings(std::vector> &adjM, - std::vector &inDegree, AffineExpr a, - AffineExpr b, std::optional fidx, +static void addAffineOrderings(SmallVector &constraints, + AffineExpr a, AffineExpr b, + std::optional fidx, std::optional tidx) { if (!a && !b) { // Recursion leaf. assert(fidx && tidx); const LoopId f = *fidx, t = *tidx; - if (!adjM[f][t]) { - adjM[f][t] = true; - inDegree[t]++; - } + constraints.push_back({f, t}); return; } // Picks an affine expression and expand (recurse into) it. @@ -546,20 +561,20 @@ const std::optional idx{ toExpand.cast().getPosition()}; if (toExpand == a) - addAffineOrderings(adjM, inDegree, AffineExpr(), b, idx, tidx); + addAffineOrderings(constraints, AffineExpr(), b, idx, tidx); else // toExpand == b - addAffineOrderings(adjM, inDegree, a, AffineExpr(), fidx, idx); + addAffineOrderings(constraints, a, AffineExpr(), fidx, idx); break; } case AffineExprKind::Add: case AffineExprKind::Mul: { auto binOp = toExpand.cast(); if (toExpand == a) { - addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx, tidx); - addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx, tidx); + addAffineOrderings(constraints, binOp.getLHS(), b, fidx, tidx); + addAffineOrderings(constraints, binOp.getRHS(), b, fidx, tidx); } else { - addAffineOrderings(adjM, inDegree, a, binOp.getLHS(), fidx, tidx); - addAffineOrderings(adjM, inDegree, a, binOp.getRHS(), fidx, tidx); + addAffineOrderings(constraints, a, binOp.getLHS(), fidx, tidx); + addAffineOrderings(constraints, a, binOp.getRHS(), fidx, tidx); } break; } @@ -602,10 +617,9 @@ } } -static void addFilterLoopBasedConstraints(CodegenEnv &env, OpOperand &t, - OpOperand *skip, SortMask mask, - std::vector> &adjM, - std::vector &inDegree) { +static void +addFilterLoopBasedConstraints(CodegenEnv &env, OpOperand &t, OpOperand *skip, + SmallVector &constraints) { // Get map, encoding, and tensor-identifier. const auto map = env.op().getMatchingIndexingMap(&t); const auto enc = getSparseTensorEncoding(t.get().getType()); @@ -626,7 +640,7 @@ if (tldx && env.merger().isFilterLoop(*tldx)) { assert(!ta.isa() && !isDenseDLT(enc.getDimLevelType()[lvl])); - addAffineOrderings(adjM, inDegree, ta, AffineExpr(), std::nullopt, tldx); + addAffineOrderings(constraints, ta, AffineExpr(), std::nullopt, tldx); // Now that the ordering of affine expression is captured by filter // loop idx, we only need to ensure the affine ordering against filter // loop. Thus, we reset the affine express to nil here to mark it as @@ -651,31 +665,28 @@ // requiring d0 < d2, d1 < d2, d0 < d3, d1 < d3. // We also relax the affine constraint when use slice-based algorithm // as there is no filter loop for affine index on sparse dimension. - // TODO: do we really need the condition? - if (!includesDense(mask)) - tryRelaxAffineConstraints(env.op(), fldx, fa, tldx, ta); + tryRelaxAffineConstraints(env.op(), fldx, fa, tldx, ta); // (d0 + d1) < (d2 + d3), or // filter_loop_d-1 < (d2 + d3), or // (d0 + d1) < filter_loop_d, or // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset // above. - addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx); + addAffineOrderings(constraints, fa, ta, fldx, tldx); } } } -static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, - OpOperand *skip, SortMask mask, - std::vector> &adjM, - std::vector &inDegree) { +static void +addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, OpOperand *skip, + SmallVector &constraints) { // Get map and encoding. const auto map = env.op().getMatchingIndexingMap(&t); const auto enc = getSparseTensorEncoding(t.get().getType()); // No special treatment for simple indices. if (getNumNonTrivialIdxExpOnSparseLvls(map, t.get()) == 0) - return addFilterLoopBasedConstraints(env, t, skip, mask, adjM, inDegree); + return addFilterLoopBasedConstraints(env, t, skip, constraints); // Skip tensor during cycle resolution, though order between filter loop // and dependent loops need to be guaranteed unconditionally. @@ -705,10 +716,7 @@ const LoopId tldx = env.makeLoopId(texp.getPosition()); // d_x > d_y - if (!adjM[fldx][tldx]) { - adjM[fldx][tldx] = true; - inDegree[tldx]++; - } + constraints.push_back({fldx, tldx}); AffineDimCollector fCollector; fCollector.walkPostOrder(fa); @@ -720,19 +728,15 @@ const LoopId f = env.makeLoopId(fd.getPosition()); if (f == fldx) continue; - if (!adjM[f][fldx]) { - adjM[f][fldx] = true; - inDegree[fldx]++; - } + + constraints.push_back({f, fldx}); } for (auto td : tCollector.dims) { const LoopId t = env.makeLoopId(td.getPosition()); if (t == tldx) continue; - if (!adjM[t][tldx]) { - adjM[t][tldx] = true; - inDegree[tldx]++; - } + + constraints.push_back({t, tldx}); } // Since we only support affine addition, the order between two dim // expression does not really matters. @@ -752,71 +756,192 @@ const LoopId t = env.makeLoopId(td.getPosition()); if (t == tldx) // skip d_y continue; - if (!adjM[f][t]) { - adjM[f][t] = true; - inDegree[t]++; - } + constraints.push_back({f, t}); } } } } -/// Computes a topologically sorted iteration graph for the linalg operation. -/// Ensures all tensors are visited in natural index order. This is -/// essential for sparse storage formats since these only support access -/// along fixed dimensions. Even for dense storage formats, however, the natural -/// index order yields innermost unit-stride access with better spatial -/// locality. -static bool computeIterationGraph(CodegenEnv &env, SortMask mask, - OpOperand *skip, bool idxReducBased = false) { - // Set up an n x n from/to adjacency matrix of the iteration graph - // for the implicit loop indices i_0 .. i_n-1. +static void collectOrderingConstraints( + CodegenEnv &env, OpOperand *skip, bool idxReducBased, + std::unordered_map> + &constraints) { const unsigned numLoops = env.merger().getNumLoops(); - std::vector> adjM(numLoops, - std::vector(numLoops, false)); - std::vector inDegree(numLoops, 0); // in-degree of each node. const auto iteratorTypes = env.op().getIteratorTypesArray(); // Iterate over the indexing maps of every tensor in the tensor expression. for (OpOperand &t : env.op()->getOpOperands()) { // Get map and encoding. const auto enc = getSparseTensorEncoding(t.get().getType()); - // Skips dense inputs/outputs when not requested. - const bool isDenseInput = !enc && env.op().isDpsInput(&t); - const bool isDenseOutput = !enc && !isDenseInput; - if ((isDenseInput && !includesDenseInput(mask)) || - (isDenseOutput && !includesDenseOutput(mask))) - continue; // Push unrelated loops into sparse iteration space, so these // will be skipped more often. // TODO: Do we really need this? - if (includesUndef(mask)) { - const TensorId tid = env.makeTensorId(t.getOperandNumber()); - for (LoopId i = 0; i < numLoops; i++) { - const auto dltI = env.dlt(tid, i); - if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) || - isSingletonDLT(dltI)) { - for (LoopId j = 0; j < numLoops; j++) - if (isUndefDLT(env.dlt(tid, j))) { - adjM[i][j] = true; - inDegree[j]++; - } - } else { - assert(isDenseDLT(dltI) || isUndefDLT(dltI)); + const TensorId tid = env.makeTensorId(t.getOperandNumber()); + for (LoopId i = 0; i < numLoops; i++) { + + const auto dltI = env.dlt(tid, i); + if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) || + isSingletonDLT(dltI)) { + for (LoopId j = 0; j < numLoops; j++) { + if (!env.merger().isFilterLoop(j)) + constraints[ConstraintKind::Undef].push_back({i, j}); } + } else { + assert(isDenseDLT(dltI) || isUndefDLT(dltI)); } } - // Push unrelated loops into sparse iteration space, so these - // will be skipped more often. + + ConstraintKind constraintKind = ConstraintKind::Sparse; + + const bool isDenseInput = !enc && env.op().isDpsInput(&t); + const bool isDenseOutput = !enc && !isDenseInput; + + if (isDenseInput) + constraintKind = ConstraintKind::DenseInput; + else if (isDenseOutput) + constraintKind = ConstraintKind::DenseOutput; + if (idxReducBased) - addSliceBasedConstraints(env, t, skip, mask, adjM, inDegree); + addSliceBasedConstraints(env, t, skip, constraints[constraintKind]); + else + addFilterLoopBasedConstraints(env, t, skip, constraints[constraintKind]); + } +} + +static void computeInitialLoopOrdering(CodegenEnv &env, + SmallVector &initialOrdering) { + const unsigned numLoops = env.merger().getNumLoops(); + initialOrdering.resize(numLoops); + + for (LoopId i = 0; i < numLoops; ++i) { + initialOrdering[i] = i; + } + + const auto iteratorTypes = env.op().getIteratorTypesArray(); + + // For the initial sorting we use filter loop -> parallel loop -> reduction + // loop because + // 1. Putting reduction loop early might make the loop sequence + // inadmissible. + // 2. Filter loops should be put as early as possible for better + // performance, since only one (if any) iteration will carry the + // computation. E.g., for (1 to N) + // for (1 to M) + // for (1 to K) + // if (xxx) + // O(X) computation => O(NMK+NMX) time complexity + // + // By putting the filter loop one level up, we got + // + // for (1 to N) + // for (1 to K) + // if (xxx) + // for (1 to M) + // O(X) computation => O(NK+NMX) time complexity + auto loopRank = [&](LoopId idx) { + if (env.merger().isFilterLoop(idx)) + return -1; + else if (linalg::isReductionIterator(iteratorTypes[idx])) + return 1; else - addFilterLoopBasedConstraints(env, t, skip, mask, adjM, inDegree); + return 0; + }; + + std::sort(initialOrdering.begin(), initialOrdering.end(), + [&](LoopId a, LoopId b) { return loopRank(a) < loopRank(b); }); +} + +/// Given a list of constraints resp. edges that define the iteration graph this +/// returns a topological sort of the nodes that respects all sparse +/// constraints. For the other constraints we greedily try to add them such that +/// no cycles occur. +static void computeTopSort( + CodegenEnv &env, + std::unordered_map> + &constraints, + bool &hasCycle, bool &isAdmissible) { + // By providing an initial sorting we can impose soft constraints on the + // ordering based on loop types. + // TODO: It might be better to impose the loop type constraints via edges as + // well. + SmallVector initialOrdering; + computeInitialLoopOrdering(env, initialOrdering); + + IncrementalTopoSort topSort(initialOrdering); + + hasCycle = false; + isAdmissible = true; + + // First try to satisfy all required constraints, if this fails we return + // an error. + for (const OrderingConstraint &constraint : + constraints[ConstraintKind::Sparse]) { + LoopId i, j; + std::tie(i, j) = constraint; + + if (topSort.createsCycle(i, j)) { + hasCycle = true; + return; + } + + topSort.addEdge(i, j); + } + + if (!env.isAdmissibleTopoOrder(topSort.getCurrent())) { + isAdmissible = false; + return; } - // Topologically sort the iteration graph to determine loop order. - // Report failure for a cyclic iteration graph. - env.topSortClear(numLoops); - return topSortOptimal(env, iteratorTypes, inDegree, adjM); + + // Now we add relaxable constraints and skip any that lead to cycles. + SmallVector relaxableConstraints = { + ConstraintKind::DenseInput, ConstraintKind::DenseOutput, + ConstraintKind::Undef}; + + // If we encounter an inadmissible ordering here, we return the last good + // configuration. + // TODO: We could also skip the constraint that led to an inadmissible + // ordering and continue. + std::vector lastAdmissible = topSort.getCurrent(); + + bool encounteredInadmissible = false; + for (ConstraintKind kind : relaxableConstraints) { + for (const OrderingConstraint &constraint : constraints[kind]) { + LoopId i, j; + std::tie(i, j) = constraint; + if (topSort.createsCycle(i, j)) + continue; + + topSort.addEdge(i, j); + + if (!env.isAdmissibleTopoOrder(topSort.getCurrent())) { + encounteredInadmissible = true; + break; + } + lastAdmissible = topSort.getCurrent(); + } + if (encounteredInadmissible) + break; + } + + env.topSortSet(lastAdmissible); +} + +/// Computes a topologically sorted iteration graph for the linalg operation. +/// Ensures all tensors are visited in natural index order. This is +/// essential for sparse storage formats since these only support access +/// along fixed dimensions. Even for dense storage formats, however, the natural +/// index order yields innermost unit-stride access with better spatial +/// locality. +static void computeIterationGraph(CodegenEnv &env, OpOperand *skip, + bool &hasCycle, bool &isAdmissible, + bool idxReducBased = false) { + // Note that we use the terms 'constraint' and 'edge' somewhat interchangably + // here + std::unordered_map> + constraints; + collectOrderingConstraints(env, skip, idxReducBased, constraints); + + computeTopSort(env, constraints, hasCycle, isAdmissible); } //===----------------------------------------------------------------------===// @@ -1869,36 +1994,19 @@ // constraints until an acyclic iteration graph results, such that sparse // code generation can proceed. As a last resort, an attempt is made // to resolve cycles by inserting a conversion. - bool isAdmissible = false; - bool hasCycle = true; - - // An const list of all masks that we used for interation graph - // computation. Must be ordered from more strict to less strict. - // Ideally (though might not be guaranteed), the eariler a constraint mask - // can be satisfied, the faster the generated kernel will be. - const auto allMasks = { - SortMask::kIncludeAll, SortMask::kIncludeDense, - SortMask::kIncludeDenseInput, SortMask::kIncludeDenseOutput, - SortMask::kIncludeUndef, SortMask::kSparseOnly}; - for (const SortMask mask : allMasks) { - if (computeIterationGraph(env, mask, nullptr, idxReducBased)) { - hasCycle = false; - if (env.isAdmissibleTopoOrder()) { - isAdmissible = true; - break; - } - // else try a set of less strict constraints. - } + bool hasCycle, isAdmissible; + computeIterationGraph(env, nullptr, hasCycle, isAdmissible, idxReducBased); + + if (!isAdmissible) { + return failure(); } + if (hasCycle) { return idxReducBased ? failure() // TODO: should cycle be resolved differently? : resolveCycle(env, rewriter); // one last shot } - if (!isAdmissible) - return failure(); // inadmissible expression, reject - // Recursively generates code if admissible. env.startEmit(); genBuffers(env, rewriter); @@ -1921,8 +2029,15 @@ const TensorId tid = env.makeTensorId(t->getOperandNumber()); Value tval = t->get(); auto srcEnc = getSparseTensorEncoding(tval.getType()); - if (!srcEnc || !computeIterationGraph(env, SortMask::kSparseOnly, t)) + + if (!srcEnc) + continue; + + bool hasCycle, isAdmissible; + computeIterationGraph(env, t, hasCycle, isAdmissible); + if (hasCycle || !isAdmissible) continue; + // Found an input tensor that resolves the cycle by inserting a // conversion into a sparse tensor that adheres to the iteration // graph order. Also releases the temporary sparse tensor. diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir @@ -209,32 +209,31 @@ // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<3x3xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x3xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref // CHECK: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<6x6xi32> -// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref -// CHECK: scf.for %[[VAL_16:.*]] = %[[VAL_14]] to %[[VAL_15]] step %[[VAL_5]] { -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref -// CHECK: scf.for %[[VAL_18:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { -// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_18]]] : memref<6x6xi32> -// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_16]]] : memref -// CHECK: %[[VAL_21:.*]] = arith.addi %[[VAL_16]], %[[VAL_5]] : index -// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_21]]] : memref -// CHECK: %[[VAL_23:.*]] = scf.for %[[VAL_24:.*]] = %[[VAL_20]] to %[[VAL_22]] step %[[VAL_5]] iter_args(%[[VAL_25:.*]] = %[[VAL_19]]) -> (i32) { -// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_24]]] : memref -// CHECK: %[[VAL_27:.*]] = arith.addi %[[VAL_13]], %[[VAL_17]] : index -// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_18]], %[[VAL_26]] : index -// CHECK: %[[VAL_29:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_27]], %[[VAL_28]]] : memref<8x8xi32> -// CHECK: %[[VAL_30:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_24]]] : memref -// CHECK: %[[VAL_31:.*]] = arith.muli %[[VAL_29]], %[[VAL_30]] : i32 -// CHECK: %[[VAL_32:.*]] = arith.addi %[[VAL_25]], %[[VAL_31]] : i32 -// CHECK: scf.yield %[[VAL_32]] : i32 +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref +// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref +// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_5]] { +// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref +// CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { +// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_19]]] : memref +// CHECK: scf.for %[[VAL_21:.*]] = %[[VAL_18]] to %[[VAL_20]] step %[[VAL_5]] { +// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_21]]] : memref +// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_21]]] : memref +// CHECK: scf.for %[[VAL_24:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { +// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_17]], %[[VAL_24]]] : memref<6x6xi32> +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_17]], %[[VAL_16]] : index +// CHECK: %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_22]] : index +// CHECK: %[[VAL_28:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_26]], %[[VAL_27]]] : memref<8x8xi32> +// CHECK: %[[VAL_29:.*]] = arith.muli %[[VAL_28]], %[[VAL_23]] : i32 +// CHECK: %[[VAL_30:.*]] = arith.addi %[[VAL_25]], %[[VAL_29]] : i32 +// CHECK: memref.store %[[VAL_30]], %[[VAL_12]]{{\[}}%[[VAL_17]], %[[VAL_24]]] : memref<6x6xi32> // CHECK: } {"Emitted from" = "linalg.generic"} -// CHECK: memref.store %[[VAL_33:.*]], %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_18]]] : memref<6x6xi32> // CHECK: } {"Emitted from" = "linalg.generic"} // CHECK: } {"Emitted from" = "linalg.generic"} // CHECK: } {"Emitted from" = "linalg.generic"} -// CHECK: %[[VAL_34:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<6x6xi32> -// CHECK: return %[[VAL_34]] : tensor<6x6xi32> +// CHECK: %[[VAL_31:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<6x6xi32> +// CHECK: return %[[VAL_31]] : tensor<6x6xi32> // CHECK: } func.func @conv2d(%input: tensor<8x8xi32>, %filter: tensor<3x3xi32, #DCSR>,