diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h
@@ -95,7 +95,7 @@
 
   /// Whether the iteration graph is sorted in admissible topoOrder.
   /// Sets outerParNest on success with sparse output
-  bool isAdmissibleTopoOrder();
+  bool isAdmissibleTopoOrder(const std::vector<LoopId> &topSort);
 
   //
   // Topological delegate and sort methods.
@@ -103,10 +103,11 @@
 
   LoopOrd topSortSize() const { return topSort.size(); }
   LoopId topSortAt(LoopOrd n) const { return topSort.at(n); }
-  void topSortPushBack(LoopId i) { topSort.push_back(i); }
-  void topSortClear(size_t capacity = 0) {
-    topSort.clear();
-    topSort.reserve(capacity);
+  void topSortSet(const std::vector<LoopId> &value) {
+    assert(isAdmissibleTopoOrder(value));
+    if (hasSparseOutput())
+      outerParNest = computeTopoOrderNest(value);
+    topSort = value;
   }
 
   ArrayRef<LoopId> getTopSortSlice(LoopOrd n, LoopOrd m) const;
@@ -173,11 +174,15 @@
   // (cf., `getLoopVar` and `topSortAt`).
   std::vector<LoopId> topSort;
 
+  // Computes `outerParNest` for a given ordering. We add a function for this
+  // because we need to compute this when checking if an ordering is admissible.
+  LoopOrd computeTopoOrderNest(const std::vector<LoopId> &topSort);
+
   // Sparse tensor as output. Implemented either through direct injective
   // insertion in lexicographic index order or through access pattern
   // expansion in the innermost loop nest (`expValues` through `expCount`).
   OpOperand *sparseOut;
-  // The count of outer non-filter loops, as defined by `isAdmissibleTopoOrder`.
+  // The count of outer non-filter loops, as defined by `computeTopoOrderNest`.
   LoopOrd outerParNest;
   Value insChain;
   Value expValues;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
@@ -183,16 +183,12 @@
   return isMaterializing(lhs->get());
 }
 
-bool CodegenEnv::isAdmissibleTopoOrder() {
-  if (!hasSparseOutput())
-    return true;
-
-  OpOperand *lhs = linalgOp.getDpsInitOperand(0);
+LoopOrd CodegenEnv::computeTopoOrderNest(const std::vector<LoopId> &topSort) {
   // Accept "truly dynamic" if the output tensor materializes uninitialized
   // into the computation and insertions occur in lexicographic index order.
   LoopOrd nest = 0;
   const auto iteratorTypes = linalgOp.getIteratorTypesArray();
-  assert(topSortSize() == latticeMerger.getNumLoops());
+  assert(topSort.size() == latticeMerger.getNumLoops());
   for (const LoopId i : topSort) {
     if (!latticeMerger.isFilterLoop(i)) {
       // We only count non-filter loops as filter loops should be considered
@@ -202,11 +198,22 @@
       nest++;
     }
   }
+
+  return nest;
+}
+
+bool CodegenEnv::isAdmissibleTopoOrder(const std::vector<LoopId> &topSort) {
+  if (!hasSparseOutput())
+    return true;
+
+  LoopOrd nest = computeTopoOrderNest(topSort);
+
+  OpOperand *lhs = linalgOp.getDpsInitOperand(0);
+
   // Determine admissible dynamic insertion situations:
   // (1) fully injective, since there are no reductions,
   // (2) admissible 1-d expansion in innermost dimension.
   if (static_cast<int64_t>(nest) >= linalgOp.getRank(lhs) - 1) {
-    outerParNest = nest;
     return true;
   }
   return false;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -45,38 +45,6 @@
 
 namespace {
 
-/// Iteration graph sorting.
-enum class SortMask : unsigned {
-  // The individual mask bits.
-  kIncludeDenseOutput = 0x1, // b001
-  kIncludeDenseInput = 0x2,  // b010
-  kIncludeUndef = 0x4,       // b100
-  // The subsets of mask bits.
-  kIncludeAll = 0x7,   // b111
-  kIncludeDense = 0x3, // b011
-  kSparseOnly = 0x0,   // b000
-};
-
-inline static bool includesAny(SortMask mask1, SortMask mask2) {
-  return static_cast<unsigned>(mask1) & static_cast<unsigned>(mask2);
-}
-
-inline static bool includesDenseInput(SortMask mask) {
-  return includesAny(mask, SortMask::kIncludeDenseInput);
-}
-
-inline static bool includesDenseOutput(SortMask mask) {
-  return includesAny(mask, SortMask::kIncludeDenseOutput);
-}
-
-inline static bool includesDense(SortMask mask) {
-  return includesAny(mask, SortMask::kIncludeDense);
-}
-
-inline static bool includesUndef(SortMask mask) {
-  return includesAny(mask, SortMask::kIncludeUndef);
-}
-
 /// A helper class that visits an affine expression and tries to find an
 /// AffineDimExpr to which the corresponding iterator from a GenericOp matches
 /// the desired iterator type.
@@ -121,8 +89,117 @@
   SmallVector<AffineDimExpr> dims;
 };
 
+/// Allows constructing a topological sort incrementally. Loosely inspired by
+/// `llvm::ScheduleDAGTopologicalSort`.
+class IncrementalTopoSort {
+public:
+  // Initializes the sorting with the given order. As the sorting is 'stable'
+  // the ordering between nodes will be kept unless a newly added edge requires
+  // it to change. The graph will not contain any edges at this point.
+  IncrementalTopoSort(SmallVector<LoopId> &initialOrder);
+
+  // Returns whether adding an edge from `a` to `b` would create a cycle and
+  // invalidate the sort.
+  bool createsCycle(LoopId a, LoopId b);
+
+  std::vector<LoopId> &getCurrent() { return index2Loop; };
+
+  // Adds a directed edge from `a` to `b` to the graph and updates the sorting.
+  void addEdge(LoopId a, LoopId b);
+
+private:
+  bool DFS(LoopId a, LoopOrd upperBound, llvm::SmallBitVector &visited);
+  void allocate(LoopId l, LoopOrd index) {
+    loop2Index[l] = index;
+    index2Loop[index] = l;
+  }
+
+  LoopOrd numLoops;
+  std::vector<LoopId> index2Loop;
+  SmallVector<LoopOrd> loop2Index;
+
+  SmallVector<SmallVector<LoopId>> succ;
+};
+
 } // namespace
 
+IncrementalTopoSort::IncrementalTopoSort(SmallVector<LoopId> &initialOrder)
+    : numLoops(initialOrder.size()) {
+  index2Loop.resize(numLoops);
+  loop2Index.resize(numLoops);
+
+  for (LoopOrd i = 0; i < numLoops; i++) {
+    allocate(initialOrder[i], i);
+  }
+
+  succ.resize(numLoops);
+}
+
+bool IncrementalTopoSort::createsCycle(LoopId a, LoopId b) {
+  LoopOrd lowerBound = loop2Index[b];
+  LoopOrd upperBound = loop2Index[a];
+
+  if (upperBound < lowerBound)
+    return false;
+
+  llvm::SmallBitVector visited(numLoops);
+  return DFS(b, upperBound, visited);
+}
+
+void IncrementalTopoSort::addEdge(LoopId a, LoopId b) {
+  LoopOrd lowerBound = loop2Index[b];
+  LoopOrd upperBound = loop2Index[a];
+
+  // Check if the constraint is not already satisfied.
+  if (lowerBound < upperBound) {
+    llvm::SmallBitVector visited(numLoops);
+
+    // Find nodes which need to be moved behind `a`.
+    bool createsCycle = DFS(b, upperBound, visited);
+    assert(!createsCycle);
+
+    SmallVector<LoopId> moveToBack;
+
+    for (LoopOrd i = lowerBound; i <= upperBound; ++i) {
+      LoopId l = index2Loop[i];
+      if (visited.test(l)) {
+        moveToBack.push_back(l);
+      } else {
+        allocate(l, i - moveToBack.size());
+      }
+    }
+
+    for (unsigned j = 0; j < moveToBack.size(); ++j) {
+      LoopId l = moveToBack[j];
+      allocate(l, upperBound + 1 - moveToBack.size() + j);
+    }
+  }
+
+  succ[a].push_back(b);
+}
+
+bool IncrementalTopoSort::DFS(LoopId a, LoopOrd upperBound,
+                              llvm::SmallBitVector &visited) {
+  SmallVector<LoopId> queue = {a};
+
+  while (!queue.empty()) {
+    LoopId u = queue.pop_back_val();
+    visited.set(u);
+
+    for (LoopId v : succ[u]) {
+      if (v == index2Loop[upperBound])
+        return true;
+
+      // Ignore nodes that come after upper bound as there cannot be any back
+      // edges.
+      if (!visited.test(v) && loop2Index[v] < upperBound)
+        queue.push_back(v);
+    }
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse compiler analysis methods.
 //===----------------------------------------------------------------------===//
@@ -450,70 +527,11 @@
   return annotated;
 }
 
-/// A helper to compute a topological sort. O(n^2) time complexity
-/// as we use adj matrix for the graph.
-/// The sorted result will put the first Reduction iterator to the
-/// latest possible `LoopOrd`.
-///
-/// The `inDegree` is indexed by `LoopId`, and the `adjM` is indexed by
-/// `(LoopId,LoopId)`.
-static bool topSortOptimal(CodegenEnv &env,
-                           ArrayRef<utils::IteratorType> iteratorTypes,
-                           std::vector<unsigned> &inDegree,
-                           std::vector<std::vector<bool>> &adjM) {
-  std::vector<LoopId> redIt;    // reduce iterator with 0 degree
-  std::vector<LoopId> parIt;    // parallel iterator with 0 degree
-  std::vector<LoopId> filterIt; // filter loop with 0 degree
-  const LoopId numLoops = env.merger().getNumLoops();
-  for (LoopId i = 0; i < numLoops; i++) {
-    if (inDegree[i] == 0) {
-      if (env.merger().isFilterLoop(i))
-        filterIt.push_back(i);
-      else if (linalg::isReductionIterator(iteratorTypes[i]))
-        redIt.push_back(i);
-      else
-        parIt.push_back(i);
-    }
-  }
+namespace {
+enum ConstraintKind { Sparse, DenseInput, DenseOutput, Undef };
 
-  while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) {
-    // We always choose in order of filter loop -> parallel loop -> reduction
-    // loop because
-    // 1. Putting reduction loop early might make the loop sequence
-    // inadmissible.
-    // 2. Filter loops should be put as early as possible for better
-    // performance, since only one (if any) iteration will carry the
-    // computation. E.g., for (1 to N)
-    //  for (1 to M)
-    //    for (1 to K)
-    //      if (xxx)
-    //        O(X) computation  => O(NMK+NMX) time complexity
-    //
-    // By putting the filter loop one level up, we got
-    //
-    // for (1 to N)
-    //  for (1 to K)
-    //    if (xxx)
-    //      for (1 to M)
-    //        O(X) computation  => O(NK+NMX) time complexity
-    auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt);
-    auto src = it.back();
-    env.topSortPushBack(src);
-    it.pop_back();
-    // Update in-degree, and push 0-degree node into worklist.
-    for (LoopId dst = 0; dst < numLoops; dst++) {
-      if (adjM[src][dst] && --inDegree[dst] == 0) {
-        if (env.merger().isFilterLoop(dst))
-          filterIt.push_back(dst);
-        else if (linalg::isReductionIterator(iteratorTypes[dst]))
-          redIt.push_back(dst);
-        else
-          parIt.push_back(dst);
-      }
-    }
-  }
-  return env.topSortSize() == numLoops;
-}
+using OrderingConstraint = std::pair<LoopId, LoopId>;
+} // namespace
 
 /// Helper method to add all constraints from the indices in one affine
 /// expression before all indices in the other affine expression. For
@@ -525,18 +543,15 @@
 ///
 /// The `inDegree` is indexed by `LoopId`, and the `adjM` is indexed by
 /// `(LoopId,LoopId)`.
-static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,
-                               std::vector<unsigned> &inDegree, AffineExpr a,
-                               AffineExpr b, std::optional<LoopId> fidx,
+static void addAffineOrderings(SmallVector<OrderingConstraint> &constraints,
+                               AffineExpr a, AffineExpr b,
+                               std::optional<LoopId> fidx,
                                std::optional<LoopId> tidx) {
   if (!a && !b) {
     // Recursion leaf.
     assert(fidx && tidx);
     const LoopId f = *fidx, t = *tidx;
-    if (!adjM[f][t]) {
-      adjM[f][t] = true;
-      inDegree[t]++;
-    }
+    constraints.push_back({f, t});
     return;
   }
   // Picks an affine expression and expand (recurse into) it.
@@ -546,20 +561,20 @@
     const std::optional<LoopId> idx{
         toExpand.cast<AffineDimExpr>().getPosition()};
     if (toExpand == a)
-      addAffineOrderings(adjM, inDegree, AffineExpr(), b, idx, tidx);
+      addAffineOrderings(constraints, AffineExpr(), b, idx, tidx);
     else // toExpand == b
-      addAffineOrderings(adjM, inDegree, a, AffineExpr(), fidx, idx);
+      addAffineOrderings(constraints, a, AffineExpr(), fidx, idx);
     break;
   }
   case AffineExprKind::Add:
   case AffineExprKind::Mul: {
     auto binOp = toExpand.cast<AffineBinaryOpExpr>();
     if (toExpand == a) {
-      addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx, tidx);
-      addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx, tidx);
+      addAffineOrderings(constraints, binOp.getLHS(), b, fidx, tidx);
+      addAffineOrderings(constraints, binOp.getRHS(), b, fidx, tidx);
     } else {
-      addAffineOrderings(adjM, inDegree, a, binOp.getLHS(), fidx, tidx);
-      addAffineOrderings(adjM, inDegree, a, binOp.getRHS(), fidx, tidx);
+      addAffineOrderings(constraints, a, binOp.getLHS(), fidx, tidx);
+      addAffineOrderings(constraints, a, binOp.getRHS(), fidx, tidx);
     }
     break;
   }
@@ -602,10 +617,9 @@
   }
 }
 
-static void addFilterLoopBasedConstraints(CodegenEnv &env, OpOperand &t,
-                                          OpOperand *skip, SortMask mask,
-                                          std::vector<std::vector<bool>> &adjM,
-                                          std::vector<unsigned> &inDegree) {
+static void
+addFilterLoopBasedConstraints(CodegenEnv &env, OpOperand &t, OpOperand *skip,
+                              SmallVector<OrderingConstraint> &constraints) {
   // Get map, encoding, and tensor-identifier.
   const auto map = env.op().getMatchingIndexingMap(&t);
   const auto enc = getSparseTensorEncoding(t.get().getType());
@@ -626,7 +640,7 @@
     if (tldx && env.merger().isFilterLoop(*tldx)) {
       assert(!ta.isa<AffineDimExpr>() &&
              !isDenseDLT(enc.getDimLevelType()[lvl]));
-      addAffineOrderings(adjM, inDegree, ta, AffineExpr(), std::nullopt, tldx);
+      addAffineOrderings(constraints, ta, AffineExpr(), std::nullopt, tldx);
       // Now that the ordering of affine expression is captured by filter
       // loop idx, we only need to ensure the affine ordering against filter
       // loop. Thus, we reset the affine express to nil here to mark it as
@@ -651,31 +665,28 @@
       // requiring d0 < d2, d1 < d2, d0 < d3, d1 < d3.
       // We also relax the affine constraint when use slice-based algorithm
       // as there is no filter loop for affine index on sparse dimension.
-      // TODO: do we really need the condition?
-      if (!includesDense(mask))
-        tryRelaxAffineConstraints(env.op(), fldx, fa, tldx, ta);
+      tryRelaxAffineConstraints(env.op(), fldx, fa, tldx, ta);
 
       // (d0 + d1) < (d2 + d3), or
       // filter_loop_d-1 < (d2 + d3), or
       // (d0 + d1) < filter_loop_d, or
       // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset
       // above.
-      addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
+      addAffineOrderings(constraints, fa, ta, fldx, tldx);
     }
   }
 }
 
-static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t,
-                                     OpOperand *skip, SortMask mask,
-                                     std::vector<std::vector<bool>> &adjM,
-                                     std::vector<unsigned> &inDegree) {
+static void
+addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, OpOperand *skip,
+                         SmallVector<OrderingConstraint> &constraints) {
   // Get map and encoding.
   const auto map = env.op().getMatchingIndexingMap(&t);
   const auto enc = getSparseTensorEncoding(t.get().getType());
 
   // No special treatment for simple indices.
   if (getNumNonTrivialIdxExpOnSparseLvls(map, t.get()) == 0)
-    return addFilterLoopBasedConstraints(env, t, skip, mask, adjM, inDegree);
+    return addFilterLoopBasedConstraints(env, t, skip, constraints);
 
   // Skip tensor during cycle resolution, though order between filter loop
   // and dependent loops need to be guaranteed unconditionally.
@@ -705,10 +716,7 @@
     const LoopId tldx = env.makeLoopId(texp.getPosition());
 
     // d_x > d_y
-    if (!adjM[fldx][tldx]) {
-      adjM[fldx][tldx] = true;
-      inDegree[tldx]++;
-    }
+    constraints.push_back({fldx, tldx});
 
     AffineDimCollector fCollector;
     fCollector.walkPostOrder(fa);
@@ -720,19 +728,15 @@
       const LoopId f = env.makeLoopId(fd.getPosition());
       if (f == fldx)
         continue;
-      if (!adjM[f][fldx]) {
-        adjM[f][fldx] = true;
-        inDegree[fldx]++;
-      }
+
+      constraints.push_back({f, fldx});
     }
     for (auto td : tCollector.dims) {
       const LoopId t = env.makeLoopId(td.getPosition());
       if (t == tldx)
         continue;
-      if (!adjM[t][tldx]) {
-        adjM[t][tldx] = true;
-        inDegree[tldx]++;
-      }
+
+      constraints.push_back({t, tldx});
     }
     // Since we only support affine addition, the order between two dim
     // expression does not really matters.
@@ -752,71 +756,192 @@
         const LoopId t = env.makeLoopId(td.getPosition());
         if (t == tldx) // skip d_y
           continue;
-        if (!adjM[f][t]) {
-          adjM[f][t] = true;
-          inDegree[t]++;
-        }
+        constraints.push_back({f, t});
       }
     }
   }
 }
 
-/// Computes a topologically sorted iteration graph for the linalg operation.
-/// Ensures all tensors are visited in natural index order. This is
-/// essential for sparse storage formats since these only support access
-/// along fixed dimensions. Even for dense storage formats, however, the natural
-/// index order yields innermost unit-stride access with better spatial
-/// locality.
-static bool computeIterationGraph(CodegenEnv &env, SortMask mask,
-                                  OpOperand *skip, bool idxReducBased = false) {
-  // Set up an n x n from/to adjacency matrix of the iteration graph
-  // for the implicit loop indices i_0 .. i_n-1.
+static void collectOrderingConstraints(
+    CodegenEnv &env, OpOperand *skip, bool idxReducBased,
+    std::unordered_map<ConstraintKind, SmallVector<OrderingConstraint>>
+        &constraints) {
   const unsigned numLoops = env.merger().getNumLoops();
-  std::vector<std::vector<bool>> adjM(numLoops,
-                                      std::vector<bool>(numLoops, false));
-  std::vector<unsigned> inDegree(numLoops, 0); // in-degree of each node.
   const auto iteratorTypes = env.op().getIteratorTypesArray();
   // Iterate over the indexing maps of every tensor in the tensor expression.
   for (OpOperand &t : env.op()->getOpOperands()) {
     // Get map and encoding.
     const auto enc = getSparseTensorEncoding(t.get().getType());
-    // Skips dense inputs/outputs when not requested.
-    const bool isDenseInput = !enc && env.op().isDpsInput(&t);
-    const bool isDenseOutput = !enc && !isDenseInput;
-    if ((isDenseInput && !includesDenseInput(mask)) ||
-        (isDenseOutput && !includesDenseOutput(mask)))
-      continue;
 
     // Push unrelated loops into sparse iteration space, so these
     // will be skipped more often.
     // TODO: Do we really need this?
-    if (includesUndef(mask)) {
-      const TensorId tid = env.makeTensorId(t.getOperandNumber());
-      for (LoopId i = 0; i < numLoops; i++) {
-        const auto dltI = env.dlt(tid, i);
-        if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) ||
-            isSingletonDLT(dltI)) {
-          for (LoopId j = 0; j < numLoops; j++)
-            if (isUndefDLT(env.dlt(tid, j))) {
-              adjM[i][j] = true;
-              inDegree[j]++;
-            }
-        } else {
-          assert(isDenseDLT(dltI) || isUndefDLT(dltI));
+    const TensorId tid = env.makeTensorId(t.getOperandNumber());
+    for (LoopId i = 0; i < numLoops; i++) {
+
+      const auto dltI = env.dlt(tid, i);
+      if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) ||
+          isSingletonDLT(dltI)) {
+        for (LoopId j = 0; j < numLoops; j++) {
+          if (!env.merger().isFilterLoop(j))
+            constraints[ConstraintKind::Undef].push_back({i, j});
         }
+      } else {
+        assert(isDenseDLT(dltI) || isUndefDLT(dltI));
       }
     }
-    // Push unrelated loops into sparse iteration space, so these
-    // will be skipped more often.
+
+    ConstraintKind constraintKind = ConstraintKind::Sparse;
+
+    const bool isDenseInput = !enc && env.op().isDpsInput(&t);
+    const bool isDenseOutput = !enc && !isDenseInput;
+
+    if (isDenseInput)
+      constraintKind = ConstraintKind::DenseInput;
+    else if (isDenseOutput)
+      constraintKind = ConstraintKind::DenseOutput;
+
     if (idxReducBased)
-      addSliceBasedConstraints(env, t, skip, mask, adjM, inDegree);
+      addSliceBasedConstraints(env, t, skip, constraints[constraintKind]);
+    else
+      addFilterLoopBasedConstraints(env, t, skip, constraints[constraintKind]);
+  }
+}
+
+static void computeInitialLoopOrdering(CodegenEnv &env,
+                                       SmallVector<LoopId> &initialOrdering) {
+  const unsigned numLoops = env.merger().getNumLoops();
+  initialOrdering.resize(numLoops);
+
+  for (LoopId i = 0; i < numLoops; ++i) {
+    initialOrdering[i] = i;
+  }
+
+  const auto iteratorTypes = env.op().getIteratorTypesArray();
+
+  // For the initial sorting we use filter loop -> parallel loop -> reduction
+  // loop because
+  // 1. Putting reduction loop early might make the loop sequence
+  // inadmissible.
+  // 2. Filter loops should be put as early as possible for better
+  // performance, since only one (if any) iteration will carry the
+  // computation. E.g., for (1 to N)
+  //  for (1 to M)
+  //    for (1 to K)
+  //      if (xxx)
+  //        O(X) computation  => O(NMK+NMX) time complexity
+  //
+  // By putting the filter loop one level up, we got
+  //
+  // for (1 to N)
+  //  for (1 to K)
+  //    if (xxx)
+  //      for (1 to M)
+  //        O(X) computation  => O(NK+NMX) time complexity
+  auto loopRank = [&](LoopId idx) {
+    if (env.merger().isFilterLoop(idx))
+      return -1;
+    else if (linalg::isReductionIterator(iteratorTypes[idx]))
+      return 1;
     else
-      addFilterLoopBasedConstraints(env, t, skip, mask, adjM, inDegree);
+      return 0;
+  };
+
+  std::sort(initialOrdering.begin(), initialOrdering.end(),
+            [&](LoopId a, LoopId b) { return loopRank(a) < loopRank(b); });
+}
+
+/// Given a list of constraints resp. edges that define the iteration graph this
+/// returns a topological sort of the nodes that respects all sparse
+/// constraints. For the other constraints we greedily try to add them such that
+/// no cycles occur.
+static void computeTopSort(
+    CodegenEnv &env,
+    std::unordered_map<ConstraintKind, SmallVector<OrderingConstraint>>
+        &constraints,
+    bool &hasCycle, bool &isAdmissible) {
+  // By providing an initial sorting we can impose soft constraints on the
+  // ordering based on loop types.
+  // TODO: It might be better to impose the loop type constraints via edges as
+  // well.
+  SmallVector<LoopId> initialOrdering;
+  computeInitialLoopOrdering(env, initialOrdering);
+
+  IncrementalTopoSort topSort(initialOrdering);
+
+  hasCycle = false;
+  isAdmissible = true;
+
+  // First try to satisfy all required constraints, if this fails we return
+  // an error.
+  for (const OrderingConstraint &constraint :
+       constraints[ConstraintKind::Sparse]) {
+    LoopId i, j;
+    std::tie(i, j) = constraint;
+
+    if (topSort.createsCycle(i, j)) {
+      hasCycle = true;
+      return;
+    }
+
+    topSort.addEdge(i, j);
+  }
+
+  if (!env.isAdmissibleTopoOrder(topSort.getCurrent())) {
+    isAdmissible = false;
+    return;
   }
-  // Topologically sort the iteration graph to determine loop order.
-  // Report failure for a cyclic iteration graph.
-  env.topSortClear(numLoops);
-  return topSortOptimal(env, iteratorTypes, inDegree, adjM);
+
+  // Now we add relaxable constraints and skip any that lead to cycles.
+  SmallVector<ConstraintKind> relaxableConstraints = {
+      ConstraintKind::DenseInput, ConstraintKind::DenseOutput,
+      ConstraintKind::Undef};
+
+  // If we encounter an inadmissible ordering here, we return the last good
+  // configuration.
+  // TODO: We could also skip the constraint that led to an inadmissible
+  // ordering and continue.
+  std::vector<LoopId> lastAdmissible = topSort.getCurrent();
+
+  bool encounteredInadmissible = false;
+  for (ConstraintKind kind : relaxableConstraints) {
+    for (const OrderingConstraint &constraint : constraints[kind]) {
+      LoopId i, j;
+      std::tie(i, j) = constraint;
+      if (topSort.createsCycle(i, j))
+        continue;
+
+      topSort.addEdge(i, j);
+
+      if (!env.isAdmissibleTopoOrder(topSort.getCurrent())) {
+        encounteredInadmissible = true;
+        break;
+      }
+      lastAdmissible = topSort.getCurrent();
+    }
+    if (encounteredInadmissible)
+      break;
+  }
+
+  env.topSortSet(lastAdmissible);
+}
+
+/// Computes a topologically sorted iteration graph for the linalg operation.
+/// Ensures all tensors are visited in natural index order. This is
+/// essential for sparse storage formats since these only support access
+/// along fixed dimensions. Even for dense storage formats, however, the natural
+/// index order yields innermost unit-stride access with better spatial
+/// locality.
+static void computeIterationGraph(CodegenEnv &env, OpOperand *skip,
+                                  bool &hasCycle, bool &isAdmissible,
+                                  bool idxReducBased = false) {
+  // Note that we use the terms 'constraint' and 'edge' somewhat interchangably
+  // here
+  std::unordered_map<ConstraintKind, SmallVector<OrderingConstraint>>
+      constraints;
+  collectOrderingConstraints(env, skip, idxReducBased, constraints);
+
+  computeTopSort(env, constraints, hasCycle, isAdmissible);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1869,36 +1994,19 @@
     // constraints until an acyclic iteration graph results, such that sparse
     // code generation can proceed. As a last resort, an attempt is made
     // to resolve cycles by inserting a conversion.
-    bool isAdmissible = false;
-    bool hasCycle = true;
-
-    // An const list of all masks that we used for interation graph
-    // computation. Must be ordered from more strict to less strict.
-    // Ideally (though might not be guaranteed), the eariler a constraint mask
-    // can be satisfied, the faster the generated kernel will be.
-    const auto allMasks = {
-        SortMask::kIncludeAll,        SortMask::kIncludeDense,
-        SortMask::kIncludeDenseInput, SortMask::kIncludeDenseOutput,
-        SortMask::kIncludeUndef,      SortMask::kSparseOnly};
-    for (const SortMask mask : allMasks) {
-      if (computeIterationGraph(env, mask, nullptr, idxReducBased)) {
-        hasCycle = false;
-        if (env.isAdmissibleTopoOrder()) {
-          isAdmissible = true;
-          break;
-        }
-        // else try a set of less strict constraints.
-      }
+    bool hasCycle, isAdmissible;
+    computeIterationGraph(env, nullptr, hasCycle, isAdmissible, idxReducBased);
+
+    if (!isAdmissible) {
+      return failure();
     }
+
     if (hasCycle) {
       return idxReducBased
                  ? failure() // TODO: should cycle be resolved differently?
                  : resolveCycle(env, rewriter); // one last shot
     }
 
-    if (!isAdmissible)
-      return failure(); // inadmissible expression, reject
-
     // Recursively generates code if admissible.
     env.startEmit();
     genBuffers(env, rewriter);
@@ -1921,8 +2029,15 @@
       const TensorId tid = env.makeTensorId(t->getOperandNumber());
       Value tval = t->get();
       auto srcEnc = getSparseTensorEncoding(tval.getType());
-      if (!srcEnc || !computeIterationGraph(env, SortMask::kSparseOnly, t))
+
+      if (!srcEnc)
+        continue;
+
+      bool hasCycle, isAdmissible;
+      computeIterationGraph(env, t, hasCycle, isAdmissible);
+      if (hasCycle || !isAdmissible)
         continue;
+
       // Found an input tensor that resolves the cycle by inserting a
       // conversion into a sparse tensor that adheres to the iteration
       // graph order. Also releases the temporary sparse tensor.
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
@@ -209,32 +209,31 @@
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<3x3xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x3xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xi32>
 // CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<6x6xi32>
-// CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
-// CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:             scf.for %[[VAL_16:.*]] = %[[VAL_14]] to %[[VAL_15]] step %[[VAL_5]] {
-// CHECK:               %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref<?xindex>
-// CHECK:               scf.for %[[VAL_18:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
-// CHECK:                 %[[VAL_19:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_18]]] : memref<6x6xi32>
-// CHECK:                 %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_16]]] : memref<?xindex>
-// CHECK:                 %[[VAL_21:.*]] = arith.addi %[[VAL_16]], %[[VAL_5]] : index
-// CHECK:                 %[[VAL_22:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_21]]] : memref<?xindex>
-// CHECK:                 %[[VAL_23:.*]] = scf.for %[[VAL_24:.*]] = %[[VAL_20]] to %[[VAL_22]] step %[[VAL_5]] iter_args(%[[VAL_25:.*]] = %[[VAL_19]]) -> (i32) {
-// CHECK:                   %[[VAL_26:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_24]]] : memref<?xindex>
-// CHECK:                   %[[VAL_27:.*]] = arith.addi %[[VAL_13]], %[[VAL_17]] : index
-// CHECK:                   %[[VAL_28:.*]] = arith.addi %[[VAL_18]], %[[VAL_26]] : index
-// CHECK:                   %[[VAL_29:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_27]], %[[VAL_28]]] : memref<8x8xi32>
-// CHECK:                   %[[VAL_30:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_24]]] : memref<?xi32>
-// CHECK:                   %[[VAL_31:.*]] = arith.muli %[[VAL_29]], %[[VAL_30]] : i32
-// CHECK:                   %[[VAL_32:.*]] = arith.addi %[[VAL_25]], %[[VAL_31]] : i32
-// CHECK:                   scf.yield %[[VAL_32]] : i32
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             scf.for %[[VAL_17:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK:               %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:               %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_19]]] : memref<?xindex>
+// CHECK:               scf.for %[[VAL_21:.*]] = %[[VAL_18]] to %[[VAL_20]] step %[[VAL_5]] {
+// CHECK:                 %[[VAL_22:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_21]]] : memref<?xindex>
+// CHECK:                 %[[VAL_23:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_21]]] : memref<?xi32>
+// CHECK:                 scf.for %[[VAL_24:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
+// CHECK:                   %[[VAL_25:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_17]], %[[VAL_24]]] : memref<6x6xi32>
+// CHECK:                   %[[VAL_26:.*]] = arith.addi %[[VAL_17]], %[[VAL_16]] : index
+// CHECK:                   %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_22]] : index
+// CHECK:                   %[[VAL_28:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_26]], %[[VAL_27]]] : memref<8x8xi32>
+// CHECK:                   %[[VAL_29:.*]] = arith.muli %[[VAL_28]], %[[VAL_23]] : i32
+// CHECK:                   %[[VAL_30:.*]] = arith.addi %[[VAL_25]], %[[VAL_29]] : i32
+// CHECK:                   memref.store %[[VAL_30]], %[[VAL_12]]{{\[}}%[[VAL_17]], %[[VAL_24]]] : memref<6x6xi32>
 // CHECK:                 } {"Emitted from" = "linalg.generic"}
-// CHECK:                 memref.store %[[VAL_33:.*]], %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_18]]] : memref<6x6xi32>
 // CHECK:               } {"Emitted from" = "linalg.generic"}
 // CHECK:             } {"Emitted from" = "linalg.generic"}
 // CHECK:           } {"Emitted from" = "linalg.generic"}
-// CHECK:           %[[VAL_34:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<6x6xi32>
-// CHECK:           return %[[VAL_34]] : tensor<6x6xi32>
+// CHECK:           %[[VAL_31:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<6x6xi32>
+// CHECK:           return %[[VAL_31]] : tensor<6x6xi32>
 // CHECK:         }
 func.func @conv2d(%input:  tensor<8x8xi32>,
              %filter: tensor<3x3xi32, #DCSR>,