diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -407,12 +407,14 @@
 
   auto enc = getSparseTensorEncoding(tensors[tid].getType());
   const auto reassoc = getCollapseReassociation(tid, dim);
-  dim = reassoc.front();
   // TODO: support dynamic slices.
+  // Uses the first dimension here to build the loop bound (which is also the
+  // biggest range).
+  const auto fdim = reassoc.front();
   Value step = constantIndex(builder, loc, 1);
-  Value lo = isSparseInput ? pidxs[tid][dim]      // current offset
+  Value lo = isSparseInput ? pidxs[tid][fdim]     // current offset
                            : loopSeqStack.back(); // universal index
-  Value hi = highs[tid][dim];
+  Value hi = highs[tid][fdim];
 
   Operation *loop = nullptr;
   Value iv;
@@ -585,9 +587,17 @@
   for (auto [tid, dim] : llvm::zip(tids, dims)) {
     if (isCompressedDLT(dimTypes[tid][dim]) ||
         isSingletonDLT(dimTypes[tid][dim])) {
+      const auto reassoc = getCollapseReassociation(tid, dim);
+      for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
+        if (!isUniqueDLT(dimTypes[tid][reassoc[i]])) {
+          // This is the segment high for each non-unique levels.
+          types.push_back(indexType);
+          operands.push_back(constantIndex(builder, loc, 0));
+        }
+      }
       assert(pidxs[tid][dim]);
       types.push_back(indexType);
-      operands.push_back(pidxs[tid][dim]);
+      operands.push_back(pidxs[tid][reassoc.front()]);
     }
   }
   // The position where user-supplied reduction variable starts.
@@ -616,15 +626,22 @@
     unsigned tid = t; // Why `t` can not be captured by lambda?
     if (isCompressedDLT(dimTypes[tid][lvl]) ||
         isSingletonDLT(dimTypes[tid][lvl])) {
+      const auto reassoc = getCollapseReassociation(tid, lvl);
+      assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
+      for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
+        if (!isUniqueDLT(dimTypes[tid][reassoc[i]])) {
+          // Links the SSA chain for segHi.
+          segHi[tid][reassoc[i]] = after->getArgument(o++);
+        }
+      }
       Value op1 = before->getArgument(o);
-      Value op2 = highs[tid][lvl];
+      // We used the first level bound as the bound the collapsed set of levels.
+      Value op2 = highs[tid][reassoc.front()];
       Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
                                                 op1, op2);
       cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
       // Update positions
       Value pos = after->getArgument(o++);
-      const auto reassoc = getCollapseReassociation(tid, lvl);
-      assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
       // For COO, the position is the same across consecutive levels.
       llvm::for_each(reassoc,
                      [this, tid, pos](Level lvl) { pidxs[tid][lvl] = pos; });
@@ -714,9 +731,48 @@
   assert(loopStack.size() == loopSeqStack.size());
 
   for (auto [tid, dim] : llvm::zip(tids, dims)) {
-    if (!isUniqueDLT(dimTypes[tid][dim])) {
-      segHi[tid][dim] = genSegmentHigh(builder, loc, tid, dim, pidxs[tid][dim],
-                                       highs[tid][dim]);
+    const auto reassoc = getCollapseReassociation(tid, dim);
+    assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
+    // TODO: Refactors this into smaller functions.
+    // NOTE: For all the collapsed level (except for the last one, that is why
+    // the loop ends with `reassoc.size() - 1`), as each iteration is advanced
+    // by the segment size of the last level, which does not always invalidate
+    // the segment size for the previous levels, thus we need to propagate the
+    // segment sizes across loop iterations and only forward if needed.
+    //
+    // E.g., for a COO tensor with the following coordinates array.
+    // (0, 0, 1),
+    // (0, 0, 2),
+    // (1, 1, 1),
+    // segHi[lvl=0] = segHi[lvl=1] = 2
+    // segHi[lvl=2] = 1,
+    // the first iteration does not invalidate segHi[0] and segHi[1]
+    for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
+      const auto lvl = reassoc[i];
+      if (!isUniqueDLT(dimTypes[tid][lvl])) {
+        Value pos = pidxs[tid][lvl];
+        assert(segHi[tid][lvl]);
+        Value newSegHi = builder.create<arith::CmpIOp>(
+            loc, arith::CmpIPredicate::uge, pos, segHi[tid][lvl]);
+        auto ifOp = builder.create<scf::IfOp>(loc, builder.getIndexType(),
+                                              newSegHi, true);
+        {
+          OpBuilder::InsertionGuard guard(builder);
+          builder.setInsertionPointToStart(ifOp.thenBlock());
+          builder.create<scf::YieldOp>(
+              loc,
+              genSegmentHigh(builder, loc, tid, lvl, pos, highs[tid][lvl]));
+          // Else, resues the same segment high.
+          builder.setInsertionPointToStart(ifOp.elseBlock());
+          builder.create<scf::YieldOp>(loc, segHi[tid][lvl]);
+        }
+        highs[tid][lvl + 1] = segHi[tid][lvl] = ifOp.getResult(0);
+      }
+    };
+    const auto lvl = reassoc.back();
+    if (!isUniqueDLT(dimTypes[tid][lvl])) {
+      segHi[tid][lvl] = genSegmentHigh(builder, loc, tid, lvl, pidxs[tid][lvl],
+                                       highs[tid][lvl]);
     }
   }
 
@@ -906,6 +962,15 @@
   for (auto [tid, dim] : llvm::zip(tids, dims)) {
     if (isCompressedDLT(dimTypes[tid][dim]) ||
         isSingletonDLT(dimTypes[tid][dim])) {
+      const auto reassoc = getCollapseReassociation(tid, dim);
+      assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
+      for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
+        const auto lvl = reassoc[i];
+        if (!isUniqueDLT(dimTypes[tid][lvl])) {
+          operands.push_back(segHi[tid][lvl]);
+          o++;
+        }
+      }
       Value op1 = coord[tid][dim];
       Value op3 = pidxs[tid][dim];
       Value cmp =
@@ -913,13 +978,18 @@
       // If the loop contains a coiteration with non-unique level, we fast
       // forward all the duplicated coords by setting the position to the
       // segment high.
-      Value add = !isUniqueDLT(dimTypes[tid][dim])
-                      ? segHi[tid][dim]
+      // If this is a collapsed dim, we forward pidx based on the last level in
+      // the collapsed level set.
+      Value add = !isUniqueDLT(dimTypes[tid][reassoc.back()])
+                      ? segHi[tid][reassoc.back()]
                       : builder.create<arith::AddIOp>(loc, op3, one);
+
       operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
       // Following loops continue iteration from the break point of the
       // current while loop.
-      pidxs[tid][dim] = whileOp->getResult(o++);
+      Value pos = whileOp->getResult(o++);
+      const auto t = tid;
+      llvm::for_each(reassoc, [this, t, pos](Level l) { pidxs[t][l] = pos; });
       // The coordinates are invalid now.
       coord[tid][dim] = nullptr;
       // The segment high are invalid now
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
@@ -56,6 +56,18 @@
     return %ret1 :  tensor<?x?x?xf32>
   }
 
+  func.func @test_sparse_all_2(%arg0: tensor<5x6xf32, #COO_2D>, %arg1: tensor<2x3x6xf32, #COO_3D>) -> tensor<?x?x?xf32> {
+    // collapse the first two level this time, as this is the level requires coiterations.
+    %collapsed = tensor.collapse_shape %arg1 [[0, 1], [2]] : tensor<2x3x6xf32, #COO_3D> into tensor<6x6xf32, #COO_2D>
+    %0 = tensor.empty() : tensor<5x6xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
+    %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
+    %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
+    %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+    return %ret1 : tensor<?x?x?xf32>
+  }
+
 
   func.func @entry() {
     // Setup two sparse vectors.
@@ -68,9 +80,12 @@
       [ [0, 0, 0], [1, 1, 1], [2, 1, 1] ],
         [     6.0,       7.0,      8.0]
     > : tensor<6x2x3xf32>
+    %shape = arith.constant dense<[2, 3, 6]> : tensor<3xi32>
 
+    %d3 = tensor.reshape %d2(%shape): (tensor<6x2x3xf32>, tensor<3xi32>) -> tensor<2x3x6xf32>
     %s1 = sparse_tensor.convert %d1 : tensor<5x6xf32> to tensor<5x6xf32, #COO_2D>
     %s2 = sparse_tensor.convert %d2 : tensor<6x2x3xf32> to tensor<6x2x3xf32, #COO_3D>
+    %s3 = sparse_tensor.convert %d3 : tensor<2x3x6xf32> to tensor<2x3x6xf32, #COO_3D>
 
     //      CHECK: Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [5, 2, 3] strides = [6, 3, 1] data =
     // CHECK-NEXT:[
@@ -134,11 +149,34 @@
     %so2 = call @test_sparse_all(%s1, %s2): (tensor<5x6xf32, #COO_2D>, tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32>
     call @printMemref3dF32(%so2) : (tensor<?x?x?xf32>) -> ()
 
+    // Same results.
+    // CHECK-NEXT: Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [5, 2, 3] strides = [6, 3, 1] data =
+    // CHECK-NEXT:[
+    // CHECK-SAME: [
+    // CHECK-SAME:  [6,    0,    0],
+    // CHECK-NEXT:  [0,    0,    0]],
+    // CHECK-NEXT: [
+    // CHECK-SAME:  [0,    0,    0],
+    // CHECK-NEXT:  [0,    14,    0]],
+    // CHECK-NEXT: [
+    // CHECK-SAME:  [0,    0,    0],
+    // CHECK-NEXT:  [0,    24,    0]],
+    // CHECK-NEXT: [
+    // CHECK-SAME:  [0,    0,    0],
+    // CHECK-NEXT:  [0,    0,    0]],
+    // CHECK-NEXT: [
+    // CHECK-SAME:  [0,    0,    0],
+    // CHECK-NEXT:  [0,    0,    0]]]
+    %so3 = call @test_sparse_all_2(%s1, %s3): (tensor<5x6xf32, #COO_2D>, tensor<2x3x6xf32, #COO_3D>) -> tensor<?x?x?xf32>
+    call @printMemref3dF32(%so2) : (tensor<?x?x?xf32>) -> ()
+
     bufferization.dealloc_tensor %s1 : tensor<5x6xf32, #COO_2D>
     bufferization.dealloc_tensor %s2 : tensor<6x2x3xf32, #COO_3D>
+    bufferization.dealloc_tensor %s3 : tensor<2x3x6xf32, #COO_3D>
     bufferization.dealloc_tensor %do1 : tensor<?x?x?xf32>
     bufferization.dealloc_tensor %so1 : tensor<?x?x?xf32>
     bufferization.dealloc_tensor %so2 : tensor<?x?x?xf32>
+    bufferization.dealloc_tensor %so3 : tensor<?x?x?xf32>
     return
   }
 }