diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
@@ -453,6 +453,9 @@
     return tid < lvlTypes.size() && lvl < lvlTypes[tid].size();
   }
 
+  void forwardsReducedSliceLevelTreeIt(OpBuilder &builder, Location loc,
+                                       TensorId tid, Level lvl, Value fcnt);
+
   /// Prepares loop for iterating over `tensor[lvl]`, under the assumption
   /// that `tensor[0...lvl-1]` loops have already been set up.
   void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 
 using namespace mlir;
 using namespace mlir::sparse_tensor;
@@ -41,6 +42,13 @@
 // File local helper functions.
 //===----------------------------------------------------------------------===//
 
+// For index reduction loops, since the tensor are sliced into uncontinuous
+// fragments, we need a tuple [pLo, pHi, pPtr], in which the pair (pLo, pHi)
+// specifies the range of the fragment, and pPtr specifies the index of the
+// corresponding fragment in the child level (i.e., a pointer to the sliced
+// position array).
+static constexpr unsigned kSliceIterWidth = 3;
+
 static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
                             Level lvl) {
   auto enc = getSparseTensorEncoding(tensor.getType());
@@ -123,6 +131,27 @@
   return ifOp.getResult(0);
 }
 
+static void dumpIndexMemRef(OpBuilder &builder, Location loc, Value memref) {
+  memref = builder.create<memref::CastOp>(
+      loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
+  createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
+                 ValueRange{memref}, EmitCInterface::On);
+}
+
+static Value loadSlicePosPtr(OpBuilder &builder, Location loc, Value sPosBuf) {
+  return genIndexLoad(builder, loc, sPosBuf, C_IDX(1));
+}
+
+static void updateSlicePosPtr(OpBuilder &builder, Location loc, Value sPosBuf,
+                              Value pPtr) {
+  builder.create<memref::StoreOp>(loc, pPtr, sPosBuf, C_IDX(1));
+}
+
+static Value loadSliceNextPosPtrStart(OpBuilder &builder, Location loc,
+                                      Value sPosBuf, Value pPtr) {
+  return genIndexLoad(builder, loc, sPosBuf, ADDI(pPtr, C_IDX(4)));
+}
+
 std::pair<Value, Value>
 LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd,
                                     TensorId tid, Level lvl) {
@@ -572,15 +601,17 @@
       assert(sliceStack[tid].back().slicedOnLvl == lvl);
       sliceStack[tid].pop_back();
     } else {
+      continue;
+
       if (!isDenseDLT(lvlTypes[tid][lvl])) {
         // Else this is a resolved-slice, and advance posit similar to TACO.
-        Value c1 = C_IDX(1), c2 = C_IDX(2);
+        Value c1 = C_IDX(1);
         // pIdx += 2, we finished the current lvl, advance the pointer index of
         // the previous level by two to skip the [pLo, pHi] for current level.
         Value sPtrBuf = slicePosBuffer[tid][lvl].back();
         Value curP = genIndexLoad(builder, loc, sPtrBuf, c1);
         // TODO: we could probably use an SSA value for it.
-        Value nexP = ADDI(curP, c2);
+        Value nexP = ADDI(curP, C_IDX(kSliceIterWidth));
         builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1);
       }
     }
@@ -1297,11 +1328,11 @@
       // Pushes sliced levels to build correct LoopInfo.
       bool unReduc = isAffineIdxUnRedCond(denseLoopCond);
       SliceInfo &info = sliceStack[tid].back();
+      // Pushes sliced dense loop info to tell LoopEmitter how to exit it.
+      sliceInfo.emplace_back(tid, lvl, /*fullyReduced=*/!unReduc);
       if (unReduc) {
-        // Pushes sliced dense loop info to tell LoopEmitter how to exit it.
-        sliceInfo.emplace_back(tid, lvl, /*fullyReduced=*/false);
-        // Update the slice information as we enter the new loop.
         assert(*info.slicedOnLvl == lvl);
+        // Update the slice information as we enter the new loop.
         info.minCrd = info.offset = iv;
         info.isNonEmpty = constantI1(builder, loc, true);
         levelReducedDep[tid][lvl]++;
@@ -1331,27 +1362,31 @@
   }
 }
 
-void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
+void LoopEmitter::exitForLoop(RewriterBase &builder, Location loc,
                               MutableArrayRef<Value> reduc) {
   const LoopInfo &loopInfo = loopStack.back();
   for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) {
-    SliceInfo &info = sliceStack[tid].back();
-    assert(isDenseDLT(lvlTypes[tid][lvl]));
-    assert(*info.slicedOnLvl == lvl && !reduced);
-    (void)reduced;
-    // Resets slices pointers as the resolved slices are invalidated after we
-    // moves forward to the next slice.
-    invalidateSliceIterIdx(rewriter, loc, tid, lvl);
-    info.minCrd = info.offset = info.isNonEmpty = Value();
-    levelReducedDep[tid][lvl]--;
+    if (!reduced) {
+      SliceInfo &info = sliceStack[tid].back();
+      assert(isDenseDLT(lvlTypes[tid][lvl]));
+      assert(*info.slicedOnLvl == lvl);
+      (void)reduced;
+      // Resets slices pointers as the resolved slices are invalidated after we
+      // moves forward to the next slice.
+      invalidateSliceIterIdx(builder, loc, tid, lvl);
+      info.minCrd = info.offset = info.isNonEmpty = Value();
+      levelReducedDep[tid][lvl]--;
+    } else {
+      forwardsReducedSliceLevelTreeIt(builder, loc, tid, lvl, C_IDX(1));
+    }
   }
   if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
     if (!reduc.empty()) {
       assert(reduc.size() == forOp.getNumResults());
-      rewriter.create<scf::YieldOp>(loc, reduc);
+      builder.create<scf::YieldOp>(loc, reduc);
     }
     // Exit the loop.
-    rewriter.setInsertionPointAfter(forOp);
+    builder.setInsertionPointAfter(forOp);
     // In-place update reduction variables.
     for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
       reduc[i] = forOp.getResult(i);
@@ -1387,22 +1422,22 @@
       assert(numUsers == 1);
 #endif // NDEBUG
 
-      rewriter.setInsertionPointAfter(redExp);
-      auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
+      builder.setInsertionPointAfter(redExp);
+      auto redOp = builder.create<scf::ReduceOp>(loc, curVal);
       // Attach to the reduction op.
       Block *redBlock = &redOp.getRegion().getBlocks().front();
-      rewriter.setInsertionPointToEnd(redBlock);
-      Operation *newRed = rewriter.clone(*redExp);
+      builder.setInsertionPointToEnd(redBlock);
+      Operation *newRed = builder.clone(*redExp);
       // Replaces arguments of the reduction expression by using the block
       // arguments from scf.reduce.
-      rewriter.updateRootInPlace(
+      builder.updateRootInPlace(
           newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
       // Erases the out-dated reduction expression.
-      rewriter.eraseOp(redExp);
-      rewriter.setInsertionPointToEnd(redBlock);
-      rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
+      builder.eraseOp(redExp);
+      builder.setInsertionPointToEnd(redBlock);
+      builder.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
     }
-    rewriter.setInsertionPointAfter(parOp);
+    builder.setInsertionPointAfter(parOp);
     // In-place update reduction variables.
     for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
       reduc[i] = parOp.getResult(i);
@@ -1421,6 +1456,52 @@
   }
 }
 
+void LoopEmitter::forwardsReducedSliceLevelTreeIt(OpBuilder &builder,
+                                                  Location loc, TensorId tid,
+                                                  Level rootLvl, Value fcnt) {
+  auto stt = getSparseTensorType(tensors[tid]);
+
+  // Find a [Lvl, leafLvl) range, and all level in between are fully reduced
+  // level (but not resolved). Since we forwards a iterator at higher level of
+  // the tree, the subtree need to be pruned.
+  Level leafLvl = rootLvl + 1;
+  while (leafLvl < stt.getLvlRank() && !dependentLvlMap[tid][leafLvl].empty()) {
+    assert(depFullyReduced(tid, leafLvl));
+    leafLvl++;
+  }
+
+  Level curLvl = rootLvl + 1;
+  // Prunes all denses subtree.
+  while (curLvl < leafLvl && isDenseDLT(lvlTypes[tid][curLvl])) {
+    fcnt = MULI(sliceSizes[tid][curLvl].back(), fcnt);
+    curLvl++;
+  }
+
+  Value nxPosPtr = nullptr;
+  if (curLvl < leafLvl) {
+    assert(!isDenseDLT(lvlTypes[tid][curLvl]));
+    Value sPosBuf = slicePosBuffer[tid][curLvl].back();
+    Value fPosPtr = MULI(fcnt, C_IDX(kSliceIterWidth));     // forward ptr
+    Value pPosPtr = loadSlicePosPtr(builder, loc, sPosBuf); // previous ptr
+    Value cPosPtr = ADDI(fPosPtr, pPosPtr);                 // current ptr
+    updateSlicePosPtr(builder, loc, sPosBuf, cPosPtr);
+    // dumpIndexMemRef(builder, loc, sPosBuf);
+    // Loads the position pointer start for next level.
+    nxPosPtr = genIndexLoad(builder, loc, sPosBuf, ADDI(cPosPtr, C_IDX(1)));
+    curLvl++;
+  }
+
+  for (; curLvl < leafLvl; curLvl++) {
+    assert(nxPosPtr);
+    if (!isDenseDLT(lvlTypes[tid][curLvl])) {
+      nxPosPtr = MULI(nxPosPtr, C_IDX(kSliceIterWidth));
+      Value sPosBuf = slicePosBuffer[tid][curLvl].back();
+      updateSlicePosPtr(builder, loc, sPosBuf, nxPosPtr);
+      nxPosPtr = genIndexLoad(builder, loc, sPosBuf, ADDI(nxPosPtr, C_IDX(1)));
+    }
+  }
+}
+
 void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
                                 MutableArrayRef<Value> reduc) {
   const LoopInfo &loopInfo = loopStack.back();
@@ -1448,17 +1529,25 @@
       continue;
     }
 
+    Value forwarded = nullptr;
     if (loopInfo.trivialTidLvls.empty() &&
         loopInfo.sliceDrivenInfo.size() == 1) {
       // Forwards the position iterator.
       operands.push_back(ADDI(posits[tid][lvl], one));
+      forwarded = constantI1(builder, loc, true);
     } else {
       const Value pos = posits[tid][lvl];
       const Value nxPos = ADDI(posits[tid][lvl], one);
-      Value cmp = CMPI(eq, coords[tid][lvl], iv);
-      operands.push_back(SELECT(cmp, nxPos, pos));
+      forwarded = CMPI(eq, coords[tid][lvl], iv);
+      operands.push_back(SELECT(forwarded, nxPos, pos));
+    }
+    {
+      OpBuilder::InsertionGuard guard(builder);
+      auto ifOp = builder.create<scf::IfOp>(loc, TypeRange{}, forwarded,
+                                            /*else=*/false);
+      builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      forwardsReducedSliceLevelTreeIt(builder, loc, tid, lvl, one);
     }
-
     // The coordinate is invalid now.
     coords[tid][lvl] = nullptr;
 
@@ -1656,7 +1745,6 @@
 }
 
 // Generates a loop nest that traverse all the unresolved levels in between.
-// TODO: it can only handle all compressed tensors.
 //
 // for(int i = 0; i < slicePos.size(); i+=2) {
 //   loopLo = slicePos[i];
@@ -1683,6 +1771,15 @@
   OpBuilder::InsertPoint ip;
   SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
   scf::ForOp outerMost = nullptr; // the outtermost loop.
+
+  // Wrap body builder and insert a extr counting instruction at the end.
+  auto wrapped = [bodyBuilder](OpBuilder &builder, Location loc, Value iv,
+                               MutableArrayRef<Value> reduc) {
+    bodyBuilder(builder, loc, iv, reduc.drop_back());
+    // Increments the counter.
+    reduc.back() = ADDI(reduc.back(), C_IDX(1));
+  };
+
   if (firstResLvl.has_value()) {
     // Overwrite position when the first level is fully resolved.
     pos = posits[firstResLvl->first][firstResLvl->second];
@@ -1692,13 +1789,18 @@
     Level firstLvl = *frontSlice.slicedOnLvl;
     if (!lvlFullyResolved(tid, firstLvl)) {
       if (isCompressedDLT(lvlTypes[tid][firstLvl])) {
+        // An extra counter that tracks how many segments are there in the child
+        // compressed level.
+        innerArgs.push_back(c0);
+        // Overrides the user-provided builder.
+        bodyBuilder = wrapped;
         unsigned depth = frontSlice.depth - 1;
         Value offset = frontSlice.offset;
         Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth];
         Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
         outerMost = builder.create<scf::ForOp>(
-            loc, c2, mSz, c2, innerArgs,
-            [this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos,
+            loc, c2, mSz, C_IDX(kSliceIterWidth), innerArgs,
+            [this, c1, c2, tid, firstLvl, offset, sPtrBuf, &ip, &pos,
              &innerArgs](OpBuilder &builder, Location loc, Value iv,
                          ValueRange iterArgs) {
               // generate traversal for each level.
@@ -1715,6 +1817,9 @@
                         innerArgs.assign(reduc.begin(), reduc.end());
                       })
                       .second;
+              // Marks downs the pPtr for next level.
+              builder.create<memref::StoreOp>(loc, itArgs.back(), sPtrBuf,
+                                              ADDI(iv, c2).getResult());
               YIELD(itArgs);
             });
       } else if (isDenseDLT(lvlTypes[tid][firstLvl])) {
@@ -1855,8 +1960,7 @@
                                           TensorId tid, Level lvl) {
   Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
   unsigned depth = levelReducedDep[tid][lvl];
-  Value size = sliceSizes[tid][lvl][depth];
-  // Dense slice begin is trivial
+  Value size = sliceSizes[tid][lvl][depth]; // Dense slice begin is trivial
   if (isDenseDLT(lvlTypes[tid][lvl])) {
     sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), lvl,
                                  depth + 1);
@@ -1902,9 +2006,8 @@
 
   ValueRange result = genUnResolvedSliceTreeTraverse(
       builder, loc, tid, unResSlices, firstResLvl, reduc,
-      [this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc,
-                                        Value iv,
-                                        MutableArrayRef<Value> reduc) {
+      [this, c1, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, Value iv,
+                                    MutableArrayRef<Value> reduc) {
         Value &nonEmpty = reduc[0];
         Value &minCrd = reduc[1];
         Value &curMemSz = reduc[2];
@@ -1942,8 +2045,8 @@
         builder.create<memref::StoreOp>(loc, sPLo, sPtrBuf, curMemSz);
         Value nxtMemSize = ADDI(curMemSz, c1);
         builder.create<memref::StoreOp>(loc, sPHi, sPtrBuf, nxtMemSize);
-        // curMemSize += 2
-        curMemSz = ADDI(curMemSz, c2);
+        // curMemSize += kSliceIterWidth
+        curMemSz = ADDI(curMemSz, C_IDX(kSliceIterWidth));
       });
 
   Value isNonEmpty = result[0];
@@ -1962,6 +2065,7 @@
   Value c1 = C_IDX(1), c2 = C_IDX(2);
 
   if (depFullyReduced(tid, lvl)) {
+
     // Do not need to prepare for slice driven loop on dense level after it is
     // fully reduced.
     if (isDenseDLT(lvlTypes[tid][lvl]))
@@ -1969,8 +2073,9 @@
     // If constraints on the tensor is fully resolved. We do not need to
     // generates slice begin any more, instead we fall back to TACO-based
     // algorithm to (co)iterates over the slice.
+    // dumpIndexMemRef(builder, loc, slicePosBuffer[tid][lvl].back());
     Value pLoPtr =
-        genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), c1);
+        loadSlicePosPtr(builder, loc, slicePosBuffer[tid][lvl].back());
     pLoPtr = ADDI(pLoPtr, c2);
     Value pHiPtr = ADDI(pLoPtr, c1);
     posits[tid][lvl] =
@@ -2022,10 +2127,10 @@
       Value sz = *(sliceSizes[tid][lvl].rbegin() + depth - 1);
       bufSize = MULI(bufSize, sz);
     }
-    // For a pair of [pLo, pHi]. Note that we can not compress pHi because
+    // For a tuple of [pLo, pHi, N]. Note that we can not compress pHi because
     // slice creates segments in the index buffer so that the pHi for the
     // current level is no longer the pLo for the next level.
-    bufSize = MULI(bufSize, c2);
+    bufSize = MULI(bufSize, C_IDX(kSliceIterWidth));
     // Additional two metadata {memSize, idx} at head.
     bufSize = ADDI(bufSize, c2);
     llvm::for_each(
@@ -2049,8 +2154,7 @@
                                          TensorId tid, Level lvl) {
   for (unsigned i = 0; i <= lvl; i++) {
     if (!isDenseDLT(lvlTypes[tid][i]) && !dependentLvlMap[tid][i].empty()) {
-      builder.create<memref::StoreOp>(loc, C_IDX(0),
-                                      slicePosBuffer[tid][i].back(), C_IDX(1));
+      updateSlicePosPtr(builder, loc, slicePosBuffer[tid][i].back(), C_IDX(0));
     }
   }
 }
@@ -2103,7 +2207,7 @@
     YIELD(reduc);
 
     // else /*minCrd == offset*/ {
-    //    for (i = 0; i < slicePos.size(); i+=2) {
+    //    for (i = 0; i < slicePos.size(); i+=kSliceIterWidth) {
     //       if (crd[pos[slicePos[i]]] == minCrd) {
     //          slicePos[i]++;
     //       }
@@ -2119,7 +2223,7 @@
     reduc[1] = constantI1(builder, loc, false);          // isNonEmpty
     auto loopArgs = static_cast<ValueRange>(reduc).drop_back();
     auto forOp = scf::buildLoopNest(
-        builder, loc, pSt, mSz, c2, loopArgs,
+        builder, loc, pSt, mSz, C_IDX(kSliceIterWidth), loopArgs,
         [this, tid, lvl, c1, sPtrBuf,
          &info](OpBuilder &builder, Location loc, ValueRange ivs,
                 ValueRange iterArgs) -> scf::ValueVector {
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
@@ -2,7 +2,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:  -e entry -entry-point-result=void  \
-// DEFINE:  -shared-libs=%mlir_c_runner_utils | \
+// DEFINE:  -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils | \
 // DEFINE: FileCheck %s
 //
 // RUN: %{compile} | %{run}
@@ -155,18 +155,18 @@
     // FIXME: DCSR still wrong
     //
     // Should be the same as dense output
-    // C_HECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // C_HECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // C_HECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // C_HECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // C_HECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // C_HECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
     //
-    // %all_sparse_DCSR = sparse_tensor.convert %2
-    //   : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    // %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
-    //   : tensor<6x6xi32>, vector<6x6xi32>
-    // vector.print %v2 : vector<6x6xi32>
+    %all_sparse_DCSR = sparse_tensor.convert %2
+      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
+    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v2 : vector<6x6xi32>
 
     //
     // Should be the same as dense output
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -2,7 +2,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:  -e entry -entry-point-result=void  \
-// DEFINE:  -shared-libs=%mlir_c_runner_utils | \
+// DEFINE:  -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils | \
 // DEFINE: FileCheck %s
 //
 // RUN: %{compile} | %{run}