diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -1162,15 +1162,6 @@
 void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
                               MutableArrayRef<Value> reduc) {
   const LoopInfo &loopInfo = loopStack.back();
-  rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
-  if (!loopInfo.userCodeBlock->empty() &&
-      llvm::isa<scf::ForOp>(loopInfo.loop) &&
-      llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) {
-    // scf::For inserts an implicit yield op when there is no loop iter args. In
-    // this case, we need to insert the code before the yield.
-    assert(reduc.empty());
-    rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back());
-  }
   for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) {
     SliceInfo &info = sliceStack[tid].back();
     assert(isDenseDLT(lvlTypes[tid][lvl]));
@@ -1262,7 +1253,6 @@
                                 MutableArrayRef<Value> reduc) {
   const LoopInfo &loopInfo = loopStack.back();
   auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
-  builder.setInsertionPointToEnd(loopInfo.userCodeBlock);
   Value iv = loopInfo.iv;
 
   // Finalize the induction. Note that the induction could be performed
@@ -1361,7 +1351,9 @@
   }
 
   assert(o == operands.size() + delta);
-  YIELD(operands);
+  if (!operands.empty())
+    YIELD(operands);
+
   builder.setInsertionPointAfter(whileOp);
 }
 
@@ -1370,7 +1362,17 @@
   // Clean up the values, it would help use to discover potential bug at a
   // earlier stage (instead of silently using a wrong value).
   const LoopInfo &loopInfo = loopStack.back();
-  SmallVector<Value> red;
+
+  // Sets the insertion point to the right position.
+  rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
+  if (!loopInfo.userCodeBlock->empty() &&
+      llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) {
+    // scf::While/For inserts an implicit yield op when there is no loop
+    // iter args. In this case, we need to insert the code before the yield.
+    assert(loopInfo.userCodeBlock->back().getNumResults() == 0);
+    rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back());
+  }
+
   if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
     exitWhileLoop(rewriter, loc, reduc);
   } else {
@@ -1449,7 +1451,8 @@
         Value cont = CMPI(ult, coord, sliceHi);
         TypeRange types = args.drop_front(2).getTypes();
 
-        auto ifOp = builder.create<scf::IfOp>(loc, types, cont, true);
+        auto ifOp = builder.create<scf::IfOp>(loc, types, cont,
+                                              /*withElseBlock=*/!types.empty());
         {
           // 2 reduction variable maintained by us.
           SmallVector<Value> ifRet = args.drop_front(2);
@@ -1457,8 +1460,10 @@
 
           OpBuilder::InsertionGuard guard(builder);
           // If coord >= sliceHi.
-          builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-          YIELD(ifRet);
+          if (!ifRet.empty()) {
+            builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+            YIELD(ifRet);
+          }
 
           // If coord < sliceHi.
           builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -699,14 +699,18 @@
     const AffineExpr fa = map.getResult(toOrigDim(enc, lvl - 1));
     const AffineExpr ta = map.getResult(toOrigDim(enc, lvl));
 
-    if (auto fdim = fa.dyn_cast<AffineDimExpr>()) {
+    if (fa.isa<AffineDimExpr>() || ta.isa<AffineDimExpr>()) {
+      AffineDimCollector fCollector;
+      fCollector.walkPostOrder(fa);
+
       AffineDimCollector tCollector;
       tCollector.walkPostOrder(ta);
-
-      const LoopId f = env.makeLoopId(fdim.getPosition());
-      for (auto td : tCollector.dims) {
-        const LoopId t = env.makeLoopId(td.getPosition());
-        addIterOrdering(f, t, adjM, inDegree);
+      for (auto fd : fCollector.dims) {
+        for (auto td : tCollector.dims) {
+          const LoopId f = env.makeLoopId(fd.getPosition());
+          const LoopId t = env.makeLoopId(td.getPosition());
+          addIterOrdering(f, t, adjM, inDegree);
+        }
       }
       continue;
     }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir
@@ -0,0 +1,78 @@
+// DEFINE: %{option} = "enable-runtime-library=false enable-index-reduction=true"
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE:  -e entry -entry-point-result=void  \
+// DEFINE:  -shared-libs=%mlir_c_runner_utils | \
+// DEFINE: FileCheck %s
+//
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{option} = "enable-runtime-library=true enable-buffer-initialization=true enable-index-reduction=true"
+// RUN: %{compile} | %{run}
+
+#CCCC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ], posWidth = 32, crdWidth = 32 }>
+
+func.func @pooling_nhwc_sum_CCCC(%input: tensor<1x4x4x1xf32, #CCCC>, %filter: tensor<2x2xf32>) -> tensor<1x3x3x1xf32, #CCCC> {
+  %init = bufferization.alloc_tensor() : tensor<1x3x3x1xf32, #CCCC>
+  %0 = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>,
+                                strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter: tensor<1x4x4x1xf32, #CCCC>, tensor<2x2xf32>)
+    outs (%init: tensor<1x3x3x1xf32, #CCCC>) -> tensor<1x3x3x1xf32, #CCCC>
+  return %0 : tensor<1x3x3x1xf32, #CCCC>
+}
+
+func.func @pooling_nhwc_sum(%input: tensor<1x4x4x1xf32>, %filter: tensor<2x2xf32>) -> tensor<1x3x3x1xf32> {
+  %init = arith.constant dense<[[ [[0.0], [0.0], [0.0]],
+                                  [[0.0], [0.0], [0.0]],
+                                  [[0.0], [0.0], [0.0]] ]]> : tensor<1x3x3x1xf32>
+  %0 = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>,
+                                strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter: tensor<1x4x4x1xf32>, tensor<2x2xf32>)
+    outs (%init: tensor<1x3x3x1xf32>) -> tensor<1x3x3x1xf32>
+  return %0 : tensor<1x3x3x1xf32>
+}
+
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %filter = arith.constant dense<
+     [[  1.0,  0.0],
+      [  0.0,  1.0]]
+  > : tensor<2x2xf32>
+
+  %in_dense = arith.constant dense<
+     [[[[1.0],  [2.0],  [1.0],  [2.0]],
+       [[1.0],  [2.0],  [1.0],  [2.0]],
+       [[1.0],  [2.0],  [1.0],  [2.0]],
+       [[1.0],  [2.0],  [1.0],  [2.0]]]]
+  > : tensor<1x4x4x1xf32>
+
+  %in_CCCC = sparse_tensor.convert %in_dense : tensor<1x4x4x1xf32> to tensor<1x4x4x1xf32, #CCCC>
+
+  %dense_ret = call @pooling_nhwc_sum(%in_dense, %filter) : (tensor<1x4x4x1xf32>, tensor<2x2xf32>) -> tensor<1x3x3x1xf32>
+  %CCCC_ret = call @pooling_nhwc_sum_CCCC(%in_CCCC, %filter) : (tensor<1x4x4x1xf32, #CCCC>, tensor<2x2xf32>) -> tensor<1x3x3x1xf32, #CCCC>
+
+  // CHECK: ( ( ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<1x3x3x1xf32>, vector<1x3x3x1xf32>
+  vector.print %dense_v : vector<1x3x3x1xf32>
+
+  //
+  // Sparse pooling should have the same output.
+  //
+
+  // CHECK-NEXT: ( ( ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ) ) )
+  %s1 = sparse_tensor.convert %CCCC_ret : tensor<1x3x3x1xf32, #CCCC> to tensor<1x3x3x1xf32>
+  %v1 = vector.transfer_read %s1[%c0, %c0, %c0, %c0], %zero
+      : tensor<1x3x3x1xf32>, vector<1x3x3x1xf32>
+  vector.print %v1 : vector<1x3x3x1xf32>
+
+  // Releases resources.
+  bufferization.dealloc_tensor %in_CCCC : tensor<1x4x4x1xf32, #CCCC>
+  bufferization.dealloc_tensor %CCCC_ret : tensor<1x3x3x1xf32, #CCCC>
+  bufferization.dealloc_tensor %dense_ret : tensor<1x3x3x1xf32>
+  return
+}