diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -60,7 +60,8 @@
   /// Constructor: take an array of tensors inputs, on which the generated loops
   /// will iterate on. The index of the tensor in the array is also the
   /// tensor id (tid) used in related functions.
-  explicit SparseTensorLoopEmitter(ValueRange tensors);
+  explicit SparseTensorLoopEmitter(ValueRange tensors,
+                                   bool isLastOutput = false);
 
   ///
   /// Core functions.
@@ -140,6 +141,7 @@
   std::vector<std::vector<Value>> idxBuffer; // to_indices
   std::vector<Value> valBuffer;              // to_value
 
+  bool isLastOutput; // Is the last tensor output tensor
   std::vector<LoopLevelInfo> loopStack;
   // TODO: not yet used, it should track the current level for each tensor
   // to help eliminate `dim` paramters from above APIs.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -44,15 +44,19 @@
 // Sparse tensor loop emitter class implementations
 //===----------------------------------------------------------------------===//
 
-SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors)
+SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors,
+                                                 bool isLastOutput)
     : tensors(tensors.begin(), tensors.end()), dims(tensors.size()),
       pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()),
       sizes(tensors.size()), ptrBuffer(tensors.size()),
-      idxBuffer(tensors.size()), valBuffer(tensors.size()), loopStack(),
-      curLv(tensors.size(), 0) {
+      idxBuffer(tensors.size()), valBuffer(tensors.size()),
+      isLastOutput(isLastOutput), loopStack(), curLv(tensors.size(), 0) {
   for (size_t i = 0, e = tensors.size(); i < e; i++) {
     auto t = tensors[i];
-    auto rtp = t.getType().cast<RankedTensorType>();
+    auto rtp = t.getType().dyn_cast<RankedTensorType>();
+    if (!rtp) // a scalar (0-dimension tensors)
+      continue;
+
     auto rank = static_cast<size_t>(rtp.getRank());
     auto enc = getSparseTensorEncoding(rtp);
     if (enc)
@@ -100,7 +104,14 @@
         ptrBuffer[t][d] = builder.create<ToPointersOp>(loc, ptrTp, tensor, dim);
         idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
       } else if (isSingletonDim(dims[t][d])) {
-        llvm_unreachable("TODO: not implemented yet");
+        // Singleton dimension, fetch indices.
+        auto indTp =
+            MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
+        auto dim = builder.getIndexAttr(d);
+        idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
+      } else {
+        // Dense dimension, nothing to fetch.
+        assert(isDenseDim(dims[t][d]));
       }
 
       // Find upper bound in current dimension.
@@ -116,9 +127,11 @@
     if (!enc) {
       // Non-annotated dense tensors.
       auto denseTp = MemRefType::get(shape, elementType);
-      // This is not the output tensor
-      valBuffer[t] =
-          builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+      if (isLastOutput && t == tensors.size() - 1)
+        llvm_unreachable("TODO: not yet handled");
+      else
+        valBuffer[t] =
+            builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
     } else {
       // Annotated sparse tensors.
       auto dynShape = {ShapedType::kDynamicSize};
@@ -137,10 +150,12 @@
   // We can not re-enter the same level.
   assert(!coord[tid][dim]);
   Value step = constantIndex(builder, loc, 1);
-  bool isCompressed = isCompressedDim(dims[tid][dim]);
-  assert(isDenseDim(dims[tid][dim]) || isCompressedDim(dims[tid][dim]));
+  auto dimType = dims[tid][dim];
+  bool isSparse = isCompressedDim(dimType) || isSingletonDim(dimType);
+  assert(isDenseDim(dimType) || isCompressedDim(dimType) ||
+         isSingletonDim(dimType));
 
-  Value lo = isCompressed ? pidxs[tid][dim] : constantIndex(builder, loc, 0);
+  Value lo = isSparse ? pidxs[tid][dim] : constantIndex(builder, loc, 0);
   Value hi = highs[tid][dim];
 
   // TODO: support reduction.
@@ -153,7 +168,7 @@
   Operation *loop = forOp;
 
   assert(iv);
-  if (isCompressed) {
+  if (isSparse) {
     pidxs[tid][dim] = iv;
     // Generating a load on the indices array yields the coordinate.
     Value ptr = idxBuffer[tid][dim];
@@ -191,26 +206,33 @@
   // TODO: generate loop iteration on output tensor based on the shape
   // instead of pointer/indices arrays.
   assert(dims[tid].size() > dim);
+  auto dimType = dims[tid][dim];
 
-  if (isDenseDim(dims[tid][dim]))
+  if (isDenseDim(dimType))
     return false;
 
   // Either the first dimension, or the previous dimension has been set.
   assert(dim == 0 || pidxs[tid][dim - 1]);
-  if (isCompressedDim(dims[tid][dim])) {
+  Value c0 = constantIndex(builder, loc, 0);
+  Value c1 = constantIndex(builder, loc, 1);
+  if (isCompressedDim(dimType)) {
     Value ptr = ptrBuffer[tid][dim];
-    Value c1 = constantIndex(builder, loc, 1);
-    Value pLo = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
+
+    Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
     Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
 
     pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo);
     highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi);
-
     return true;
   }
+  if (isSingletonDim(dimType)) {
+    Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
+    Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
 
-  if (isSingletonDim(dims[tid][dim]))
-    llvm_unreachable("TODO: not implemented yet");
+    pidxs[tid][dim] = pLo;
+    highs[tid][dim] = pHi;
+    return true;
+  }
 
   llvm_unreachable("Unrecognizable dimesion type!");
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
@@ -16,6 +16,15 @@
   dimOrdering = affine_map<(i,j) -> (j,i)>
 }>
 
+#SortedCOO = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed-nu", "singleton" ]
+}>
+
+#SortedCOOPerm = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed-nu", "singleton" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
 module {
 
   /// uses foreach operator to print coords and values.
@@ -49,6 +58,26 @@
      return
   }
 
+  func.func @foreach_print_4(%arg0: tensor<2x2xf64, #SortedCOO>) {
+    sparse_tensor.foreach in %arg0 : tensor<2x2xf64, #SortedCOO> do {
+      ^bb0(%1: index, %2: index, %v: f64) :
+        vector.print %1: index
+        vector.print %2: index
+        vector.print %v: f64
+     }
+     return
+  }
+
+  func.func @foreach_print_5(%arg0: tensor<2x2xf64, #SortedCOOPerm>) {
+    sparse_tensor.foreach in %arg0 : tensor<2x2xf64, #SortedCOOPerm> do {
+      ^bb0(%1: index, %2: index, %v: f64) :
+        vector.print %1: index
+        vector.print %2: index
+        vector.print %v: f64
+     }
+     return
+  }
+
   //
   // Main driver.
   //
@@ -67,6 +96,8 @@
     %s1 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #Row>
     %s2 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #CSR>
     %s3 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #DCSC>
+    %s4 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #SortedCOO>
+    %s5 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #SortedCOOPerm>
     // CHECK: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -106,10 +137,38 @@
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_3(%s3) : (tensor<2x2xf64, #DCSC>) -> ()
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 2
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 5
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 6
+    call @foreach_print_4(%s4) : (tensor<2x2xf64, #SortedCOO>) -> ()
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 5
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 2
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 6
+    call @foreach_print_5(%s5) : (tensor<2x2xf64, #SortedCOOPerm>) -> ()
     
     bufferization.dealloc_tensor %s1 : tensor<2x2xf64, #Row>
     bufferization.dealloc_tensor %s2 : tensor<2x2xf64, #CSR>
     bufferization.dealloc_tensor %s3 : tensor<2x2xf64, #DCSC>
+    bufferization.dealloc_tensor %s4 : tensor<2x2xf64, #SortedCOO>
+    bufferization.dealloc_tensor %s5 : tensor<2x2xf64, #SortedCOOPerm>
 
     return
   }