diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -515,6 +515,15 @@
   return env.topSortSize() == numLoops;
 }
 
+static void addIterOrdering(LoopId f, LoopId t,
+                            std::vector<std::vector<bool>> &adjM,
+                            std::vector<unsigned> &inDegree) {
+  if (!adjM[f][t] && f != t) {
+    adjM[f][t] = true;
+    inDegree[t]++;
+  }
+}
+
 /// Helper method to add all constraints from the indices in one affine
 /// expression before all indices in the other affine expression. For
 /// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.
@@ -533,10 +542,7 @@
     // Recursion leaf.
     assert(fidx && tidx);
     const LoopId f = *fidx, t = *tidx;
-    if (!adjM[f][t]) {
-      adjM[f][t] = true;
-      inDegree[t]++;
-    }
+    addIterOrdering(f, t, adjM, inDegree);
     return;
   }
   // Picks an affine expression and expand (recurse into) it.
@@ -693,6 +699,18 @@
     const AffineExpr fa = map.getResult(toOrigDim(enc, lvl - 1));
     const AffineExpr ta = map.getResult(toOrigDim(enc, lvl));
 
+    if (auto fdim = fa.dyn_cast<AffineDimExpr>()) {
+      AffineDimCollector tCollector;
+      tCollector.walkPostOrder(ta);
+
+      const LoopId f = env.makeLoopId(fdim.getPosition());
+      for (auto td : tCollector.dims) {
+        const LoopId t = env.makeLoopId(td.getPosition());
+        addIterOrdering(f, t, adjM, inDegree);
+      }
+      continue;
+    }
+
     // This is a heuristic, we pick an abitrary reduction loop from lhs and
     // rhs and use them as d_x and d_y.
     finder.walkPostOrder(fa);
@@ -704,10 +722,7 @@
     const LoopId tldx = env.makeLoopId(texp.getPosition());
 
     // d_x > d_y
-    if (!adjM[fldx][tldx]) {
-      adjM[fldx][tldx] = true;
-      inDegree[tldx]++;
-    }
+    addIterOrdering(fldx, tldx, adjM, inDegree);
 
     AffineDimCollector fCollector;
     fCollector.walkPostOrder(fa);
@@ -717,21 +732,11 @@
     // make sure dx and dy is the last;
     for (auto fd : fCollector.dims) {
       const LoopId f = env.makeLoopId(fd.getPosition());
-      if (f == fldx)
-        continue;
-      if (!adjM[f][fldx]) {
-        adjM[f][fldx] = true;
-        inDegree[fldx]++;
-      }
+      addIterOrdering(f, fldx, adjM, inDegree);
     }
     for (auto td : tCollector.dims) {
       const LoopId t = env.makeLoopId(td.getPosition());
-      if (t == tldx)
-        continue;
-      if (!adjM[t][tldx]) {
-        adjM[t][tldx] = true;
-        inDegree[tldx]++;
-      }
+      addIterOrdering(t, tldx, adjM, inDegree);
     }
     // Since we only support affine addition, the order between two dim
     // expression does not really matters.
@@ -746,15 +751,11 @@
       const LoopId f = env.makeLoopId(fd.getPosition());
       if (f == fldx) // skip d_x
         continue;
-
       for (auto td : tCollector.dims) {
         const LoopId t = env.makeLoopId(td.getPosition());
         if (t == tldx) // skip d_y
           continue;
-        if (!adjM[f][t]) {
-          adjM[f][t] = true;
-          inDegree[t]++;
-        }
+        addIterOrdering(f, t, adjM, inDegree);
       }
     }
   }
@@ -796,7 +797,7 @@
         if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) ||
             isSingletonDLT(dltI)) {
           for (LoopId j = 0; j < numLoops; j++)
-            if (isUndefDLT(env.dlt(tid, j))) {
+            if (isUndefDLT(env.dlt(tid, j)) && !adjM[i][j]) {
               adjM[i][j] = true;
               inDegree[j]++;
             }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
@@ -0,0 +1,178 @@
+// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true"
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE:  -e entry -entry-point-result=void  \
+// DEFINE:  -shared-libs=%mlir_c_runner_utils | \
+// DEFINE: FileCheck %s
+//
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true"
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true  enable-index-reduction=true"
+// RUN: %{compile} | %{run}
+
+// Do the same run, but now with direct IR generation and, if available, VLA
+// vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA  enable-index-reduction=true"
+// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \
+// REDEFINE:   --entry-function=entry_lli \
+// REDEFINE:   --extra-module=%S/Inputs/main_for_lli.ll \
+// REDEFINE:   %VLA_ARCH_ATTR_OPTIONS \
+// REDEFINE:   --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
+// REDEFINE: FileCheck %s
+// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run}
+
+
+// TODO: we can only support dense output for nchw input because 'c' is a reduction loop
+
+
+#CCCD = #sparse_tensor.encoding<{
+  lvlTypes = [ "dense", "dense", "dense", "compressed" ]
+}>
+
+
+#CCCC = #sparse_tensor.encoding<{
+  lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ]
+}>
+
+// FIXME: CDCD encoding crashes!
+
+// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
+func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
+  %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nchw_fchw(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nchw_fchw_CCCD(%arg0: tensor<?x?x?x?xf32, #CCCD>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCD>, tensor<?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nchw_fchw_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c3, %c8, %c8, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c0, %c3] : tensor<?x?x?x?xf32>
+  %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %out2D_nhwc_CCCD = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %out2D_nhwc_CCCC = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+
+  %in2D_nhwc_CCCD = sparse_tensor.convert %in2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCD>
+  %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
+
+  %dense_ret = call @conv_2d_nchw_fchw(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+  %CCCC_ret = call @conv_2d_nchw_fchw_CCCD(%in2D_nhwc_CCCD, %filter2D_nhwc, %out2D_nhwc_CCCD) : (tensor<?x?x?x?xf32, #CCCD>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+  %CDCD_ret = call @conv_2d_nchw_fchw_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc_CCCC) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+
+
+  // CHECK:     ( ( ( ( 108, 124, 124, 124, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x1x6x6xf32>
+  vector.print %dense_v : vector<3x1x6x6xf32>
+
+  // CHECK:     ( ( ( ( 108, 124, 124, 124, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ) )
+  %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x1x6x6xf32>
+  vector.print %v1 : vector<3x1x6x6xf32>
+
+  // CHECK:     ( ( ( ( 108, 124, 124, 124, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ),
+  // CHECK-SAME:  ( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:      ( 108, 108, 108, 108, 108, 108 ) ) ) )
+  %v2 = vector.transfer_read %CDCD_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x1x6x6xf32>
+  vector.print %v2 : vector<3x1x6x6xf32>
+
+  // Free the resources
+  bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
+  bufferization.dealloc_tensor %filter2D_nhwc : tensor<?x?x?x?xf32>
+  bufferization.dealloc_tensor %out2D_nhwc : tensor<?x?x?x?xf32>
+  bufferization.dealloc_tensor %out2D_nhwc_CCCD : tensor<?x?x?x?xf32>
+  bufferization.dealloc_tensor %out2D_nhwc_CCCC : tensor<?x?x?x?xf32>
+
+  bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor<?x?x?x?xf32, #CCCC>
+  bufferization.dealloc_tensor %in2D_nhwc_CCCD : tensor<?x?x?x?xf32, #CCCD>
+  return
+}