diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -1166,10 +1166,15 @@
     Type idxType = rewriter.getIndexType();
     // All initialization should be done on entry of the loop nest.
     rewriter.setInsertionPointAfter(op.getTensor().getDefiningOp());
-    // Determine the size for access expansion.
+    // Determine the size for access expansion (always the innermost stored
+    // dimension size, but we need to translate it back to the original
+    // dimension since the dim size utility applies dimension ordering).
     auto enc = getSparseTensorEncoding(srcType);
     Value src = adaptor.getOperands()[0];
-    Value sz = genDimSizeCall(rewriter, loc, enc, src, srcType.getRank() - 1);
+    unsigned innerDim = srcType.getRank() - 1;
+    if (AffineMap p = enc.getDimOrdering())
+      innerDim = p.getDimPosition(innerDim);
+    Value sz = genDimSizeCall(rewriter, loc, enc, src, innerDim);
     // Allocate temporary buffers for values, filled-switch, and indices.
     // We do not use stack buffers for this, since the expanded size may
     // be rather large (as it envelops a single expanded dense dimension).
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -494,7 +494,9 @@
 }
 
 // CHECK-LABEL: func @sparse_expansion()
-//       CHECK: %[[S:.*]] = call @sparseDimSize
+//  CHECK-DAG:  %[[C:.*]] = arith.constant 1 : index
+//       CHECK: %[[N:.*]] = call @newSparseTensor
+//       CHECK: %[[S:.*]] = call @sparseDimSize(%[[N]], %[[C]]) : (!llvm.ptr<i8>, index) -> index
 //       CHECK: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
 //       CHECK: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
 //       CHECK: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
@@ -502,9 +504,9 @@
 //   CHECK-DAG: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
 //       CHECK: return %[[C]] : memref<?xindex>
 func.func @sparse_expansion() -> memref<?xindex> {
-  %0 = bufferization.alloc_tensor() : tensor<8x8xf64, #SparseMatrix>
+  %0 = bufferization.alloc_tensor() : tensor<4x8xf64, #SparseMatrix>
   %values, %filled, %added, %count = sparse_tensor.expand %0
-    : tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+    : tensor<4x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
   return %added : memref<?xindex>
 }
 
diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@@ -1,8 +1,21 @@
-// RUN: mlir-opt %s -sparsification                           | \
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --linalg-fuse-elementwise-ops \
+// RUN:             --sparsification | \
 // RUN:   FileCheck %s --check-prefix=CHECK-SPARSE
-// RUN: mlir-opt %s -sparsification -sparse-tensor-conversion | \
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --linalg-fuse-elementwise-ops \
+// RUN:             --sparsification --sparse-tensor-conversion --cse | \
 // RUN:   FileCheck %s --check-prefix=CHECK-CONVERT
 
+#CSR = #sparse_tensor.encoding<{
+  dimLevelType = [  "dense", "compressed" ]
+}>
+
+#CSC = #sparse_tensor.encoding<{
+  dimLevelType = [  "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
 #DCSC = #sparse_tensor.encoding<{
   dimLevelType = [  "compressed", "compressed" ],
   dimOrdering = affine_map<(i,j) -> (j,i)>
@@ -24,22 +37,28 @@
 //
 // CHECK-SPARSE-LABEL: func @kernel(
 // CHECK-SPARSE: %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
-// CHECK-SPARSE: scf.for
-// CHECK-SPARSE:   scf.for
+// CHECK-SPARSE: scf.for {{.*}} {
+// CHECK-SPARSE:   scf.for {{.*}} {
+// CHECK-SPARSE:   }
+// CHECK-SPARSE: }
 // CHECK-SPARSE: sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
 // CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
 // CHECK-SPARSE: return %[[RET]]
 //
 // CHECK-CONVERT-LABEL: func @kernel(
+// CHECK-CONVERT: %[[C:.*]] = arith.constant 0 : index
 // CHECK-CONVERT: %{{.*}} = call @sparseDimSize
-// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize
+// CHECK-CONVERT: %[[N:.*]] = call @newSparseTensor
+// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize(%[[N]], %[[C]])
 // CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
 // CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
 // CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
 // CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
 // CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
-// CHECK-CONVERT: scf.for
-// CHECK-CONVERT:   scf.for
+// CHECK-CONVERT: scf.for {{.*}} {
+// CHECK-CONVERT:   scf.for {{.*}} {
+// CHECK-CONVERT:   }
+// CHECK-CONVERT: }
 // CHECK-CONVERT: call @expInsertF64
 // CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
 // CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
@@ -59,3 +78,99 @@
   } -> tensor<?xf64, #SV>
   return %0 : tensor<?xf64, #SV>
 }
+
+//
+// CHECK-SPARSE-LABEL: func @matmul1(
+// CHECK-SPARSE-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-SPARSE-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-SPARSE-DAG: %[[C8:.*]] = arith.constant 8 : index
+// CHECK-SPARSE: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] {
+// CHECK-SPARSE:   %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
+// CHECK-SPARSE:   scf.for {{.*}} {
+// CHECK-SPARSE:     scf.for {{.*}} {
+// CHECK-SPARSE:     }
+// CHECK-SPARSE:   }
+// CHECK-SPARSE:   sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
+// CHECK-SPARSE: }
+// CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
+// CHECK-SPARSE: return %[[RET]]
+//
+// CHECK-CONVERT-LABEL: func @matmul1(
+// CHECK-CONVERT-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-CONVERT-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-CONVERT-DAG: %[[C8:.*]] = arith.constant 8 : index
+// CHECK-CONVERT: %[[N:.*]] = call @newSparseTensor
+// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize(%[[N]], %[[C1]])
+// CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
+// CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
+// CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
+// CHECK-CONVERT: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] {
+// CHECK-CONVERT:   scf.for {{.*}} {
+// CHECK-CONVERT:     scf.for {{.*}} {
+// CHECK-CONVERT:     }
+// CHECK-CONVERT:   }
+// CHECK-CONVERT:   call @expInsertF64
+// CHECK-CONVERT: }
+// CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
+// CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
+// CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
+// CHECK-CONVERT: call @endInsert
+//
+func.func @matmul1(%A: tensor<8x2xf64, #CSR>,
+                   %B: tensor<2x4xf64, #CSR>) -> tensor<8x4xf64, #CSR> {
+  %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSR>
+  %D = linalg.matmul
+    ins(%A, %B: tensor<8x2xf64, #CSR>, tensor<2x4xf64, #CSR>)
+       outs(%C: tensor<8x4xf64, #CSR>) -> tensor<8x4xf64, #CSR>
+  return %D: tensor<8x4xf64, #CSR>
+}
+
+//
+// CHECK-SPARSE-LABEL: func @matmul2(
+// CHECK-SPARSE-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-SPARSE-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-SPARSE-DAG: %[[C4:.*]] = arith.constant 4 : index
+// CHECK-SPARSE: scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]] {
+// CHECK-SPARSE:   %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
+// CHECK-SPARSE:   scf.for {{.*}} {
+// CHECK-SPARSE:     scf.for {{.*}} {
+// CHECK-SPARSE:     }
+// CHECK-SPARSE:   }
+// CHECK-SPARSE:   sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
+// CHECK-SPARSE: }
+// CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
+// CHECK-SPARSE: return %[[RET]]
+//
+// CHECK-CONVERT-LABEL: func @matmul2(
+// CHECK-CONVERT-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-CONVERT-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-CONVERT-DAG: %[[C4:.*]] = arith.constant 4 : index
+// CHECK-CONVERT: %[[N:.*]] = call @newSparseTensor
+// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize(%[[N]], %[[C1]])
+// CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
+// CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
+// CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
+// CHECK-CONVERT: scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]] {
+// CHECK-CONVERT:   scf.for {{.*}} {
+// CHECK-CONVERT:     scf.for {{.*}} {
+// CHECK-CONVERT:     }
+// CHECK-CONVERT:   }
+// CHECK-CONVERT:   call @expInsertF64
+// CHECK-CONVERT: }
+// CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
+// CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
+// CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
+// CHECK-CONVERT: call @endInsert
+//
+func.func @matmul2(%A: tensor<8x2xf64, #CSC>,
+                   %B: tensor<2x4xf64, #CSC>) -> tensor<8x4xf64, #CSC> {
+  %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSC>
+  %D = linalg.matmul
+    ins(%A, %B: tensor<8x2xf64, #CSC>, tensor<2x4xf64, #CSC>)
+       outs(%C: tensor<8x4xf64, #CSC>) -> tensor<8x4xf64, #CSC>
+  return %D: tensor<8x4xf64, #CSC>
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
@@ -0,0 +1,79 @@
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#CSC = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+module {
+  //
+  // Column-wise storage forces the ijk loop to permute into jki
+  // so that access pattern expansion (workspace) needs to be
+  // done along dimension with size 8.
+  //
+  func.func @matmul(%A: tensor<8x2xf64, #CSC>,
+                    %B: tensor<2x4xf64, #CSC>) -> tensor<8x4xf64, #CSC> {
+    %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSC>
+    %D = linalg.matmul
+      ins(%A, %B: tensor<8x2xf64, #CSC>, tensor<2x4xf64, #CSC>)
+         outs(%C: tensor<8x4xf64, #CSC>) -> tensor<8x4xf64, #CSC>
+    return %D: tensor<8x4xf64, #CSC>
+  }
+
+  //
+  // Main driver.
+  //
+  func.func @entry() {
+    %c0 = arith.constant 0 : index
+    %d1 = arith.constant -1.0 : f64
+
+    // Initialize various dense matrices for stress testing.
+    %da = arith.constant dense<[
+        [ 1.1, 2.1 ],
+        [ 1.2, 2.2 ],
+        [ 1.3, 2.3 ],
+        [ 1.4, 2.4 ],
+        [ 1.5, 2.5 ],
+        [ 1.6, 2.6 ],
+        [ 1.7, 2.7 ],
+        [ 1.8, 2.8 ]
+    ]> : tensor<8x2xf64>
+    %db = arith.constant dense<[
+        [ 10.1, 11.1, 12.1, 13.1 ],
+        [ 10.2, 11.2, 12.2, 13.2 ]
+    ]> : tensor<2x4xf64>
+
+    // Convert all these matrices to sparse format.
+    %x1 = sparse_tensor.convert %da : tensor<8x2xf64> to tensor<8x2xf64, #CSC>
+    %x2 = sparse_tensor.convert %db : tensor<2x4xf64> to tensor<2x4xf64, #CSC>
+
+    // Call kernels with dense.
+    %x3 = call @matmul(%x1, %x2)
+       : (tensor<8x2xf64, #CSC>,
+          tensor<2x4xf64, #CSC>) -> tensor<8x4xf64, #CSC>
+
+    //
+    // CHECK:    ( ( 32.53, 35.73, 38.93, 42.13 ),
+    // CHECK-SAME: ( 34.56, 37.96, 41.36, 44.76 ),
+    // CHECK-SAME: ( 36.59, 40.19, 43.79, 47.39 ),
+    // CHECK-SAME: ( 38.62, 42.42, 46.22, 50.02 ),
+    // CHECK-SAME: ( 40.65, 44.65, 48.65, 52.65 ),
+    // CHECK-SAME: ( 42.68, 46.88, 51.08, 55.28 ),
+    // CHECK-SAME: ( 44.71, 49.11, 53.51, 57.91 ),
+    // CHECK-SAME: ( 46.74, 51.34, 55.94, 60.54 ) )
+    //
+    %xc = sparse_tensor.convert %x3 : tensor<8x4xf64, #CSC> to tensor<8x4xf64>
+    %xv = vector.transfer_read %xc[%c0, %c0], %d1 : tensor<8x4xf64>, vector<8x4xf64>
+    vector.print %xv : vector<8x4xf64>
+
+    // Release the resources.
+    bufferization.dealloc_tensor %x1 : tensor<8x2xf64, #CSC>
+    bufferization.dealloc_tensor %x2 : tensor<2x4xf64, #CSC>
+    bufferization.dealloc_tensor %x3 : tensor<8x4xf64, #CSC>
+
+    return
+  }
+}