diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -679,12 +679,17 @@
   /// in the argument differ from those in the current cursor.
   uint64_t lexDiff(const uint64_t *lvlCoords) const {
     const uint64_t lvlRank = getLvlRank();
-    for (uint64_t l = 0; l < lvlRank; ++l)
-      if (lvlCoords[l] > lvlCursor[l])
+    for (uint64_t l = 0; l < lvlRank; ++l) {
+      const auto crd = lvlCoords[l];
+      const auto cur = lvlCursor[l];
+      if (crd > cur || (crd == cur && !isUniqueLvl(l)))
         return l;
-      else
-        assert(lvlCoords[l] == lvlCursor[l] && "non-lexicographic insertion");
-    assert(0 && "duplicate insertion");
+      if (crd < cur) {
+        assert(false && "non-lexicographic insertion");
+        return -1u;
+      }
+    }
+    assert(false && "duplicate insertion");
     return -1u;
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/nonunique_lexdiff.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/nonunique_lexdiff.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/nonunique_lexdiff.mlir
@@ -0,0 +1,49 @@
+// This is a regression test to ensure proper handling of non-unique levels.
+// In particular, without the D156946 changes to `SparseTensorStorage::lexDiff`
+// this code can cause the size-assertion in `SparseTensorStorage::toCOO`
+// to fail under certain conditions.  (Alas, the specifics of those conditions
+// are obscure and difficult to replicate.)
+
+// RUN: mlir-opt %s \
+// RUN:   --sparse-compiler=enable-runtime-library=true \
+// RUN: | mlir-cpu-runner \
+// RUN:   -e entry -entry-point-result=void \
+// RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \
+// RUN: | FileCheck %s
+
+#COO = #sparse_tensor.encoding<{lvlTypes = ["compressed-nu", "singleton"]}>
+module {
+  func.func @entry() {
+    %c0 = arith.constant 0 : index
+    %i0 = arith.constant 0 : i32
+
+    %input = arith.constant dense<[
+      [ 2, 4, 3, 8, 0, 12, 7, 16 ],
+      [ 3, 4, 4, 8, 0, 6, 6, 16 ],
+      [ 3, 2, 4, 8, 0, 6, 6, 16 ],
+      [ 3, 2, 3, 8, 0, 6, 7, 16 ],
+      [ 2, 5, 0, 8, 0, 6, 6, 16 ],
+      [ 4, 4, 0, 8, 0, 6, 7, 16 ],
+      [ 2, 5, 3, 8, 3, 12, 6, 8 ],
+      [ 2, 5, 3, 8, 3, 6, 7, 8 ]
+    ]> : tensor<8x8xi32>
+    %coo = sparse_tensor.convert %input : tensor<8x8xi32> to tensor<8x8xi32, #COO>
+    %output = sparse_tensor.convert %coo : tensor<8x8xi32, #COO> to tensor<8x8xi32>
+
+    // CHECK:    ( ( 2, 4, 3, 8, 0, 12, 7, 16 ),
+    // CHECK-SAME: ( 3, 4, 4, 8, 0, 6, 6, 16 ),
+    // CHECK-SAME: ( 3, 2, 4, 8, 0, 6, 6, 16 ),
+    // CHECK-SAME: ( 3, 2, 3, 8, 0, 6, 7, 16 ),
+    // CHECK-SAME: ( 2, 5, 0, 8, 0, 6, 6, 16 ),
+    // CHECK-SAME: ( 4, 4, 0, 8, 0, 6, 7, 16 ),
+    // CHECK-SAME: ( 2, 5, 3, 8, 3, 12, 6, 8 ),
+    // CHECK-SAME: ( 2, 5, 3, 8, 3, 6, 7, 8 ) )
+    %v = vector.transfer_read %output[%c0, %c0], %i0 : tensor<8x8xi32>, vector<8x8xi32>
+    vector.print %v : vector<8x8xi32>
+
+    // Release the resources.
+    bufferization.dealloc_tensor %coo : tensor<8x8xi32, #COO>
+
+    return
+  }
+}