diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1055,18 +1055,22 @@
   let summary = "Custom reduction operation utilized within linalg.generic";
   let description = [{
       Defines a computation with a `linalg.generic` operation that takes two
-      operands and an identity value and reduces all values down to a single
-      result based on the computation in the region.
+      operands and an identity value and reduces all stored values down to a
+      single result based on the computation in the region.
 
       The region must contain exactly one block taking two arguments. The block
       must end with a sparse_tensor.yield and the output must match the input
       argument types.
 
-      Note that this operation is only required for custom reductions beyond the
-      standard operations (add, mul, and, or, etc). The `linalg.generic`
-      `iterator_types` defines which indices are being reduced. When the associated
-      operands are used in an operation, a reduction will occur. The use of this
-      explicit `reduce` operation is not required in most cases.
+      Note that this operation is only required for custom reductions beyond
+      the standard reduction operations (add, sub, or, xor) that can be
+      sparsified by merely reducing the stored values. More elaborate reduction
+      operations (mul, and, min, max, etc.) would need to account for implicit
+      zeros as well. They can still be handled using this custom reduction
+      operation. The `linalg.generic` `iterator_types` defines which indices
+      are being reduced. When the associated operands are used in an operation,
+      a reduction will occur. The use of this explicit `reduce` operation
+      is not required in most cases.
 
       Example of Matrix->Vector reduction using max(product(x_i), 100):
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -1848,6 +1848,24 @@
     if (!findSparseAnnotations(env, idxReducBased))
       return failure();
 
+    // Only standard reduction operations (add, sub, or, xor) that can be
+    // sparsified by merely reducing the stored values are admissible. More
+    // elaborate reduction operations (such as mul, and, min, max) would need
+    // to know whether implicit zeros occur as well. They can still be
+    // implemented with a custom reduction operation, accepted here as well.
+    if (op.getNumReductionLoops() > 0) {
+      Operation *yield = op.getRegion().front().getTerminator();
+      assert(isa<linalg::YieldOp>(yield));
+      Operation *redop = yield->getOperand(0).getDefiningOp();
+      if (!isa<arith::AddFOp>(redop) && !isa<complex::AddOp>(redop) &&
+          !isa<arith::AddIOp>(redop) && !isa<arith::SubFOp>(redop) &&
+          !isa<complex::SubOp>(redop) && !isa<arith::SubIOp>(redop) &&
+          !isa<arith::OrIOp>(redop) && !isa<arith::XOrIOp>(redop) &&
+          !isa<ReduceOp>(redop)) {
+        return failure();
+      }
+    }
+
     // Constructs the tensor expressions tree from `op`, returns failure if the
     // tree can not be built or the tensor expression is inadmissible.
     if (failed(env.initTensorExp()))
diff --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
--- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
@@ -5,213 +5,11 @@
 
 // -----
 
-// Check that we recognize a reduction with a mul operator.
-// We use two dimensions here to check that the vectorization
-// is not affected by how the outer loop is layed out.
-// In other words, we should be able to vectorize the sparse inner loop
-// regardless of whether the outer loop is dense or sparse.
-//
-// For this particular test, we expect:
-// With vectorization on:
-// dense scf.for
-//   init vector_accumulator = {scalar_accumulator, 1.0, 1.0, ...}
-//   sparse scf.for
-//     vectorized mul in vector_accumulator, vector_input
-//   horizontal reduction of the vector_accumulator to scalar_accumulator
-// final store of scalar_accumulaor
-//
-// With vectorization off:
-// dense scf.for
-//   sparse scf.for
-//     mul in accumulator
-// final store
-//
-// CHECK-ON-LABEL:   func.func @sparse_product_reduction_dense_sparse(
-// CHECK-ON-SAME:                                                     %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-ON-SAME:                                                     %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<1.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-ON-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
-// CHECK-ON:           %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-ON:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_7]] step %[[VAL_6]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f64) {
-// CHECK-ON:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_13]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_16:.*]] = arith.addi %[[VAL_13]], %[[VAL_6]] : index
-// CHECK-ON:             %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_18:.*]] = vector.insertelement %[[VAL_14]], %[[VAL_3]]{{\[}}%[[VAL_5]] : index] : vector<8xf64>
-// CHECK-ON:             %[[VAL_19:.*]] = scf.for %[[VAL_20:.*]] = %[[VAL_15]] to %[[VAL_17]] step %[[VAL_2]] iter_args(%[[VAL_21:.*]] = %[[VAL_18]]) -> (vector<8xf64>) {
-// CHECK-ON:               %[[VAL_22:.*]] = affine.min #map(%[[VAL_17]], %[[VAL_20]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:               %[[VAL_23:.*]] = vector.create_mask %[[VAL_22]] : vector<8xi1>
-// CHECK-ON:               %[[VAL_24:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_20]]], %[[VAL_23]], %[[VAL_4]] : memref<?xf64>, vector<8xi1>, vector<8xf64> into vector<8xf64>
-// CHECK-ON:               %[[VAL_25:.*]] = arith.mulf %[[VAL_21]], %[[VAL_24]] : vector<8xf64>
-// CHECK-ON:               %[[VAL_26:.*]] = arith.select %[[VAL_23]], %[[VAL_25]], %[[VAL_21]] : vector<8xi1>, vector<8xf64>
-// CHECK-ON:               scf.yield %[[VAL_26]] : vector<8xf64>
-// CHECK-ON:             } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:             %[[VAL_27:.*]] = vector.reduction <mul>, %[[VAL_28:.*]] : vector<8xf64> into f64
-// CHECK-ON:             scf.yield %[[VAL_27]] : f64
-// CHECK-ON:           } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:           memref.store %[[VAL_29:.*]], %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_30:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<f64>
-// CHECK-ON:           return %[[VAL_30]] : tensor<f64>
-// CHECK-ON:         }
-//
-// CHECK-OFF-LABEL:   func.func @sparse_product_reduction_dense_sparse(
-// CHECK-OFF-SAME:                                                     %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-OFF-SAME:                                                     %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:           %[[VAL_4:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
-// CHECK-OFF:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-OFF:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_9:.*]] = scf.for %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (f64) {
-// CHECK-OFF:             %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_10]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_13:.*]] = arith.addi %[[VAL_10]], %[[VAL_3]] : index
-// CHECK-OFF:             %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_13]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_15:.*]] = scf.for %[[VAL_16:.*]] = %[[VAL_12]] to %[[VAL_14]] step %[[VAL_3]] iter_args(%[[VAL_17:.*]] = %[[VAL_11]]) -> (f64) {
-// CHECK-OFF:               %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_16]]] : memref<?xf64>
-// CHECK-OFF:               %[[VAL_19:.*]] = arith.mulf %[[VAL_17]], %[[VAL_18]] : f64
-// CHECK-OFF:               scf.yield %[[VAL_19]] : f64
-// CHECK-OFF:             } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:             scf.yield %[[VAL_20:.*]] : f64
-// CHECK-OFF:           } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:           memref.store %[[VAL_21:.*]], %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_22:.*]] = bufferization.to_tensor %[[VAL_7]] : memref<f64>
-// CHECK-OFF:           return %[[VAL_22]] : tensor<f64>
-// CHECK-OFF:         }
-
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["dense","compressed"]}>
-
-#trait = {
-  indexing_maps = [
-    affine_map<(i,j) -> (i,j)>,  // a (in)
-    affine_map<(i,j) -> ()>      // x (out)
-  ],
-  iterator_types = ["reduction", "reduction"]
-}
-
-func.func @sparse_product_reduction_dense_sparse(%argx: tensor<f64>,
-                             %arga: tensor<?x128xf64, #SparseVector>)
- -> tensor<f64> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?x128xf64, #SparseVector>)
-      outs(%argx: tensor<f64>) {
-      ^bb(%a: f64, %x: f64):
-        %t = arith.mulf %x, %a: f64
-        linalg.yield %t : f64
-  } -> tensor<f64>
-  return %0 : tensor<f64>
-}
-
-// -----
-
-// Same as sparse_product_reduction_dense_sparse but with the outer loop being sparse.
-//
-// CHECK-ON-LABEL:   func.func @sparse_product_reduction_sparse_sparse(
-// CHECK-ON-SAME:                                                      %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-ON-SAME:                                                      %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<1.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-ON:           %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:           %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf64>
-// CHECK-ON:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK-ON:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_6]] iter_args(%[[VAL_16:.*]] = %[[VAL_11]]) -> (f64) {
-// CHECK-ON:             %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_6]] : index
-// CHECK-ON:             %[[VAL_19:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_18]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_20:.*]] = vector.insertelement %[[VAL_16]], %[[VAL_3]]{{\[}}%[[VAL_5]] : index] : vector<8xf64>
-// CHECK-ON:             %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_17]] to %[[VAL_19]] step %[[VAL_2]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (vector<8xf64>) {
-// CHECK-ON:               %[[VAL_24:.*]] = affine.min #map(%[[VAL_19]], %[[VAL_22]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:               %[[VAL_25:.*]] = vector.create_mask %[[VAL_24]] : vector<8xi1>
-// CHECK-ON:               %[[VAL_26:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_22]]], %[[VAL_25]], %[[VAL_4]] : memref<?xf64>, vector<8xi1>, vector<8xf64> into vector<8xf64>
-// CHECK-ON:               %[[VAL_27:.*]] = arith.mulf %[[VAL_23]], %[[VAL_26]] : vector<8xf64>
-// CHECK-ON:               %[[VAL_28:.*]] = arith.select %[[VAL_25]], %[[VAL_27]], %[[VAL_23]] : vector<8xi1>, vector<8xf64>
-// CHECK-ON:               scf.yield %[[VAL_28]] : vector<8xf64>
-// CHECK-ON:             } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:             %[[VAL_29:.*]] = vector.reduction <mul>, %[[VAL_30:.*]] : vector<8xf64> into f64
-// CHECK-ON:             scf.yield %[[VAL_29]] : f64
-// CHECK-ON:           } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:           memref.store %[[VAL_31:.*]], %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_32:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<f64>
-// CHECK-ON:           return %[[VAL_32]] : tensor<f64>
-// CHECK-ON:         }
-//
-// CHECK-OFF-LABEL:   func.func @sparse_product_reduction_sparse_sparse(
-// CHECK-OFF-SAME:                                                     %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-OFF-SAME:                                                     %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf64>
-// CHECK-OFF:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK-OFF:           %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-OFF:           %[[VAL_11:.*]] = scf.for %[[VAL_12:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] iter_args(%[[VAL_13:.*]] = %[[VAL_8]]) -> (f64) {
-// CHECK-OFF:             %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_12]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_3]] : index
-// CHECK-OFF:             %[[VAL_16:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_15]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_3]] iter_args(%[[VAL_19:.*]] = %[[VAL_13]]) -> (f64) {
-// CHECK-OFF:               %[[VAL_20:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_18]]] : memref<?xf64>
-// CHECK-OFF:               %[[VAL_21:.*]] = arith.mulf %[[VAL_19]], %[[VAL_20]] : f64
-// CHECK-OFF:               scf.yield %[[VAL_21]] : f64
-// CHECK-OFF:             } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:             scf.yield %[[VAL_22:.*]] : f64
-// CHECK-OFF:           } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:           memref.store %[[VAL_23:.*]], %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_7]] : memref<f64>
-// CHECK-OFF:           return %[[VAL_24]] : tensor<f64>
-// CHECK-OFF:         }
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed","compressed"]}>
-
-#trait = {
-  indexing_maps = [
-    affine_map<(i,j) -> (i,j)>,  // a (in)
-    affine_map<(i,j) -> ()>      // x (out)
-  ],
-  iterator_types = ["reduction", "reduction"]
-}
-
-func.func @sparse_product_reduction_sparse_sparse(%argx: tensor<f64>,
-                             %arga: tensor<?x128xf64, #SparseVector>)
- -> tensor<f64> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?x128xf64, #SparseVector>)
-      outs(%argx: tensor<f64>) {
-      ^bb(%a: f64, %x: f64):
-        %t = arith.mulf %x, %a: f64
-        linalg.yield %t : f64
-  } -> tensor<f64>
-  return %0 : tensor<f64>
-}
-
-// -----
-
-// sparse_product_reduction_dense_sparse and
-// sparse_product_reduction_sparse_sparse established that the outer loop
-// doesn't matter for vectorization.
-// As a result from this point forward, use tensors with fewer dimensions.
-
 // Check that we vectorize reductions with ori.
-// Note: The weird element type here is to check that we create the right
-// constant type for the pass-through value.
+
 // CHECK-ON-LABEL:   func.func @sparse_reduction_ori(
-// CHECK-ON-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-ON-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-ON-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-ON-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi13>
 // CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
@@ -238,8 +36,8 @@
 // CHECK-ON:         }
 //
 // CHECK-OFF-LABEL:   func.func @sparse_reduction_ori(
-// CHECK-OFF-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-OFF-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-OFF-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-OFF-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
@@ -268,7 +66,7 @@
 }
 
 func.func @sparse_reduction_ori(%argx: tensor<i13>,
-                             %arga: tensor<?xi13, #SparseVector>)
+                                %arga: tensor<?xi13, #SparseVector>)
  -> tensor<i13> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi13, #SparseVector>)
@@ -283,13 +81,12 @@
 // -----
 
 // Same test as sparse_reduction_ori except that the accumulator is on the
-// rhs of the operation.
-// This checks that we can recognize a reduction irrespective to where the
-// accumalator appears on commutative operations.
+// rhs of the operation. This checks that we can recognize a reduction
+// irrespective to where the accumulator appears on commutative operations.
 
 // CHECK-ON-LABEL:   func.func @sparse_reduction_ori_accumulator_on_rhs(
-// CHECK-ON-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-ON-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-ON-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-ON-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi13>
 // CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
@@ -316,8 +113,8 @@
 // CHECK-ON:         }
 //
 // CHECK-OFF-LABEL:   func.func @sparse_reduction_ori_accumulator_on_rhs(
-// CHECK-OFF-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-OFF-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-OFF-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-OFF-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
@@ -346,7 +143,7 @@
 }
 
 func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
-                             %arga: tensor<?xi13, #SparseVector>)
+                                                   %arga: tensor<?xi13, #SparseVector>)
  -> tensor<i13> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi13, #SparseVector>)
@@ -360,11 +157,11 @@
 
 // -----
 
-// Check that we vectorize reduction with subi.
+// Check that we vectorize reductions with subi.
 //
 // CHECK-ON-LABEL:   func.func @sparse_reduction_subi(
-// CHECK-ON-SAME:                                     %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-ON-SAME:                                     %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
+// CHECK-ON-SAME:      %[[VAL_0:.*]]: tensor<i32>,
+// CHECK-ON-SAME:      %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
 // CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant dense<0> : vector<8xi32>
@@ -391,8 +188,8 @@
 // CHECK-ON:         }
 //
 // CHECK-OFF-LABEL:   func.func @sparse_reduction_subi(
-// CHECK-OFF-SAME:                                     %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-OFF-SAME:                                     %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
+// CHECK-OFF-SAME:      %[[VAL_0:.*]]: tensor<i32>,
+// CHECK-OFF-SAME:      %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
 // CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
@@ -421,7 +218,7 @@
 }
 
 func.func @sparse_reduction_subi(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
+                                 %arga: tensor<?xi32, #SparseVector>)
  -> tensor<i32> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi32, #SparseVector>)
@@ -435,10 +232,8 @@
 
 // -----
 
-// From this point forward, we essentially have the same test for all
-// arithmetic operation. This is for a code coverage perspective.
+// Check that we vectorize reductions with xor.
 
-// Check that we vectorize xor.
 // CHECK-ON-LABEL: func.func @sparse_reduction_xor(
 // CHECK-ON-SAME: %[[VAL_0:.*]]: tensor<i32>,
 // CHECK-ON-SAME: %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
@@ -512,156 +307,9 @@
 }
 
 // -----
-// Check that we vectorize and.
-// CHECK-ON-LABEL: func.func @sparse_reduction_and(
-// CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-ON-DAG:   %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:   %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi32>
-// CHECK-ON-DAG:   %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-ON:   %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:   %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-ON:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
-// CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.broadcast %[[VAL_9]] : i32 to vector<8xi32>
-// CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
-// CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
-// CHECK-ON:     %[[VAL_18:.*]] = vector.maskedload %[[VAL_7]]{{\[}}%[[VAL_14]]], %[[VAL_17]], %[[VAL_3]] : memref<?xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
-// CHECK-ON:     %[[VAL_19:.*]] = arith.andi %[[VAL_15]], %[[VAL_18]] : vector<8xi32>
-// CHECK-ON:     %[[VAL_20:.*]] = arith.select %[[VAL_17]], %[[VAL_19]], %[[VAL_15]] : vector<8xi1>, vector<8xi32>
-// CHECK-ON:     scf.yield %[[VAL_20]] : vector<8xi32>
-// CHECK-ON:   } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:   %[[VAL_21:.*]] = vector.reduction <and>, %[[VAL_22:.*]] : vector<8xi32> into i32
-// CHECK-ON:   memref.store %[[VAL_21]], %[[VAL_8]][] : memref<i32>
-// CHECK-ON:   %[[VAL_23:.*]] = bufferization.to_tensor %[[VAL_8]] : memref<i32>
-// CHECK-ON:   return %[[VAL_23]] : tensor<i32>
-// CHECK-ON: }
-//
-// CHECK-OFF-LABEL: func.func @sparse_reduction_and(
-// CHECK-OFF-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-OFF-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-OFF-DAG:   %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-OFF:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_7]]) -> (i32) {
-// CHECK-OFF:     %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xi32>
-// CHECK-OFF:     %[[VAL_14:.*]] = arith.andi %[[VAL_12]], %[[VAL_13]] : i32
-// CHECK-OFF:     scf.yield %[[VAL_14]] : i32
-// CHECK-OFF:   } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:   memref.store %[[VAL_15:.*]], %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_6]] : memref<i32>
-// CHECK-OFF:   return %[[VAL_16]] : tensor<i32>
-// CHECK-OFF: }
-
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}>
-
-#trait = {
-  indexing_maps = [
-    affine_map<(i) -> (i)>,  // a (in)
-    affine_map<(i) -> ()>    // x (out)
-  ],
-  iterator_types = ["reduction"]
-}
-
-func.func @sparse_reduction_and(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
- -> tensor<i32> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?xi32, #SparseVector>)
-      outs(%argx: tensor<i32>) {
-      ^bb(%a: i32, %x: i32):
-        %t = arith.andi %x, %a: i32
-        linalg.yield %t : i32
-  } -> tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-// Check that we vectorize muli.
-// CHECK-ON-LABEL: func.func @sparse_reduction_muli(
-// CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-ON-DAG:   %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:   %[[VAL_3:.*]] = arith.constant dense<1> : vector<8xi32>
-// CHECK-ON-DAG:   %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant dense<0> : vector<8xi32>
-// CHECK-ON-DAG:   %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-ON:   %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:   %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-ON:   %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_9]][] : memref<i32>
-// CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_13:.*]] = vector.insertelement %[[VAL_10]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32>
-// CHECK-ON:   %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_2]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (vector<8xi32>) {
-// CHECK-ON:     %[[VAL_17:.*]] = affine.min #map(%[[VAL_12]], %[[VAL_15]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:     %[[VAL_18:.*]] = vector.create_mask %[[VAL_17]] : vector<8xi1>
-// CHECK-ON:     %[[VAL_19:.*]] = vector.maskedload %[[VAL_8]]{{\[}}%[[VAL_15]]], %[[VAL_18]], %[[VAL_5]] : memref<?xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
-// CHECK-ON:     %[[VAL_20:.*]] = arith.muli %[[VAL_16]], %[[VAL_19]] : vector<8xi32>
-// CHECK-ON:     %[[VAL_21:.*]] = arith.select %[[VAL_18]], %[[VAL_20]], %[[VAL_16]] : vector<8xi1>, vector<8xi32>
-// CHECK-ON:     scf.yield %[[VAL_21]] : vector<8xi32>
-// CHECK-ON:   } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:   %[[VAL_22:.*]] = vector.reduction <mul>, %[[VAL_23:.*]] : vector<8xi32> into i32
-// CHECK-ON:   memref.store %[[VAL_22]], %[[VAL_9]][] : memref<i32>
-// CHECK-ON:   %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_9]] : memref<i32>
-// CHECK-ON:   return %[[VAL_24]] : tensor<i32>
-// CHECK-ON: }
-//
-// CHECK-OFF-LABEL: func.func @sparse_reduction_muli(
-// CHECK-OFF-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-OFF-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-OFF-DAG:   %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-OFF:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_7]]) -> (i32) {
-// CHECK-OFF:     %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xi32>
-// CHECK-OFF:     %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_13]] : i32
-// CHECK-OFF:     scf.yield %[[VAL_14]] : i32
-// CHECK-OFF:   } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:   memref.store %[[VAL_15:.*]], %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_6]] : memref<i32>
-// CHECK-OFF:   return %[[VAL_16]] : tensor<i32>
-// CHECK-OFF: }
 
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}>
+// Check that we vectorize reductions with addi.
 
-#trait = {
-  indexing_maps = [
-    affine_map<(i) -> (i)>,  // a (in)
-    affine_map<(i) -> ()>    // x (out)
-  ],
-  iterator_types = ["reduction"]
-}
-
-func.func @sparse_reduction_muli(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
- -> tensor<i32> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?xi32, #SparseVector>)
-      outs(%argx: tensor<i32>) {
-      ^bb(%a: i32, %x: i32):
-        %t = arith.muli %x, %a: i32
-        linalg.yield %t : i32
-  } -> tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-// Check that we vectorize addi.
 // CHECK-ON-LABEL: func.func @sparse_reduction_addi(
 // CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<i32>,
 // CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
@@ -722,7 +370,7 @@
 }
 
 func.func @sparse_reduction_addi(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
+                                 %arga: tensor<?xi32, #SparseVector>)
  -> tensor<i32> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi32, #SparseVector>)
@@ -735,7 +383,9 @@
 }
 
 // -----
-// Check that we vectorize subf.
+
+// Check that we vectorize reductions with subf.
+
 // CHECK-ON-LABEL: func.func @sparse_reduction_subf(
 // CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<f32>,
 // CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
@@ -809,10 +459,12 @@
 }
 
 // -----
-// Check that we vectorize addf.
+
+// Check that we vectorize reductions with addf.
+
 // CHECK-ON-LABEL: func.func @sparse_reduction_addf(
-// CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<f32>,
-// CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
+// CHECK-ON-SAME:  %[[VAL_0:.*]]: tensor<f32>,
+// CHECK-ON-SAME:  %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
 // CHECK-ON-DAG:   %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:   %[[VAL_3:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32>
 // CHECK-ON-DAG:   %[[VAL_4:.*]] = arith.constant 0 : index
@@ -839,8 +491,8 @@
 // CHECK-ON: }
 //
 // CHECK-OFF-LABEL: func.func @sparse_reduction_addf(
-// CHECK-OFF-SAME:    %[[VAL_0:.*]]: tensor<f32>,
-// CHECK-OFF-SAME:    %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
+// CHECK-OFF-SAME:  %[[VAL_0:.*]]: tensor<f32>,
+// CHECK-OFF-SAME:  %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
 // CHECK-OFF-DAG:   %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
@@ -28,7 +28,6 @@
 // Reduction in this file _are_ supported by the AArch64 SVE backend
 
 #SV = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
-#DV = #sparse_tensor.encoding<{ lvlTypes = [ "dense"      ] }>
 
 #trait_reduction = {
   indexing_maps = [
@@ -66,18 +65,6 @@
     return %0 : tensor<f32>
   }
 
-  func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
-                          %argx: tensor<i32>) -> tensor<i32> {
-    %0 = linalg.generic #trait_reduction
-      ins(%arga: tensor<32xi32, #DV>)
-      outs(%argx: tensor<i32>) {
-        ^bb(%a: i32, %x: i32):
-          %0 = arith.andi %x, %a : i32
-          linalg.yield %0 : i32
-    } -> tensor<i32>
-    return %0 : tensor<i32>
-  }
-
   func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
                          %argx: tensor<i32>) -> tensor<i32> {
     %0 = linalg.generic #trait_reduction
@@ -130,59 +117,37 @@
       2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
     ]> : tensor<32xf32>
 
-    %c_1_i32 = arith.constant dense<[
-      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
-    ]> : tensor<32xi32>
-
-    %c_1_f32 = arith.constant dense<[
-      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
-      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
-      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
-    ]> : tensor<32xf32>
-
     // Convert constants to annotated tensors.
     %sparse_input_i32 = sparse_tensor.convert %c_0_i32
       : tensor<32xi32> to tensor<32xi32, #SV>
     %sparse_input_f32 = sparse_tensor.convert %c_0_f32
       : tensor<32xf32> to tensor<32xf32, #SV>
-    %dense_input_i32 = sparse_tensor.convert %c_1_i32
-      : tensor<32xi32> to tensor<32xi32, #DV>
-    %dense_input_f32 = sparse_tensor.convert %c_1_f32
-      : tensor<32xf32> to tensor<32xf32, #DV>
 
     // Call the kernels.
     %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
     %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
        : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
-    %4 = call @and_reduction_i32(%dense_input_i32, %ri)
-       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
-    %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
+    %2 = call @or_reduction_i32(%sparse_input_i32, %ri)
        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
-    %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
+    %3 = call @xor_reduction_i32(%sparse_input_i32, %ri)
        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
 
     // Verify results.
     //
     // CHECK: 26
     // CHECK: 27.5
-    // CHECK: 1
     // CHECK: 15
     // CHECK: 10
     //
     call @dump_i32(%0) : (tensor<i32>) -> ()
     call @dump_f32(%1) : (tensor<f32>) -> ()
-    call @dump_i32(%4) : (tensor<i32>) -> ()
-    call @dump_i32(%5) : (tensor<i32>) -> ()
-    call @dump_i32(%6) : (tensor<i32>) -> ()
+    call @dump_i32(%2) : (tensor<i32>) -> ()
+    call @dump_i32(%3) : (tensor<i32>) -> ()
 
     // Release the resources.
     bufferization.dealloc_tensor %sparse_input_i32 : tensor<32xi32, #SV>
     bufferization.dealloc_tensor %sparse_input_f32 : tensor<32xf32, #SV>
-    bufferization.dealloc_tensor %dense_input_i32  : tensor<32xi32, #DV>
-    bufferization.dealloc_tensor %dense_input_f32  : tensor<32xf32, #DV>
 
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir
@@ -15,10 +15,6 @@
 // REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
 // RUN: %{command}
 
-// Product reductions - kept in a seperate file as these are not supported by
-// the AArch64 SVE backend (so the set-up is a bit different to
-// sparse_reducitons.mlir)
-
 #SV = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
 #DV = #sparse_tensor.encoding<{ lvlTypes = [ "dense"      ] }>
 
@@ -28,32 +24,76 @@
     affine_map<(i) -> ()>    // x (scalar out)
   ],
   iterator_types = ["reduction"],
-  doc = "x += OPER_i a(i)"
+  doc = "x += PROD_CUSTOM_i a(i)"
 }
 
 // An example of vector reductions.
 module {
 
-  func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
-                           %argx: tensor<i32>) -> tensor<i32> {
+  func.func @prod_dreduction_i32(%arga: tensor<32xi32, #DV>,
+                                 %argx: tensor<i32>) -> tensor<i32> {
+    %c = tensor.extract %argx[] : tensor<i32>
     %0 = linalg.generic #trait_reduction
       ins(%arga: tensor<32xi32, #DV>)
       outs(%argx: tensor<i32>) {
-        ^bb(%a: i32, %x: i32):
-          %0 = arith.muli %x, %a : i32
-          linalg.yield %0 : i32
+        ^bb(%a: i32, %b: i32):
+          %1 = sparse_tensor.reduce %a, %b, %c : i32 {
+            ^bb0(%x: i32, %y: i32):
+              %2 = arith.muli %x, %y : i32
+              sparse_tensor.yield %2 : i32
+          }
+          linalg.yield %1 : i32
     } -> tensor<i32>
     return %0 : tensor<i32>
   }
 
-  func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
-                           %argx: tensor<f32>) -> tensor<f32> {
+  func.func @prod_dreduction_f32(%arga: tensor<32xf32, #DV>,
+                                 %argx: tensor<f32>) -> tensor<f32> {
+    %c = tensor.extract %argx[] : tensor<f32>
     %0 = linalg.generic #trait_reduction
       ins(%arga: tensor<32xf32, #DV>)
       outs(%argx: tensor<f32>) {
-        ^bb(%a: f32, %x: f32):
-          %0 = arith.mulf %x, %a : f32
-          linalg.yield %0 : f32
+        ^bb(%a: f32, %b: f32):
+          %1 = sparse_tensor.reduce %a, %b, %c : f32 {
+            ^bb0(%x: f32, %y: f32):
+              %2 = arith.mulf %x, %y : f32
+              sparse_tensor.yield %2 : f32
+          }
+          linalg.yield %1 : f32
+    } -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func @prod_sreduction_i32(%arga: tensor<32xi32, #SV>,
+                                 %argx: tensor<i32>) -> tensor<i32> {
+    %c = tensor.extract %argx[] : tensor<i32>
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #SV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %b: i32):
+          %1 = sparse_tensor.reduce %a, %b, %c : i32 {
+            ^bb0(%x: i32, %y: i32):
+              %2 = arith.muli %x, %y : i32
+              sparse_tensor.yield %2 : i32
+          }
+          linalg.yield %1 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func.func @prod_sreduction_f32(%arga: tensor<32xf32, #SV>,
+                                 %argx: tensor<f32>) -> tensor<f32> {
+    %c = tensor.extract %argx[] : tensor<f32>
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xf32, #SV>)
+      outs(%argx: tensor<f32>) {
+        ^bb(%a: f32, %b: f32):
+          %1 = sparse_tensor.reduce %a, %b, %c : f32 {
+            ^bb0(%x: f32, %y: f32):
+              %2 = arith.mulf %x, %y : f32
+              sparse_tensor.yield %2 : f32
+          }
+          linalg.yield %1 : f32
     } -> tensor<f32>
     return %0 : tensor<f32>
   }
@@ -74,6 +114,20 @@
     %ri = arith.constant dense< 7   > : tensor<i32>
     %rf = arith.constant dense< 2.0 > : tensor<f32>
 
+    // Vectors with a few zeros.
+    %c_0_i32 = arith.constant dense<[
+      1, 1, 7, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 0, 1, 1, 7, 3
+    ]> : tensor<32xi32>
+
+    %c_0_f32 = arith.constant dense<[
+      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
+      1.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+      1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0,
+      1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0
+    ]> : tensor<32xf32>
+
+    // Vectors with no zeros.
     %c_1_i32 = arith.constant dense<[
       1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
@@ -87,28 +141,64 @@
     ]> : tensor<32xf32>
 
     // Convert constants to annotated tensors.
-    %dense_input_i32 = sparse_tensor.convert %c_1_i32
+    %d0_i32 = sparse_tensor.convert %c_0_i32
       : tensor<32xi32> to tensor<32xi32, #DV>
-    %dense_input_f32 = sparse_tensor.convert %c_1_f32
+    %d0_f32 = sparse_tensor.convert %c_0_f32
       : tensor<32xf32> to tensor<32xf32, #DV>
+    %s0_i32 = sparse_tensor.convert %c_0_i32
+      : tensor<32xi32> to tensor<32xi32, #SV>
+    %s0_f32 = sparse_tensor.convert %c_0_f32
+      : tensor<32xf32> to tensor<32xf32, #SV>
+    %d1_i32 = sparse_tensor.convert %c_1_i32
+      : tensor<32xi32> to tensor<32xi32, #DV>
+    %d1_f32 = sparse_tensor.convert %c_1_f32
+      : tensor<32xf32> to tensor<32xf32, #DV>
+    %s1_i32 = sparse_tensor.convert %c_1_i32
+      : tensor<32xi32> to tensor<32xi32, #SV>
+    %s1_f32 = sparse_tensor.convert %c_1_f32
+      : tensor<32xf32> to tensor<32xf32, #SV>
 
     // Call the kernels.
-    %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
-       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
-    %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
-       : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
-
-    // Verify results.
+    %0 = call @prod_dreduction_i32(%d0_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
+    %1 = call @prod_dreduction_f32(%d0_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
+    %2 = call @prod_sreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+    %3 = call @prod_sreduction_f32(%s0_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
+    %4 = call @prod_dreduction_i32(%d1_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
+    %5 = call @prod_dreduction_f32(%d1_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
+    %6 = call @prod_sreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+    %7 = call @prod_sreduction_f32(%s1_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
+
+    // Verify results. Note that the custom reduction gave permission
+    // to treat an explicit vs implicit zero differently to compute the
+    // full product reduction. A "standard" product reduction would
+    // have to return 0 for any implicit zero occurrence too.
     //
+    // CHECK: 0
+    // CHECK: 3087
+    // CHECK: 14
+    // CHECK: 3087
+    // CHECK: 168
     // CHECK: 3087
     // CHECK: 168
     //
+    call @dump_i32(%0) : (tensor<i32>) -> ()
+    call @dump_f32(%1) : (tensor<f32>) -> ()
     call @dump_i32(%2) : (tensor<i32>) -> ()
     call @dump_f32(%3) : (tensor<f32>) -> ()
+    call @dump_i32(%4) : (tensor<i32>) -> ()
+    call @dump_f32(%5) : (tensor<f32>) -> ()
+    call @dump_i32(%6) : (tensor<i32>) -> ()
+    call @dump_f32(%7) : (tensor<f32>) -> ()
 
     // Release the resources.
-    bufferization.dealloc_tensor %dense_input_i32  : tensor<32xi32, #DV>
-    bufferization.dealloc_tensor %dense_input_f32  : tensor<32xf32, #DV>
+    bufferization.dealloc_tensor %d0_i32 : tensor<32xi32, #DV>
+    bufferization.dealloc_tensor %d0_f32 : tensor<32xf32, #DV>
+    bufferization.dealloc_tensor %s0_i32 : tensor<32xi32, #SV>
+    bufferization.dealloc_tensor %s0_f32 : tensor<32xf32, #SV>
+    bufferization.dealloc_tensor %d1_i32 : tensor<32xi32, #DV>
+    bufferization.dealloc_tensor %d1_f32 : tensor<32xf32, #DV>
+    bufferization.dealloc_tensor %s1_i32 : tensor<32xi32, #SV>
+    bufferization.dealloc_tensor %s1_f32 : tensor<32xf32, #SV>
 
     return
   }