diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -49,26 +49,32 @@
   let hasVerifier = 1;
 }
 
-def SparseTensor_InitOp : SparseTensor_Op<"init", [NoSideEffect]>,
-    Arguments<(ins Variadic<Index>:$sizes)>,
+def SparseTensor_InitOp : SparseTensor_Op<"init",
+    [NoSideEffect,
+     AttrSizedOperandSegments]>,
+    Arguments<(ins Variadic<Index>:$sizes, Optional<AnyType>:$startValue)>,
     Results<(outs AnyTensor:$result)> {
-  string summary = "Materializes an unitialized sparse tensor";
+  string summary = "Materializes an uninitialized sparse tensor";
   string description = [{
     Materializes an uninitialized sparse tensor with given shape (either static
     or dynamic). The operation is provided as an anchor that materializes a
     properly typed but uninitialized sparse tensor into the output clause of
     a subsequent operation that yields a sparse tensor as the result.
 
+    When the computation performs a reduction or aggregation into the uninitialized
+    sparse tensor, the starting value of every element is assumed to be zero unless
+    explicitly declared using the optional `start_value`.
+
     Example:
 
     ```mlir
-    %c = sparse_tensor.init_tensor [%d1, %d2] : tensor<?x?xf32, #SparseMatrix>
+    %c = sparse_tensor.init [%d1, %d2] start_value=%cf0[f32] : tensor<?x?xf32, #SparseMatrix>
     %0 = linalg.matmul
       ins(%a, %b: tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%c: tensor<?x?xf32, #SparseMatrix>) -> tensor<?x?xf32, #SparseMatrix>
     ```
   }];
-  let assemblyFormat = "`[` $sizes `]` attr-dict `:` type($result)";
+  let assemblyFormat = "`[` $sizes `]` ( `start_value` `=` $startValue^ `[` type($startValue) `]` )? attr-dict `:` type($result)";
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -233,6 +233,11 @@
              << shape[i];
     }
   }
+  if (startValue()) {
+    Type svtype = startValue().getType();
+    if (svtype != ttp.getElementType())
+      return emitError("start_value must match type of tensor");
+  }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -473,8 +473,7 @@
     // explicitly defined by the arguments to the init operator.
     SmallVector<Value, 8> params;
     ShapedType stp = resType.cast<ShapedType>();
-    newParams(rewriter, params, op, stp, enc, Action::kEmpty,
-              adaptor.getOperands());
+    newParams(rewriter, params, op, stp, enc, Action::kEmpty, adaptor.sizes());
     rewriter.replaceOp(op, genNewCall(rewriter, op, params));
     return success();
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -704,14 +704,36 @@
 static Value genInsertionLoad(CodeGen &codegen, OpBuilder &builder,
                               linalg::GenericOp op, OpOperand *t) {
   Location loc = op.getLoc();
+  Type tp = getElementTypeOrSelf(t->get().getType());
+  InitOp init = codegen.sparseOut->get().getDefiningOp<InitOp>();
   // Direct lexicographic index order, tensor loads as zero.
   if (!codegen.expValues) {
-    Type tp = getElementTypeOrSelf(t->get().getType());
+    if (init && init.startValue())
+      return init.startValue();
     return constantZero(builder, loc, tp);
   }
   // Load from expanded access pattern.
   Value index = genIndex(codegen, op, t);
-  return builder.create<memref::LoadOp>(loc, codegen.expValues, index);
+  // If startValue provided, use unless already filled
+  if (init && init.startValue()) {
+    Value isFilled =
+        builder.create<memref::LoadOp>(loc, codegen.expFilled, index);
+    scf::IfOp ifStmt =
+        builder.create<scf::IfOp>(loc, tp, isFilled, /*else=*/true);
+    // True branch
+    builder.setInsertionPointToStart(ifStmt.thenBlock());
+    Value valAtIndex =
+        builder.create<memref::LoadOp>(loc, codegen.expValues, index);
+    builder.create<scf::YieldOp>(loc, valAtIndex);
+    // False branch
+    builder.setInsertionPointToStart(ifStmt.elseBlock());
+    builder.create<scf::YieldOp>(loc, init.startValue());
+    builder.setInsertionPointAfter(ifStmt);
+    // End if
+    return ifStmt.getResult(0);
+  } else {
+    return builder.create<memref::LoadOp>(loc, codegen.expValues, index);
+  }
 }
 
 /// Generates insertion code to implement dynamic tensor store.
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mat_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mat_reductions.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mat_reductions.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/test.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+// Do the same run, but now with SIMDization as well. This should not change the outcome.
+//
+// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=2" | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/test.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+!Filename = !llvm.ptr<i8>
+
+#SparseMatrix = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ]
+}>
+
+#matmul_prodprod = {
+  indexing_maps = [
+    affine_map<(i,j,k) -> (i,k)>, // A
+    affine_map<(i,j,k) -> (k,j)>, // B
+    affine_map<(i,j,k) -> (i,j)>  // X (out)
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"],
+  doc = "X(i,j) *= A(i,k) * B(k,j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  //
+  // A kernel that multiplies a sparse matrix A with itself
+  // into a sparse matrix X using PROD instead of SUM for the reduction.
+  //
+  func.func @kernel_prod(%arga: tensor<?x?xf64, #SparseMatrix>) -> tensor<?x?xf64, #SparseMatrix> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cf1 = arith.constant 1.0 : f64
+    %d0 = tensor.dim %arga, %c0 : tensor<?x?xf64, #SparseMatrix>
+    %d1 = tensor.dim %arga, %c1 : tensor<?x?xf64, #SparseMatrix>
+    %argx = sparse_tensor.init [%d0, %d1] start_value=%cf1[f64] : tensor<?x?xf64, #SparseMatrix>
+    %0 = linalg.generic #matmul_prodprod
+      ins(%arga, %arga: tensor<?x?xf64, #SparseMatrix>, tensor<?x?xf64, #SparseMatrix>)
+      outs(%argx: tensor<?x?xf64, #SparseMatrix>) {
+      ^bb(%a: f64, %b: f64, %x: f64):
+        %0 = arith.mulf %a, %b : f64
+        %1 = arith.mulf %x, %0 : f64
+        linalg.yield %1 : f64
+    } -> tensor<?x?xf64, #SparseMatrix>
+    return %0 : tensor<?x?xf64, #SparseMatrix>
+  }
+
+  func.func private @getTensorFilename(index) -> (!Filename)
+
+  // Dump a sparse matrix.
+  func.func @dump_mat(%arg0: tensor<?x?xf64, #SparseMatrix>) {
+    %c0 = arith.constant 0 : index
+    %i0 = arith.constant -1.0 : f64
+    %dm = sparse_tensor.convert %arg0 : tensor<?x?xf64, #SparseMatrix> to tensor<?x?xf64>
+    %2 = bufferization.to_memref %dm : memref<?x?xf64>
+    %3 = vector.transfer_read %2[%c0, %c0], %i0: memref<?x?xf64>, vector<5x5xf64>
+    vector.print %3 : vector<5x5xf64>
+    memref.dealloc %2 : memref<?x?xf64>
+    return
+  }
+
+  //
+  // Main driver that reads matrix from file and calls the sparse kernel.
+  //
+  func.func @entry() {
+    %c0 = arith.constant 0 : index
+
+    // Read the sparse matrix from file, construct sparse storage.
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %a = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #SparseMatrix>
+
+    // Call kernel.
+    %0 = call @kernel_prod(%a) : (tensor<?x?xf64, #SparseMatrix>) -> (tensor<?x?xf64, #SparseMatrix>)
+
+    // Print the result for verification.
+    //
+    // CHECK: ( ( 5.74, 0, 0, 7.84, 0 ), ( 0, 52, 0, 0, 62.5 ), ( 0, 0, 9, 0, 0 ), ( 67.24, 0, 0, 91.84, 0 ), ( 0, 270.4, 0, 0, 325 ) )
+    //
+    call @dump_mat(%0) : (tensor<?x?xf64, #SparseMatrix>) -> ()
+
+    // Release the resources.
+    sparse_tensor.release %a : tensor<?x?xf64, #SparseMatrix>
+    sparse_tensor.release %0 : tensor<?x?xf64, #SparseMatrix>
+
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir
@@ -24,8 +24,7 @@
 
 module {
   func.func @redsum(%arga: tensor<?x?x?xi32, #SparseTensor>,
-               %argb: tensor<?x?x?xi32, #SparseTensor>)
-	           -> tensor<?x?xi32, #SparseMatrix> {
+                    %argb: tensor<?x?x?xi32, #SparseTensor>) -> tensor<?x?xi32, #SparseMatrix> {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %d0 = tensor.dim %arga, %c0 : tensor<?x?x?xi32, #SparseTensor>
@@ -43,11 +42,43 @@
     return %0 : tensor<?x?xi32, #SparseMatrix>
   }
 
-  // Driver method to call and verify tensor kernel.
-  func.func @entry() {
+  func.func @redprod(%arga: tensor<?x?x?xi32, #SparseTensor>,
+                     %argb: tensor<?x?x?xi32, #SparseTensor>) -> tensor<?x?xi32, #SparseMatrix> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %ci1 = arith.constant 1 : i32
+    %d0 = tensor.dim %arga, %c0 : tensor<?x?x?xi32, #SparseTensor>
+    %d1 = tensor.dim %arga, %c1 : tensor<?x?x?xi32, #SparseTensor>
+    %xinit = sparse_tensor.init [%d0, %d1] start_value=%ci1[i32] : tensor<?x?xi32, #SparseMatrix>
+    %0 = linalg.generic #redsum
+      ins(%arga, %argb: tensor<?x?x?xi32, #SparseTensor>,
+                        tensor<?x?x?xi32, #SparseTensor>)
+      outs(%xinit: tensor<?x?xi32, #SparseMatrix>) {
+        ^bb(%a: i32, %b: i32, %x: i32):
+          %0 = arith.muli %a, %b : i32
+          %1 = arith.muli %x, %0 : i32
+          linalg.yield %1 : i32
+    } -> tensor<?x?xi32, #SparseMatrix>
+    return %0 : tensor<?x?xi32, #SparseMatrix>
+  }
+
+  // Dump a sparse matrix.
+  func.func @dump_mat(%arg0: tensor<?x?xi32, #SparseMatrix>) {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant -1 : i32
+    %0 = sparse_tensor.values %arg0 : tensor<?x?xi32, #SparseMatrix> to memref<?xi32>
+    %1 = vector.transfer_read %0[%c0], %i0: memref<?xi32>, vector<4xi32>
+    vector.print %1 : vector<4xi32>
+    %dm = sparse_tensor.convert %arg0 : tensor<?x?xi32, #SparseMatrix> to tensor<?x?xi32>
+    %2 = bufferization.to_memref %dm : memref<?x?xi32>
+    %3 = vector.transfer_read %2[%c0, %c0], %i0: memref<?x?xi32>, vector<3x3xi32>
+    vector.print %3 : vector<3x3xi32>
+    memref.dealloc %2 : memref<?x?xi32>
+    return
+  }
 
+  // Driver method to call and verify tensor kernel.
+  func.func @entry() {
     // Setup very sparse 3-d tensors.
     %t1 = arith.constant sparse<
        [ [1,1,3], [2,0,0], [2,2,1], [2,2,2], [2,2,3] ], [ 1, 2, 3, 4, 5 ]
@@ -64,28 +95,26 @@
     %0 = call @redsum(%st1, %st2)
       : (tensor<?x?x?xi32, #SparseTensor>,
          tensor<?x?x?xi32, #SparseTensor>) -> tensor<?x?xi32, #SparseMatrix>
+    %1 = call @redprod(%st1, %st2)
+      : (tensor<?x?x?xi32, #SparseTensor>,
+         tensor<?x?x?xi32, #SparseTensor>) -> tensor<?x?xi32, #SparseMatrix>
 
     //
     // Verify results. Only two entries stored in result. Correct structure.
     //
     // CHECK: ( 7, 69, -1, -1 )
     // CHECK-NEXT: ( ( 0, 0, 0 ), ( 0, 7, 0 ), ( 0, 0, 69 ) )
+    // CHECK: ( 7, 1080, -1, -1 )
+    // CHECK-NEXT: ( ( 0, 0, 0 ), ( 0, 7, 0 ), ( 0, 0, 1080 ) )
     //
-    %val = sparse_tensor.values %0
-      : tensor<?x?xi32, #SparseMatrix> to memref<?xi32>
-    %vv = vector.transfer_read %val[%c0], %i0: memref<?xi32>, vector<4xi32>
-    vector.print %vv : vector<4xi32>
-    %dm = sparse_tensor.convert %0
-      : tensor<?x?xi32, #SparseMatrix> to tensor<?x?xi32>
-    %db = bufferization.to_memref %dm : memref<?x?xi32>
-    %vm = vector.transfer_read %db[%c0, %c0], %i0: memref<?x?xi32>, vector<3x3xi32>
-    vector.print %vm : vector<3x3xi32>
+    call @dump_mat(%0) : (tensor<?x?xi32, #SparseMatrix>) -> ()
+    call @dump_mat(%1) : (tensor<?x?xi32, #SparseMatrix>) -> ()
 
     // Release the resources.
     sparse_tensor.release %st1 : tensor<?x?x?xi32, #SparseTensor>
     sparse_tensor.release %st2 : tensor<?x?x?xi32, #SparseTensor>
     sparse_tensor.release %0 : tensor<?x?xi32, #SparseMatrix>
-    memref.dealloc %db : memref<?x?xi32>
+    sparse_tensor.release %1 : tensor<?x?xi32, #SparseMatrix>
     return
   }
 }