diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -362,7 +362,7 @@
       ArrayRef<size_t> dims, SmallVectorImpl<Value> &reduc, bool needsUniv,
       ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});
 
-  SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
+  SmallVector<Value, 2> exitCurrentLoop(RewriterBase &rewriter, Location loc,
                                         ArrayRef<Value> reduc = {});
 
   /// Return the array of coordinate for all the loop generated till now.
@@ -440,7 +440,7 @@
   ///   yield %val
   /// }
   /// Return %ret to user, while %val is provided by users (`reduc`)
-  SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
+  SmallVector<Value, 2> exitForLoop(RewriterBase &rewriter, Location loc,
                                     ArrayRef<Value> reduc);
 
   /// Exits a while loop, returns the reduction results.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -166,9 +166,12 @@
     OpBuilder &builder, Location loc, size_t tid, size_t dim,
     SmallVectorImpl<Value> &reduc, bool isParallel, ArrayRef<size_t> extraTids,
     ArrayRef<size_t> extraDims) {
+
   assert(dimTypes[tid].size() > dim);
   // We can not re-enter the same level.
   assert(!coord[tid][dim]);
+  // TODO: support multiple return on parallel for?
+  assert(!isParallel || reduc.empty() <= 1);
 
   Value step = constantIndex(builder, loc, 1);
   auto dimType = dimTypes[tid][dim];
@@ -182,11 +185,32 @@
   Value lo = isSparseInput ? pidxs[tid][dim]      // current offset
                            : loopSeqStack.back(); // univeral tid
   Value hi = highs[tid][dim];
+  Operation *loop = nullptr;
+  Value iv;
+  if (isParallel) {
+    scf::ParallelOp parOp =
+        builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
+    builder.setInsertionPointToStart(parOp.getBody());
+    assert(parOp.getNumReductions() == reduc.size());
+    iv = parOp.getInductionVars()[0];
+
+    // In-place update on the reduction variable vector.
+    for (int i = 0, e = reduc.size(); i < e; i++)
+      reduc[i] = parOp.getInitVals()[i];
+    loop = parOp;
+  } else {
+    scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
+    builder.setInsertionPointToStart(forOp.getBody());
+    iv = forOp.getInductionVar();
+
+    // In-place update on the reduction variable vector.
+    assert(forOp.getNumRegionIterArgs() == reduc.size());
+    for (int i = 0, e = reduc.size(); i < e; i++)
+      reduc[i] = forOp.getRegionIterArg(i);
+    loop = forOp;
+  }
+  assert(loop && iv);
 
-  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
-  builder.setInsertionPointToStart(forOp.getBody());
-  Value iv = forOp.getInductionVar();
-  assert(iv);
   if (isSparseInput) {
     pidxs[tid][dim] = iv;
     // Generating a load on the indices array yields the coordinate.
@@ -203,16 +227,12 @@
 
   // NOTE: we can also prepares for next dim here in advance
   // Push the loop into stack
-  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), loop,
                          coord[tid][dim]);
   // Emit extra locals.
   emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
 
-  // In-place update on the reduction variable vector.
-  assert(forOp.getNumRegionIterArgs() == reduc.size());
-  for (int i = 0, e = reduc.size(); i < e; i++)
-    reduc[i] = forOp.getRegionIterArg(i);
-  return forOp;
+  return loop;
 }
 
 Operation *SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims(
@@ -387,16 +407,56 @@
 }
 
 SmallVector<Value, 2>
-SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
+SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
                                      ArrayRef<Value> reduc) {
   LoopLevelInfo &loopInfo = loopStack.back();
   auto &dims = loopStack.back().dims;
   auto &tids = loopStack.back().tids;
-  auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
-  if (!reduc.empty()) {
-    assert(reduc.size() == forOp.getNumResults());
-    builder.setInsertionPointToEnd(forOp.getBody());
-    builder.create<scf::YieldOp>(loc, reduc);
+  auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop);
+  SmallVector<Value, 2> results;
+  if (forOp) {
+    if (!reduc.empty()) {
+      assert(reduc.size() == forOp.getNumResults());
+      rewriter.setInsertionPointToEnd(forOp.getBody());
+      rewriter.create<scf::YieldOp>(loc, reduc);
+    }
+    // exit the loop
+    rewriter.setInsertionPointAfter(forOp);
+    results = forOp.getResults();
+  } else {
+    auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
+    if (!reduc.empty()) {
+      assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
+      Operation *redExp = reduc.front().getDefiningOp();
+      assert(redExp->getUses().empty());
+      // this must be a binary operation.
+      // This is users' responsibilty to ensure the operation are commutative.
+      assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
+
+      Value redVal = parOp.getInitVals().front();
+      Value curVal;
+      if (redExp->getOperand(0) == redVal)
+        curVal = redExp->getOperand(1);
+      else if (redExp->getOperand(1) == redVal)
+        curVal = redExp->getOperand(0);
+      // one of the operands must be the init value (which is also the previous
+      // reduction value).
+      assert(curVal);
+      rewriter.setInsertionPointAfter(redExp);
+      auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
+      redExp->remove(); // detach from previous parents
+      // Attach to the reduction op.
+      Block *redBlock = &redOp.getRegion().getBlocks().front();
+      redBlock->push_back(redExp);
+      // replaced arguments
+      rewriter.updateRootInPlace(
+          redExp, [&]() { redExp->setOperands(redBlock->getArguments()); });
+      rewriter.setInsertionPointToEnd(redBlock);
+      rewriter.create<scf::ReduceReturnOp>(loc, redExp->getResult(0));
+      rewriter.setInsertionPointToEnd(parOp.getBody());
+    }
+    rewriter.setInsertionPointAfter(parOp);
+    results = parOp.getResults();
   }
 
   // Finished iterating a tensor, clean up
@@ -410,9 +470,7 @@
     if (!isDenseDim(dimTypes[tid][dim]))
       highs[tid][dim] = Value();
   }
-  // exit the loop
-  builder.setInsertionPointAfter(forOp);
-  return forOp.getResults();
+  return results;
 }
 
 SmallVector<Value, 2>
@@ -473,7 +531,7 @@
 }
 
 SmallVector<Value, 2>
-SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
+SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
                                          ArrayRef<Value> reduc) {
   // Clean up the values, it would help use to discover potential bug at a
   // earlier stage (instead of silently using a wrong value).
@@ -481,9 +539,9 @@
   assert(loopInfo.tids.size() == loopInfo.dims.size());
   SmallVector<Value, 2> red;
   if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
-    red = exitCoiterationLoop(builder, loc, reduc);
+    red = exitCoiterationLoop(rewriter, loc, reduc);
   } else {
-    red = exitForLoop(builder, loc, reduc);
+    red = exitForLoop(rewriter, loc, reduc);
   }
 
   assert(loopStack.size() == loopSeqStack.size());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -1015,13 +1015,14 @@
   case SparseParallelizationStrategy::kNone:
     return false;
   case SparseParallelizationStrategy::kDenseOuterLoop:
-    return isOuter && !isSparse && !isReduction;
+    // We do not support parallel reduction on tensor expansion.
+    return isOuter && !isSparse && !codegen.expCount;
   case SparseParallelizationStrategy::kAnyStorageOuterLoop:
-    return isOuter && !isReduction;
+    return isOuter && !codegen.expCount;
   case SparseParallelizationStrategy::kDenseAnyLoop:
-    return !isSparse && !isReduction;
+    return !isSparse && !codegen.expCount;
   case SparseParallelizationStrategy::kAnyStorageAnyLoop:
-    return !isReduction;
+    return !codegen.expCount;
   }
   llvm_unreachable("unexpected parallelization strategy");
 }
@@ -1038,7 +1039,6 @@
   bool isSparse = merger.isDimLevelType(tid, idx, DimLvlType::kCompressed) ||
                   merger.isDimLevelType(tid, idx, DimLvlType::kSingleton);
   bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
-  assert(!isParallel);
 
   // Emit a sequential or vector loop.
   SmallVector<Value, 4> operands;
@@ -1048,7 +1048,7 @@
     operands.push_back(codegen.expCount);
 
   Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim(
-      builder, loc, tid, dim, operands, false, extraTids, extraDims);
+      builder, loc, tid, dim, operands, isParallel, extraTids, extraDims);
 
   // The operands should be updated by loop emitter already.
   if (codegen.redVal)
@@ -1300,14 +1300,14 @@
 }
 
 /// Ends a single loop in current sequence. Returns new values for needsUniv.
-static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
+static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
                     linalg::GenericOp op, Operation *loop, unsigned idx,
                     unsigned li, bool needsUniv) {
   codegen.curVecLength = 1;
 
   // End a while-loop.
   if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
-    finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv,
+    finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv,
                     merger.lat(li).bits, whileOp);
   } else {
     needsUniv = false;
@@ -1320,7 +1320,7 @@
     reduc.push_back(codegen.expCount);
 
   auto loopRet =
-      codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc);
+      codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc);
   assert(reduc.size() == loopRet.size());
 
   if (codegen.redVal)
diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
@@ -1,14 +1,14 @@
 // RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \
 // RUN:   FileCheck %s --check-prefix=CHECK-PAR0
 // FIXME: we do not support vectorization/parallel loops in loop emitter right now
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR1
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR2
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR3
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR4
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR1
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR2
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR3
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR4
 
 #DenseMatrix = #sparse_tensor.encoding<{
   dimLevelType = [ "dense", "dense" ]
@@ -151,7 +151,8 @@
 //
 // CHECK-PAR4-LABEL: func @matvec
 // CHECK-PAR4:         scf.parallel
-// CHECK-PAR4:           scf.for
+// CHECK-PAR4:           scf.parallel
+// CHECK-PAR4:             scf.reduce
 // CHECK-PAR4:         return
 //
 func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
@@ -5,6 +5,14 @@
 // RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 //
+// Do the same run, but now with parallization
+// RUN: mlir-opt %s \
+// RUN:   --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
+// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
 // FIXME: support vector in sparse loop emitter.
 // R_U_N: mlir-opt %s \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir
@@ -6,13 +6,13 @@
 // RUN: FileCheck %s
 //
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
-// FIXME: we do not support vectorization/parallel loops in loop emitter right now
-// R_U_N: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
-// R_U_N: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
-// R_U_N: mlir-cpu-runner \
-// R_U_N:  -e entry -entry-point-result=void  \
-// R_U_N:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// R_U_N: FileCheck %s
+//
+// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
+// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
 
 !Filename = !llvm.ptr<i8>