diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -362,7 +362,7 @@ ArrayRef dims, SmallVectorImpl &reduc, bool needsUniv, ArrayRef extraTids = {}, ArrayRef extraDims = {}); - SmallVector exitCurrentLoop(OpBuilder &builder, Location loc, + SmallVector exitCurrentLoop(RewriterBase &rewriter, Location loc, ArrayRef reduc = {}); /// Return the array of coordinate for all the loop generated till now. @@ -440,7 +440,7 @@ /// yield %val /// } /// Return %ret to user, while %val is provided by users (`reduc`) - SmallVector exitForLoop(OpBuilder &builder, Location loc, + SmallVector exitForLoop(RewriterBase &rewriter, Location loc, ArrayRef reduc); /// Exits a while loop, returns the reduction results. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -166,9 +166,12 @@ OpBuilder &builder, Location loc, size_t tid, size_t dim, SmallVectorImpl &reduc, bool isParallel, ArrayRef extraTids, ArrayRef extraDims) { + assert(dimTypes[tid].size() > dim); // We can not re-enter the same level. assert(!coord[tid][dim]); + // TODO: support multiple return on parallel for? + assert(!isParallel || reduc.empty() <= 1); Value step = constantIndex(builder, loc, 1); auto dimType = dimTypes[tid][dim]; @@ -182,11 +185,32 @@ Value lo = isSparseInput ? pidxs[tid][dim] // current offset : loopSeqStack.back(); // univeral tid Value hi = highs[tid][dim]; + Operation *loop = nullptr; + Value iv; + if (isParallel) { + scf::ParallelOp parOp = + builder.create(loc, lo, hi, step, reduc); + builder.setInsertionPointToStart(parOp.getBody()); + assert(parOp.getNumReductions() == reduc.size()); + iv = parOp.getInductionVars()[0]; + + // In-place update on the reduction variable vector. + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = parOp.getInitVals()[i]; + loop = parOp; + } else { + scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); + builder.setInsertionPointToStart(forOp.getBody()); + iv = forOp.getInductionVar(); + + // In-place update on the reduction variable vector. + assert(forOp.getNumRegionIterArgs() == reduc.size()); + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = forOp.getRegionIterArg(i); + loop = forOp; + } + assert(loop && iv); - scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); - builder.setInsertionPointToStart(forOp.getBody()); - Value iv = forOp.getInductionVar(); - assert(iv); if (isSparseInput) { pidxs[tid][dim] = iv; // Generating a load on the indices array yields the coordinate. @@ -203,16 +227,12 @@ // NOTE: we can also prepares for next dim here in advance // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, + loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), loop, coord[tid][dim]); // Emit extra locals. emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims); - // In-place update on the reduction variable vector. - assert(forOp.getNumRegionIterArgs() == reduc.size()); - for (int i = 0, e = reduc.size(); i < e; i++) - reduc[i] = forOp.getRegionIterArg(i); - return forOp; + return loop; } Operation *SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims( @@ -387,16 +407,56 @@ } SmallVector -SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc, +SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, ArrayRef reduc) { LoopLevelInfo &loopInfo = loopStack.back(); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; - auto forOp = llvm::cast(loopInfo.loop); - if (!reduc.empty()) { - assert(reduc.size() == forOp.getNumResults()); - builder.setInsertionPointToEnd(forOp.getBody()); - builder.create(loc, reduc); + auto forOp = llvm::dyn_cast(loopInfo.loop); + SmallVector results; + if (forOp) { + if (!reduc.empty()) { + assert(reduc.size() == forOp.getNumResults()); + rewriter.setInsertionPointToEnd(forOp.getBody()); + rewriter.create(loc, reduc); + } + // exit the loop + rewriter.setInsertionPointAfter(forOp); + results = forOp.getResults(); + } else { + auto parOp = llvm::cast(loopInfo.loop); + if (!reduc.empty()) { + assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); + Operation *redExp = reduc.front().getDefiningOp(); + assert(redExp->getUses().empty()); + // this must be a binary operation. + // This is users' responsibilty to ensure the operation are commutative. + assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); + + Value redVal = parOp.getInitVals().front(); + Value curVal; + if (redExp->getOperand(0) == redVal) + curVal = redExp->getOperand(1); + else if (redExp->getOperand(1) == redVal) + curVal = redExp->getOperand(0); + // one of the operands must be the init value (which is also the previous + // reduction value). + assert(curVal); + rewriter.setInsertionPointAfter(redExp); + auto redOp = rewriter.create(loc, curVal); + redExp->remove(); // detach from previous parents + // Attach to the reduction op. + Block *redBlock = &redOp.getRegion().getBlocks().front(); + redBlock->push_back(redExp); + // replaced arguments + rewriter.updateRootInPlace( + redExp, [&]() { redExp->setOperands(redBlock->getArguments()); }); + rewriter.setInsertionPointToEnd(redBlock); + rewriter.create(loc, redExp->getResult(0)); + rewriter.setInsertionPointToEnd(parOp.getBody()); + } + rewriter.setInsertionPointAfter(parOp); + results = parOp.getResults(); } // Finished iterating a tensor, clean up @@ -410,9 +470,7 @@ if (!isDenseDim(dimTypes[tid][dim])) highs[tid][dim] = Value(); } - // exit the loop - builder.setInsertionPointAfter(forOp); - return forOp.getResults(); + return results; } SmallVector @@ -473,7 +531,7 @@ } SmallVector -SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc, +SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, ArrayRef reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). @@ -481,9 +539,9 @@ assert(loopInfo.tids.size() == loopInfo.dims.size()); SmallVector red; if (llvm::isa(loopInfo.loop)) { - red = exitCoiterationLoop(builder, loc, reduc); + red = exitCoiterationLoop(rewriter, loc, reduc); } else { - red = exitForLoop(builder, loc, reduc); + red = exitForLoop(rewriter, loc, reduc); } assert(loopStack.size() == loopSeqStack.size()); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -1015,13 +1015,14 @@ case SparseParallelizationStrategy::kNone: return false; case SparseParallelizationStrategy::kDenseOuterLoop: - return isOuter && !isSparse && !isReduction; + // We do not support parallel reduction on tensor expansion. + return isOuter && !isSparse && !codegen.expCount; case SparseParallelizationStrategy::kAnyStorageOuterLoop: - return isOuter && !isReduction; + return isOuter && !codegen.expCount; case SparseParallelizationStrategy::kDenseAnyLoop: - return !isSparse && !isReduction; + return !isSparse && !codegen.expCount; case SparseParallelizationStrategy::kAnyStorageAnyLoop: - return !isReduction; + return !codegen.expCount; } llvm_unreachable("unexpected parallelization strategy"); } @@ -1038,7 +1039,6 @@ bool isSparse = merger.isDimLevelType(tid, idx, DimLvlType::kCompressed) || merger.isDimLevelType(tid, idx, DimLvlType::kSingleton); bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse); - assert(!isParallel); // Emit a sequential or vector loop. SmallVector operands; @@ -1048,7 +1048,7 @@ operands.push_back(codegen.expCount); Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim( - builder, loc, tid, dim, operands, false, extraTids, extraDims); + builder, loc, tid, dim, operands, isParallel, extraTids, extraDims); // The operands should be updated by loop emitter already. if (codegen.redVal) @@ -1300,14 +1300,14 @@ } /// Ends a single loop in current sequence. Returns new values for needsUniv. -static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, +static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, linalg::GenericOp op, Operation *loop, unsigned idx, unsigned li, bool needsUniv) { codegen.curVecLength = 1; // End a while-loop. if (auto whileOp = dyn_cast(loop)) { - finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv, + finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv, merger.lat(li).bits, whileOp); } else { needsUniv = false; @@ -1320,7 +1320,7 @@ reduc.push_back(codegen.expCount); auto loopRet = - codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc); + codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc); assert(reduc.size() == loopRet.size()); if (codegen.redVal) diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir @@ -1,14 +1,14 @@ // RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \ // RUN: FileCheck %s --check-prefix=CHECK-PAR0 // FIXME: we do not support vectorization/parallel loops in loop emitter right now -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR1 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR2 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR3 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR4 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR1 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR2 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR3 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR4 #DenseMatrix = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ] @@ -151,7 +151,8 @@ // // CHECK-PAR4-LABEL: func @matvec // CHECK-PAR4: scf.parallel -// CHECK-PAR4: scf.for +// CHECK-PAR4: scf.parallel +// CHECK-PAR4: scf.reduce // CHECK-PAR4: return // func.func @matvec(%arga: tensor<16x32xf32, #CSR>, diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir @@ -5,6 +5,14 @@ // RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s // +// Do the same run, but now with parallization +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \ +// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s // Do the same run, but now with SIMDization as well. This should not change the outcome. // FIXME: support vector in sparse loop emitter. // R_U_N: mlir-opt %s \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir @@ -6,13 +6,13 @@ // RUN: FileCheck %s // // Do the same run, but now with SIMDization as well. This should not change the outcome. -// FIXME: we do not support vectorization/parallel loops in loop emitter right now -// R_U_N: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \ -// R_U_N: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ -// R_U_N: mlir-cpu-runner \ -// R_U_N: -e entry -entry-point-result=void \ -// R_U_N: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ -// R_U_N: FileCheck %s +// +// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \ +// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s !Filename = !llvm.ptr