diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -362,8 +362,8 @@ ArrayRef dims, SmallVectorImpl &reduc, bool needsUniv, ArrayRef extraTids = {}, ArrayRef extraDims = {}); - SmallVector exitCurrentLoop(OpBuilder &builder, Location loc, - ArrayRef reduc = {}); + void exitCurrentLoop(RewriterBase &rewriter, Location loc, + SmallVectorImpl &reduc); /// Return the array of coordinate for all the loop generated till now. void getCoordinateArray(SmallVectorImpl &coords) const { @@ -435,17 +435,32 @@ } /// Exits a for loop, returns the reduction results, e.g., + /// For for loops: /// %ret = for () { /// ... + /// %val = addi %args, %c /// yield %val /// } + /// For parallel loops, the following generated code by users + /// %ret = parallel () init(%args) { + /// ... + /// %val = addi %args, %c + /// } + /// will be transformed into + /// %ret = parallel () init(%args) { + /// ... + /// scf.reduce(%c) bb0(%0, %1){ + /// %val = addi %0, %1 + /// scf.reduce.return %val + /// } + /// } /// Return %ret to user, while %val is provided by users (`reduc`) - SmallVector exitForLoop(OpBuilder &builder, Location loc, - ArrayRef reduc); + void exitForLoop(RewriterBase &rewriter, Location loc, + SmallVectorImpl &reduc); /// Exits a while loop, returns the reduction results. - SmallVector exitCoiterationLoop(OpBuilder &builder, Location loc, - ArrayRef reduc); + void exitCoiterationLoop(OpBuilder &builder, Location loc, + SmallVectorImpl &reduc); struct LoopLevelInfo { LoopLevelInfo(ArrayRef tids, ArrayRef dims, Operation *loop, diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -166,9 +166,12 @@ OpBuilder &builder, Location loc, size_t tid, size_t dim, SmallVectorImpl &reduc, bool isParallel, ArrayRef extraTids, ArrayRef extraDims) { + assert(dimTypes[tid].size() > dim); // We can not re-enter the same level. assert(!coord[tid][dim]); + // TODO: support multiple return on parallel for? + assert(!isParallel || reduc.empty() <= 1); Value step = constantIndex(builder, loc, 1); auto dimType = dimTypes[tid][dim]; @@ -182,11 +185,38 @@ Value lo = isSparseInput ? pidxs[tid][dim] // current offset : loopSeqStack.back(); // univeral tid Value hi = highs[tid][dim]; + Operation *loop = nullptr; + Value iv; + if (isParallel) { + scf::ParallelOp parOp = + builder.create(loc, lo, hi, step, reduc); + builder.setInsertionPointToStart(parOp.getBody()); + assert(parOp.getNumReductions() == reduc.size()); + iv = parOp.getInductionVars()[0]; + + // In-place update on the reduction variable vector. + // Note that the init vals is not the actual reduction variables but instead + // used as a `special handle` to (temporally) represent them. The + // expression on init vals will be moved into scf.reduce and replaced with + // the block arguments when exiting the loop (see exitForLoop). This is + // needed as we can not build the actual reduction block and get the actual + // reduction varaible before users fill parallel loop body. + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = parOp.getInitVals()[i]; + loop = parOp; + } else { + scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); + builder.setInsertionPointToStart(forOp.getBody()); + iv = forOp.getInductionVar(); + + // In-place update on the reduction variable vector. + assert(forOp.getNumRegionIterArgs() == reduc.size()); + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = forOp.getRegionIterArg(i); + loop = forOp; + } + assert(loop && iv); - scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); - builder.setInsertionPointToStart(forOp.getBody()); - Value iv = forOp.getInductionVar(); - assert(iv); if (isSparseInput) { pidxs[tid][dim] = iv; // Generating a load on the indices array yields the coordinate. @@ -203,16 +233,12 @@ // NOTE: we can also prepares for next dim here in advance // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, + loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), loop, coord[tid][dim]); // Emit extra locals. emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims); - // In-place update on the reduction variable vector. - assert(forOp.getNumRegionIterArgs() == reduc.size()); - for (int i = 0, e = reduc.size(); i < e; i++) - reduc[i] = forOp.getRegionIterArg(i); - return forOp; + return loop; } Operation *SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims( @@ -386,17 +412,68 @@ } } -SmallVector -SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc, - ArrayRef reduc) { +void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, + SmallVectorImpl &reduc) { LoopLevelInfo &loopInfo = loopStack.back(); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; - auto forOp = llvm::cast(loopInfo.loop); - if (!reduc.empty()) { - assert(reduc.size() == forOp.getNumResults()); - builder.setInsertionPointToEnd(forOp.getBody()); - builder.create(loc, reduc); + auto forOp = llvm::dyn_cast(loopInfo.loop); + if (forOp) { + if (!reduc.empty()) { + assert(reduc.size() == forOp.getNumResults()); + rewriter.setInsertionPointToEnd(forOp.getBody()); + rewriter.create(loc, reduc); + } + // Exit the loop. + rewriter.setInsertionPointAfter(forOp); + // In-place update reduction variables. + for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) + reduc[i] = forOp.getResult(i); + } else { + auto parOp = llvm::cast(loopInfo.loop); + if (!reduc.empty()) { + assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); + Operation *redExp = reduc.front().getDefiningOp(); + // reduction expression should have no use. + assert(redExp->getUses().empty()); + // this must be a binary operation. + // This is users' responsibilty to ensure the operation are commutative. + assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); + + Value redVal = parOp.getInitVals().front(); + Value curVal; + if (redExp->getOperand(0) == redVal) + curVal = redExp->getOperand(1); + else if (redExp->getOperand(1) == redVal) + curVal = redExp->getOperand(0); + // one of the operands must be the init value (which is also the + // previous reduction value). + assert(curVal); + // the reduction expression should be the only user of the reduction val + // inside the parallel for. + unsigned numUsers = 0; + for (Operation *op : redVal.getUsers()) { + if (op->getParentOp() == parOp) + numUsers++; + } + assert(numUsers == 1); + (void)numUsers; // to silence unused variable warning in release build + rewriter.setInsertionPointAfter(redExp); + auto redOp = rewriter.create(loc, curVal); + redExp->remove(); // detach from previous parents + // Attach to the reduction op. + Block *redBlock = &redOp.getRegion().getBlocks().front(); + redBlock->push_back(redExp); + // replaced arguments + rewriter.updateRootInPlace( + redExp, [&]() { redExp->setOperands(redBlock->getArguments()); }); + rewriter.setInsertionPointToEnd(redBlock); + rewriter.create(loc, redExp->getResult(0)); + } + rewriter.setInsertionPointAfter(parOp); + // In-place update reduction variables. + for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) + reduc[i] = parOp.getResult(i); } // Finished iterating a tensor, clean up @@ -410,14 +487,10 @@ if (!isDenseDim(dimTypes[tid][dim])) highs[tid][dim] = Value(); } - // exit the loop - builder.setInsertionPointAfter(forOp); - return forOp.getResults(); } -SmallVector -SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc, - ArrayRef reduc) { +void SparseTensorLoopEmitter::exitCoiterationLoop( + OpBuilder &builder, Location loc, SmallVectorImpl &reduc) { auto whileOp = llvm::cast(loopStack.back().loop); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; @@ -451,10 +524,10 @@ } // Reduction value from users. - SmallVector ret; - for (auto red : reduc) { - operands.push_back(red); - ret.push_back(whileOp->getResult(o++)); + for (unsigned i = 0, e = reduc.size(); i < e; i++) { + operands.push_back(reduc[i]); + // In place update reduction variable. + reduc[i] = whileOp->getResult(o++); } // An (optional) universal index. @@ -469,26 +542,24 @@ assert(o == operands.size()); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); - return ret; } -SmallVector -SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc, - ArrayRef reduc) { +void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter, + Location loc, + SmallVectorImpl &reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). LoopLevelInfo &loopInfo = loopStack.back(); assert(loopInfo.tids.size() == loopInfo.dims.size()); SmallVector red; if (llvm::isa(loopInfo.loop)) { - red = exitCoiterationLoop(builder, loc, reduc); + exitCoiterationLoop(rewriter, loc, reduc); } else { - red = exitForLoop(builder, loc, reduc); + exitForLoop(rewriter, loc, reduc); } assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); - return red; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -500,7 +500,7 @@ rewriter.eraseOp(srcBlock->getTerminator()); for (int64_t i = 0; i < rank; i++) { - loopEmitter.exitCurrentLoop(rewriter, loc); + loopEmitter.exitCurrentLoop(rewriter, loc, reduc); loopEmitter.exitCurrentLoopSeq(); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -473,6 +473,25 @@ // Sparse compiler synthesis methods (statements and expressions). //===----------------------------------------------------------------------===// +/// Generates stmts that updates reduction varaibles. +static void +genReducUpdateStmt(CodeGen &codegen, Merger &merger, + function_ref &reduc)> callback) { + SmallVector reduc; + if (codegen.redVal) + reduc.push_back(codegen.redVal); + if (codegen.expValues) + reduc.push_back(codegen.expCount); + + callback(reduc); + + // Callback should do in-place update on reduction value vector. + if (codegen.redVal) + updateReduc(merger, codegen, reduc.front()); + if (codegen.expValues) + codegen.expCount = reduc.back(); +} + /// Local bufferization of all dense and sparse data structures. static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op) { @@ -1005,8 +1024,7 @@ /// Returns parallelization strategy. Any implicit loop in the Linalg /// operation that is marked "parallel" is a candidate. Whether it is actually /// converted to a parallel operation depends on the requested strategy. -static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, - bool isSparse) { +static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isSparse) { // Reject parallelization of sparse output. if (codegen.sparseOut) return false; @@ -1015,13 +1033,14 @@ case SparseParallelizationStrategy::kNone: return false; case SparseParallelizationStrategy::kDenseOuterLoop: - return isOuter && !isSparse && !isReduction; + // We do not support parallel reduction on tensor expansion. + return isOuter && !isSparse && !codegen.expCount; case SparseParallelizationStrategy::kAnyStorageOuterLoop: - return isOuter && !isReduction; + return isOuter && !codegen.expCount; case SparseParallelizationStrategy::kDenseAnyLoop: - return !isSparse && !isReduction; + return !isSparse && !codegen.expCount; case SparseParallelizationStrategy::kAnyStorageAnyLoop: - return !isReduction; + return !codegen.expCount; } llvm_unreachable("unexpected parallelization strategy"); } @@ -1034,28 +1053,16 @@ ArrayRef extraDims) { Location loc = op.getLoc(); auto iteratorTypes = op.getIteratorTypesArray(); - bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]); bool isSparse = merger.isDimLevelType(tid, idx, DimLvlType::kCompressed) || merger.isDimLevelType(tid, idx, DimLvlType::kSingleton); - bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse); - assert(!isParallel); - - // Emit a sequential or vector loop. - SmallVector operands; - if (codegen.redVal) - operands.push_back(codegen.redVal); - if (codegen.expValues) - operands.push_back(codegen.expCount); - - Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim( - builder, loc, tid, dim, operands, false, extraTids, extraDims); - - // The operands should be updated by loop emitter already. - if (codegen.redVal) - updateReduc(merger, codegen, operands.front()); - if (codegen.expValues) - codegen.expCount = operands.back(); - + bool isParallel = isParallelFor(codegen, isOuter, isSparse); + + Operation *loop = nullptr; + genReducUpdateStmt(codegen, merger, [&](SmallVectorImpl &reduc) { + loop = codegen.loopEmitter.enterLoopOverTensorAtDim( + builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims); + }); + assert(loop); return loop; } @@ -1066,23 +1073,14 @@ ArrayRef extraTids, ArrayRef extraDims) { - SmallVector operands; - // Construct the while-loop with a parameter for each index. - - if (codegen.redVal) - operands.push_back(codegen.redVal); - if (codegen.expValues) - operands.push_back(codegen.expCount); - - Operation *loop = codegen.loopEmitter.enterCoiterationOverTensorsAtDims( - builder, op.getLoc(), condTids, condDims, operands, needsUniv, extraTids, - extraDims); - - if (codegen.redVal) - updateReduc(merger, codegen, operands.front()); - if (codegen.expValues) - codegen.expCount = operands.back(); - + Operation *loop = nullptr; + genReducUpdateStmt(codegen, merger, [&](SmallVectorImpl &reduc) { + // Construct the while-loop with a parameter for each index. + loop = codegen.loopEmitter.enterCoiterationOverTensorsAtDims( + builder, op.getLoc(), condTids, condDims, reduc, needsUniv, extraTids, + extraDims); + }); + assert(loop); return loop; } @@ -1300,33 +1298,20 @@ } /// Ends a single loop in current sequence. Returns new values for needsUniv. -static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, +static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, linalg::GenericOp op, Operation *loop, unsigned idx, unsigned li, bool needsUniv) { - codegen.curVecLength = 1; - // End a while-loop. if (auto whileOp = dyn_cast(loop)) { - finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv, + finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv, merger.lat(li).bits, whileOp); } else { needsUniv = false; } - SmallVector reduc; - if (codegen.redVal) - reduc.push_back(codegen.redVal); - if (codegen.expValues) - reduc.push_back(codegen.expCount); - - auto loopRet = - codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc); - assert(reduc.size() == loopRet.size()); - - if (codegen.redVal) - updateReduc(merger, codegen, loopRet.front()); - if (codegen.expValues) - codegen.expCount = loopRet.back(); + genReducUpdateStmt(codegen, merger, [&](SmallVectorImpl &reduc) { + codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc); + }); return needsUniv; } diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir @@ -1,14 +1,14 @@ // RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \ // RUN: FileCheck %s --check-prefix=CHECK-PAR0 // FIXME: we do not support vectorization/parallel loops in loop emitter right now -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR1 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR2 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR3 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR4 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR1 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR2 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR3 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR4 #DenseMatrix = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ] @@ -151,7 +151,8 @@ // // CHECK-PAR4-LABEL: func @matvec // CHECK-PAR4: scf.parallel -// CHECK-PAR4: scf.for +// CHECK-PAR4: scf.parallel +// CHECK-PAR4: scf.reduce // CHECK-PAR4: return // func.func @matvec(%arga: tensor<16x32xf32, #CSR>, diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir @@ -5,6 +5,14 @@ // RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s // +// Do the same run, but now with parallization +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \ +// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s // Do the same run, but now with SIMDization as well. This should not change the outcome. // FIXME: support vector in sparse loop emitter. // R_U_N: mlir-opt %s \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir @@ -6,13 +6,13 @@ // RUN: FileCheck %s // // Do the same run, but now with SIMDization as well. This should not change the outcome. -// FIXME: we do not support vectorization/parallel loops in loop emitter right now -// R_U_N: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \ -// R_U_N: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ -// R_U_N: mlir-cpu-runner \ -// R_U_N: -e entry -entry-point-result=void \ -// R_U_N: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ -// R_U_N: FileCheck %s +// +// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \ +// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s !Filename = !llvm.ptr