diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -380,8 +380,8 @@ ArrayRef dims, bool needsUniv, MutableArrayRef reduc = {}, ArrayRef extraTids = {}, ArrayRef extraDims = {}); - SmallVector exitCurrentLoop(OpBuilder &builder, Location loc, - ArrayRef reduc = {}); + void exitCurrentLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc = {}); /// Returns the array of coordinate for all the loop generated till now. void getCoordinateArray(SmallVectorImpl &coords) const { @@ -452,17 +452,35 @@ ArrayRef dims); /// Exits a for loop, returns the reduction results, e.g., + /// For sequential for loops: /// %ret = for () { /// ... + /// %val = addi %args, %c /// yield %val /// } - /// Return %ret to user, while %val is provided by users (`reduc`) - SmallVector exitForLoop(OpBuilder &builder, Location loc, - ArrayRef reduc); + /// For parallel loops, the following generated code by users: + /// %ret = parallel () init(%args) { + /// ... + /// %val = op %args, %c + /// } + /// will be transformed into + /// %ret = parallel () init(%args) { + /// ... + /// scf.reduce(%c) bb0(%0, %1){ + /// %val = op %0, %1 + /// scf.reduce.return %val + /// } + /// } + /// NOTE: only one instruction will be moved into reduce block, transformation + /// will fail if multiple instructions are used to compute the reduction + /// value. + /// Return %ret to user, while %val is provided by users (`reduc`). + void exitForLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc); /// Exits a while loop, returns the reduction results. - SmallVector exitCoiterationLoop(OpBuilder &builder, Location loc, - ArrayRef reduc); + void exitCoIterationLoop(OpBuilder &builder, Location loc, + MutableArrayRef reduc); // Whether the loop emitter needs to treat the last tensor as the output // tensor. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -219,9 +219,12 @@ OpBuilder &builder, Location loc, size_t tid, size_t dim, MutableArrayRef reduc, bool isParallel, ArrayRef extraTids, ArrayRef extraDims) { + assert(dimTypes[tid].size() > dim); // We can not re-enter the same level. assert(!coord[tid][dim]); + // TODO: support multiple return on parallel for? + assert(!isParallel || reduc.empty() <= 1); Value step = constantIndex(builder, loc, 1); auto dimType = dimTypes[tid][dim]; @@ -232,11 +235,38 @@ Value lo = isSparseInput ? pidxs[tid][dim] // current offset : loopSeqStack.back(); // univeral tid Value hi = highs[tid][dim]; + Operation *loop = nullptr; + Value iv; + if (isParallel) { + scf::ParallelOp parOp = + builder.create(loc, lo, hi, step, reduc); + builder.setInsertionPointToStart(parOp.getBody()); + assert(parOp.getNumReductions() == reduc.size()); + iv = parOp.getInductionVars()[0]; + + // In-place update on the reduction variable vector. + // Note that the init vals is not the actual reduction variables but instead + // used as a `special handle` to (temporarily) represent them. The + // expression on init vals will be moved into scf.reduce and replaced with + // the block arguments when exiting the loop (see exitForLoop). This is + // needed as we can not build the actual reduction block and get the actual + // reduction varaible before users fill parallel loop body. + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = parOp.getInitVals()[i]; + loop = parOp; + } else { + scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); + builder.setInsertionPointToStart(forOp.getBody()); + iv = forOp.getInductionVar(); + + // In-place update on the reduction variable vector. + assert(forOp.getNumRegionIterArgs() == reduc.size()); + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = forOp.getRegionIterArg(i); + loop = forOp; + } + assert(loop && iv); - scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); - builder.setInsertionPointToStart(forOp.getBody()); - Value iv = forOp.getInductionVar(); - assert(iv); if (isSparseInput) { pidxs[tid][dim] = iv; // Generating a load on the indices array yields the coordinate. @@ -253,16 +283,12 @@ // NOTE: we can also prepares for next dim here in advance // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, + loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), loop, coord[tid][dim]); // Emit extra locals. emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims); - // In-place update on the reduction variable vector. - assert(forOp.getNumRegionIterArgs() == reduc.size()); - for (int i = 0, e = reduc.size(); i < e; i++) - reduc[i] = forOp.getRegionIterArg(i); - return forOp; + return loop; } Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims( @@ -434,17 +460,73 @@ } } -SmallVector -SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc, - ArrayRef reduc) { +void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc) { LoopLevelInfo &loopInfo = loopStack.back(); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; - auto forOp = llvm::cast(loopInfo.loop); - if (!reduc.empty()) { - assert(reduc.size() == forOp.getNumResults()); - builder.setInsertionPointToEnd(forOp.getBody()); - builder.create(loc, reduc); + auto forOp = llvm::dyn_cast(loopInfo.loop); + if (forOp) { + if (!reduc.empty()) { + assert(reduc.size() == forOp.getNumResults()); + rewriter.setInsertionPointToEnd(forOp.getBody()); + rewriter.create(loc, reduc); + } + // Exit the loop. + rewriter.setInsertionPointAfter(forOp); + // In-place update reduction variables. + for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) + reduc[i] = forOp.getResult(i); + } else { + auto parOp = llvm::cast(loopInfo.loop); + if (!reduc.empty()) { + assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); + Operation *redExp = reduc.front().getDefiningOp(); + // Reduction expression should have no use. + assert(redExp->getUses().empty()); + // This must be a binary operation. + // NOTE: This is users' responsibilty to ensure the operation are + // commutative. + assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); + + Value redVal = parOp.getInitVals().front(); + Value curVal; + if (redExp->getOperand(0) == redVal) + curVal = redExp->getOperand(1); + else if (redExp->getOperand(1) == redVal) + curVal = redExp->getOperand(0); + // One of the operands must be the init value (which is also the + // previous reduction value). + assert(curVal); + // The reduction expression should be the only user of the reduction val + // inside the parallel for. + unsigned numUsers = 0; + for (Operation *op : redVal.getUsers()) { + if (op->getParentOp() == parOp) + numUsers++; + } + assert(numUsers == 1); + (void)numUsers; // to silence unused variable warning in release build + + rewriter.setInsertionPointAfter(redExp); + auto redOp = rewriter.create(loc, curVal); + // Attach to the reduction op. + Block *redBlock = &redOp.getRegion().getBlocks().front(); + rewriter.setInsertionPointToEnd(redBlock); + Operation *newRed = rewriter.clone(*redExp); + // Replaces arguments of the reduction expression by using the block + // arguments from scf.reduce. + rewriter.updateRootInPlace( + newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); + // Erases the out-dated reduction expression. + rewriter.eraseOp(redExp); + rewriter.setInsertionPointToEnd(redBlock); + rewriter.create(loc, newRed->getResult(0)); + } + rewriter.setInsertionPointAfter(parOp); + // In-place update reduction variables. + for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) + reduc[i] = parOp.getResult(i); } // Finished iterating a tensor, clean up @@ -458,14 +540,10 @@ if (!isDenseDLT(dimTypes[tid][dim])) highs[tid][dim] = Value(); } - // exit the loop - builder.setInsertionPointAfter(forOp); - return forOp.getResults(); } -SmallVector -SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc, - ArrayRef reduc) { +void SparseTensorLoopEmitter::exitCoIterationLoop( + OpBuilder &builder, Location loc, MutableArrayRef reduc) { auto whileOp = llvm::cast(loopStack.back().loop); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; @@ -499,10 +577,10 @@ } // Reduction value from users. - SmallVector ret; - for (auto red : reduc) { - operands.push_back(red); - ret.push_back(whileOp->getResult(o++)); + for (unsigned i = 0, e = reduc.size(); i < e; i++) { + operands.push_back(reduc[i]); + // In place update reduction variable. + reduc[i] = whileOp->getResult(o++); } // An (optional) universal index. @@ -517,26 +595,24 @@ assert(o == operands.size()); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); - return ret; } -SmallVector -SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc, - ArrayRef reduc) { +void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter, + Location loc, + MutableArrayRef reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). LoopLevelInfo &loopInfo = loopStack.back(); assert(loopInfo.tids.size() == loopInfo.dims.size()); SmallVector red; if (llvm::isa(loopInfo.loop)) { - red = exitCoiterationLoop(builder, loc, reduc); + exitCoIterationLoop(rewriter, loc, reduc); } else { - red = exitForLoop(builder, loc, reduc); + exitForLoop(rewriter, loc, reduc); } assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); - return red; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -410,6 +410,34 @@ // Sparse compiler synthesis methods (statements and expressions). //===----------------------------------------------------------------------===// +/// Generates loop boundary statements (entering/exiting loops). The function +/// passes and updates the reduction value. +static Optional genLoopBoundary( + CodeGen &codegen, Merger &merger, + function_ref(MutableArrayRef reduc)> + callback) { + SmallVector reduc; + if (codegen.redVal) + reduc.push_back(codegen.redVal); + if (codegen.expValues) + reduc.push_back(codegen.expCount); + if (codegen.insChain) + reduc.push_back(codegen.insChain); + + auto r = callback(reduc); + + // Callback should do in-place update on reduction value vector. + unsigned i = 0; + if (codegen.redVal) + updateReduc(merger, codegen, reduc[i++]); + if (codegen.expValues) + codegen.expCount = reduc[i++]; + if (codegen.insChain) + codegen.insChain = reduc[i]; + + return r; +} + /// Local bufferization of all dense and sparse data structures. static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op) { @@ -869,23 +897,25 @@ /// Returns parallelization strategy. Any implicit loop in the Linalg /// operation that is marked "parallel" is a candidate. Whether it is actually /// converted to a parallel operation depends on the requested strategy. -static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, - bool isSparse) { +static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isSparse) { // Reject parallelization of sparse output. if (codegen.sparseOut) return false; + // Parallel loops on tensor expansion can cause data races. + if (codegen.expCount) + return false; // Inspect strategy. switch (codegen.options.parallelizationStrategy) { case SparseParallelizationStrategy::kNone: return false; case SparseParallelizationStrategy::kDenseOuterLoop: - return isOuter && !isSparse && !isReduction; + return isOuter && !isSparse; case SparseParallelizationStrategy::kAnyStorageOuterLoop: - return isOuter && !isReduction; + return isOuter; case SparseParallelizationStrategy::kDenseAnyLoop: - return !isSparse && !isReduction; + return !isSparse; case SparseParallelizationStrategy::kAnyStorageAnyLoop: - return !isReduction; + return true; } llvm_unreachable("unexpected parallelization strategy"); } @@ -898,33 +928,16 @@ ArrayRef extraDims) { Location loc = op.getLoc(); auto iteratorTypes = op.getIteratorTypesArray(); - bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]); bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) || isSingletonDLT(merger.getDimLevelType(tid, idx)); - bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse); - assert(!isParallel); - - // Emit a sequential for loop. - SmallVector operands; - if (codegen.redVal) - operands.push_back(codegen.redVal); - if (codegen.expValues) - operands.push_back(codegen.expCount); - if (codegen.insChain) - operands.push_back(codegen.insChain); - - Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim( - builder, loc, tid, dim, operands, isParallel, extraTids, extraDims); - - unsigned o = 0; - if (codegen.redVal) - updateReduc(merger, codegen, operands[o++]); - if (codegen.expValues) - codegen.expCount = operands[o++]; - if (codegen.insChain) - codegen.insChain = operands[o++]; - assert(o == operands.size()); - + bool isParallel = isParallelFor(codegen, isOuter, isSparse); + + Operation *loop = + genLoopBoundary(codegen, merger, [&](MutableArrayRef reduc) { + return codegen.loopEmitter.enterLoopOverTensorAtDim( + builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims); + }).value(); + assert(loop); return loop; } @@ -934,29 +947,15 @@ ArrayRef condTids, ArrayRef condDims, ArrayRef extraTids, ArrayRef extraDims) { - SmallVector operands; - - // Construct the while-loop with a parameter for each index. - if (codegen.redVal) - operands.push_back(codegen.redVal); - if (codegen.expValues) - operands.push_back(codegen.expCount); - if (codegen.insChain) - operands.push_back(codegen.insChain); - - Operation *loop = codegen.loopEmitter.enterCoIterationOverTensorsAtDims( - builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids, - extraDims); - - unsigned o = 0; - if (codegen.redVal) - updateReduc(merger, codegen, operands[o++]); - if (codegen.expValues) - codegen.expCount = operands[o++]; - if (codegen.insChain) - codegen.insChain = operands[o++]; - assert(o == operands.size()); + Operation *loop = + genLoopBoundary(codegen, merger, [&](MutableArrayRef reduc) { + // Construct the while-loop with a parameter for each index. + return codegen.loopEmitter.enterCoIterationOverTensorsAtDims( + builder, op.getLoc(), condTids, condDims, needsUniv, reduc, + extraTids, extraDims); + }).value(); + assert(loop); return loop; } @@ -1186,37 +1185,21 @@ } /// Ends a single loop in current sequence. Returns new values for needsUniv. -static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, +static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, linalg::GenericOp op, Operation *loop, unsigned idx, unsigned li, bool needsUniv) { // End a while-loop. if (auto whileOp = dyn_cast(loop)) { - finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv, + finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv, merger.lat(li).bits, whileOp); } else { needsUniv = false; } - SmallVector reduc; - if (codegen.redVal) - reduc.push_back(codegen.redVal); - if (codegen.expValues) - reduc.push_back(codegen.expCount); - if (codegen.insChain) - reduc.push_back(codegen.insChain); - - auto loopRet = - codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc); - assert(reduc.size() == loopRet.size()); - - unsigned o = 0; - if (codegen.redVal) - updateReduc(merger, codegen, loopRet[o++]); - if (codegen.expValues) - codegen.expCount = loopRet[o++]; - if (codegen.insChain) - codegen.insChain = loopRet[o++]; - assert(o == loopRet.size()); + genLoopBoundary(codegen, merger, [&](MutableArrayRef reduc) { + codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc); + return llvm::None; + }); return needsUniv; } diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir @@ -1,14 +1,13 @@ // RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \ // RUN: FileCheck %s --check-prefix=CHECK-PAR0 -// FIXME: we do not support vectorization/parallel loops in loop emitter right now -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR1 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR2 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR3 -// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ -// R_U_N: FileCheck %s --check-prefix=CHECK-PAR4 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR1 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR2 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR3 +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ +// RUN: FileCheck %s --check-prefix=CHECK-PAR4 #DenseMatrix = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ] @@ -151,7 +150,8 @@ // // CHECK-PAR4-LABEL: func @matvec // CHECK-PAR4: scf.parallel -// CHECK-PAR4: scf.for +// CHECK-PAR4: scf.parallel +// CHECK-PAR4: scf.reduce // CHECK-PAR4: return // func.func @matvec(%arga: tensor<16x32xf32, #CSR>, diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir @@ -0,0 +1,63 @@ +// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \ +// RUN: FileCheck %s + +#CSR = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "compressed" ] +}> + +#trait_matvec = { + indexing_maps = [ + affine_map<(i,j) -> (i,j)>, // A + affine_map<(i,j) -> (j)>, // b + affine_map<(i,j) -> (i)> // x (out) + ], + iterator_types = ["parallel", "reduction"], + doc = "x(i) += A(i,j) * b(j)" +} +// CHECK-LABEL: func.func @matvec( +// CHECK-SAME: %[[TMP_arg0:.*]]: tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>, +// CHECK-SAME: %[[TMP_arg1:.*]]: tensor<32xf32>, +// CHECK-SAME: %[[TMP_arg2:.*]]: tensor<16xf32>) -> tensor<16xf32> { +// CHECK-DAG: %[[TMP_c16:.*]] = arith.constant 16 : index +// CHECK-DAG: %[[TMP_c0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[TMP_c1:.*]] = arith.constant 1 : index +// CHECK: %[[TMP_0:.*]] = sparse_tensor.pointers %[[TMP_arg0]] {dimension = 1 : index} +// CHECK: %[[TMP_1:.*]] = sparse_tensor.indices %[[TMP_arg0]] {dimension = 1 : index} +// CHECK: %[[TMP_2:.*]] = sparse_tensor.values %[[TMP_arg0]] +// CHECK: %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : memref<32xf32> +// CHECK: %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : memref<16xf32> +// CHECK: scf.parallel (%[[TMP_arg3:.*]]) = (%[[TMP_c0]]) to (%[[TMP_c16]]) step (%[[TMP_c1]]) { +// CHECK: %[[TMP_6:.*]] = memref.load %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32> +// CHECK: %[[TMP_7:.*]] = memref.load %[[TMP_0]][%[[TMP_arg3]]] : memref +// CHECK: %[[TMP_8:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index +// CHECK: %[[TMP_9:.*]] = memref.load %[[TMP_0]][%[[TMP_8]]] : memref +// CHECK: %[[TMP_10:.*]] = scf.parallel (%[[TMP_arg4:.*]]) = (%[[TMP_7]]) to (%[[TMP_9]]) step (%[[TMP_c1]]) init (%[[TMP_6]]) -> f32 { +// CHECK: %[[TMP_11:.*]] = memref.load %[[TMP_1]][%[[TMP_arg4]]] : memref +// CHECK: %[[TMP_12:.*]] = memref.load %[[TMP_2]][%[[TMP_arg4]]] : memref +// CHECK: %[[TMP_13:.*]] = memref.load %[[TMP_3]][%[[TMP_11]]] : memref<32xf32> +// CHECK: %[[TMP_14:.*]] = arith.mulf %[[TMP_12]], %[[TMP_13]] : f32 +// CHECK: scf.reduce(%[[TMP_14]]) : f32 { +// CHECK: ^bb0(%[[TMP_arg5:.*]]: f32, %[[TMP_arg6:.*]]: f32): +// CHECK: %[[TMP_15:.*]] = arith.addf %[[TMP_arg5]], %[[TMP_arg6]] : f32 +// CHECK: scf.reduce.return %[[TMP_15]] : f32 +// CHECK: } +// CHECK: scf.yield +// CHECK: } +// CHECK: memref.store %[[TMP_10]], %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32> +// CHECK: scf.yield +// CHECK: } +// CHECK: %[[TMP_5:.*]] = bufferization.to_tensor %[[TMP_4]] : memref<16xf32> +// CHECK: return %[[TMP_5]] : tensor<16xf32> +func.func @matvec(%arga: tensor<16x32xf32, #CSR>, + %argb: tensor<32xf32>, + %argx: tensor<16xf32>) -> tensor<16xf32> { + %0 = linalg.generic #trait_matvec + ins(%arga, %argb : tensor<16x32xf32, #CSR>, tensor<32xf32>) + outs(%argx: tensor<16xf32>) { + ^bb(%A: f32, %b: f32, %x: f32): + %0 = arith.mulf %A, %b : f32 + %1 = arith.addf %0, %x : f32 + linalg.yield %1 : f32 + } -> tensor<16xf32> + return %0 : tensor<16xf32> +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir @@ -2,6 +2,14 @@ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s +// +// Do the same run, but now with parallelization. +// +// RUN: mlir-opt %s --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + #CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir @@ -4,6 +4,16 @@ // RUN: -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s +// +// Do the same run, but now with parallelization. +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \ +// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s !Filename = !llvm.ptr