diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -219,6 +219,81 @@ " type($tensor) `,` type($indices) `,` type($value)"; } +def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []>, + Arguments<(ins AnyTensor:$tensor)>, + Results<(outs AnyStridedMemRefOfRank<1>:$values, + StridedMemRefRankOf<[I1],[1]>:$filled, + StridedMemRefRankOf<[Index],[1]>:$added, + Index:$count)> { + string summary = "Expands an access pattern for insertion"; + string description = [{ + Performs an access pattern expansion for the innermost dimensions of the + given tensor. This operation is useful to implement kernels in which a + sparse tensor appears as output. This technique is known under several + different names and using several alternative implementations, + for example, phase counter [Gustavson71], expanded or switch array + [Pissanetzky84], in phase scan [Duff90], access pattern expansion [Bik96], + and workspaces [Kjolstad2018]. + + The values and filled array have a size the suffices for a *dense* innermost + dimension (e.g. a full row for matrices). The added array and count are used + to store new indices when a false value is encountered in the filled array. + All arrays should be allocated before the loop (possibly even shared between + loops in a future optimization) so that their *dense* intitialization can be + amortized over many iterations. Setting and resetting the dense arrays in + the loop nest itself is kept *sparse* by only iterating over set elements + through an indirection using the added array, so that the operations are + kept proportional to the number of nonzeros. + + Note that this operation is "impure" in the sense that its behavior + is solely defined by side-effects and not SSA values. The semantics + may be refined over time as our sparse abstractions evolve. + + Example: + + ```mlir + %values, %filled, %added, %count = sparse_tensor.expand %0 + : tensor<4x4xf64, #CSR> to memref, memref, memref, index + ``` + }]; + let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($values)" + " `,` type($filled) `,` type($added) `,` type($count)"; +} + +def SparseTensor_CompressOp : SparseTensor_Op<"compress", []>, + Arguments<(ins AnyTensor:$tensor, + StridedMemRefRankOf<[Index],[1]>:$indices, + AnyStridedMemRefOfRank<1>:$values, + StridedMemRefRankOf<[I1],[1]>:$filled, + StridedMemRefRankOf<[Index],[1]>:$added, + Index:$count)> { + string summary = "Compressed an access pattern for insertion"; + string description = [{ + Finishes a single access pattern by moving the inserted elements + into the sparse storage scheme. The values and filled array are reset + in a *sparse* fashion by only iterating over set elements through an + indirection using the added array, so that the operations are kept + proportional to the number of nonzeros. See the 'expand' operation + for more details. + + Note that this operation is "impure" in the sense that its behavior + is solely defined by side-effects and not SSA values. The semantics + may be refined over time as our sparse abstractions evolve. + + Example: + + ```mlir + sparse_tensor.compress %0, %1, %values, %filled, %added, %2 + : tensor<4x4xf64, #CSR>, memref, memref, + memref, memref, index + ``` + }]; + let assemblyFormat = "$tensor `,` $indices `,` $values `,` $filled `,`" + " $added `,` $count attr-dict `:` type($tensor) `,`" + " type($indices) `,` type($values) `,` type($filled) `,`" + " type($added) `,` type($count)"; +} + def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]>, Arguments<(ins AnyTensor:$tensor, UnitAttr:$hasInserts)>, Results<(outs AnyTensor:$result)> { diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -305,6 +305,18 @@ return success(); } +static LogicalResult verify(ExpandOp op) { + if (!getSparseTensorEncoding(op.tensor().getType())) + return op.emitError("expected a sparse tensor for expansion"); + return success(); +} + +static LogicalResult verify(CompressOp op) { + if (!getSparseTensorEncoding(op.tensor().getType())) + return op.emitError("expected a sparse tensor for compression"); + return success(); +} + static LogicalResult verify(LoadOp op) { if (!getSparseTensorEncoding(op.tensor().getType())) return op.emitError("expected a sparse tensor to materialize"); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp @@ -253,10 +253,17 @@ /// type, but returns it as type `memref` (rather than as type /// `memref<$sz x $tp>`). static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc, - unsigned sz, Type tp) { + Value sz, Type tp) { auto memTp = MemRefType::get({ShapedType::kDynamicSize}, tp); - Value a = constantIndex(rewriter, loc, sz); - return rewriter.create(loc, memTp, ValueRange{a}); + return rewriter.create(loc, memTp, ValueRange{sz}); +} + +/// Generates an uninitialized temporary buffer of the given size and +/// type, but returns it as type `memref` (rather than as type +/// `memref<$sz x $tp>`). +static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc, + unsigned sz, Type tp) { + return genAlloca(rewriter, loc, constantIndex(rewriter, loc, sz), tp); } /// Generates an uninitialized temporary buffer with room for one value @@ -906,6 +913,78 @@ } }; +class SparseTensorExpandConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(ExpandOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op->getLoc(); + ShapedType srcType = op.tensor().getType().cast(); + Type eltType = srcType.getElementType(); + Type boolType = rewriter.getIntegerType(1); + Type idxType = rewriter.getIndexType(); + // All initialization should be done on entry of the loop nest. + rewriter.setInsertionPointAfter(op.tensor().getDefiningOp()); + // Determine the size for access expansion. + auto enc = getSparseTensorEncoding(srcType); + Value src = adaptor.getOperands()[0]; + Value sz = genDimSizeCall(rewriter, op, enc, src, srcType.getRank() - 1); + // Allocate temporary stack buffers for values, filled-switch, and indices. + Value values = genAlloca(rewriter, loc, sz, eltType); + Value filled = genAlloca(rewriter, loc, sz, boolType); + Value indices = genAlloca(rewriter, loc, sz, idxType); + Value zero = constantZero(rewriter, loc, idxType); + // Reset the values/filled-switch to all-zero/false. Note that this + // introduces an O(N) operation into the computation, but this reset + // operation is amortized over the innermost loops for the access + // pattern expansion. + rewriter.create(loc, constantZero(rewriter, loc, eltType), + values); + rewriter.create(loc, constantZero(rewriter, loc, boolType), + filled); + // Replace expansion op with these buffers and initial index. + assert(op.getNumResults() == 4); + rewriter.replaceOp(op, {values, filled, indices, zero}); + return success(); + } +}; + +class SparseTensorCompressConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CompressOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Note that this method call resets the values/filled-switch back to + // all-zero/false by only iterating over the set elements, so the + // complexity remains proportional to the sparsity of the expanded + // access pattern. + Type srcType = op.tensor().getType(); + Type eltType = srcType.cast().getElementType(); + StringRef name; + if (eltType.isF64()) + name = "expInsertF64"; + else if (eltType.isF32()) + name = "expInsertF32"; + else if (eltType.isInteger(64)) + name = "expInsertI64"; + else if (eltType.isInteger(32)) + name = "expInsertI32"; + else if (eltType.isInteger(16)) + name = "expInsertI16"; + else if (eltType.isInteger(8)) + name = "expInsertI8"; + else + return failure(); + TypeRange noTp; + auto fn = + getFunc(op, name, noTp, adaptor.getOperands(), /*emitCInterface=*/true); + rewriter.replaceOpWithNewOp(op, noTp, fn, adaptor.getOperands()); + return success(); + } +}; + } // namespace //===----------------------------------------------------------------------===// @@ -921,6 +1000,7 @@ SparseTensorInitConverter, SparseTensorConvertConverter, SparseTensorReleaseConverter, SparseTensorToPointersConverter, SparseTensorToIndicesConverter, SparseTensorToValuesConverter, - SparseTensorLoadConverter, SparseTensorLexInsertConverter>( + SparseTensorLoadConverter, SparseTensorLexInsertConverter, + SparseTensorExpandConverter, SparseTensorCompressConverter>( typeConverter, patterns.getContext()); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -54,7 +54,8 @@ pidxs(numTensors, std::vector(numLoops)), idxs(numTensors, std::vector(numLoops)), redExp(-1u), redVal(), redKind(kNoReduc), sparseOut(op), outerParNest(nest), lexIdx(), - curVecLength(1), curVecMask() {} + expValues(), expFilled(), expAdded(), expCount(), curVecLength(1), + curVecMask() {} /// Sparsification options. SparsificationOptions options; /// Universal dense indices and upper bounds (by index). The loops array @@ -81,10 +82,15 @@ Reduction redKind; // Sparse tensor as output. Implemented either through direct injective // insertion in lexicographic index order (where indices are updated - // in the temporary array `lexIdx`) or TODO: access pattern expansion + // in the temporary array `lexIdx`) or through access pattern expansion + // in the innermost loop nest (`expValues` through `expCount`). OpOperand *sparseOut; unsigned outerParNest; Value lexIdx; + Value expValues; + Value expFilled; + Value expAdded; + Value expCount; // Current vector length and mask. unsigned curVecLength; Value curVecMask; @@ -334,8 +340,8 @@ } // Determine admissable dynamic insertion situations: // (1) fully injective, since there are no reductions, - // (2) admissable 1-d expansion in innermost dimension. TODO: accept - if (nest == op.getRank(lhs)) { + // (2) admissable 1-d expansion in innermost dimension. + if (nest >= op.getRank(lhs) - 1) { *sparseOut = lhs; outerParNest = nest; return true; @@ -680,6 +686,16 @@ } } +/// Generates index for load/store on sparse tensor. +static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) { + auto map = op.getTiedIndexingMap(t); + auto enc = getSparseTensorEncoding(t->get().getType()); + AffineExpr a = map.getResult(perm(enc, map.getNumResults() - 1)); + assert(a.getKind() == AffineExprKind::DimId); + unsigned idx = a.cast().getPosition(); + return codegen.loops[idx]; +} + /// Generates subscript for load/store on a dense or sparse tensor. static Value genSubscript(CodeGen &codegen, PatternRewriter &rewriter, linalg::GenericOp op, OpOperand *t, @@ -705,6 +721,62 @@ return codegen.buffers[tensor]; } +/// Generates insertion code to implement dynamic tensor load. +static Value genInsertionLoad(CodeGen &codegen, PatternRewriter &rewriter, + linalg::GenericOp op, OpOperand *t) { + Location loc = op.getLoc(); + // Direct lexicographic index order, tensor loads as zero. + if (!codegen.expValues) { + Type tp = getElementTypeOrSelf(t->get().getType()); + return rewriter.create(loc, tp, + rewriter.getZeroAttr(tp)); + } + // Load from expanded access pattern. + Value index = genIndex(codegen, op, t); + return rewriter.create(loc, codegen.expValues, index); +} + +/// Generates insertion code to implement dynamic tensor store. +static void genInsertionStore(CodeGen &codegen, PatternRewriter &rewriter, + linalg::GenericOp op, OpOperand *t, Value rhs) { + Location loc = op.getLoc(); + // Direct insertion in lexicographic index order. + if (!codegen.expValues) { + rewriter.create(loc, t->get(), codegen.lexIdx, rhs); + return; + } + // Generates insertion code along expanded access pattern. + // if (!expFilled[i]) then + // expFilled[i] = true + // expAdded[inserts++] = i + // endif + // values[i] = rhs + Value index = genIndex(codegen, op, t); + Value fval = rewriter.create(loc, 0, 1); // false + Value tval = rewriter.create(loc, 1, 1); // true + // If statement. + Value filled = rewriter.create(loc, codegen.expFilled, index); + Value cond = rewriter.create(loc, arith::CmpIPredicate::eq, + filled, fval); + scf::IfOp ifOp = rewriter.create(loc, rewriter.getIndexType(), + cond, /*else=*/true); + // True branch. + rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); + rewriter.create(loc, tval, codegen.expFilled, index); + rewriter.create(loc, index, codegen.expAdded, + codegen.expCount); + Value one = rewriter.create(loc, 1); + Value add = rewriter.create(loc, codegen.expCount, one); + rewriter.create(loc, add); + // False branch. + rewriter.setInsertionPointToStart(&ifOp.elseRegion().front()); + rewriter.create(loc, codegen.expCount); + codegen.expCount = ifOp.getResult(0); + rewriter.setInsertionPointAfter(ifOp); + // Value assignment. + rewriter.create(loc, rhs, codegen.expValues, index); +} + /// Generates a load on a dense or sparse tensor. static Value genTensorLoad(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter, linalg::GenericOp op, @@ -716,13 +788,10 @@ return genVectorInvariantValue(codegen, rewriter, val); return val; } - // Insertion (a sparse tensor output "loads" as zero). + // Load during insertion. OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor]; - if (t == codegen.sparseOut) { - Type tp = getElementTypeOrSelf(t->get().getType()); - return rewriter.create(op.getLoc(), tp, - rewriter.getZeroAttr(tp)); - } + if (t == codegen.sparseOut) + return genInsertionLoad(codegen, rewriter, op, t); // Actual load. SmallVector args; Value ptr = genSubscript(codegen, rewriter, op, t, args); @@ -744,10 +813,10 @@ updateReduc(merger, codegen, rhs); return; } - // Insertion. + // Store during insertion. OpOperand *t = op.getOutputOperand(0); if (t == codegen.sparseOut) { - rewriter.create(loc, t->get(), codegen.lexIdx, rhs); + genInsertionStore(codegen, rewriter, op, t, rhs); return; } // Actual store. @@ -916,6 +985,42 @@ } } +/// Generates an expanded access pattern in innermost dimension. +static void genExpansion(Merger &merger, CodeGen &codegen, + PatternRewriter &rewriter, linalg::GenericOp op, + unsigned at, bool atStart) { + OpOperand *lhs = codegen.sparseOut; + if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 || + at != codegen.outerParNest) + return; // not needed at this level + // Generate start or end of an expanded access pattern. + Value tensor = lhs->get(); + Location loc = op.getLoc(); + if (atStart) { + auto dynShape = {ShapedType::kDynamicSize}; + Type etp = tensor.getType().cast().getElementType(); + Type t1 = MemRefType::get(dynShape, etp); + Type t2 = MemRefType::get(dynShape, genIntType(rewriter, 1)); + Type t3 = MemRefType::get(dynShape, genIntType(rewriter, 0)); + Type t4 = rewriter.getIndexType(); + auto res = + rewriter.create(loc, TypeRange({t1, t2, t3, t4}), tensor); + assert(res.getNumResults() == 4); + assert(!codegen.expValues); + codegen.expValues = res.getResult(0); + codegen.expFilled = res.getResult(1); + codegen.expAdded = res.getResult(2); + codegen.expCount = res.getResult(3); + } else { + assert(codegen.expValues); + rewriter.create(loc, tensor, codegen.lexIdx, codegen.expValues, + codegen.expFilled, codegen.expAdded, + codegen.expCount); + codegen.expValues = codegen.expFilled = codegen.expAdded = + codegen.expCount = Value(); + } +} + /// Generates initialization code for the subsequent loop sequence at /// current index level. Returns true if the loop sequence needs to /// maintain the universal index. @@ -1069,9 +1174,13 @@ } operands.push_back(codegen.redVal); } + if (codegen.expValues) + operands.push_back(codegen.expCount); scf::ForOp forOp = rewriter.create(loc, lo, hi, step, operands); if (codegen.redVal) updateReduc(merger, codegen, forOp.getRegionIterArgs().front()); + if (codegen.expValues) + codegen.expCount = forOp.getRegionIterArgs().back(); // Assign induction variable to sparse or dense index. Value iv = forOp.getInductionVar(); if (isSparse) @@ -1106,6 +1215,10 @@ types.push_back(codegen.redVal.getType()); operands.push_back(codegen.redVal); } + if (codegen.expValues) { + types.push_back(indexType); + operands.push_back(codegen.expCount); + } if (needsUniv) { types.push_back(indexType); operands.push_back(codegen.loops[idx]); @@ -1135,6 +1248,8 @@ } if (codegen.redVal) updateReduc(merger, codegen, after->getArgument(o++)); + if (codegen.expValues) + codegen.expCount = after->getArgument(o++); if (needsUniv) codegen.loops[idx] = after->getArgument(o++); assert(o == operands.size()); @@ -1215,8 +1330,9 @@ } } - // Move the insertion indices in lexicographic index order. - if (codegen.sparseOut) { + // Move the insertion indices in lexicographic index order. During access + // pattern expansion, we can skip setting the innermost dimension. + if (codegen.sparseOut && !codegen.expValues) { Value pos = rewriter.create(loc, at); rewriter.create(loc, codegen.loops[idx], codegen.lexIdx, pos); @@ -1231,11 +1347,21 @@ scf::WhileOp whileOp) { Location loc = op.getLoc(); // Finalize each else branch of all if statements. - if (codegen.redVal) { + if (codegen.redVal || codegen.expValues) { while (auto ifOp = dyn_cast_or_null( rewriter.getInsertionBlock()->getParentOp())) { - rewriter.create(loc, codegen.redVal); - updateReduc(merger, codegen, ifOp.getResult(0)); + unsigned y = 0; + SmallVector yields; + if (codegen.redVal) { + yields.push_back(codegen.redVal); + updateReduc(merger, codegen, ifOp.getResult(y++)); + } + if (codegen.expValues) { + yields.push_back(codegen.expCount); + codegen.expCount = ifOp->getResult(y++); + } + assert(y == yields.size()); + rewriter.create(loc, yields); rewriter.setInsertionPointAfter(ifOp); } } @@ -1266,6 +1392,10 @@ operands.push_back(codegen.redVal); updateReduc(merger, codegen, whileOp->getResult(o++)); } + if (codegen.expValues) { + operands.push_back(codegen.expCount); + codegen.expCount = whileOp->getResult(o++); + } if (needsUniv) { operands.push_back( rewriter.create(loc, codegen.loops[idx], one)); @@ -1287,6 +1417,10 @@ operands.push_back(codegen.redVal); updateReduc(merger, codegen, loop->getResult(o++)); } + if (codegen.expValues) { + operands.push_back(codegen.expCount); + codegen.expCount = loop->getResult(o++); + } assert(o == operands.size()); if (o > 0) rewriter.create(loc, operands); @@ -1318,6 +1452,8 @@ } if (codegen.redVal) types.push_back(codegen.redVal.getType()); + if (codegen.expValues) + types.push_back(rewriter.getIndexType()); scf::IfOp ifOp = rewriter.create(loc, types, cond, /*else=*/true); rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); return ifOp; @@ -1325,11 +1461,19 @@ /// Generates end of true branch of if-statement within a while-loop. static void endIf(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter, - linalg::GenericOp op, scf::IfOp ifOp, Value ifInput) { + linalg::GenericOp op, scf::IfOp ifOp, Operation *loop, + Value redInput, Value cntInput) { + SmallVector operands; if (codegen.redVal) { - rewriter.create(op.getLoc(), codegen.redVal); - updateReduc(merger, codegen, ifInput); + operands.push_back(codegen.redVal); + updateReduc(merger, codegen, redInput); + } + if (codegen.expValues) { + operands.push_back(codegen.expCount); + codegen.expCount = cntInput; } + if (!operands.empty()) + rewriter.create(op.getLoc(), operands); rewriter.setInsertionPointToStart(&ifOp.elseRegion().front()); } @@ -1348,6 +1492,8 @@ assert(!codegen.loops[idx]); // Emit invariants at this loop sequence level. genInvariants(merger, codegen, rewriter, op, exp, ldx, /*atStart=*/true); + // Emit access pattern expansion for sparse tensor output. + genExpansion(merger, codegen, rewriter, op, at, /*atStart=*/true); // Emit further intitialization at this loop sequence level. unsigned l0 = merger.set(lts)[0]; bool needsUniv = @@ -1399,7 +1545,7 @@ /// Ends a loop sequence at given level. static void endLoopSeq(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter, linalg::GenericOp op, - unsigned exp, unsigned idx, unsigned ldx) { + unsigned exp, unsigned at, unsigned idx, unsigned ldx) { assert(codegen.curVecLength == 1); codegen.loops[idx] = Value(); // Bring a pending reduction back from SIMD form when sequence ends. @@ -1409,6 +1555,8 @@ genVectorReducEnd(codegen, rewriter, op.getLoc(), vtp)); // Unmark bookkeeping of invariants and loop index. genInvariants(merger, codegen, rewriter, op, exp, ldx, /*atStart=*/false); + // Finalize access pattern expansion for sparse tensor output. + genExpansion(merger, codegen, rewriter, op, at, /*atStart=*/false); } /// Recursively generates code while computing iteration lattices in order @@ -1443,7 +1591,8 @@ // Visit all lattices points with Li >= Lj to generate the // loop-body, possibly with if statements for coiteration. - Value ifInput = codegen.redVal; + Value redInput = codegen.redVal; + Value cntInput = codegen.expCount; bool isWhile = dyn_cast(loop) != nullptr; for (unsigned j = 0; j < lsize; j++) { unsigned lj = merger.set(lts)[j]; @@ -1454,7 +1603,7 @@ scf::IfOp ifOp = genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple); genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1); - endIf(merger, codegen, rewriter, op, ifOp, ifInput); + endIf(merger, codegen, rewriter, op, ifOp, loop, redInput, cntInput); } else { genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1); } @@ -1467,7 +1616,7 @@ } // End a loop sequence. - endLoopSeq(merger, codegen, rewriter, op, exp, idx, ldx); + endLoopSeq(merger, codegen, rewriter, op, exp, at, idx, ldx); } /// Converts the result computed by the sparse kernel into the required form. diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp --- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp +++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp @@ -163,10 +163,10 @@ /// function overloading to implement "partial" method specialization. class SparseTensorStorageBase { public: - // Dimension size query. + /// Dimension size query. virtual uint64_t getDimSize(uint64_t) = 0; - // Overhead storage. + /// Overhead storage. virtual void getPointers(std::vector **, uint64_t) { fatal("p64"); } virtual void getPointers(std::vector **, uint64_t) { fatal("p32"); } virtual void getPointers(std::vector **, uint64_t) { fatal("p16"); } @@ -176,7 +176,7 @@ virtual void getIndices(std::vector **, uint64_t) { fatal("i16"); } virtual void getIndices(std::vector **, uint64_t) { fatal("i8"); } - // Primary storage. + /// Primary storage. virtual void getValues(std::vector **) { fatal("valf64"); } virtual void getValues(std::vector **) { fatal("valf32"); } virtual void getValues(std::vector **) { fatal("vali64"); } @@ -184,13 +184,35 @@ virtual void getValues(std::vector **) { fatal("vali16"); } virtual void getValues(std::vector **) { fatal("vali8"); } - // Element-wise insertion in lexicographic index order. + /// Element-wise insertion in lexicographic index order. virtual void lexInsert(uint64_t *, double) { fatal("insf64"); } virtual void lexInsert(uint64_t *, float) { fatal("insf32"); } virtual void lexInsert(uint64_t *, int64_t) { fatal("insi64"); } virtual void lexInsert(uint64_t *, int32_t) { fatal("insi32"); } virtual void lexInsert(uint64_t *, int16_t) { fatal("ins16"); } virtual void lexInsert(uint64_t *, int8_t) { fatal("insi8"); } + + /// Expanded insertion. + virtual void expInsert(uint64_t *, double *, bool *, uint64_t *, uint64_t) { + fatal("expf64"); + } + virtual void expInsert(uint64_t *, float *, bool *, uint64_t *, uint64_t) { + fatal("expf32"); + } + virtual void expInsert(uint64_t *, int64_t *, bool *, uint64_t *, uint64_t) { + fatal("expi64"); + } + virtual void expInsert(uint64_t *, int32_t *, bool *, uint64_t *, uint64_t) { + fatal("expi32"); + } + virtual void expInsert(uint64_t *, int16_t *, bool *, uint64_t *, uint64_t) { + fatal("expi16"); + } + virtual void expInsert(uint64_t *, int8_t *, bool *, uint64_t *, uint64_t) { + fatal("expi8"); + } + + /// Finishes insertion. virtual void endInsert() = 0; virtual ~SparseTensorStorageBase() {} @@ -289,6 +311,35 @@ insPath(cursor, diff, top, val); } + /// Partially specialize expanded insertions based on template types. + /// Note that this method resets the values/filled-switch array back + /// to all-zero/false while only iterating over the nonzero elements. + void expInsert(uint64_t *cursor, V *values, bool *filled, uint64_t *added, + uint64_t count) override { + if (count == 0) + return; + // Sort. + std::sort(added, added + count); + // Restore insertion path for first insert. + uint64_t rank = getRank(); + uint64_t index = added[0]; + cursor[rank - 1] = index; + lexInsert(cursor, values[index]); + assert(filled[index]); + values[index] = 0; + filled[index] = false; + // Subsequent insertions are quick. + for (uint64_t i = 1; i < count; i++) { + assert(index < added[i] && "non-lexicographic insertion"); + index = added[i]; + cursor[rank - 1] = index; + insPath(cursor, rank - 1, added[i - 1] + 1, values[index]); + assert(filled[index]); + values[index] = 0.0; + filled[index] = false; + } + } + /// Finalizes lexicographic insertions. void endInsert() override { if (values.empty()) @@ -683,8 +734,7 @@ #define IMPL_SPARSEVALUES(NAME, TYPE, LIB) \ void _mlir_ciface_##NAME(StridedMemRefType *ref, void *tensor) { \ - assert(ref); \ - assert(tensor); \ + assert(ref &&tensor); \ std::vector *v; \ static_cast(tensor)->LIB(&v); \ ref->basePtr = ref->data = v->data(); \ @@ -696,8 +746,7 @@ #define IMPL_GETOVERHEAD(NAME, TYPE, LIB) \ void _mlir_ciface_##NAME(StridedMemRefType *ref, void *tensor, \ index_t d) { \ - assert(ref); \ - assert(tensor); \ + assert(ref &&tensor); \ std::vector *v; \ static_cast(tensor)->LIB(&v, d); \ ref->basePtr = ref->data = v->data(); \ @@ -710,9 +759,7 @@ void *_mlir_ciface_##NAME(void *tensor, TYPE value, \ StridedMemRefType *iref, \ StridedMemRefType *pref) { \ - assert(tensor); \ - assert(iref); \ - assert(pref); \ + assert(tensor &&iref &&pref); \ assert(iref->strides[0] == 1 && pref->strides[0] == 1); \ assert(iref->sizes[0] == pref->sizes[0]); \ const index_t *indx = iref->data + iref->offset; \ @@ -726,10 +773,11 @@ } #define IMPL_GETNEXT(NAME, V) \ - bool _mlir_ciface_##NAME(void *tensor, StridedMemRefType *iref, \ + bool _mlir_ciface_##NAME(void *tensor, StridedMemRefType *iref, \ StridedMemRefType *vref) { \ + assert(tensor &&iref &&vref); \ assert(iref->strides[0] == 1); \ - uint64_t *indx = iref->data + iref->offset; \ + index_t *indx = iref->data + iref->offset; \ V *value = vref->data + vref->offset; \ const uint64_t isize = iref->sizes[0]; \ auto iter = static_cast *>(tensor); \ @@ -747,12 +795,32 @@ #define IMPL_LEXINSERT(NAME, V) \ void _mlir_ciface_##NAME(void *tensor, StridedMemRefType *cref, \ V val) { \ + assert(tensor &&cref); \ assert(cref->strides[0] == 1); \ - uint64_t *cursor = cref->data + cref->offset; \ + index_t *cursor = cref->data + cref->offset; \ assert(cursor); \ static_cast(tensor)->lexInsert(cursor, val); \ } +#define IMPL_EXPINSERT(NAME, V) \ + void _mlir_ciface_##NAME( \ + void *tensor, StridedMemRefType *cref, \ + StridedMemRefType *vref, StridedMemRefType *fref, \ + StridedMemRefType *aref, index_t count) { \ + assert(tensor &&cref &&vref &&fref &&aref); \ + assert(cref->strides[0] == 1); \ + assert(vref->strides[0] == 1); \ + assert(fref->strides[0] == 1); \ + assert(aref->strides[0] == 1); \ + assert(vref->sizes[0] == fref->sizes[0]); \ + index_t *cursor = cref->data + cref->offset; \ + V *values = vref->data + vref->offset; \ + bool *filled = fref->data + fref->offset; \ + index_t *added = aref->data + aref->offset; \ + static_cast(tensor)->expInsert( \ + cursor, values, filled, added, count); \ + } + /// Constructs a new sparse tensor. This is the "swiss army knife" /// method for materializing sparse tensors into the computation. /// @@ -912,12 +980,21 @@ IMPL_LEXINSERT(lexInsertI16, int16_t) IMPL_LEXINSERT(lexInsertI8, int8_t) +/// Helper to insert using expansion, one per value type. +IMPL_EXPINSERT(expInsertF64, double) +IMPL_EXPINSERT(expInsertF32, float) +IMPL_EXPINSERT(expInsertI64, int64_t) +IMPL_EXPINSERT(expInsertI32, int32_t) +IMPL_EXPINSERT(expInsertI16, int16_t) +IMPL_EXPINSERT(expInsertI8, int8_t) + #undef CASE #undef IMPL_SPARSEVALUES #undef IMPL_GETOVERHEAD #undef IMPL_ADDELT #undef IMPL_GETNEXT -#undef IMPL_INSERTLEX +#undef IMPL_LEXINSERT +#undef IMPL_EXPINSERT //===----------------------------------------------------------------------===// // diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir --- a/mlir/test/Dialect/SparseTensor/conversion.mlir +++ b/mlir/test/Dialect/SparseTensor/conversion.mlir @@ -441,3 +441,30 @@ return } +// CHECK-LABEL: func @sparse_expansion() +// %[[S:.*]] = call @sparseDimSize +// %[[V:.*]] = memref.alloca(%[[S]]) : memref +// %[[F:.*]] = memref.alloca(%[[S]]) : memref +// %[[A:.*]] = memref.alloca(%[[S]]) : memref +// linalg.fill(%{{.*}}, %[[V]]) : f64, memref +// linalg.fill(%{{.*}}, %[[F]]) : i1, memref +// CHECK: return +func @sparse_expansion() { + %c = arith.constant 8 : index + %0 = sparse_tensor.init [%c, %c] : tensor<8x8xf64, #SparseMatrix> + %values, %filled, %added, %count = sparse_tensor.expand %0 + : tensor<8x8xf64, #SparseMatrix> to memref, memref, memref, index + return +} + +// CHECK-LABEL: func @sparse_compression( +// CHECK-SAME: %[[A:.*]]: !llvm.ptr, +// CHECK: call @expInsertF64(%[[A]], +// CHECK: return +func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>, + %arg1: memref, %arg2: memref, %arg3: memref, + %arg4: memref, %arg5: index) { + sparse_tensor.compress %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 + : tensor<8x8xf64, #SparseMatrix>, memref, memref, memref, memref, index + return +} diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir --- a/mlir/test/Dialect/SparseTensor/invalid.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid.mlir @@ -160,6 +160,25 @@ // ----- +func @sparse_unannotated_expansion(%arg0: tensor<128xf64>) { + // expected-error@+1 {{expected a sparse tensor for expansion}} + %values, %filled, %added, %count = sparse_tensor.expand %arg0 + : tensor<128xf64> to memref, memref, memref, index + return +} + +// ----- + +func @sparse_unannotated_compression(%arg0: tensor<128xf64>, %arg1: memref, + %arg2: memref, %arg3: memref, + %arg4: memref, %arg5: index) { + // expected-error@+1 {{expected a sparse tensor for compression}} + sparse_tensor.compress %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 + : tensor<128xf64>, memref, memref, memref, memref, index +} + +// ----- + func @sparse_convert_unranked(%arg0: tensor<*xf32>) -> tensor<10xf32> { // expected-error@+1 {{unexpected type in convert}} %0 = sparse_tensor.convert %arg0 : tensor<*xf32> to tensor<10xf32> diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir --- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir @@ -149,3 +149,33 @@ sparse_tensor.lex_insert %arg0, %arg1, %arg2 : tensor<128xf64, #SparseVector>, memref, f64 return } + +// ----- + +#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["compressed", "compressed"]}> + +// CHECK-LABEL: func @sparse_expansion( +// CHECK-SAME: %[[A:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{{.*}}>>) +// CHECK: sparse_tensor.expand %[[A]] +// CHECK: return +func @sparse_expansion(%arg0: tensor<8x8xf64, #SparseMatrix>) { + %values, %filled, %added, %count = sparse_tensor.expand %arg0 + : tensor<8x8xf64, #SparseMatrix> to memref, memref, memref, index + return +} + +// ----- + +#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["compressed", "compressed"]}> + +// CHECK-LABEL: func @sparse_compression( +// CHECK-SAME: %[[A:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{{.*}}>>, +// CHECK: sparse_tensor.compress %[[A]] +// CHECK: return +func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>, + %arg1: memref, %arg2: memref, %arg3: memref, + %arg4: memref, %arg5: index) { + sparse_tensor.compress %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 + : tensor<8x8xf64, #SparseMatrix>, memref, memref, memref, memref, index + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir @@ -0,0 +1,274 @@ +// RUN: mlir-opt %s \ +// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \ +// RUN: --sparsification --sparse-tensor-conversion \ +// RUN: --linalg-bufferize --convert-linalg-to-loops \ +// RUN: --convert-vector-to-scf --convert-scf-to-std \ +// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ +// RUN: --std-bufferize --finalizing-bufferize --lower-affine \ +// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s +// + +#CSR = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "compressed" ], + dimOrdering = affine_map<(i,j) -> (i,j)> +}> + +#DCSR = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed" ], + dimOrdering = affine_map<(i,j) -> (i,j)> +}> + +module { + // + // Computes C = A x B with all matrices dense. + // + func @matmul1(%A: tensor<4x8xf64>, + %B: tensor<8x4xf64>) -> tensor<4x4xf64> { + %C = arith.constant dense<0.0> : tensor<4x4xf64> + %D = linalg.matmul + ins(%A, %B: tensor<4x8xf64>, tensor<8x4xf64>) + outs(%C: tensor<4x4xf64>) -> tensor<4x4xf64> + return %D: tensor<4x4xf64> + } + + // + // Computes C = A x B with all matrices sparse (SpMSpM) in CSR. + // + func @matmul2(%A: tensor<4x8xf64, #CSR>, + %B: tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> { + %c4 = arith.constant 4 : index + %C = sparse_tensor.init [%c4, %c4] : tensor<4x4xf64, #CSR> + %D = linalg.matmul + ins(%A, %B: tensor<4x8xf64, #CSR>, tensor<8x4xf64, #CSR>) + outs(%C: tensor<4x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> + return %D: tensor<4x4xf64, #CSR> + } + + // + // Computes C = A x B with all matrices sparse (SpMSpM) in DCSR. + // + func @matmul3(%A: tensor<4x8xf64, #DCSR>, + %B: tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> { + %c4 = arith.constant 4 : index + %C = sparse_tensor.init [%c4, %c4] : tensor<4x4xf64, #DCSR> + %D = linalg.matmul + ins(%A, %B: tensor<4x8xf64, #DCSR>, tensor<8x4xf64, #DCSR>) + outs(%C: tensor<4x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> + return %D: tensor<4x4xf64, #DCSR> + } + + // + // Main driver. + // + func @entry() { + %c0 = arith.constant 0 : index + %d1 = arith.constant -1.0 : f64 + + // Initialize various matrices, dense for stress testing, + // and sparse to verify correct nonzero structure. + %da = arith.constant dense<[ + [ 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1 ], + [ 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2 ], + [ 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3 ], + [ 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4 ] + ]> : tensor<4x8xf64> + %db = arith.constant dense<[ + [ 10.1, 11.1, 12.1, 13.1 ], + [ 10.2, 11.2, 12.2, 13.2 ], + [ 10.3, 11.3, 12.3, 13.3 ], + [ 10.4, 11.4, 12.4, 13.4 ], + [ 10.5, 11.5, 12.5, 13.5 ], + [ 10.6, 11.6, 12.6, 13.6 ], + [ 10.7, 11.7, 12.7, 13.7 ], + [ 10.8, 11.8, 12.8, 13.8 ] + ]> : tensor<8x4xf64> + %sa = arith.constant dense<[ + [ 0.0, 2.1, 0.0, 0.0, 0.0, 6.1, 0.0, 0.0 ], + [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ], + [ 0.0, 2.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ], + [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ] + ]> : tensor<4x8xf64> + %sb = arith.constant dense<[ + [ 0.0, 0.0, 0.0, 1.0 ], + [ 0.0, 0.0, 2.0, 0.0 ], + [ 0.0, 3.0, 0.0, 0.0 ], + [ 4.0, 0.0, 0.0, 0.0 ], + [ 0.0, 0.0, 0.0, 0.0 ], + [ 0.0, 5.0, 0.0, 0.0 ], + [ 0.0, 0.0, 6.0, 0.0 ], + [ 0.0, 0.0, 7.0, 8.0 ] + ]> : tensor<8x4xf64> + + // Convert all these matrices to sparse format. + %a1 = sparse_tensor.convert %da : tensor<4x8xf64> to tensor<4x8xf64, #CSR> + %a2 = sparse_tensor.convert %da : tensor<4x8xf64> to tensor<4x8xf64, #DCSR> + %a3 = sparse_tensor.convert %sa : tensor<4x8xf64> to tensor<4x8xf64, #CSR> + %a4 = sparse_tensor.convert %sa : tensor<4x8xf64> to tensor<4x8xf64, #DCSR> + %b1 = sparse_tensor.convert %db : tensor<8x4xf64> to tensor<8x4xf64, #CSR> + %b2 = sparse_tensor.convert %db : tensor<8x4xf64> to tensor<8x4xf64, #DCSR> + %b3 = sparse_tensor.convert %sb : tensor<8x4xf64> to tensor<8x4xf64, #CSR> + %b4 = sparse_tensor.convert %sb : tensor<8x4xf64> to tensor<8x4xf64, #DCSR> + + // Call kernels with dense. + %0 = call @matmul1(%da, %db) + : (tensor<4x8xf64>, tensor<8x4xf64>) -> tensor<4x4xf64> + %1 = call @matmul2(%a1, %b1) + : (tensor<4x8xf64, #CSR>, + tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> + %2 = call @matmul3(%a2, %b2) + : (tensor<4x8xf64, #DCSR>, + tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> + + // Call kernels with one sparse. + %3 = call @matmul1(%sa, %db) + : (tensor<4x8xf64>, tensor<8x4xf64>) -> tensor<4x4xf64> + %4 = call @matmul2(%a3, %b1) + : (tensor<4x8xf64, #CSR>, + tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> + %5 = call @matmul3(%a4, %b2) + : (tensor<4x8xf64, #DCSR>, + tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> + + // Call kernels with sparse. + %6 = call @matmul1(%sa, %sb) + : (tensor<4x8xf64>, tensor<8x4xf64>) -> tensor<4x4xf64> + %7 = call @matmul2(%a3, %b3) + : (tensor<4x8xf64, #CSR>, + tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> + %8 = call @matmul3(%a4, %b4) + : (tensor<4x8xf64, #DCSR>, + tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> + + // + // CHECK: ( ( 388.76, 425.56, 462.36, 499.16 ), + // CHECK-SAME: ( 397.12, 434.72, 472.32, 509.92 ), + // CHECK-SAME: ( 405.48, 443.88, 482.28, 520.68 ), + // CHECK-SAME: ( 413.84, 453.04, 492.24, 531.44 ) ) + // + %m0 = bufferization.to_memref %0 : memref<4x4xf64> + %v0 = vector.transfer_read %m0[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v0 : vector<4x4xf64> + + // + // CHECK: ( ( 388.76, 425.56, 462.36, 499.16 ), + // CHECK-SAME: ( 397.12, 434.72, 472.32, 509.92 ), + // CHECK-SAME: ( 405.48, 443.88, 482.28, 520.68 ), + // CHECK-SAME: ( 413.84, 453.04, 492.24, 531.44 ) ) + // + %c1 = sparse_tensor.convert %1 : tensor<4x4xf64, #CSR> to tensor<4x4xf64> + %m1 = bufferization.to_memref %c1 : memref<4x4xf64> + %v1 = vector.transfer_read %m1[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v1 : vector<4x4xf64> + + // + // CHECK: ( ( 388.76, 425.56, 462.36, 499.16 ), + // CHECK-SAME: ( 397.12, 434.72, 472.32, 509.92 ), + // CHECK-SAME: ( 405.48, 443.88, 482.28, 520.68 ), + // CHECK-SAME: ( 413.84, 453.04, 492.24, 531.44 ) ) + // + %c2 = sparse_tensor.convert %2 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64> + %m2 = bufferization.to_memref %c2 : memref<4x4xf64> + %v2 = vector.transfer_read %m2[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v2 : vector<4x4xf64> + + // + // CHECK: ( ( 86.08, 94.28, 102.48, 110.68 ), + // CHECK-SAME: ( 0, 0, 0, 0 ), + // CHECK-SAME: ( 23.46, 25.76, 28.06, 30.36 ), + // CHECK-SAME: ( 10.8, 11.8, 12.8, 13.8 ) ) + // + %m3 = bufferization.to_memref %3 : memref<4x4xf64> + %v3 = vector.transfer_read %m3[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v3 : vector<4x4xf64> + + // + // CHECK: ( ( 86.08, 94.28, 102.48, 110.68 ), + // CHECK-SAME: ( 0, 0, 0, 0 ), + // CHECK-SAME: ( 23.46, 25.76, 28.06, 30.36 ), + // CHECK-SAME: ( 10.8, 11.8, 12.8, 13.8 ) ) + // + %c4 = sparse_tensor.convert %4 : tensor<4x4xf64, #CSR> to tensor<4x4xf64> + %m4 = bufferization.to_memref %c4 : memref<4x4xf64> + %v4 = vector.transfer_read %m4[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v4 : vector<4x4xf64> + + // + // CHECK: ( ( 86.08, 94.28, 102.48, 110.68 ), + // CHECK-SAME: ( 0, 0, 0, 0 ), + // CHECK-SAME: ( 23.46, 25.76, 28.06, 30.36 ), + // CHECK-SAME: ( 10.8, 11.8, 12.8, 13.8 ) ) + // + %c5 = sparse_tensor.convert %5 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64> + %m5 = bufferization.to_memref %c5 : memref<4x4xf64> + %v5 = vector.transfer_read %m5[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v5 : vector<4x4xf64> + + // + // CHECK: ( ( 0, 30.5, 4.2, 0 ), ( 0, 0, 0, 0 ), ( 0, 0, 4.6, 0 ), ( 0, 0, 7, 8 ) ) + // + %m6 = bufferization.to_memref %6 : memref<4x4xf64> + %v6 = vector.transfer_read %m6[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v6 : vector<4x4xf64> + + // + // CHECK: ( ( 0, 30.5, 4.2, 0 ), ( 0, 0, 0, 0 ), ( 0, 0, 4.6, 0 ), ( 0, 0, 7, 8 ) ) + // + %c7 = sparse_tensor.convert %7 : tensor<4x4xf64, #CSR> to tensor<4x4xf64> + %m7 = bufferization.to_memref %c7 : memref<4x4xf64> + %v7 = vector.transfer_read %m7[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v7 : vector<4x4xf64> + + // + // CHECK: ( ( 0, 30.5, 4.2, 0 ), ( 0, 0, 0, 0 ), ( 0, 0, 4.6, 0 ), ( 0, 0, 7, 8 ) ) + // + %c8 = sparse_tensor.convert %8 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64> + %m8 = bufferization.to_memref %c8 : memref<4x4xf64> + %v8 = vector.transfer_read %m8[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64> + vector.print %v8 : vector<4x4xf64> + + // + // Sanity check on nonzeros. + // + // CHECK: ( 30.5, 4.2, 4.6, 7, 8, -1, -1, -1 ) + // CHECK: ( 30.5, 4.2, 4.6, 7, 8, -1, -1, -1 ) + // + %val7 = sparse_tensor.values %7 : tensor<4x4xf64, #CSR> to memref + %val8 = sparse_tensor.values %8 : tensor<4x4xf64, #DCSR> to memref + %nz7 = vector.transfer_read %val7[%c0], %d1 : memref, vector<8xf64> + %nz8 = vector.transfer_read %val8[%c0], %d1 : memref, vector<8xf64> + vector.print %nz7 : vector<8xf64> + vector.print %nz8 : vector<8xf64> + + // Release the resources. + sparse_tensor.release %a1 : tensor<4x8xf64, #CSR> + sparse_tensor.release %a2 : tensor<4x8xf64, #DCSR> + sparse_tensor.release %a3 : tensor<4x8xf64, #CSR> + sparse_tensor.release %a4 : tensor<4x8xf64, #DCSR> + sparse_tensor.release %b1 : tensor<8x4xf64, #CSR> + sparse_tensor.release %b2 : tensor<8x4xf64, #DCSR> + sparse_tensor.release %b3 : tensor<8x4xf64, #CSR> + sparse_tensor.release %b4 : tensor<8x4xf64, #DCSR> + sparse_tensor.release %1 : tensor<4x4xf64, #CSR> + sparse_tensor.release %2 : tensor<4x4xf64, #DCSR> + sparse_tensor.release %4 : tensor<4x4xf64, #CSR> + sparse_tensor.release %5 : tensor<4x4xf64, #DCSR> + sparse_tensor.release %7 : tensor<4x4xf64, #CSR> + sparse_tensor.release %8 : tensor<4x4xf64, #DCSR> + memref.dealloc %m0 : memref<4x4xf64> + memref.dealloc %m1 : memref<4x4xf64> + memref.dealloc %m2 : memref<4x4xf64> + memref.dealloc %m3 : memref<4x4xf64> + memref.dealloc %m4 : memref<4x4xf64> + memref.dealloc %m5 : memref<4x4xf64> + memref.dealloc %m6 : memref<4x4xf64> + memref.dealloc %m7 : memref<4x4xf64> + memref.dealloc %m8 : memref<4x4xf64> + + return + } +}