diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp @@ -352,6 +352,32 @@ rewriter.create(loc, elemV, tensor, ivs); } +/// Determine if the runtime library supports direct conversion to the +/// given target `dimTypes`. +static bool canUseDirectConversion( + ArrayRef dimTypes) { + bool alreadyCompressed = false; + for (uint64_t rank = dimTypes.size(), r = 0; r < rank; r++) { + switch (dimTypes[r]) { + case SparseTensorEncodingAttr::DimLevelType::Compressed: + if (alreadyCompressed) + return false; // Multiple compressed dimensions not yet supported. + alreadyCompressed = true; + break; + case SparseTensorEncodingAttr::DimLevelType::Dense: + if (alreadyCompressed) + return false; // Dense after Compressed not yet supported. + break; + case SparseTensorEncodingAttr::DimLevelType::Singleton: + // Although Singleton isn't generally supported yet, the direct + // conversion method doesn't have any particular problems with + // singleton after compressed. + break; + } + } + return true; +} + //===----------------------------------------------------------------------===// // Conversion rules. //===----------------------------------------------------------------------===// @@ -489,19 +515,39 @@ SmallVector params; ShapedType stp = srcType.cast(); sizesFromPtr(rewriter, sizes, op, encSrc, stp, src); - // Set up encoding with right mix of src and dst so that the two - // method calls can share most parameters, while still providing - // the correct sparsity information to either of them. - auto enc = SparseTensorEncodingAttr::get( - op->getContext(), encDst.getDimLevelType(), encDst.getDimOrdering(), - encSrc.getPointerBitWidth(), encSrc.getIndexBitWidth()); - newParams(rewriter, params, op, stp, enc, Action::kToCOO, sizes, src); - Value coo = genNewCall(rewriter, op, params); - params[3] = constantPointerTypeEncoding(rewriter, loc, encDst); - params[4] = constantIndexTypeEncoding(rewriter, loc, encDst); - params[6] = constantAction(rewriter, loc, Action::kFromCOO); - params[7] = coo; - rewriter.replaceOp(op, genNewCall(rewriter, op, params)); + bool useDirectConversion; + switch (options.sparseToSparseStrategy) { + case SparseToSparseConversionStrategy::kViaCOO: + useDirectConversion = false; + break; + case SparseToSparseConversionStrategy::kDirect: + useDirectConversion = true; + assert(canUseDirectConversion(encDst.getDimLevelType()) && + "Unsupported target for direct sparse-to-sparse conversion"); + break; + case SparseToSparseConversionStrategy::kAuto: + useDirectConversion = canUseDirectConversion(encDst.getDimLevelType()); + break; + } + if (useDirectConversion) { + newParams(rewriter, params, op, stp, encDst, Action::kSparseToSparse, + sizes, src); + rewriter.replaceOp(op, genNewCall(rewriter, op, params)); + } else { // use via-COO conversion. + // Set up encoding with right mix of src and dst so that the two + // method calls can share most parameters, while still providing + // the correct sparsity information to either of them. + auto enc = SparseTensorEncodingAttr::get( + op->getContext(), encDst.getDimLevelType(), encDst.getDimOrdering(), + encSrc.getPointerBitWidth(), encSrc.getIndexBitWidth()); + newParams(rewriter, params, op, stp, enc, Action::kToCOO, sizes, src); + Value coo = genNewCall(rewriter, op, params); + params[3] = constantPointerTypeEncoding(rewriter, loc, encDst); + params[4] = constantIndexTypeEncoding(rewriter, loc, encDst); + params[6] = constantAction(rewriter, loc, Action::kFromCOO); + params[7] = coo; + rewriter.replaceOp(op, genNewCall(rewriter, op, params)); + } return success(); } if (!encDst && encSrc) { diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp --- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp +++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp @@ -73,6 +73,22 @@ static constexpr int kColWidth = 1025; +// TODO(wrengr): adjust this so it can be used by `openSparseTensorCOO` +// too. That version doesn't have the permutation, and the `sizes` +// are a pointer/C-array rather than `std::vector`. +/// Precondition: `perm` and `shape` must be valid for `rank`. +static inline void assertPermutedSizesMatchShape( + const std::vector &sizes, // in target order. + const uint64_t rank, + const uint64_t *perm, // semantic order -> target order. + const uint64_t *shape) { // in semantic order. + assert(rank == sizes.size() && "Rank mismatch"); + for (uint64_t r = 0; r < rank; r++) { + const uint64_t s = shape[r]; + assert((s == 0 || s == sizes[perm[r]]) && "Dimension size mismatch"); + } +} + /// A sparse tensor element in coordinate scheme (value and indices). /// For example, a rank-1 vector element would look like /// ({i}, a[i]) @@ -171,6 +187,7 @@ unsigned iteratorPos; }; +//===----------------------------------------------------------------------===// /// Abstract base class of sparse tensor storage. Note that we use /// function overloading to implement "partial" method specialization. class SparseTensorStorageBase { @@ -373,6 +390,132 @@ } }; // class SparseTensorEnumerator +//===----------------------------------------------------------------------===// +// Forward declaration. +template +class SparseTensorStorage; + +/// Statistics regarding the number of nonzero subtensors in +/// a source tensor, for direct sparse=>sparse conversion a la +/// . +/// +/// N.B., this class stores a reference to the dimension-sizes passed +/// to the constructor; thus, objects of this class must not outlive +/// the dimension-sizes they depend on. +class SparseTensorNNZ { + // All of these are in the target storage-order. + const std::vector &dimSizes; + const std::vector dimTypes; + std::vector> nnz; + +public: + SparseTensorNNZ(const SparseTensorNNZ &) = delete; + SparseTensorNNZ(SparseTensorNNZ &&) = delete; + SparseTensorNNZ &operator=(const SparseTensorNNZ &) = delete; + SparseTensorNNZ &operator=(SparseTensorNNZ &&) = delete; + + /// Returns the rank of the target tensor. + uint64_t getRank() const { return dimSizes.size(); } + + /// Allocate the statistics structure for the desired sizes and + /// sparsity (in the target tensor's storage-order). + /// + /// Preconditions: + /// (1) `sparsity` must be valid for `szs.size()` + /// (2) `szs` must not contain zeros. + SparseTensorNNZ(const std::vector &szs, + const DimLevelType *sparsity) + : dimSizes(szs), dimTypes(sparsity, sparsity + getRank()), + nnz(getRank()) { + bool uncompressed = true; + uint64_t sz = 1; // sz_r := product(dimSizes[0:r-1]); + for (uint64_t rank = getRank(), r = 0; r < rank; r++) { + switch (dimTypes[r]) { + case DimLevelType::kCompressed: + assert(uncompressed && + "Multiple compressed layers not currently supported"); + uncompressed = false; + nnz[r].resize(sz, 0); // Both allocate and zero-initialize. + break; + case DimLevelType::kDense: + assert(uncompressed && + "Dense after compressed not currently supported"); + break; + case DimLevelType::kSingleton: + // Singleton after Compressed causes no problems for allocating + // `nnz` nor for the yieldPos loop. This remains true even + // when adding support for multiple compressed dimensions or + // for dense-after-compressed. + break; + } + // TODO(wrengr): this division is expensive, but replacing it + // with ffs/clz or intrinsics is non-portable; does mlir have + // any cpp macro to enable this only when we want "paranoia mode" + // for extra debugging info? Or should we just rely on Clang's + // -fsanitize=integer ? + assert(dimSizes[r] <= std::numeric_limits::max() / sz); + sz *= dimSizes[r]; + } + } + + /// Enumerate the source tensor to fill in the statistics. The + /// enumerator should already incorporate the permutation of dimensions + /// from the semantic-order to the target tensor's storage-order. + template + void initialize(SparseTensorEnumerator &enumerator) { + assert(enumerator.getRank() == getRank() && "Tensor rank mismatch"); + assert(enumerator.permutedSizes() == dimSizes && "Tensor size mismatch"); + enumerator.forallElements( + [this](const std::vector &ind, V) { add(ind); }); + } + +private: + /// Adds a new element (i.e., increment its statistics). We use + /// a method rather than inlining into the lambda in `initialize`, + /// to avoid spurious templating over `V`. And this method is private + /// to avoid needing to re-assert validity of `ind` (which is guaranteed + /// by `forallElements`). + void add(const std::vector &ind) { + uint64_t parentPos = 0; + for (uint64_t rank = getRank(), r = 0; r < rank; r++) { + if (dimTypes[r] == DimLevelType::kCompressed) + nnz[r][parentPos]++; + parentPos = parentPos * dimSizes[r] + ind[r]; + } + } + +public: + /// The type of callback functions which receive an nnz-statistic. + typedef const std::function &NNZConsumer; + + /// Lexicographically enumerates all indicies for dimensions strictly + /// less than `stopDim`, and passes their nnz statistic to the callback. + /// Since our use case only requires the statistic not the coordinates + /// themselves, we do not bother to construct them. + void forallIndices(uint64_t stopDim, NNZConsumer yield) const { + assert(stopDim < getRank() && "Stopping-dimension is out of bounds"); + assert(dimTypes[stopDim] == DimLevelType::kCompressed && + "Cannot look up non-compressed dimensions"); + forallIndices(yield, stopDim, 0, 0); + } + +private: + /// Recursive component of the public `forallIndices`. + void forallIndices(NNZConsumer yield, uint64_t stopDim, uint64_t parentPos, + uint64_t d) const { + assert(d <= stopDim); + if (d == stopDim) { + assert(parentPos < nnz[d].size() && "Cursor is out of range"); + yield(nnz[d][parentPos]); + } else { + const uint64_t sz = dimSizes[d]; + const uint64_t pstart = parentPos * sz; + for (uint64_t i = 0; i < sz; i++) + forallIndices(yield, stopDim, pstart + i, d + 1); + } + } +}; // class SparseTensorNNZ + //===----------------------------------------------------------------------===// /// A memory-resident sparse tensor using a storage scheme based on /// per-dimension sparse/dense annotations. This data structure provides a @@ -382,40 +525,58 @@ /// and annotations to implement all required setup in a general manner. template class SparseTensorStorage : protected EnumerableSparseTensorStorage { -public: - /// Constructs a sparse tensor storage scheme with the given dimensions, - /// permutation, and per-dimension dense/sparse annotations, using - /// the coordinate scheme tensor for the initial contents if provided. +private: + /// Partial constructor for sharing code between the public + /// constructors. This only does the bare minimum setup for the + /// other constructors, thus the resulting object is not guaranteed + /// to be in a valid state. In particular `isCompressedDim` (and + /// hence `appendCurrentPointer` etc) will not work. /// /// Precondition: `perm` and `sparsity` must be valid for `szs.size()`. SparseTensorStorage(const std::vector &szs, const uint64_t *perm, - const DimLevelType *sparsity, - SparseTensorCOO *tensor = nullptr) + const DimLevelType *sparsity) : sizes(szs), rev(getRank()), idx(getRank()), pointers(getRank()), indices(getRank()) { - uint64_t rank = getRank(); + // Validate parameters. + for (uint64_t rank = getRank(), r = 0; r < rank; r++) { + assert(sizes[r] > 0 && "Dimension size zero has trivial storage"); + assert((sparsity[r] == DimLevelType::kDense || + sparsity[r] == DimLevelType::kCompressed) && + "Unsupported DimLevelType"); + } // Store "reverse" permutation. - for (uint64_t r = 0; r < rank; r++) + for (uint64_t rank = getRank(), r = 0; r < rank; r++) rev[perm[r]] = r; + } + +public: + /// Constructs a sparse tensor storage scheme with the given dimensions, + /// permutation, and per-dimension dense/sparse annotations, using + /// the coordinate scheme tensor for the initial contents if provided. + /// + /// Precondition: `perm` and `sparsity` must be valid for `szs.size()`. + SparseTensorStorage(const std::vector &szs, const uint64_t *perm, + const DimLevelType *sparsity, SparseTensorCOO *tensor) + : SparseTensorStorage(szs, perm, sparsity) { // Provide hints on capacity of pointers and indices. // TODO: needs fine-tuning based on sparsity bool allDense = true; uint64_t sz = 1; - for (uint64_t r = 0; r < rank; r++) { - assert(sizes[r] > 0 && "Dimension size zero has trivial storage"); + for (uint64_t rank = getRank(), r = 0; r < rank; r++) { + // TODO(wrengr): this division is expensive, but replacing it + // with ffs/clz or intrinsics is non-portable; does mlir have + // any cpp macro to enable this only when we want "paranoia mode" + // for extra debugging info? Or should we just rely on Clang's + // -fsanitize=integer ? + assert(sizes[r] <= std::numeric_limits::max() / sz); sz *= sizes[r]; if (sparsity[r] == DimLevelType::kCompressed) { pointers[r].reserve(sz + 1); indices[r].reserve(sz); sz = 1; allDense = false; - // Prepare the pointer structure. We cannot use `appendPointer` - // here, because `isCompressedDim` won't work until after this - // preparation has been done. + // Can't use `isCompressedDim` until after we push this zero. pointers[r].push_back(0); - } else { - assert(sparsity[r] == DimLevelType::kDense && - "singleton not yet supported"); } } // Then assign contents from coordinate scheme tensor if provided. @@ -433,6 +594,114 @@ } } + /// Constructs a sparse tensor storage scheme with the given dimensions, + /// permutation, and per-dimension dense/sparse annotations, using + /// the given sparse tensor for the initial contents. + /// + /// Precondition: `perm` and `sparsity` must be valid for `szs.size()`. + SparseTensorStorage(const std::vector &szs, const uint64_t *perm, + const DimLevelType *sparsity, + const EnumerableSparseTensorStorage &tensor) + : SparseTensorStorage(szs, perm, sparsity) { + SparseTensorEnumerator enumerator(tensor, getRank(), perm); + { + // Initialize the statistics structure. + SparseTensorNNZ nnz(sizes, sparsity); + nnz.initialize(enumerator); + // Initialize "pointers" overhead (and allocate "indices", "values"). + uint64_t parentSz = 1; // assembled-size (not dimension-size) of `r-1`. + for (uint64_t rank = getRank(), r = 0; r < rank; r++) { + if (sparsity[r] == DimLevelType::kCompressed) { + pointers[r].reserve(parentSz + 1); + // Can't use `isCompressedDim` until after we push this zero. + pointers[r].push_back(0); + uint64_t currentPos = 0; + nnz.forallIndices(r, [this, ¤tPos, r](uint64_t n) { + currentPos += n; + appendPointer(r, currentPos); + }); + assert(pointers[r].size() == parentSz + 1 && + "Final pointers size doesn't match allocated size"); + // That assertion entails `assembledSize(parentSz, r)` + // is now in a valid state. That is, `pointers[r][parentSz]` + // equals the present value of `currentPos`, which is the + // correct assembled-size for `indices[r]`. + } + // Update assembled-size for the next iteration. + parentSz = assembledSize(parentSz, r); + // Ideally we need only `indices[r].reserve(parentSz)`, however + // the `std::vector` implementation forces us to initialize it too. + // That is, in the yieldPos loop we need random-access assignment + // to `indices[r]`; however, `std::vector`'s subscript-assignment + // only allows assigning to already-initialized positions. + if (isCompressedDim(r)) + indices[r].resize(parentSz, 0); + } + values.resize(parentSz, 0); // Both allocate and zero-initialize. + } // end initializing "pointers" overhead + // The yieldPos loop + enumerator.forallElements([this](const std::vector &ind, V val) { + uint64_t parentSz = 1, parentPos = 0; + for (uint64_t rank = getRank(), r = 0; r < rank; r++) { + if (isCompressedDim(r)) { + // If `parentPos == parentSz` then it's valid as an array-lookup; + // however, it's semantically invalid here since that entry + // does not represent a segment of `indices[r]`. Moreover, that + // entry must be immutable for `assembledSize` to remain valid. + assert(parentPos < parentSz && "Pointers position is out of bounds"); + const uint64_t currentPos = pointers[r][parentPos]; + // This increment won't overflow the `P` type, since it can't + // exceed the original value of `pointers[r][parentPos+1]` + // which was already verified to be within bounds for `P` + // when it was written to the array. + pointers[r][parentPos]++; + writeIndex(r, currentPos, ind[r]); + parentPos = currentPos; + } else { // if dense + parentPos = parentPos * sizes[r] + ind[r]; + } // else if singleton: the new `parentPos` is the same as the old one. + parentSz = assembledSize(parentSz, r); + } + assert(parentPos < values.size() && "Value position is out of bounds"); + values[parentPos] = val; + }); // end yieldPos loop + // The finalizeYieldPos loop + for (uint64_t parentSz = 1, rank = getRank(), r = 0; r < rank; r++) { + if (isCompressedDim(r)) { + assert(parentSz == pointers[r].size() - 1 && + "Actual pointers size doesn't match the expected size"); + // Can't check all of them, but at least we can check the last one. + assert(pointers[r][parentSz - 1] == pointers[r][parentSz] && + "Pointers got corrupted"); + // TODO: optimize this by using `memmove` or similar. + for (uint64_t n = 0; n < parentSz; n++) { + uint64_t parentPos = parentSz - n; + pointers[r][parentPos] = pointers[r][parentPos - 1]; + } + pointers[r][0] = 0; + } + parentSz = assembledSize(parentSz, r); + } // end finalizeYieldPos loop + } + +private: + /// Computes the assembled-size associated with the `d`-th dimension, + /// given the assembled-size associated with the `(d-1)`-th dimension. + /// "Assembled-sizes" correspond to the (nominal) sizes of overhead + /// storage, as opposed to "dimension-sizes" which are the cardinality + /// of coordinates for that dimension. + /// + /// Precondition: the `pointers[d]` array must be fully initialized + /// before calling this method. + inline uint64_t assembledSize(uint64_t parentSz, uint64_t d) const { + if (isCompressedDim(d)) + return pointers[d][parentSz]; + // else if dense: + return parentSz * sizes[d]; + // else if singleton: `return parentSz;` + } + +public: ~SparseTensorStorage() override = default; /// Get the rank of the tensor. @@ -533,39 +802,71 @@ /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`. static SparseTensorStorage * newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm, - const DimLevelType *sparsity, SparseTensorCOO *tensor) { + const DimLevelType *sparsity, SparseTensorCOO *coo) { SparseTensorStorage *n = nullptr; - if (tensor) { - assert(tensor->getRank() == rank && "Tensor rank mismatch"); - for (uint64_t r = 0; r < rank; r++) - assert(shape[r] == 0 || shape[r] == tensor->getSizes()[perm[r]]); - n = new SparseTensorStorage(tensor->getSizes(), perm, sparsity, - tensor); - delete tensor; + if (coo) { + const auto &coosz = coo->getSizes(); + assertPermutedSizesMatchShape(coosz, rank, perm, shape); + n = new SparseTensorStorage(coosz, perm, sparsity, coo); + delete coo; } else { std::vector permsz(rank); for (uint64_t r = 0; r < rank; r++) { assert(shape[r] > 0 && "Dimension size zero has trivial storage"); permsz[perm[r]] = shape[r]; } - n = new SparseTensorStorage(permsz, perm, sparsity); + // We pass the null `coo` to ensure we select the intended constructor. + n = new SparseTensorStorage(permsz, perm, sparsity, coo); } return n; } + /// Factory method. Constructs a sparse tensor storage scheme with + /// the given dimensions, permutation, and per-dimension dense/sparse + /// annotations, using the sparse tensor for the initial contents. + /// + /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`. + static SparseTensorStorage * + newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm, + const DimLevelType *sparsity, + const EnumerableSparseTensorStorage *source) { + assert(source && "Got nullptr for source"); + SparseTensorEnumerator enumerator(*source, rank, perm); + const auto &permsz = enumerator.permutedSizes(); + assertPermutedSizesMatchShape(permsz, rank, perm, shape); + // N.B., do *not* `delete source`. The other `newSparseTensor` + // only deletes the COO because the end-user has no way to do so + // themselves. Whereas if we delete the source here, then we'll + // get a double-free error in `delSparseTensor`. + return new SparseTensorStorage(permsz, perm, sparsity, *source); + } + private: /// Appends the next free position of `indices[d]` to `pointers[d]`. /// Thus, when called after inserting the last element of a segment, /// it will append the position where the next segment begins. - inline void appendPointer(uint64_t d) { + inline void appendCurrentPointer(uint64_t d) { assert(isCompressedDim(d)); // Entails `d < getRank()`. - uint64_t p = indices[d].size(); - assert(p <= std::numeric_limits

::max() && + appendPointer(d, indices[d].size()); + } + + /// Appends an arbitrary new position to `pointers[d]`. This method + /// checks that `pos` is representable in the `P` type; however, it + /// does not check that `pos` is semantically valid (i.e., larger than + /// the previous position and smaller than `indices[d].capacity()`). + inline void appendPointer(uint64_t d, uint64_t pos) { + // This first assertion is trivially satisfied by both use sites; + // but we leave it in just to be safe. + assert(isCompressedDim(d)); // Entails `d < getRank()`. + assert(pos <= std::numeric_limits

::max() && "Pointer value is too large for the P-type"); - pointers[d].push_back(p); // Here is where we convert to `P`. + pointers[d].push_back(pos); // Here is where we convert to `P`. } - /// Appends the given index to `indices[d]`. + /// Appends the given index to `indices[d]`. This method checks that + /// `i` is representable in the `I` type; however, it does not check + /// that `i` is semantically valid (i.e., in bounds for `sizes[d]` + /// and not previously occurring in the same segment). inline void appendIndex(uint64_t d, uint64_t i) { assert(isCompressedDim(d)); // Entails `d < getRank()`. assert(i <= std::numeric_limits::max() && @@ -573,6 +874,23 @@ indices[d].push_back(i); // Here is where we convert to `I`. } + /// Writes the given coordinate to `indices[d][pos]`. This method + /// checks that `i` is representable in the `I` type; however, it + /// does not check that `i` is semantically valid (i.e., in bounds + /// for `sizes[d]` and not elsewhere occurring in the same segment). + inline void writeIndex(uint64_t d, uint64_t pos, uint64_t i) { + // This first assertion trivially entailed at the one call-site; + // but just to be safe. + assert(isCompressedDim(d)); // Entails `d < getRank()`. + // Subscript assignment to `std::vector` requires that the `pos`-th + // entry has been initialized; thus we must be sure to check `size()` + // here, instead of `capacity()` as would be ideal. + assert(pos < indices[d].size() && "Index position is out of bounds"); + assert(i <= std::numeric_limits::max() && + "Index value is too large for the I-type"); + indices[d][pos] = i; // Here is where we convert to `I`. + } + /// Initializes sparse tensor storage scheme from a memory-resident sparse /// tensor in coordinate scheme. This method prepares the pointers and /// indices arrays under the given per-dimension dense/sparse annotations. @@ -615,7 +933,7 @@ } // Finalize the sparse pointer structure at this dimension. if (isCompressedDim(d)) { - appendPointer(d); + appendCurrentPointer(d); } else { // For dense storage we must fill in all the zero values after // the last element. @@ -630,7 +948,7 @@ if (d == getRank()) { values.push_back(0); } else if (isCompressedDim(d)) { - appendPointer(d); + appendCurrentPointer(d); } else { for (uint64_t full = 0, sz = sizes[d]; full < sz; full++) endDim(d + 1); @@ -644,7 +962,7 @@ for (uint64_t i = 0; i < rank - diff; i++) { uint64_t d = rank - i - 1; if (isCompressedDim(d)) { - appendPointer(d); + appendCurrentPointer(d); } else { for (uint64_t full = idx[d] + 1, sz = sizes[d]; full < sz; full++) endDim(d + 1); @@ -718,6 +1036,7 @@ std::vector values; }; +//===----------------------------------------------------------------------===// /// Helper to convert string to lower case. static char *toLower(char *token) { for (char *c = token; *c; c++) @@ -832,7 +1151,7 @@ "dimension size mismatch"); SparseTensorCOO *tensor = SparseTensorCOO::newSparseTensorCOO(rank, idata + 2, perm, nnz); - // Read all nonzero elements. + // Read all nonzero elements. std::vector indices(rank); for (uint64_t k = 0; k < nnz; k++) { if (!fgets(line, kColWidth, file)) { @@ -987,28 +1306,34 @@ #define CASE(p, i, v, P, I, V) \ if (ptrTp == (p) && indTp == (i) && valTp == (v)) { \ - SparseTensorCOO *tensor = nullptr; \ + SparseTensorCOO *coo = nullptr; \ if (action <= Action::kFromCOO) { \ if (action == Action::kFromFile) { \ char *filename = static_cast(ptr); \ - tensor = openSparseTensorCOO(filename, rank, shape, perm); \ + coo = openSparseTensorCOO(filename, rank, shape, perm); \ } else if (action == Action::kFromCOO) { \ - tensor = static_cast *>(ptr); \ + coo = static_cast *>(ptr); \ } else { \ assert(action == Action::kEmpty); \ } \ + return SparseTensorStorage::newSparseTensor(rank, shape, perm, \ + sparsity, coo); \ + } \ + if (action == Action::kSparseToSparse) { \ + EnumerableSparseTensorStorage *tensor = \ + static_cast *>(ptr); \ return SparseTensorStorage::newSparseTensor(rank, shape, perm, \ sparsity, tensor); \ } \ if (action == Action::kEmptyCOO) \ return SparseTensorCOO::newSparseTensorCOO(rank, shape, perm); \ - tensor = static_cast *>(ptr)->toCOO(perm); \ + coo = static_cast *>(ptr)->toCOO(perm); \ if (action == Action::kToIterator) { \ - tensor->startIterator(); \ + coo->startIterator(); \ } else { \ assert(action == Action::kToCOO); \ } \ - return tensor; \ + return coo; \ } #define CASE_SECSAME(p, v, P, V) CASE(p, p, v, P, P, V) diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir --- a/mlir/test/Dialect/SparseTensor/conversion.mlir +++ b/mlir/test/Dialect/SparseTensor/conversion.mlir @@ -1,4 +1,10 @@ -// RUN: mlir-opt %s --sparse-tensor-conversion --canonicalize --cse | FileCheck %s +// First use with `kViaCOO` for sparse2sparse conversion (the old way). +// RUN: mlir-opt %s --sparse-tensor-conversion="s2s-strategy=1" \ +// RUN: --canonicalize --cse | FileCheck %s +// +// Now again with `kAuto` (the new default) +// RUN: mlir-opt %s --sparse-tensor-conversion="s2s-strategy=0" \ +// RUN: --canonicalize --cse | FileCheck %s -check-prefix=CHECKAUTO #SparseVector = #sparse_tensor.encoding<{ dimLevelType = ["compressed"] @@ -191,15 +197,28 @@ // CHECK-LABEL: func @sparse_convert_1d_ss( // CHECK-SAME: %[[A:.*]]: !llvm.ptr) +// CHECK-DAG: %[[ToCOO:.*]] = arith.constant 5 : i32 +// CHECK-DAG: %[[FromCOO:.*]] = arith.constant 2 : i32 // CHECK-DAG: %[[P:.*]] = memref.alloca() : memref<1xi8> // CHECK-DAG: %[[Q:.*]] = memref.alloca() : memref<1xindex> // CHECK-DAG: %[[R:.*]] = memref.alloca() : memref<1xindex> // CHECK-DAG: %[[X:.*]] = memref.cast %[[P]] : memref<1xi8> to memref // CHECK-DAG: %[[Y:.*]] = memref.cast %[[Q]] : memref<1xindex> to memref // CHECK-DAG: %[[Z:.*]] = memref.cast %[[R]] : memref<1xindex> to memref -// CHECK: %[[C:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[A]]) -// CHECK: %[[T:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[C]]) +// CHECK: %[[C:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %[[ToCOO]], %[[A]]) +// CHECK: %[[T:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %[[FromCOO]], %[[C]]) // CHECK: return %[[T]] : !llvm.ptr +// CHECKAUTO-LABEL: func @sparse_convert_1d_ss( +// CHECKAUTO-SAME: %[[A:.*]]: !llvm.ptr) +// CHECKAUTO-DAG: %[[SparseToSparse:.*]] = arith.constant 3 : i32 +// CHECKAUTO-DAG: %[[P:.*]] = memref.alloca() : memref<1xi8> +// CHECKAUTO-DAG: %[[Q:.*]] = memref.alloca() : memref<1xindex> +// CHECKAUTO-DAG: %[[R:.*]] = memref.alloca() : memref<1xindex> +// CHECKAUTO-DAG: %[[X:.*]] = memref.cast %[[P]] : memref<1xi8> to memref +// CHECKAUTO-DAG: %[[Y:.*]] = memref.cast %[[Q]] : memref<1xindex> to memref +// CHECKAUTO-DAG: %[[Z:.*]] = memref.cast %[[R]] : memref<1xindex> to memref +// CHECKAUTO: %[[T:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %[[SparseToSparse]], %[[A]]) +// CHECKAUTO: return %[[T]] : !llvm.ptr func @sparse_convert_1d_ss(%arg0: tensor) -> tensor { %0 = sparse_tensor.convert %arg0 : tensor to tensor return %0 : tensor diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir @@ -0,0 +1,102 @@ +// Force this file to use the kDirect method for sparse2sparse +// RUN: mlir-opt %s --sparse-compiler="s2s-strategy=2" | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#Tensor1 = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "dense", "compressed" ] +}> + +// NOTE: dense after compressed is not currently supported for the target +// of direct-sparse2sparse conversion. (It's fine for the source though.) +#Tensor2 = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "compressed", "dense" ] +}> + +#Tensor3 = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "dense", "compressed" ], + dimOrdering = affine_map<(i,j,k) -> (i,k,j)> +}> + +module { + // + // Utilities for output and releasing memory. + // + func @dump(%arg0: tensor<2x3x4xf64>) { + %c0 = arith.constant 0 : index + %d0 = arith.constant -1.0 : f64 + %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %d0: tensor<2x3x4xf64>, vector<2x3x4xf64> + vector.print %0 : vector<2x3x4xf64> + return + } + func @dumpAndRelease_234(%arg0: tensor<2x3x4xf64>) { + call @dump(%arg0) : (tensor<2x3x4xf64>) -> () + %1 = bufferization.to_memref %arg0 : memref<2x3x4xf64> + memref.dealloc %1 : memref<2x3x4xf64> + return + } + + // + // Main driver. + // + func @entry() { + // + // Initialize a 3-dim dense tensor. + // + %src = arith.constant dense<[ + [ [ 1.0, 2.0, 3.0, 4.0 ], + [ 5.0, 6.0, 7.0, 8.0 ], + [ 9.0, 10.0, 11.0, 12.0 ] ], + [ [ 13.0, 14.0, 15.0, 16.0 ], + [ 17.0, 18.0, 19.0, 20.0 ], + [ 21.0, 22.0, 23.0, 24.0 ] ] + ]> : tensor<2x3x4xf64> + + // + // Convert dense tensor directly to various sparse tensors. + // + %s1 = sparse_tensor.convert %src : tensor<2x3x4xf64> to tensor<2x3x4xf64, #Tensor1> + %s2 = sparse_tensor.convert %src : tensor<2x3x4xf64> to tensor<2x3x4xf64, #Tensor2> + %s3 = sparse_tensor.convert %src : tensor<2x3x4xf64> to tensor<2x3x4xf64, #Tensor3> + + // + // Convert sparse tensor directly to another sparse format. + // + %t13 = sparse_tensor.convert %s1 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64, #Tensor3> + %t21 = sparse_tensor.convert %s2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64, #Tensor1> + %t23 = sparse_tensor.convert %s2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64, #Tensor3> + %t31 = sparse_tensor.convert %s3 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64, #Tensor1> + + // + // Convert sparse tensor back to dense. + // + %d13 = sparse_tensor.convert %t13 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64> + %d21 = sparse_tensor.convert %t21 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64> + %d23 = sparse_tensor.convert %t23 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64> + %d31 = sparse_tensor.convert %t31 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64> + + // + // Check round-trip equality. And release dense tensors. + // + // CHECK-COUNT-5: ( ( ( 1, 2, 3, 4 ), ( 5, 6, 7, 8 ), ( 9, 10, 11, 12 ) ), ( ( 13, 14, 15, 16 ), ( 17, 18, 19, 20 ), ( 21, 22, 23, 24 ) ) ) + call @dump(%src) : (tensor<2x3x4xf64>) -> () + call @dumpAndRelease_234(%d13) : (tensor<2x3x4xf64>) -> () + call @dumpAndRelease_234(%d21) : (tensor<2x3x4xf64>) -> () + call @dumpAndRelease_234(%d23) : (tensor<2x3x4xf64>) -> () + call @dumpAndRelease_234(%d31) : (tensor<2x3x4xf64>) -> () + + // + // Release sparse tensors. + // + sparse_tensor.release %t13 : tensor<2x3x4xf64, #Tensor3> + sparse_tensor.release %t21 : tensor<2x3x4xf64, #Tensor1> + sparse_tensor.release %t23 : tensor<2x3x4xf64, #Tensor3> + sparse_tensor.release %t31 : tensor<2x3x4xf64, #Tensor1> + sparse_tensor.release %s1 : tensor<2x3x4xf64, #Tensor1> + sparse_tensor.release %s2 : tensor<2x3x4xf64, #Tensor2> + sparse_tensor.release %s3 : tensor<2x3x4xf64, #Tensor3> + + return + } +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py --- a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py +++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py @@ -189,11 +189,17 @@ vec = 0 vl = 1 e = False + # Disable direct sparse2sparse conversion, because it doubles the time! + # TODO: While direct s2s is far too slow for per-commit testing, + # we should have some framework ensure that we run this test with + # `s2s=0` on a regular basis, to ensure that it does continue to work. + s2s = 1 sparsification_options = ( f'parallelization-strategy={par} ' f'vectorization-strategy={vec} ' f'vl={vl} ' - f'enable-simd-index32={e}') + f'enable-simd-index32={e} ' + f's2s-strategy={s2s}') compiler = sparse_compiler.SparseCompiler(options=sparsification_options) f64 = ir.F64Type.get() # Be careful about increasing this because