diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -106,6 +106,7 @@ let dependentDialects = [ "arith::ArithmeticDialect", "LLVM::LLVMDialect", + "linalg::LinalgDialect", "memref::MemRefDialect", "scf::SCFDialect", "sparse_tensor::SparseTensorDialect", diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp @@ -275,6 +275,56 @@ params); } +/// Generates a call to `SparseTensorCOO::Iterator::getNext()`. +/// To avoid needing to handle multiple outputs and avoid defining +/// a bunch of new MLIR types for `Element`, we instead have both +/// `indices` and `elemPtr` serve as out-parameters and return a bool +/// to indicate whether those out-parameters are filled or whether we +/// have no more elements to iterate. +/// +/// \param [in] elemTp The type of the tensor's elements/values, called +/// `V` below. +/// \param [in] iter A value of MLIR-type `!llvm.ptr` which we +/// static-cast to C++-type `SparseTensorCOO::Iterator*`. +/// \param [out] ind A value of `memref` type, where the dynamic +/// size matches the iterator/sparse-tensor. +/// \param [out] elemPtr A value of MLIR-type `memref`. +/// +/// \returns `i1` indicating whether `ind` and `elemPtr` were filled. +// +// TODO(wrengr): once we're sure this is actually working as +// intended/desired, then we ought to be able to deduce the `elemTp` +// directly from the `elemPtr`. +static Value genGetNextCall(ConversionPatternRewriter &rewriter, Operation *op, + Type elemTp, Value iter, Value ind, Value elemPtr) { + Location loc = op->getLoc(); + StringRef name; + if (elemTp.isF64()) + name = "getNextF64"; + else if (elemTp.isF32()) + name = "getNextF32"; + else if (elemTp.isInteger(64)) + name = "getNextI64"; + else if (elemTp.isInteger(32)) + name = "getNextI32"; + else if (elemTp.isInteger(16)) + name = "getNextI16"; + else if (elemTp.isInteger(8)) + name = "getNextI8"; + else + llvm_unreachable("Unknown element type"); + SmallVector params; + params.push_back(iter); + params.push_back(ind); + params.push_back(elemPtr); + // TODO(wrengr): Need we do anything extra to massage between C++'s + // `bool` type and MLIR's `i1` type? + Type i1 = rewriter.getI1Type(); + auto call = rewriter.create( + loc, i1, getFunc(op, name, i1, params, /*emitCInterface=*/true), params); + return call.getResult(0); +} + /// If the tensor is a sparse constant, generates and returns the pair of /// the constants for the indices and the values. static Optional> @@ -313,6 +363,10 @@ /// is the given `rank`. This array is intended to serve as a reusable /// buffer for storing the indices of a single tensor element, to avoid /// allocation in the body of loops. +// TODO(wrengr): can we push the type-erasure of the rank down into +// where we actually generate calls to the ExecutionEngine, instead +// of doing them here? (so that the intervening generated code can +// still be type-checked wrt the rank) static Value allocaIndices(ConversionPatternRewriter &rewriter, Location loc, int64_t rank) { auto indexTp = rewriter.getIndexType(); @@ -321,6 +375,71 @@ return rewriter.create(loc, memTp, ValueRange{arg}); } +/// Generates code to stack-allocate room for one value of the given +/// type, and return the pointer to that memory. At present we leave +/// the memory for that value uninitialized. +static Value allocaValue(ConversionPatternRewriter &rewriter, Location loc, + Type t) { + return rewriter.create(loc, MemRefType::get({}, t)); +} + +/// Generates code to allocate a tensor of the given type, and zero +/// initialize it. This function assumes the TensorType is fully +/// specified (i.e., has static rank and sizes). +static Value allocDenseTensor(ConversionPatternRewriter &rewriter, Location loc, + RankedTensorType tensorTp) { + /* DO_NOT_SUBMIT(wrengr): {notes to self} + // This code works (a) to generate `%0 = constant dense<0> : $tensorTp` + // which MLIR calls a "splat tensor", and (b) not cause mlir-opt to + // crash like it used to with tensor::GenerateOp (which was redherring-ly + // attributed to an unlegalized builtin.unrealized_conversion_cast). + // + // Unfortunately, splat-tensors may only live in read-only buffers + // eventually; thus we must instead revert back to using linalg::FillOp + // like we had originally. But it looks like we're allowed to have that + // work on the `tensor<>` type instead of just `memref<>`. + // + // cf + + re overhead of using attributes. + // SmallVector does have emplace_back() afterall; see line1419 of that same + file + // + // There's also `SparseElementsAttr::getZeroValue()` but that + // requires us to have the `T` at C++ compile-time, rather than the + // `RankedTensorType` at C++ runtime / MLIR codegen time. We could + // paper over this staging issue via templates, sure, but I'm not sure + // the benefit would be worth the boilerplate to do so. + // + DenseIntOrFPElementsAttr zeroAttr; + Type elemTp = tensorTp.getElementType(); + if (auto floatTp = elemTp.dyn_cast()) { + // Cribbed from the implementation of SparseElementsAttr::getZeroAPFloat() + APFloat zeroElem(floatTp.getFloatSemantics()); + zeroAttr = DenseFPElementsAttr::get(tensorTp, zeroElem); + } else if (auto intTp = elemTp.dyn_cast()) { + // Cribbed from the implementation of SparseElementsAttr::getZeroAPInt() + APInt zeroElem = APInt::getZero(intTp.getWidth()); + zeroAttr = DenseIntElementsAttr::get(tensorTp, zeroElem); + } else + llvm_unreachable("The element type is not an IntOrFP type"); + // DO_NOT_SUBMIT(wrengr): this assertion isn't dynamic, so we can/should + // probably remove it before landing the CL. I just have it here to make + // sure I didn't mess up the C++ code above. + assert(zeroAttr.isSplat() && "Failed to codegen a 'splat' tensor"); + return rewriter.create(loc, tensorTp, zeroAttr); + */ + Type elemTp = tensorTp.getElementType(); + // TODO(wrengr): support dynamic sizes too. + // BUG(wrengr): Why won't this legalize? + Value initTensor = + rewriter.create(loc, tensorTp.getShape(), elemTp) + .getResult(); + Value zero = constantZero(rewriter, loc, elemTp); + // Re legalization, beware that linalg::FillOp implicitly uses linalg::YieldOp + return rewriter.create(loc, zero, initTensor).getResult(0); +} + //===----------------------------------------------------------------------===// // Conversion rules. //===----------------------------------------------------------------------===// @@ -406,8 +525,71 @@ rewriter.replaceOp(op, genNewCall(rewriter, op, encDst, 1, perm, coo)); return success(); } - if (!encDst || encSrc) { - // TODO: sparse => dense + if (!encDst && encSrc) { + // This is sparse => dense conversion, which is handled as follows: + // dst = new Tensor(0); + // iter = src->toCOO()->getIterator(); + // while (elem = iter->getNext()) { + // dst[elem.indices] = elem.value; + // } + // While it would be more efficient to inline the iterator logic + // directly rather than allocating an object and calling methods, + // this is good enough for now. + Location loc = op->getLoc(); + ShapedType shapeTp = resType.cast(); + unsigned rank = shapeTp.getRank(); + Type elemTp = shapeTp.getElementType(); + Value dst = + allocDenseTensor(rewriter, loc, resType.cast()); + Value ind = allocaIndices(rewriter, loc, rank); + Value elemPtr = allocaValue(rewriter, loc, elemTp); + Value perm; + Value iter = genNewCall(rewriter, op, encSrc, 4, perm, src); + // Generate the while-loop Op itself. + // (We cannot call `argTypes` a "TypeRange" here, lest mlir-opt sefgault.) + SmallVector argTypes{resType}; + auto whileOp = + rewriter.create(loc, argTypes, ValueRange{dst}); + Block *before = rewriter.createBlock(&whileOp.before(), {}, argTypes); + Block *after = rewriter.createBlock(&whileOp.after(), {}, argTypes); + // Build the while-loop's "before" region. + { + rewriter.setInsertionPointToEnd(before); + Value cond = genGetNextCall(rewriter, op, elemTp, iter, ind, elemPtr); + rewriter.create(loc, cond, before->getArguments()); + } + // Build the while-loop's "after" region. + // BUG(wrengr): this part is still causing mlir-opt to segfault, + // just in a new/different way than before. + { + rewriter.setInsertionPointToStart(after); + // Can't pass `ind` directly to tensor::InsertOp::build(); + // instead must explicitly convert it into a ValueRange `ivs`. + // (Though, of course, we cannot call it a "ValueRange" here, + // lest mlir-opt sefgault.) + SmallVector ivs; + ivs.reserve(rank); + for (unsigned i = 0; i < rank; i++) { + Value idx = constantIndex(rewriter, loc, i); + ivs.push_back(rewriter.create(loc, ind, idx)); + } + Value elemV = rewriter.create(loc, elemPtr); + Value dstIn = after->getArgument(0); + Value dstOut = + rewriter.create(loc, elemV, dstIn, ivs); + // BUG(wrengr): evidently, the TypeRange argument to + // scf::YieldOp::build() is only allowed to be passed when it's + // empty: because they assert(resultTypes.size() == 0u) for + // some bizzare reason. + rewriter.create(loc, ValueRange{dstOut}); + } + // Finish up. + rewriter.setInsertionPointAfter(whileOp); + rewriter.replaceOp(op, whileOp.getResult(0)); + return success(); + } + if (!encDst && !encSrc) { + // dense => dense return failure(); } // This is a dense => sparse conversion or a sparse constant in COO => diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp @@ -114,11 +114,15 @@ }); // The following operations and dialects may be introduced by the // rewriting rules, and are therefore marked as legal. + // BUG(wrengr): why aren't linalg::InitTensorOp and tensor::InsertOp getting + // legalized?! target.addLegalOp(); + arith::CmpFOp, arith::CmpIOp, tensor::CastOp, + tensor::ExtractOp, tensor::InsertOp, linalg::InitTensorOp, + linalg::FillOp, linalg::YieldOp>(); target.addLegalDialect(); + linalg::LinalgDialect, memref::MemRefDialect, + tensor::TensorDialect>(); // Populate with rules and apply rewriting rules. populateFuncOpTypeConversionPattern(patterns, converter); populateCallOpTypeConversionPattern(patterns, converter); diff --git a/mlir/lib/ExecutionEngine/SparseUtils.cpp b/mlir/lib/ExecutionEngine/SparseUtils.cpp --- a/mlir/lib/ExecutionEngine/SparseUtils.cpp +++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp @@ -107,6 +107,12 @@ /// Getter for elements array. const std::vector> &getElements() const { return elements; } + // Forward declaration of the class required by getIterator. We make + // it a nested class so that it can access the private fields. + class Iterator; + /// Returns an iterator over the elements of a SparseTensorCOO. + Iterator *getIterator() const { return new Iterator(*this); } + /// Factory method. Permutes the original dimensions according to /// the given ordering and expects subsequent add() calls to honor /// that same ordering for the given indices. The result is a @@ -132,10 +138,40 @@ } return false; } - std::vector sizes; // per-rank dimension sizes + const std::vector sizes; // per-rank dimension sizes std::vector> elements; }; +/// An iterator over the elements of a sparse tensor in coordinate-scheme +/// format. This iterator is not designed for use by C++ code itself, +/// but rather by generated MLIR code (i.e., by calls to `IMPL_COO_GETNEXT`); +/// hence why it may look idisyncratic or unconventional compared to +/// conventional C++ iterators. +template +class SparseTensorCOO::Iterator { + // TODO(wrengr): really this class should be a thin wrapper/subclass + // of the std::vector, rather than needing to do a dereference every + // time a method is called; but we don't want to actually copy the whole + // contents of the underlying array(s) when this class is initialized. + // Maybe we should be a thin wrapper/subclass of SparseTensorCOO? + // Or have a variant of SparseTensorStorage::toCOO() to construct this + // iterator directly? + const std::vector> &elements; + unsigned pos; + +public: + // TODO(wrengr): to guarantee safety we'd either need to consume the + // SparseTensorCOO (e.g., requiring an rvalue-reference) or get notified + // somehow whenever the SparseTensorCOO adds new elements, sorts, etc. + Iterator(const SparseTensorCOO &coo) : elements(coo.elements), pos(0) {} + + const Element *getNext() { + if (pos < elements.size()) + return &(elements[pos++]); + return nullptr; + } +}; + /// Abstract base class of sparse tensor storage. Note that we use /// function overloading to implement "partial" method specialization. class SparseTensorStorageBase { @@ -518,7 +554,13 @@ kI8 = 6 }; -enum Action : uint32_t { kFromFile = 0, kFromCOO = 1, kNewCOO = 2, kToCOO = 3 }; +enum Action : uint32_t { + kFromFile = 0, + kFromCOO = 1, + kNewCOO = 2, + kToCOO = 3, + kToIter = 4 +}; #define CASE(p, i, v, P, I, V) \ if (ptrTp == (p) && indTp == (i) && valTp == (v)) { \ @@ -530,10 +572,15 @@ tensor = static_cast *>(ptr); \ else if (action == kNewCOO) \ return SparseTensorCOO::newSparseTensorCOO(size, sizes, perm); \ - else if (action == kToCOO) \ - return static_cast *>(ptr)->toCOO(perm); \ - else \ - assert(0); \ + else { \ + tensor = static_cast *>(ptr)->toCOO(perm); \ + if (action == kToCOO) \ + return tensor; \ + else if (action == kToIter) \ + return tensor->getIterator(); \ + else \ + assert(0); \ + } \ return SparseTensorStorage::newSparseTensor(tensor, sparsity, \ perm); \ } @@ -582,6 +629,29 @@ return tensor; \ } +/// Calls SparseTensorCOO::Iterator::getNext() with the following semantics. +/// To avoid needing to handle multiple outputs and avoid defining +/// a bunch of new MLIR types for `Element`, we instead have both +/// `iref` and `value` serve as out-parameters and return a bool to +/// indicate whether those out-parameters are filled or whether we have +/// no more elements to iterate. +#define IMPL_COO_GETNEXT(NAME, V) \ + bool _mlir_ciface_##NAME(void *ptr, StridedMemRefType *iref, \ + StridedMemRefType *vref) { \ + assert(iref->strides[0] == 1); \ + uint64_t *indx = iref->data + iref->offset; \ + V *value = vref->data + vref->offset; \ + const uint64_t isize = iref->sizes[0]; \ + auto iter = static_cast::Iterator *>(ptr); \ + const Element *elem = iter->getNext(); \ + if (elem == nullptr) \ + return false; \ + for (uint64_t r = 0; r < isize; r++) \ + indx[r] = elem->indices[r]; \ + *value = elem->value; \ + return true; \ + } + /// Constructs a new sparse tensor. This is the "swiss army knife" /// method for materializing sparse tensors into the computation. /// @@ -590,6 +660,8 @@ /// kFromCOO = ptr contains coordinate scheme to assign to new storage /// kNewCOO = returns empty coordinate scheme to fill and use with kFromCOO /// kToCOO = returns coordinate scheme from storage in ptr to use with kFromCOO +/// kToIter = returns iterator from storage in ptr (call IMPL_COO_GETNEXT to +/// use) void * _mlir_ciface_newSparseTensor(StridedMemRefType *aref, // NOLINT StridedMemRefType *sref, @@ -687,10 +759,19 @@ IMPL3(addEltI16, int16_t) IMPL3(addEltI8, int8_t) +/// Helper to enumerate elements of coordinate scheme, one per value type. +IMPL_COO_GETNEXT(getNextF64, double) +IMPL_COO_GETNEXT(getNextF32, float) +IMPL_COO_GETNEXT(getNextI64, int64_t) +IMPL_COO_GETNEXT(getNextI32, int32_t) +IMPL_COO_GETNEXT(getNextI16, int16_t) +IMPL_COO_GETNEXT(getNextI8, int8_t) + #undef CASE #undef IMPL1 #undef IMPL2 #undef IMPL3 +#undef IMPL_COO_GETNEXT //===----------------------------------------------------------------------===// // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir @@ -1,11 +1,14 @@ // RUN: mlir-opt %s \ +// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \ // RUN: --sparsification --sparse-tensor-conversion \ +// RUN: --linalg-bufferize --convert-linalg-to-loops \ // RUN: --convert-vector-to-scf --convert-scf-to-std \ // RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ -// RUN: --std-bufferize --finalizing-bufferize \ -// RUN: --convert-vector-to-llvm --convert-memref-to-llvm --convert-std-to-llvm --reconcile-unrealized-casts | \ +// RUN: --std-bufferize --finalizing-bufferize \ +// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner \ -// RUN: -e entry -entry-point-result=void \ +// RUN: -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s @@ -75,6 +78,27 @@ } return } + func @checkTensor(%arg0: tensor<2x3x4xf64>, %arg1: tensor<2x3x4xf64>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + // Same content? + scf.for %i = %c0 to %c2 step %c1 { + scf.for %j = %c0 to %c3 step %c1 { + scf.for %k = %c0 to %c4 step %c1 { + %a = tensor.extract %arg0[%i, %j, %k] : tensor<2x3x4xf64> + %b = tensor.extract %arg1[%i, %j, %k] : tensor<2x3x4xf64> + %c = arith.cmpf une, %a, %b : f64 + scf.if %c { + call @exit(%c1) : (index) -> () + } + } + } + } + return + } // // Output utility. @@ -132,6 +156,13 @@ %h = sparse_tensor.convert %2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64, #Tensor3> %i = sparse_tensor.convert %3 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64, #Tensor3> + // + // Convert sparse tensor back to dense. + // + %x = sparse_tensor.convert %1 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64> + %y = sparse_tensor.convert %2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64> + %z = sparse_tensor.convert %3 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64> + // // Check values equality. // @@ -160,6 +191,10 @@ call @checkf64(%v3, %hv) : (memref, memref) -> () call @checkf64(%v3, %iv) : (memref, memref) -> () + call @checkTensor(%t, %x) : (tensor<2x3x4xf64>, tensor<2x3x4xf64>) -> () + call @checkTensor(%t, %y) : (tensor<2x3x4xf64>, tensor<2x3x4xf64>) -> () + call @checkTensor(%t, %z) : (tensor<2x3x4xf64>, tensor<2x3x4xf64>) -> () + // // Check index equality. //