diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h --- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h @@ -148,14 +148,29 @@ /// independently from the basic algorithm if bottlenecks are identified. class Merger { public: - /// Constructs a merger for the given number of tensors and loops. The - /// user supplies the number of tensors involved in the kernel, with the - /// last tensor in this set denoting the output tensor. The merger adds an - /// additional synthetic tensor at the end of this set to represent all - /// invariant expressions in the kernel. - Merger(unsigned t, unsigned l) - : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l), - hasSparseOut(false), + /// Constructs a merger for the given number of tensors, native loops, and + /// filter loops. The user supplies the number of tensors involved in the + /// kernel, with the last tensor in this set denoting the output tensor. The + /// merger adds an additional synthetic tensor at the end of this set to + /// represent all invariant expressions in the kernel. + /// In addition to natives + /// loops (which are specified by the GenericOp), extra filter loops are + /// needed in order to handle affine expressions on sparse dimensions. + /// E.g., (d0, d1, d2) => (d0 + d1, d2), a naive implementation of the filter + /// loop could be generated as: + /// for (coord : sparse_dim[0]) + /// if (coord == d0 + d1) { + /// generated_code; + /// } + /// } + /// to filter out coordinates that are not equal to the affine expression + /// result. + /// TODO: we want to make the filter loop more efficient in the future, e.g., + /// by avoiding scanning the full stored index sparse (keeping the last + /// position in ordered list) or even apply binary search to find the index. + Merger(unsigned t, unsigned l, unsigned fl) + : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), + numNativeLoops(l), numLoops(l + fl), hasSparseOut(false), dimTypes(numTensors, std::vector(numLoops, DimLevelType::Undef)), loopIdxToDim(numTensors, @@ -231,6 +246,15 @@ /// Bit translation (get loop index). unsigned index(unsigned b) const { return b / numTensors; } + /// Get the number of total loops (native loops + filter loops). + unsigned getNumLoops() const { return numLoops; } + /// Get the number of native loops. + unsigned getNumNativeLoops() const { return numNativeLoops; } + /// Get the number of filter loops. + unsigned getNumFilterLoops() const { return numLoops - numNativeLoops; } + /// Get the starting filter loop index. + unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); } + /// Returns true if bit corresponds to index of output tensor. bool isOutTensor(unsigned b, unsigned i) const { return tensor(b) == outTensor && index(b) == i; @@ -242,6 +266,11 @@ /// expressions). unsigned getSynTensorID() const { return syntheticTensor; } + bool isFilterLoop(unsigned ldx) const { + assert(ldx < numLoops); + return ldx >= numNativeLoops; + } + /// Returns true if given tensor iterates *only* in the given tensor /// expression. For the output tensor, this defines a "simply dynamic" /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for @@ -345,6 +374,7 @@ const unsigned outTensor; const unsigned syntheticTensor; const unsigned numTensors; + const unsigned numNativeLoops; const unsigned numLoops; bool hasSparseOut; // Map that converts pair to the corresponding dimension diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -406,6 +406,11 @@ ArrayRef extraTids = {}, ArrayRef extraDims = {}); + Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + AffineExpr affine, + MutableArrayRef reduc = {}); + void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -329,6 +329,12 @@ return loop; } +Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim( + OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine, + MutableArrayRef reduc) { + llvm_unreachable("need to be implemented"); +} + void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel( OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine) { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -155,8 +155,11 @@ /// Helper method to inspect affine expressions. Rejects cases where the /// same index is used more than once. Also rejects compound affine /// expressions in sparse dimensions. +/// filterIdx stores the current filter loop idx should be used for the next +/// compound affine sparse level, and it will be incremented by one when +/// used. static bool findAffine(Merger &merger, unsigned tensor, unsigned dim, - AffineExpr a, DimLevelType dlt, + AffineExpr a, DimLevelType dlt, unsigned &filterLdx, bool setLvlFormat = true) { switch (a.getKind()) { case AffineExprKind::DimId: { @@ -169,22 +172,68 @@ return true; } case AffineExprKind::Add: - case AffineExprKind::Mul: { - if (!isDenseDLT(dlt)) - return false; // compound only in dense dim - auto binOp = a.cast(); - // We do not set dim level format for affine expresssion like d0 + d1 on - // both loop index at d0 and d1, - return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) && - findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false); + case AffineExprKind::Mul: + case AffineExprKind::Constant: { + if (!isDenseDLT(dlt) && setLvlFormat) { + assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx))); + // Use a filter loop for sparse affine expression. + merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt); + } + + if (auto binOp = a.dyn_cast()) { + // We do not set dim level format for affine expresssion like d0 + d1 on + // either loop index at d0 or d1. + // We continue the recursion merely to check whether current affine is + // admissible or not. + return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx, + false) && + findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx, + false); + } + // Falls through when it is a constant Affine + return true; } - case AffineExprKind::Constant: - return isDenseDLT(dlt); // const only in dense dim default: return false; } } +/// Get the total number of compound affine expressions in affineMap that are +/// attached to the given tensor. For the following inputs: +/// +/// affineMap = (d0, d1, d2) => (d0 + d1, d2) +/// tensor = ["compressed", "compressed"] +/// +/// Returns 1 (because the first level is compressed and its corresponding +/// affineMap is d0 + d1) +static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap, + Value tensor) { + unsigned num = 0; + auto enc = getSparseTensorEncoding(tensor.getType()); + if (enc) { + ArrayRef exps = affineMap.getResults(); + for (unsigned rank = 0; rank < exps.size(); rank++) { + auto aidx = toOrigDim(enc, rank); + auto affine = exps[aidx]; + if (!affine.isa()) + if (!isDenseDLT(getDimLevelType(enc, rank))) + num++; + } + } + + return num; +} + +/// Get the total number of compound affine expressions attached on a sparse +/// level in the given GenericOp. +static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) { + unsigned num = 0; + for (OpOperand &t : op->getOpOperands()) + num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t), + t.get()); + return num; +} + /// Helper method to inspect sparse encodings in the tensor types. /// Fills the per-dimension sparsity information for all tensors. /// Returns true if the sparse annotations and affine subscript @@ -192,19 +241,22 @@ /// no annotations are found or inadmissible constructs occur. static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) { bool annotated = false; + unsigned filterLdx = merger.getFilterLoopStartingIdx(); for (OpOperand &t : op->getOpOperands()) { auto map = op.getMatchingIndexingMap(&t); auto enc = getSparseTensorEncoding(t.get().getType()); if (enc) annotated = true; assert(map.getNumResults() == op.getRank(&t)); + for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { unsigned tensor = t.getOperandNumber(); AffineExpr a = map.getResult(toOrigDim(enc, d)); - if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d))) + if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx)) return false; // inadmissible affine expression } } + assert(filterLdx == merger.getNumLoops()); return annotated; } @@ -214,34 +266,58 @@ /// latest possible index. static bool topSortOptimal(unsigned n, ArrayRef iteratorTypes, - std::vector &topSort, + const Merger &merger, std::vector &topSort, std::vector &inDegree, std::vector> &adjM) { - std::vector redIt; // reduce iterator with 0 degree - std::vector parIt; // parallel iterator with 0 degree + std::vector redIt; // reduce iterator with 0 degree + std::vector parIt; // parallel iterator with 0 degree + std::vector filterIt; // filter loop with 0 degree for (unsigned i = 0; i < n; i++) { if (inDegree[i] == 0) { - if (linalg::isReductionIterator(iteratorTypes[i])) + if (merger.isFilterLoop(i)) + filterIt.push_back(i); + else if (linalg::isReductionIterator(iteratorTypes[i])) redIt.push_back(i); else parIt.push_back(i); } } - while (!redIt.empty() || !parIt.empty()) { - // We always choose parallel iterator if there is any. - auto &it = !parIt.empty() ? parIt : redIt; + while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) { + // We always choose in order of filter loop -> parallel loop -> reduction + // loop because + // 1. Putting reduction loop early might make the loop sequence + // inadmissible. + // 2. Filter loops should be put as early as possible for better + // performance, since only one (if any) iteration will carry the + // computation. E.g., for (1 to N) + // for (1 to M) + // for (1 to K) + // if (xxx) + // O(X) computation => O(NMK+NMX) time complexity + // + // By putting the filter loop one level up, we got + // + // for (1 to N) + // for (1 to K) + // if (xxx) + // for (1 to M) + // O(X) computation => O(NK+NMX) time complexity + auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt); auto src = it.back(); topSort.push_back(src); it.pop_back(); // Update in-degree, and push 0-degree node into worklist. - for (unsigned dst = 0; dst < n; dst++) + for (unsigned dst = 0; dst < n; dst++) { if (adjM[src][dst] && --inDegree[dst] == 0) { - if (linalg::isReductionIterator(iteratorTypes[dst])) + if (merger.isFilterLoop(dst)) + filterIt.push_back(dst); + else if (linalg::isReductionIterator(iteratorTypes[dst])) redIt.push_back(dst); else parIt.push_back(dst); } + } } return topSort.size() == n; } @@ -340,7 +416,7 @@ OpOperand *skip = nullptr) { // Set up an n x n from/to adjacency matrix of the iteration graph // for the implicit loop indices i_0 .. i_n-1. - unsigned n = op.getNumLoops(); + unsigned n = merger.getNumLoops(); std::vector> adjM(n, std::vector(n, false)); std::vector inDegree(n, 0); // in-degree of each node. auto iteratorTypes = op.getIteratorTypesArray(); @@ -352,7 +428,7 @@ // Get map and encoding. auto map = op.getMatchingIndexingMap(&t); auto enc = getSparseTensorEncoding(t.get().getType()); - assert(map.getNumDims() == n); + assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n); // Skip dense tensor constraints when not requested. if (!(mask & SortMask::kIncludeDense) && !enc) continue; @@ -364,6 +440,19 @@ AffineExpr ta = map.getResult(toOrigDim(enc, d)); Optional tldx = merger.getLoopIdx(t.getOperandNumber(), d); + // Filter loops should be constructed after all the dependent loops, + // i.e., d0 + d1 < filter_loop(d0 + d1) + if (tldx && merger.isFilterLoop(tldx.value())) { + assert(!ta.isa() && + !isDenseDLT(getDimLevelType(enc, d))); + addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx); + // Now that the ordering of affine expression is captured by filter + // loop idx, we only need to ensure the affine ordering against filter + // loop. Thus, we reset the affine express to nil here to mark it as + // resolved. + ta = AffineExpr(); + } + if (d > 0) { AffineExpr fa = map.getResult(toOrigDim(enc, d - 1)); Optional fldx = @@ -377,6 +466,11 @@ if (!(mask & SortMask::kIncludeDense)) tryLoosenAffineDenseConstraints(op, fldx, fa, tldx, ta); + // (d0 + d1) < (d2 + d3), or + // filter_loop_d-1 < (d2 + d3), or + // (d0 + d1) < filter_loop_d, or + // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset + // above. addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx); } } @@ -402,7 +496,7 @@ // Report failure for a cyclic iteration graph. topSort.clear(); topSort.reserve(n); - return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM); + return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM); } /// Returns true if tensor materializes uninitialized into the computation. @@ -430,9 +524,8 @@ // An all-dense annotated "sparse" output tensor becomes a linearized random // access 1-dim memref. Also admissible since insertions cannot occur. bool allDense = true; - auto iteratorTypes = op.getIteratorTypesArray(); - unsigned numLoops = iteratorTypes.size(); - for (unsigned i = 0; i < numLoops; i++) + unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops + for (unsigned i = 0; i < merger.getNumLoops(); i++) if (isCompressedDLT(merger.getDimLevelType(tensor, i)) || isSingletonDLT(merger.getDimLevelType(tensor, i))) { allDense = false; @@ -443,19 +536,31 @@ } if (allDense) return true; + + // TODO: support compound affine expression on sparse output. + if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs), + lhs->get()) != 0) + return false; + // A tensor expression with a sparse output tensor that changes its values // but not its nonzero structure, an operation called "simply dynamic" in // [Bik96,Ch9], is also admissible without special codegen. if (merger.isSingleCondition(tensor, exp)) return true; + // Accept "truly dynamic" if the output tensor materializes uninitialized // into the computation and insertions occur in lexicographic index order. if (isMaterializing(lhs->get())) { + auto iteratorTypes = op.getIteratorTypesArray(); unsigned nest = 0; for (unsigned i = 0; i < numLoops; i++) { - if (linalg::isReductionIterator(iteratorTypes[topSort[i]])) - break; // terminate at first reduction - nest++; + if (!merger.isFilterLoop(topSort[i])) { + // We only count non-filter loops as filter loops should be considered + // as a special type of parallel loops. + if (linalg::isReductionIterator(iteratorTypes[topSort[i]])) + break; // terminate at first reduction + nest++; + } } // Determine admissible dynamic insertion situations: // (1) fully injective, since there are no reductions, @@ -878,7 +983,14 @@ auto enc = getSparseTensorEncoding(t.get().getType()); for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { AffineExpr a = map.getResult(toOrigDim(enc, d)); - if (!isInvariantAffine(codegen, a, ldx, atLevel)) + Optional sldx = merger.getLoopIdx(t.getOperandNumber(), d); + if (sldx && merger.isFilterLoop(sldx.value())) { + if (!codegen.getLoopIdxValue(sldx.value())) + // The filter loops has not been constructed. + return; + if (sldx.value() == ldx) + atLevel = true; + } else if (!isInvariantAffine(codegen, a, ldx, atLevel)) return; // still in play } // All exhausted at this level (atLevel denotes exactly at this level). @@ -1003,6 +1115,16 @@ Operation *loop = genLoopBoundary(codegen, merger, [&](MutableArrayRef reduc) { + if (merger.isFilterLoop(idx)) { + assert(isSparse); + OpOperand *t = &op->getOpOperand(tid); + auto enc = getSparseTensorEncoding(t->get().getType()); + // Retrieves the affine expression for the filter loop. + AffineExpr a = + op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim)); + return codegen.loopEmitter.enterFilterLoopOverTensorAtDim( + builder, loc, tid, dim, a, reduc); + } return codegen.loopEmitter.enterLoopOverTensorAtDim( builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims); }).value(); @@ -1488,7 +1610,8 @@ return failure(); unsigned numTensors = op->getNumOperands(); unsigned numLoops = op.getNumLoops(); - Merger merger(numTensors, numLoops); + unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op); + Merger merger(numTensors, numLoops, numFilterLoops); if (!findSparseAnnotations(merger, op)) return failure(); diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp --- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp +++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp @@ -127,7 +127,7 @@ protected: MergerTestBase(unsigned numTensors, unsigned numLoops) : numTensors(numTensors), numLoops(numLoops), - merger(numTensors, numLoops) {} + merger(numTensors, numLoops, /*numFilterLoops=*/0) {} /// /// Expression construction helpers.