diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h @@ -44,12 +44,15 @@ // General methods. // + LogicalResult initTensorExp(); + unsigned getTensorExp() const { return tensorExp; } + linalg::GenericOp op() const { return linalgOp; } const SparsificationOptions &options() const { return sparseOptions; } Merger &merger() { return latticeMerger; } LoopEmitter &emitter() { return loopEmitter; } - void startEmit(OpOperand *so, unsigned lv); + void startEmit(); /// Generates loop boundary statements (entering/exiting loops). The function /// passes and updates the passed-in parameters. @@ -88,6 +91,16 @@ ArrayRef getLoopCurStack() const; Value getLoopIdxValue(size_t loopIdx) const; + // + // Code generation environment verify functions. + // + + // Whether the iteration graph is sorted in admissible topoOrder. + bool isAdmissibleTopoOrder(); + // Whether the tensor expression is admissible for codegen. + // It also sets the sparseOut if the output tensor is sparse. + bool isAdmissibleTensorExp(unsigned exp); + // // Sparse tensor output and expansion methods. // @@ -156,6 +169,9 @@ Value redVal; unsigned redExp; unsigned redCustom; + + // The root tensor expression of the kernel. + unsigned tensorExp; }; } // namespace sparse_tensor diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp @@ -7,11 +7,25 @@ //===----------------------------------------------------------------------===// #include "CodegenEnv.h" + +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include using namespace mlir; using namespace mlir::sparse_tensor; +//===----------------------------------------------------------------------===// +// Code generation environment helper functions +//===----------------------------------------------------------------------===// + +/// Returns true if tensor materializes uninitialized into the computation. +static bool isMaterializing(Value val) { + return val.getDefiningOp() || + val.getDefiningOp(); +} + //===----------------------------------------------------------------------===// // Code generation environment constructor and general methods //===----------------------------------------------------------------------===// @@ -25,11 +39,18 @@ expFilled(), expAdded(), expCount(), redVal(), redExp(-1u), redCustom(-1u) {} -void CodegenEnv::startEmit(OpOperand *so, unsigned lv) { - assert(sparseOut == nullptr && insChain == nullptr && - "must only start emitting once"); - sparseOut = so; - outerParNest = lv; +LogicalResult CodegenEnv::initTensorExp() { + // Builds the tensor expression for the Linalg operation in SSA form. + std::optional optExp = latticeMerger.buildTensorExpFromLinalg(op()); + if (!optExp || !isAdmissibleTensorExp(*optExp)) + return failure(); + + tensorExp = *optExp; + return success(); +} + +void CodegenEnv::startEmit() { + assert(insChain == nullptr && "must only start emitting once"); if (sparseOut) { insChain = sparseOut->get(); latticeMerger.setHasSparseOut(true); @@ -66,6 +87,71 @@ return r; } +//===----------------------------------------------------------------------===// +// Code generation environment verify functions. +//===----------------------------------------------------------------------===// + +bool CodegenEnv::isAdmissibleTensorExp(unsigned exp) { + // We reject any expression that makes a reduction from `-outTensor`, as those + // expressions create a dependency between the current iteration (i) and the + // previous iteration (i-1). It would require iterating over the whole + // coordinate space, which prevent exploiting sparsity for faster code. + for (utils::IteratorType it : linalgOp.getIteratorTypesArray()) { + if (it == utils::IteratorType::reduction) { + if (latticeMerger.hasNegateOnOut(exp)) + return false; + break; + } + } + + OpOperand *lhs = linalgOp.getDpsInitOperand(0); + unsigned tensor = lhs->getOperandNumber(); + auto enc = getSparseTensorEncoding(lhs->get().getType()); + // An non-annotated output tensor is assumed dense, and becomes a random + // access n-dim memref. Admissible since insertions cannot occur. + if (!enc || enc.isAllDense()) + return true; + + // A tensor expression with a sparse output tensor that changes its values + // but not its nonzero structure, an operation called "simply dynamic" in + // [Bik96,Ch9], is also admissible without special env. + if (latticeMerger.isSingleCondition(tensor, exp)) + return true; + + // Accept "truly dynamic" if the output tensor materializes uninitialized + // into the computation and insertions occur in lexicographic index order. + sparseOut = lhs; + return isMaterializing(lhs->get()); +} + +bool CodegenEnv::isAdmissibleTopoOrder() { + if (!hasSparseOutput()) + return true; + + OpOperand *lhs = op().getDpsInitOperand(0); + // Accept "truly dynamic" if the output tensor materializes uninitialized + // into the computation and insertions occur in lexicographic index order. + unsigned nest = 0; + auto iteratorTypes = op().getIteratorTypesArray(); + for (unsigned i = 0, e = merger().getNumLoops(); i < e; i++) { + if (!merger().isFilterLoop(topSortAt(i))) { + // We only count non-filter loops as filter loops should be considered + // as a special type of parallel loops. + if (linalg::isReductionIterator(iteratorTypes[topSortAt(i)])) + break; // terminate at first reduction + nest++; + } + } + // Determine admissible dynamic insertion situations: + // (1) fully injective, since there are no reductions, + // (2) admissible 1-d expansion in innermost dimension. + if (nest >= op().getRank(lhs) - 1) { + outerParNest = nest; + return true; + } + return false; +} + //===----------------------------------------------------------------------===// // Code generation environment topological sort methods //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -250,6 +250,16 @@ return num; } +static bool hasCompoundAffineOnSparseOut(linalg::GenericOp op) { + OpOperand *out = op.getDpsInitOperand(0); + auto enc = getSparseTensorEncoding(out->get().getType()); + if (!enc || enc.isAllDense()) + return false; + + return getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(out), + out->get()); +} + /// Helper method to inspect sparse encodings in the tensor types. /// Fills the per-dimension sparsity information for all tensors. /// Returns true if the sparse annotations and affine subscript @@ -496,7 +506,7 @@ // will be skipped more often. if (mask & SortMask::kIncludeUndef) { unsigned tensor = t.getOperandNumber(); - for (unsigned i = 0; i < n; i++) + for (unsigned i = 0; i < n; i++) { if (isCompressedDLT(env.dlt(tensor, i)) || isSingletonDLT(env.dlt(tensor, i))) { for (unsigned j = 0; j < n; j++) @@ -508,6 +518,7 @@ assert(isDenseDLT(env.dlt(tensor, i)) || isUndefDLT(env.dlt(tensor, i))); } + } } } // Topologically sort the iteration graph to determine loop order. @@ -516,92 +527,6 @@ return topSortOptimal(env, n, iteratorTypes, inDegree, adjM); } -/// Returns true if tensor materializes uninitialized into the computation. -static bool isMaterializing(Value val) { - return val.getDefiningOp() || - val.getDefiningOp(); -} - -/// Returns true when the tensor expression is admissible for codegen. -/// Since all sparse input tensors are admissible, we just need to check -/// whether the out tensor in the tensor expression codegen is admissible. -/// Sets `sparseOut` to the tensor and `outerParNest` to the outer injective -/// nesting depth when a "truly dynamic" sparse tensor output occurs. -static bool isAdmissibleTensorExp(CodegenEnv &env, unsigned exp, - OpOperand **sparseOut, - unsigned *outerParNest) { - // We reject any expression that makes a reduction from `-outTensor`, as those - // expressions create a dependency between the current iteration (i) and the - // previous iteration (i-1). It would require iterating over the whole - // coordinate space, which prevent exploiting sparsity for faster code. - for (utils::IteratorType it : env.op().getIteratorTypesArray()) { - if (it == utils::IteratorType::reduction) { - if (env.merger().hasNegateOnOut(exp)) - return false; - break; - } - } - - OpOperand *lhs = env.op().getDpsInitOperand(0); - unsigned tensor = lhs->getOperandNumber(); - auto enc = getSparseTensorEncoding(lhs->get().getType()); - // An non-annotated output tensor is assumed dense, and becomes a random - // access n-dim memref. Admissible since insertions cannot occur. - if (!enc) - return true; - // An all-dense annotated "sparse" output tensor becomes a linearized random - // access 1-dim memref. Also admissible since insertions cannot occur. - bool allDense = true; - unsigned numLoops = - env.merger().getNumLoops(); // numNativeLoops + numFilterLoops - for (unsigned i = 0; i < env.merger().getNumLoops(); i++) - if (isCompressedDLT(env.dlt(tensor, i)) || - isSingletonDLT(env.dlt(tensor, i))) { - allDense = false; - break; - } else { - assert(isDenseDLT(env.dlt(tensor, i)) || isUndefDLT(env.dlt(tensor, i))); - } - if (allDense) - return true; - - // TODO: support compound affine expression on sparse output. - if (getNumCompoundAffineOnSparseDims(env.op().getMatchingIndexingMap(lhs), - lhs->get()) != 0) - return false; - - // A tensor expression with a sparse output tensor that changes its values - // but not its nonzero structure, an operation called "simply dynamic" in - // [Bik96,Ch9], is also admissible without special env. - if (env.merger().isSingleCondition(tensor, exp)) - return true; - - // Accept "truly dynamic" if the output tensor materializes uninitialized - // into the computation and insertions occur in lexicographic index order. - if (isMaterializing(lhs->get())) { - auto iteratorTypes = env.op().getIteratorTypesArray(); - unsigned nest = 0; - for (unsigned i = 0; i < numLoops; i++) { - if (!env.merger().isFilterLoop(env.topSortAt(i))) { - // We only count non-filter loops as filter loops should be considered - // as a special type of parallel loops. - if (linalg::isReductionIterator(iteratorTypes[env.topSortAt(i)])) - break; // terminate at first reduction - nest++; - } - } - // Determine admissible dynamic insertion situations: - // (1) fully injective, since there are no reductions, - // (2) admissible 1-d expansion in innermost dimension. - if (nest >= env.op().getRank(lhs) - 1) { - *sparseOut = lhs; - *outerParNest = nest; - return true; - } - } - return false; -} - //===----------------------------------------------------------------------===// // Sparse compiler synthesis methods (statements and expressions). //===----------------------------------------------------------------------===// @@ -1271,72 +1196,73 @@ unsigned numloopCond = 0; // Converts bits to array + dim pair - env.merger().foreachTidDimPairInBits(all, [&, - idx](unsigned b, unsigned tid, - std::optional dim, - DimLevelType dlt) { - if (simple.test(b)) { - if (isUndefDLT(dlt)) { - // An undefined dlt in the lattices, we probably mean to iterate based - // on the dim of output tensor. - // E.g., this could be a synthetic tensor (for invariants and sparse - // output tensor). - // out[i][j] = invariant; or a broadcast - // out[i][j] = in[i] (j is undef for input) - tid = env.merger().getOutTensorID(); - dim = env.merger().getDimNum(tid, idx); - // Skips invalid dim (e.g., when this is a zero ranked tensor). - if (!dim) - return; - } - tids.push_back(tid); - dims.push_back(*dim); - numloopCond++; - } else if (isDenseDLT(dlt)) { - tids.push_back(tid); - dims.push_back(*dim); - } else { - assert(isUndefDLT(dlt)); - linalg::GenericOp op = env.op(); - if (tid >= op.getNumDpsInputs()) - // We only handle affine expression on input tensors (for now). - return; - OpOperand *operand = &op->getOpOperand(tid); - auto enc = getSparseTensorEncoding(operand->get().getType()); - // Non-annotated dense tensors requires no special handling. - if (!enc) - return; - - ArrayRef affines = - op.getMatchingIndexingMap(operand).getResults(); - assert(affines.size() == enc.getDimLevelType().size()); - for (unsigned i = 0, e = affines.size(); i < e; i++) { - AffineExpr exp = affines[toOrigDim(enc, i)]; - // Skip simple affine expression and non dense dimensions (which has - // it own filter loop). - if (exp.isa() || !isDenseDLT(getDimLevelType(enc, i))) - continue; - - // Constant affine expression are handled in genLoop - if (!exp.isa()) { - bool atLevel = false; - if (isInvariantAffine(env, exp, idx, atLevel) && atLevel) { - // If the compound affine is invariant and we are right at the - // level. We need to generate the address according to the affine - // expression. This is also the best place we can do it to avoid - // putting it inside inner loops. - // NOTE: It assumes that the levels of the input tensor are - // initialized in order (and it is also currently guaranteed by - // computeIterationGraph), another more admissible approach might be - // accepting out-of-order access between consecutive dense levels. - affineTids.push_back(tid); - affineDims.push_back(i); - exps.push_back(exp); + env.merger().foreachTidDimPairInBits( + all, [&, idx](unsigned b, unsigned tid, std::optional dim, + DimLevelType dlt) { + if (simple.test(b)) { + if (isUndefDLT(dlt)) { + // An undefined dlt in the lattices, we probably mean to iterate + // based on the dim of output tensor. + // E.g., this could be a synthetic tensor (for invariants and sparse + // output tensor). + // out[i][j] = invariant; or a broadcast + // out[i][j] = in[i] (j is undef for input) + tid = env.merger().getOutTensorID(); + dim = env.merger().getDimNum(tid, idx); + // Skips invalid dim (e.g., when this is a zero ranked tensor). + if (!dim) + return; + } + tids.push_back(tid); + dims.push_back(*dim); + numloopCond++; + } else if (isDenseDLT(dlt)) { + tids.push_back(tid); + dims.push_back(*dim); + } else { + assert(isUndefDLT(dlt)); + linalg::GenericOp op = env.op(); + if (tid >= op.getNumDpsInputs()) + // We only handle affine expression on input tensors (for now). + return; + OpOperand *operand = &op->getOpOperand(tid); + auto enc = getSparseTensorEncoding(operand->get().getType()); + // Non-annotated dense tensors requires no special handling. + if (!enc) + return; + + ArrayRef affines = + op.getMatchingIndexingMap(operand).getResults(); + assert(affines.size() == enc.getDimLevelType().size()); + for (unsigned i = 0, e = affines.size(); i < e; i++) { + AffineExpr exp = affines[toOrigDim(enc, i)]; + // Skip simple affine expression and non dense dimensions (which has + // it own filter loop). + if (exp.isa() || + !isDenseDLT(getDimLevelType(enc, i))) + continue; + + // Constant affine expression are handled in genLoop + if (!exp.isa()) { + bool atLevel = false; + if (isInvariantAffine(env, exp, idx, atLevel) && atLevel) { + // If the compound affine is invariant and we are right at the + // level. We need to generate the address according to the + // affine expression. This is also the best place we can do it + // to avoid putting it inside inner loops. + // NOTE: It assumes that the levels of the input tensor are + // initialized in order (and it is also currently guaranteed by + // computeIterationGraph), another more admissible approach + // might be accepting out-of-order access between consecutive + // dense levels. + affineTids.push_back(tid); + affineDims.push_back(i); + exps.push_back(exp); + } + } } } - } - } - }); + }); if (isDenseDLT(env.dlt(env.merger().getOutTensorID(), idx))) { // Note that we generate dense indices of the output tensor @@ -1513,8 +1439,9 @@ LogicalResult matchAndRewrite(linalg::GenericOp op, PatternRewriter &rewriter) const override { - // Only accept single output operations. - if (op.getNumDpsInits() != 1) + // Only accept single output operations without affine index on sparse + // output. + if (op.getNumDpsInits() != 1 || hasCompoundAffineOnSparseOut(op)) return failure(); // Sets up a code generation environment. @@ -1528,11 +1455,8 @@ if (!findSparseAnnotations(env)) return failure(); - // Builds the tensor expression for the Linalg operation in SSA form. - std::optional optExp = env.merger().buildTensorExpFromLinalg(op); - if (!optExp) + if (failed(env.initTensorExp())) return failure(); - unsigned exp = *optExp; // Computes a topologically sorted iteration graph to ensure tensors // are visited in natural index order. Gradually relaxes the considered @@ -1541,31 +1465,31 @@ // to resolve cycles by inserting a conversion. bool isAdmissible = false; bool hasCycle = true; - OpOperand *sparseOut = nullptr; - unsigned outerParNest = -1u; + // An const list of all masks that we used for interation graph // computation. Must be ordered from more strict to less strict. const auto allMask = {SortMask::kIncludeAll, SortMask::kIncludeUndef, SortMask::kIncludeDense, SortMask::kSparseOnly}; - for (auto mask : allMask) + for (auto mask : allMask) { if (computeIterationGraph(env, mask)) { hasCycle = false; - if (isAdmissibleTensorExp(env, exp, &sparseOut, &outerParNest)) { + if (env.isAdmissibleTopoOrder()) { isAdmissible = true; break; } // else try a set of less strict constraints. } + } if (hasCycle) return resolveCycle(env, rewriter); // one last shot if (!isAdmissible) return failure(); // inadmissible expression, reject // Recursively generates code if admissible. - env.startEmit(sparseOut, outerParNest); + env.startEmit(); genBuffers(env, rewriter); genInitConstantDenseAddress(env, rewriter); - genStmt(env, rewriter, exp, 0); + genStmt(env, rewriter, env.getTensorExp(), 0); genResult(env, rewriter); return success(); }