diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h --- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h @@ -153,11 +153,15 @@ /// last tensor in this set denoting the output tensor. The merger adds an /// additional synthetic tensor at the end of this set to represent all /// invariant expressions in the kernel. - Merger(unsigned t, unsigned l) - : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l), - hasSparseOut(false), - dimTypes(t + 1, std::vector(l, DimLevelType::Undef)), - loopIdxToDim(t + 1, std::vector>(l, llvm::None)) {} + Merger(unsigned t, unsigned l, unsigned fl) + : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), + numNativeLoops(l), numLoops(l + fl), hasSparseOut(false), + dimTypes(numTensors, + std::vector(numLoops, DimLevelType::Undef)), + loopIdxToDim(numTensors, + std::vector>(numLoops, llvm::None)), + dimToLoopIdx(numTensors, + std::vector>(numLoops, llvm::None)) {} /// Adds a tensor expression. Returns its index. unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value(), @@ -227,6 +231,15 @@ /// Bit translation (get loop index). unsigned index(unsigned b) const { return b / numTensors; } + /// Get the number of total loops (native loops + filter loops). + unsigned getNumLoops() const { return numLoops; } + /// Get the number of native loops. + unsigned getNumNativeLoops() const { return numNativeLoops; } + /// Get the number of filter loops. + unsigned getNumFilterLoops() const { return numLoops - numNativeLoops; } + /// Get the starting filter loop index. + unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); } + /// Returns true if bit corresponds to index of output tensor. bool isOutTensor(unsigned b, unsigned i) const { return tensor(b) == outTensor && index(b) == i; @@ -238,6 +251,11 @@ /// expressions). unsigned getSynTensorID() const { return syntheticTensor; } + bool isFilterLoop(unsigned ldx) const { + assert(ldx < numLoops); + return ldx >= numNativeLoops; + } + /// Returns true if given tensor iterates *only* in the given tensor /// expression. For the output tensor, this defines a "simply dynamic" /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for @@ -258,6 +276,11 @@ return getDimLevelType(tensor(b), index(b)); } + Optional getLoopIdx(unsigned t, unsigned dim) const { + assert(t < numTensors && dim < numLoops); + return dimToLoopIdx[t][dim]; + } + /// Gets the dimension number of the the `t`th tensor on `i`th loop. Optional getDimNum(unsigned t, unsigned i) const { assert(t < numTensors && i < numLoops); @@ -276,6 +299,8 @@ assert(isValidDLT(dlt)); dimTypes[t][i] = dlt; loopIdxToDim[t][i] = dim; + assert(dim < numLoops); + dimToLoopIdx[t][dim] = i; } // Iterates the bits of a lattice, for each set bit, converts it into the @@ -334,6 +359,7 @@ const unsigned outTensor; const unsigned syntheticTensor; const unsigned numTensors; + const unsigned numNativeLoops; const unsigned numLoops; bool hasSparseOut; // Map that converts pair to the corresponding dimension @@ -341,6 +367,8 @@ std::vector> dimTypes; // Map that converts pair to the corresponding dimension. std::vector>> loopIdxToDim; + // Map that converts pair to the corresponding loop id. + std::vector>> dimToLoopIdx; llvm::SmallVector tensorExps; llvm::SmallVector latPoints; llvm::SmallVector, 8> latSets; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -318,7 +318,6 @@ // loopEmiter.exitCurrentLoop(); // exit i //===----------------------------------------------------------------------===// -// TODO: Sparsification should also rely on this class to generate loops. class SparseTensorLoopEmitter { public: /// Optional callback function to setup dense output tensors when @@ -332,13 +331,17 @@ /// If isSparseOut is set, loop emitter assume that the sparse output tensor /// is empty, and will always generate loops on it based on the dim sizes. explicit SparseTensorLoopEmitter(ValueRange tensors, bool hasOutput = false, - bool isSparseOut = false); + bool isSparseOut = false, + ArrayRef topSort = {}); /// Starts a loop emitting session by generating all the buffers needed to /// iterate tensors. void initializeLoopEmit(OpBuilder &builder, Location loc, OutputUpdater updater = nullptr); + /// Generates a list of operations to compute the affine expression. + Value genAffine(OpBuilder &builder, AffineExpr a, Location loc); + /// Enters a new loop sequence, the loops within the same sequence starts from /// the break points of previous loop instead of starting over from 0. /// e.g., @@ -377,6 +380,15 @@ ArrayRef extraTids = {}, ArrayRef extraDims = {}); + Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + AffineExpr affine, + MutableArrayRef reduc = {}); + + void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + AffineExpr affine); + /// Emits a co-iteration loop over a set of tensors. Operation *enterCoIterationOverTensorsAtDims( OpBuilder &builder, Location loc, ArrayRef tids, @@ -392,6 +404,9 @@ coords.push_back(l.iv); } + /// Gets loop induction variable at the given level. + unsigned getCurrentDepth() const { return loopStack.size(); } + /// Gets loop induction variable at the given level. Value getLoopIV(size_t level) const { if (level < loopStack.size()) @@ -509,6 +524,11 @@ // sequence. std::vector loopSeqStack; + // Maps AffineDimExpr to the index of the loop in loopStack. + // TODO: We should probably use a callback function here to make it more + // general. + std::vector sparsiferLoopLvlMap; + // TODO: not yet used, it should track the current level for each tensor // to help eliminate `dim` paramters from above APIs. // std::vector curLv; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -96,12 +96,14 @@ SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors, bool hasOutput, - bool isSparseOut) + bool isSparseOut, + ArrayRef topSort) : hasOutput(hasOutput), isSparseOut(isSparseOut), tensors(tensors.begin(), tensors.end()), dimTypes(tensors.size()), pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()), ptrBuffer(tensors.size()), idxBuffer(tensors.size()), - valBuffer(tensors.size()), loopStack() { + valBuffer(tensors.size()), loopStack(), + sparsiferLoopLvlMap(topSort.size(), 0) { for (size_t tid = 0, e = tensors.size(); tid < e; tid++) { auto t = tensors[tid]; // a scalar or 0-dimension tensors @@ -125,6 +127,13 @@ ptrBuffer[tid].assign(rank, Value()); idxBuffer[tid].assign(rank, Value()); } + + for (unsigned i = 0, e = topSort.size(); i < e; i++) { + // This is an inverse map of the topologically sorted loop index from + // sparsifier. This is needed to map the AffineDimExpr back to the loopStack + // index used in loop emitter. + sparsiferLoopLvlMap[topSort[i]] = i; + } } void SparseTensorLoopEmitter::initializeLoopEmit( @@ -291,6 +300,108 @@ return loop; } +Value SparseTensorLoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, + Location loc) { + switch (a.getKind()) { + case AffineExprKind::DimId: { + unsigned idx = a.cast().getPosition(); + return loopStack[sparsiferLoopLvlMap[idx]].iv; + } + case AffineExprKind::Add: { + auto binOp = a.cast(); + return builder.create( + loc, genAffine(builder, binOp.getLHS(), loc), + genAffine(builder, binOp.getRHS(), loc)); + } + case AffineExprKind::Mul: { + auto binOp = a.cast(); + return builder.create( + loc, genAffine(builder, binOp.getLHS(), loc), + genAffine(builder, binOp.getRHS(), loc)); + } + case AffineExprKind::Constant: { + int64_t c = a.cast().getValue(); + return constantIndex(builder, loc, c); + } + default: + llvm_unreachable("unexpected affine subscript"); + } +} + +Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim( + OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine, + MutableArrayRef reduc) { + assert(!affine.isa() && !isDenseDLT(dimTypes[tid][dim])); + assert(dimTypes[tid].size() > dim); + // We can not re-enter the same level. + assert(!coord[tid][dim]); + + Value step = constantIndex(builder, loc, 1); + + Value lo = pidxs[tid][dim]; + Value hi = highs[tid][dim]; + + // TODO: We should instead use a whileOp for filter loop to allow early + // break when exceeding (for ordered dimensions). + // TODO: There are many other potiential opportunities that we might apply in + // the future. E.g., we could use binary search to located the pointer index. + scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); + + // In-place update on the reduction variable vector. + assert(forOp.getNumRegionIterArgs() == reduc.size()); + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = forOp.getRegionIterArg(i); + + builder.setInsertionPointToStart(forOp.getBody()); + Value iv = forOp.getInductionVar(); + + pidxs[tid][dim] = iv; + // Generating a load on the indices array yields the coordinate. + Value ptr = idxBuffer[tid][dim]; + coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv); + + // Generate a if condition to filter out indices that is not equal to the + // result of the affine expression. + Value expected = genAffine(builder, affine, loc); + auto pred = builder.create(loc, arith::CmpIPredicate::eq, + coord[tid][dim], expected); + SmallVector types; + for (Value red : reduc) { + types.push_back(red.getType()); + } + + bool hasReduc = !types.empty(); + scf::IfOp ifOp = + builder.create(loc, types, pred, /*else*/ hasReduc); + if (hasReduc) { + // scf.for (a) -> v + // %s = scf.if (a) -> v + // user-generated code. + // else + // yield a + // yield %s + builder.create(loc, ifOp.getResults()); + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + // On mismatch. + builder.create(loc, reduc); + } + // Set the insert point to matched branch. + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + // NOTE: we can also prepares for next dim here in advance + // Push the loop into stack + loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, + coord[tid][dim]); + return forOp; +} + +void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel( + OpBuilder &builder, Location loc, size_t tid, size_t dim, + AffineExpr affine) { + Value affineV = genAffine(builder, affine, loc); + pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV); +} + Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, bool needsUniv, MutableArrayRef reduc, @@ -469,7 +580,6 @@ if (forOp) { if (!reduc.empty()) { assert(reduc.size() == forOp.getNumResults()); - rewriter.setInsertionPointToEnd(forOp.getBody()); rewriter.create(loc, reduc); } // Exit the loop. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -27,6 +27,7 @@ #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" #include "mlir/Dialect/SparseTensor/Utils/Merger.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/TensorEncoding.h" #include "llvm/ADT/SmallBitVector.h" @@ -57,7 +58,7 @@ unsigned numLoops, OpOperand *op, unsigned nest, std::vector &ts) : options(o), loopEmitter(tensors, /*hasOutput=*/true, - /*isSparseOut=*/op != nullptr), + /*isSparseOut=*/op != nullptr, ts), sparseOut(op), outerParNest(nest), topSort(ts) { if (op) insChain = op->get(); @@ -85,6 +86,11 @@ // Topsort (reference should remain in scope). std::vector &topSort; + ArrayRef getLoopCurStack() const { + ArrayRef topSortRef = topSort; + return topSortRef.slice(0, loopEmitter.getCurrentDepth()); + } + Value getLoopIdxValue(size_t loopIdx) const { for (unsigned lv = 0; lv < topSort.size(); lv++) if (topSort[lv] == loopIdx) @@ -94,27 +100,110 @@ } }; +class ParallelAffineDimFinder + : public AffineExprVisitor { + AffineExpr paraDim; + utils::IteratorType pickIterType; + SmallVector iterTypes; + +public: + explicit ParallelAffineDimFinder(linalg::GenericOp op) + : iterTypes(op.getIteratorTypesArray()) {} + void visitDimExpr(AffineDimExpr expr) { + if (paraDim == nullptr || pickIterType == iterTypes[expr.getPosition()]) { + paraDim = expr; + } + } + + void setPickedIterType(utils::IteratorType iterType) { + pickIterType = iterType; + } + + AffineDimExpr getDimExpr() const { return paraDim.cast(); } +}; } // namespace //===----------------------------------------------------------------------===// // Sparse compiler analysis methods. //===----------------------------------------------------------------------===// +/// Determines if affine expression is invariant. +static bool isInvariantAffine(AffineExpr a, ArrayRef loopStack, + unsigned ldx, bool &atLevel) { + switch (a.getKind()) { + case AffineExprKind::DimId: { + unsigned idx = a.cast().getPosition(); + if (idx == ldx) { + atLevel = true; + // Must be invariant if we are at the level. + return true; + } + bool isInvariant = false; + for (unsigned loop : loopStack) { + isInvariant = (loop == idx); + if (isInvariant) + break; + } + return isInvariant; + } + case AffineExprKind::Add: + case AffineExprKind::Mul: { + auto binOp = a.cast(); + return isInvariantAffine(binOp.getLHS(), loopStack, ldx, atLevel) && + isInvariantAffine(binOp.getRHS(), loopStack, ldx, atLevel); + } + default: { + assert(a.isa()); + return true; + } + } +} + +/// Determines if affine expression is invariant. +static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a, + unsigned ldx, bool &atLevel) { + return isInvariantAffine(a, codegen.getLoopCurStack(), ldx, atLevel); +} + /// Helper method to construct a permuted dimension ordering /// that adheres to the given topological sort. -static AffineMap permute(MLIRContext *context, AffineMap m, - std::vector &topSort) { +static AffineMap permute(const Merger &merger, MLIRContext *context, + AffineMap m, ArrayRef topSort) { unsigned sz = topSort.size(); - assert(m.getNumResults() == sz && "TopoSort/AffineMap size mismatch"); + assert(m.getNumDims() + merger.getNumFilterLoops() == sz && + "TopoSort/AffineMap size mismatch"); // Construct the inverse of `m`; to avoid the asymptotic complexity // of calling `m.getPermutedPosition` repeatedly. - SmallVector inv(sz); - for (unsigned i = 0; i < sz; i++) - inv[i] = m.getDimPosition(i); + SmallVector perm; + unsigned numResults = m.getNumResults(); + BitVector worklist(numResults, true); + unsigned loopDepth = 1; + // Construct the permutation. - SmallVector perm(sz); - for (unsigned i = 0; i < sz; i++) - perm[i] = inv[topSort[i]]; + while (worklist.any() && loopDepth <= topSort.size()) { + unsigned preSize = perm.size(); + for (auto dim : worklist.set_bits()) { + bool atLevel = false; + if (m.getResult(dim).isa() || + (isInvariantAffine(m.getResult(dim), topSort.slice(0, loopDepth), + topSort[loopDepth - 1], atLevel) && + atLevel)) { + // If the matching affine is constant expression or just become + // invariant. We can visit the dimension now without breaking the + // topSort constraint. + perm.push_back(dim); + } + } + + // Removes resolved dimension. + for (unsigned i = preSize, e = perm.size(); i < e; i++) + worklist.reset(perm[i]); + + // Tries to entering the next loop level. + loopDepth += 1; + } + + assert(perm.size() == numResults); return AffineMap::getPermutationMap(perm, context); } @@ -122,7 +211,7 @@ /// same index is used more than once. Also rejects compound affine /// expressions in sparse dimensions. static bool findAffine(Merger &merger, unsigned tensor, unsigned dim, - AffineExpr a, DimLevelType dlt, + AffineExpr a, DimLevelType dlt, unsigned &filterLdx, bool setLvlFormat = true) { switch (a.getKind()) { case AffineExprKind::DimId: { @@ -135,22 +224,58 @@ return true; } case AffineExprKind::Add: - case AffineExprKind::Mul: { - if (!isDenseDLT(dlt)) - return false; // compound only in dense dim - auto binOp = a.cast(); - // We do not set dim level format for affine expresssion like d0 + d1 on - // both loop index at d0 and d1, - return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) && - findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false); + case AffineExprKind::Mul: + case AffineExprKind::Constant: { + if (!isDenseDLT(dlt) && setLvlFormat) { + assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx))); + // Use a filter loop for sparse affine expression. + merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt); + } + + if (auto binOp = a.dyn_cast()) { + // We do not set dim level format for affine expresssion like d0 + d1 on + // either loop index at d0 or d1/ + // We continues the recursion merely to check whether current affine is + // admissible or not. + return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx, + false) && + findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx, + false); + } + // Falls through when it is a constant Affine + return true; } - case AffineExprKind::Constant: - return isDenseDLT(dlt); // const only in dense dim default: return false; } } +static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap, + Value tensor) { + unsigned num = 0; + auto enc = getSparseTensorEncoding(tensor.getType()); + if (enc) { + ArrayRef exps = affineMap.getResults(); + for (unsigned rank = 0; rank < exps.size(); rank++) { + auto aidx = toOrigDim(enc, rank); + auto affine = exps[aidx]; + if (!affine.isa()) + if (!isDenseDLT(getDimLevelType(enc, rank))) + num++; + } + } + + return num; +} + +static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) { + unsigned num = 0; + for (OpOperand &t : op->getOpOperands()) + num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t), + t.get()); + return num; +} + /// Helper method to inspect sparse encodings in the tensor types. /// Fills the per-dimension sparsity information for all tensors. /// Returns true if the sparse annotations and affine subscript @@ -158,19 +283,22 @@ /// no annotations are found or inadmissible constructs occur. static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) { bool annotated = false; + unsigned filterLdx = merger.getFilterLoopStartingIdx(); for (OpOperand &t : op->getOpOperands()) { auto map = op.getMatchingIndexingMap(&t); auto enc = getSparseTensorEncoding(t.get().getType()); if (enc) annotated = true; assert(map.getNumResults() == op.getRank(&t)); + for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { unsigned tensor = t.getOperandNumber(); AffineExpr a = map.getResult(toOrigDim(enc, d)); - if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d))) + if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx)) return false; // inadmissible affine expression } } + assert(filterLdx == merger.getNumLoops()); return annotated; } @@ -180,34 +308,58 @@ /// latest possible index. static bool topSortOptimal(unsigned n, ArrayRef iteratorTypes, - std::vector &topSort, + const Merger &merger, std::vector &topSort, std::vector &inDegree, std::vector> &adjM) { - std::vector redIt; // reduce iterator with 0 degree - std::vector parIt; // parallel iterator with 0 degree + std::vector redIt; // reduce iterator with 0 degree + std::vector parIt; // parallel iterator with 0 degree + std::vector filterIt; // filter loop with 0 degree for (unsigned i = 0; i < n; i++) { if (inDegree[i] == 0) { - if (linalg::isReductionIterator(iteratorTypes[i])) + if (merger.isFilterLoop(i)) + filterIt.push_back(i); + else if (linalg::isReductionIterator(iteratorTypes[i])) redIt.push_back(i); else parIt.push_back(i); } } - while (!redIt.empty() || !parIt.empty()) { - // We always choose parallel iterator if there is any. - auto &it = !parIt.empty() ? parIt : redIt; + while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) { + // We always choose in order of filter loop -> parallel loop -> reduction + // loop because + // 1. Putting reduction loop early might make the loop sequence + // inadmissible. + // 2. Filter Loop should be put as early as possible for better performance, + // since only one (if any) iteration will carry the computation. E.g., + // for (1 to N) + // for (1 to M) + // for (1 to K) + // if (xxx) + // O(X) computation => O(NMK+NMX) time complexity + // + // By putting the filter loop one level up, we got + // + // for (1 to N) + // for (1 to K) + // if (xxx) + // for (1 to M) + // O(X) computation => O(NK+NMX) time complexity + auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt); auto src = it.back(); topSort.push_back(src); it.pop_back(); // Update in-degree, and push 0-degree node into worklist. - for (unsigned dst = 0; dst < n; dst++) + for (unsigned dst = 0; dst < n; dst++) { if (adjM[src][dst] && --inDegree[dst] == 0) { - if (linalg::isReductionIterator(iteratorTypes[dst])) + if (merger.isFilterLoop(dst)) + filterIt.push_back(dst); + else if (linalg::isReductionIterator(iteratorTypes[dst])) redIt.push_back(dst); else parIt.push_back(dst); } + } } return topSort.size() == n; } @@ -217,23 +369,38 @@ /// example i0+i1 < i2+i3+1 yields i0> &adjM, std::vector &inDegree, AffineExpr a, - AffineExpr b, unsigned fidx) { - switch (a.getKind()) { - case AffineExprKind::DimId: { - unsigned idx = a.cast().getPosition(); - if (b) - addAffineOrderings(adjM, inDegree, b, AffineExpr(), idx); - else if (!adjM[fidx][idx]) { - adjM[fidx][idx] = true; - inDegree[idx]++; + AffineExpr b, Optional fidx, + Optional tidx) { + if (!a && !b) { + // Recursion leaf. + assert(fidx && tidx); + unsigned f = *fidx, t = *tidx; + if (!adjM[f][t]) { + adjM[f][t] = true; + inDegree[t]++; } + return; + } + auto toExpand = a ? a : b; + switch (toExpand.getKind()) { + case AffineExprKind::DimId: { + auto idx = toExpand.cast().getPosition(); + if (toExpand == a) + addAffineOrderings(adjM, inDegree, AffineExpr(), b, idx, tidx); + else // toExpand == b + addAffineOrderings(adjM, inDegree, a, AffineExpr(), fidx, idx); break; } case AffineExprKind::Add: case AffineExprKind::Mul: { - auto binOp = a.cast(); - addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx); - addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx); + auto binOp = toExpand.cast(); + if (toExpand == a) { + addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx, tidx); + addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx, tidx); + } else { + addAffineOrderings(adjM, inDegree, a, binOp.getLHS(), fidx, tidx); + addAffineOrderings(adjM, inDegree, a, binOp.getRHS(), fidx, tidx); + } break; } default: @@ -251,19 +418,16 @@ OpOperand *skip = nullptr) { // Set up an n x n from/to adjacency matrix of the iteration graph // for the implicit loop indices i_0 .. i_n-1. - unsigned n = op.getNumLoops(); + unsigned n = merger.getNumLoops(); std::vector> adjM(n, std::vector(n, false)); std::vector inDegree(n, 0); // in-degree of each node. auto iteratorTypes = op.getIteratorTypesArray(); // Iterate over the indexing maps of every tensor in the tensor expression. for (OpOperand &t : op->getOpOperands()) { - // Skip tensor during cycle resolution. - if (&t == skip) - continue; // Get map and encoding. auto map = op.getMatchingIndexingMap(&t); auto enc = getSparseTensorEncoding(t.get().getType()); - assert(map.getNumDims() == n); + assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n); // Skip dense tensor constraints when not requested. if (!(mask & SortMask::kIncludeDense) && !enc) continue; @@ -271,10 +435,84 @@ // by default) puts an ordering constraint on the loop indices. For // example, the tensor expresion A_ijk forces the ordering i < j < k // on the loop indices if no explicit dimension ordering is given. - for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) { - AffineExpr f = map.getResult(toOrigDim(enc, d - 1)); - AffineExpr t = map.getResult(toOrigDim(enc, d)); - addAffineOrderings(adjM, inDegree, f, t, 0); + for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { + AffineExpr ta = map.getResult(toOrigDim(enc, d)); + Optional tldx = merger.getLoopIdx(t.getOperandNumber(), d); + + // Filter loops should be constructed after all the dependent loops, + // i.e., d0 + d1 < filter_loop(d0 + d1) + if (tldx && merger.isFilterLoop(tldx.value())) { + assert(!ta.isa() && + !isDenseDLT(getDimLevelType(enc, d))); + addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx); + // Now that the ordering of affine expression is captured by filter + // loop idx, we only need to ensure the affine ordering against filter + // loop. Thus, we reset the affine express to nil here to mark it as + // resolved. + ta = AffineExpr(); + } + + // Skip tensor during cycle resolution, though order between filter loop + // and dependent loops need to be guaranteed unconditionally. + if (&t == skip) + continue; + + if (d > 0) { + AffineExpr fa = map.getResult(toOrigDim(enc, d - 1)); + Optional fldx = + merger.getLoopIdx(t.getOperandNumber(), d - 1); + + if (fldx && merger.isFilterLoop(fldx.value())) { + // This must be a compound affine expression on sparse dimension. + assert(!fa.isa() && + !isDenseDLT(getDimLevelType(enc, d - 1))); + // For the same reason above. + fa = AffineExpr(); + } + + if (!(mask & SortMask::kIncludeDense) && !tldx) { + ParallelAffineDimFinder finder(op); + // e.g, for [dense, dense] -> (d0 + d1, d2 + d3) + // It is totally fine to have loop sequence d0->d2->d1->d3 instead of + // requiring d0 < d2, d1 < d2, d0 < d3, d1 < d3. + // We use a heuristic here to only pick one dim expression from each + // compound affine expression to establish the order between two dense + // dimensions. + // NOTE: The ordering can only be loosen when the destination level is + // dense, for [dense, sparse] -> (d0 + d1, d2), we still require both + // d0 < d2 and d1 < d2 to ensure correct ordering (i.e., no ordering + // like d0->d2->d1). + // TODO: this is obviously a sub optimal solution. + if (!fldx && fa.isa()) { + assert(isDenseDLT(getDimLevelType(enc, d - 1)) && + !fa.isa()); + // Heuristic: we prefer parallel loop for lhs to reduce the chance + // we add reduce < parallel ordering. + finder.setPickedIterType(utils::IteratorType::parallel); + finder.walkPostOrder(fa); + fa = finder.getDimExpr(); + fldx = finder.getDimExpr().getPosition(); + } + if (!ta.isa()) { + // Dense compound affine + assert(isDenseDLT(getDimLevelType(enc, d)) && + !ta.isa()); + // Heuristic: we prefer reduction loop for rhs to reduce the chance + // addint reduce < parallel ordering. + finder.setPickedIterType(utils::IteratorType::reduction); + finder.walkPostOrder(ta); + ta = finder.getDimExpr(); + tldx = finder.getDimExpr().getPosition(); + } + } + + // (d0 + d1) < (d2 + d3), or + // filter_loop_d-1 < (d2 + d3), or + // (d0 + d1) < filter_loop_d, or + // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset + // above. + addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx); + } } // Push unrelated loops into sparse iteration space, so these // will be skipped more often. @@ -298,7 +536,7 @@ // Report failure for a cyclic iteration graph. topSort.clear(); topSort.reserve(n); - return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM); + return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM); } /// Returns true if tensor materializes uninitialized into the computation. @@ -326,9 +564,8 @@ // An all-dense annotated "sparse" output tensor becomes a linearized random // access 1-dim memref. Also admissible since insertions cannot occur. bool allDense = true; - auto iteratorTypes = op.getIteratorTypesArray(); - unsigned numLoops = iteratorTypes.size(); - for (unsigned i = 0; i < numLoops; i++) + unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops + for (unsigned i = 0; i < merger.getNumLoops(); i++) if (isCompressedDLT(merger.getDimLevelType(tensor, i)) || isSingletonDLT(merger.getDimLevelType(tensor, i))) { allDense = false; @@ -339,19 +576,31 @@ } if (allDense) return true; + + // TODO: support compound affine expression on sparse output. + if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs), + lhs->get()) != 0) + return false; + // A tensor expression with a sparse output tensor that changes its values // but not its nonzero structure, an operation called "simply dynamic" in // [Bik96,Ch9], is also admissible without special codegen. if (merger.isSingleCondition(tensor, exp)) return true; + // Accept "truly dynamic" if the output tensor materializes uninitialized // into the computation and insertions occur in lexicographic index order. if (isMaterializing(lhs->get())) { + auto iteratorTypes = op.getIteratorTypesArray(); unsigned nest = 0; for (unsigned i = 0; i < numLoops; i++) { - if (linalg::isReductionIterator(iteratorTypes[topSort[i]])) - break; // terminate at first reduction - nest++; + if (!merger.isFilterLoop(topSort[i])) { + // We only count non-filter loops as filter loops should be considered + // as a special type of parallel loops. + if (linalg::isReductionIterator(iteratorTypes[topSort[i]])) + break; // terminate at first reduction + nest++; + } } // Determine admissible dynamic insertion situations: // (1) fully injective, since there are no reductions, @@ -448,12 +697,12 @@ codegen.loopEmitter.initializeLoopEmit( builder, loc, /// Generates buffer for the output tensor. - /// Note that all sparse kernels assume that when all elements are written - /// to (viz. x(i) = y(i) * z(i)), the output buffer is already initialized - /// to all zeroes and only nonzeroes values are computed and written out. - /// For updates (viz. x(i) += y(i) * z(i)), only nonzeroes values are used - /// for the updates and no assumption on the original contents of the - /// output buffer is necessary. + /// Note that all sparse kernels assume that when all elements are + /// written to (viz. x(i) = y(i) * z(i)), the output buffer is already + /// initialized to all zeroes and only nonzeroes values are computed and + /// written out. For updates (viz. x(i) += y(i) * z(i)), only nonzeroes + /// values are used for the updates and no assumption on the original + /// contents of the output buffer is necessary. [&op](OpBuilder &builder, Location loc, Value memref, Value tensor) -> Value { // Must not be a sparse tensor. @@ -462,13 +711,13 @@ // Two output tensors references should pointed to the same object. assert(lhs->get() == tensor); bool isInit = op.isInitTensor(lhs); - // An output tensor can simply materialize from the buffer of the tensor - // that appears in the outs() clause. For updates, this has the + // An output tensor can simply materialize from the buffer of the + // tensor that appears in the outs() clause. For updates, this has the // advantage that only the nonzero value are involved in the - // computation, keeping the operation O(nnz). In all other cases, we are - // forced to zero out the buffer to enforce the assumption above, which - // may negatively impact running complexity (viz. O(n^2 + nnz) vs. - // O(nnz) for matrices). + // computation, keeping the operation O(nnz). In all other cases, we + // are forced to zero out the buffer to enforce the assumption above, + // which may negatively impact running complexity (viz. O(n^2 + nnz) + // vs. O(nnz) for matrices). // TODO: use better analysis to avoid zeroing out the buffer? Value init = memref; if (!isInit) { @@ -481,38 +730,6 @@ }); } -/// Generates an affine expression. -// -// TODO: generalize for sparse tensor subscripts -// -static Value genAffine(CodeGen &codegen, OpBuilder &builder, AffineExpr a, - Location loc) { - switch (a.getKind()) { - case AffineExprKind::DimId: { - unsigned idx = a.cast().getPosition(); - return codegen.getLoopIdxValue(idx); // universal dense index - } - case AffineExprKind::Add: { - auto binOp = a.cast(); - return builder.create( - loc, genAffine(codegen, builder, binOp.getLHS(), loc), - genAffine(codegen, builder, binOp.getRHS(), loc)); - } - case AffineExprKind::Mul: { - auto binOp = a.cast(); - return builder.create( - loc, genAffine(codegen, builder, binOp.getLHS(), loc), - genAffine(codegen, builder, binOp.getRHS(), loc)); - } - case AffineExprKind::Constant: { - int64_t c = a.cast().getValue(); - return constantIndex(builder, loc, c); - } - default: - llvm_unreachable("unexpected affine subscript"); - } -} - /// Generates index for load/store on sparse tensor. static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) { auto map = op.getMatchingIndexingMap(t); @@ -533,16 +750,13 @@ unsigned rank = map.getNumResults(); if (enc) { // Note that currently, all sparse subscripts are simple. - // TODO: accept affine too? - assert(map.getResult(toOrigDim(enc, rank - 1)).getKind() == - AffineExprKind::DimId); Value pidx = codegen.loopEmitter.getPidxs()[tensor].back(); assert(pidx); args.push_back(pidx); // position index } else { for (unsigned d = 0; d < rank; d++) { AffineExpr a = map.getResult(d); - args.push_back(genAffine(codegen, builder, a, op.getLoc())); + args.push_back(codegen.loopEmitter.genAffine(builder, a, op.getLoc())); } } return codegen.loopEmitter.getValBuffer()[tensor]; @@ -774,27 +988,6 @@ return ee; } -/// Determines if affine expression is invariant. -static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a, - unsigned ldx, bool &atLevel) { - switch (a.getKind()) { - case AffineExprKind::DimId: { - unsigned idx = a.cast().getPosition(); - if (idx == ldx) - atLevel = true; - return codegen.getLoopIdxValue(idx) != nullptr; // no longer in play? - } - case AffineExprKind::Add: - case AffineExprKind::Mul: { - auto binOp = a.cast(); - return isInvariantAffine(codegen, binOp.getLHS(), ldx, atLevel) && - isInvariantAffine(codegen, binOp.getRHS(), ldx, atLevel); - } - default: - return true; - } -} - /// Hoists loop invariant tensor loads for which indices have been exhausted. static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp, unsigned ldx, @@ -809,7 +1002,14 @@ auto enc = getSparseTensorEncoding(t.get().getType()); for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { AffineExpr a = map.getResult(toOrigDim(enc, d)); - if (!isInvariantAffine(codegen, a, ldx, atLevel)) + Optional sldx = merger.getLoopIdx(t.getOperandNumber(), d); + if (sldx && merger.isFilterLoop(sldx.value())) { + if (!codegen.getLoopIdxValue(sldx.value())) + // The filter loops has not been constructed. + return; + if (sldx.value() == ldx) + atLevel = true; + } else if (!isInvariantAffine(codegen, a, ldx, atLevel)) return; // still in play } // All exhausted at this level (atLevel denotes exactly at this level). @@ -928,13 +1128,22 @@ ArrayRef extraTids, ArrayRef extraDims) { Location loc = op.getLoc(); - auto iteratorTypes = op.getIteratorTypesArray(); bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) || isSingletonDLT(merger.getDimLevelType(tid, idx)); bool isParallel = isParallelFor(codegen, isOuter, isSparse); Operation *loop = genLoopBoundary(codegen, merger, [&](MutableArrayRef reduc) { + if (merger.isFilterLoop(idx)) { + assert(isSparse); + OpOperand *t = &op->getOpOperand(tid); + auto enc = getSparseTensorEncoding(t->get().getType()); + // Retrieves the affine expression for the filter loop. + AffineExpr a = + op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim)); + return codegen.loopEmitter.enterFilterLoopOverTensorAtDim( + builder, loc, tid, dim, a, reduc); + } return codegen.loopEmitter.enterLoopOverTensorAtDim( builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims); }).value(); @@ -1120,12 +1329,13 @@ return false; } -static void translateBitsToTidDimPairs(Merger &merger, CodeGen &codegen, - unsigned li, unsigned idx, - SmallVectorImpl &condTids, - SmallVectorImpl &condDims, - SmallVectorImpl &extraTids, - SmallVectorImpl &extraDims) { +static void translateBitsToTidDimPairs( + Merger &merger, CodeGen &codegen, linalg::GenericOp op, unsigned li, + unsigned idx, SmallVectorImpl &condTids, + SmallVectorImpl &condDims, SmallVectorImpl &extraTids, + SmallVectorImpl &extraDims, SmallVectorImpl &affineTids, + SmallVectorImpl &affineDims, SmallVectorImpl &exps) { + const BitVector &all = merger.lat(li).bits; const BitVector &simple = merger.lat(li).simple; @@ -1153,6 +1363,60 @@ // TODO: get rid of extraTids and extraDims. extraTids.push_back(tid); extraDims.push_back(dim.value()); + } else { + assert(isUndefDLT(dlt)); + if (tid >= op.getNumDpsInputs()) + // We only handle affine expression on input tensors (for now). + return; + OpOperand *operand = &op->getOpOperand(tid); + auto enc = getSparseTensorEncoding(operand->get().getType()); + if (!enc) + // Non-annotated dense tensors requires no special handling. + return; + + ArrayRef affines = + op.getMatchingIndexingMap(operand).getResults(); + assert(affines.size() == enc.getDimLevelType().size()); + for (unsigned i = 0, e = affines.size(); i < e; i++) { + AffineExpr exp = affines[toOrigDim(enc, i)]; + if (exp.isa() || !isDenseDLT(getDimLevelType(enc, i))) + // Skip simple affine expression and non dense dimensions (which has + // it own filter loop). + continue; + + // Constant affine expressions on dense level required to be generated + // when + // 1. The previous level is an (at-level) invariant compound dense + // affine (with no corresponding loop idx); or + // 2. The previous level is being generated right now. + if (exp.isa()) { + // TODO: Should we come up with a more adhersive way to handle + // constant expression? We now requires two (somehow ad-hoc) code for + // it. + if (i != 0 && // i == 0 cases are handled in genConstantDenseAddress + ((!affineTids.empty() && affineTids.back() == tid && + affineDims.back() == i - 1) || // Condition 1 + merger.getLoopIdx(tid, i - 1) == idx)) { // Condition 2 + affineTids.push_back(tid); + affineDims.push_back(i); + exps.push_back(exp); + } + } else { + bool atLevel = false; + if (isInvariantAffine(codegen, exp, idx, atLevel) && atLevel) { + // If the compound affine is invariant and we are right at the + // level. We need to generate the address according to the affine + // expression. This is also the best place we can do it to avoid + // putting it inside inner loops. + // NOTE: It assumes that the levels of the input tensor are + // initialized in order, another more admissible approach might be + // accepting out-of-order access between consecutive dense levels. + affineTids.push_back(tid); + affineDims.push_back(i); + exps.push_back(exp); + } + } + } } }); @@ -1160,7 +1424,6 @@ // Note that we generate dense indices of the output tensor // unconditionally, since they may not appear in the lattice, but may be // needed for linearized codegen. - // Only dense dimensions should be optimized from conditions. auto dim = merger.getDimNum(merger.getOutTensorID(), idx).value(); extraTids.push_back(merger.getOutTensorID()); extraDims.push_back(dim); @@ -1176,12 +1439,23 @@ // The set of (dense) tensors that is optimized from condition, yet still // need extra locals to iterate on them. SmallVector extraTids, extraDims; - - translateBitsToTidDimPairs(merger, codegen, li, codegen.topSort[at], condTids, - condDims, extraTids, extraDims); + // The set of dense tensors with non-trivial affine expression that just + // becomes invariant and the address shall now be generated at the current + // level. + SmallVector affineTids, affineDims; + SmallVector affines; + + translateBitsToTidDimPairs(merger, codegen, op, li, codegen.topSort[at], + condTids, condDims, extraTids, extraDims, + affineTids, affineDims, affines); // Emit the for/while-loop control. Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv, condTids, condDims, extraTids, extraDims); + + for (auto [tid, dim, exp] : llvm::zip(affineTids, affineDims, affines)) { + codegen.loopEmitter.genDenseAffineAddressAtCurLevel(builder, op.getLoc(), + tid, dim, exp); + } return loop; } @@ -1217,6 +1491,32 @@ genExpansion(merger, codegen, builder, op, at, /*atStart=*/false); } +static void genConstantDenseAddress(CodeGen &codegen, RewriterBase &rewriter, + linalg::GenericOp op) { + // We can generates address for constant affine expression before any loops + // starting from the first level as they do not depend on any thing. + // E.g., [Dense, Dense, Sparse] -> (1, 2, d0), the addresses for the first two + // levels can be determined before loops. + for (OpOperand *input : op.getDpsInputOperands()) { + ArrayRef affines = + op.getMatchingIndexingMap(input).getResults(); + auto enc = getSparseTensorEncoding(input->get().getType()); + if (enc) { + for (unsigned i = 0, e = affines.size(); i < e; i++) { + AffineExpr affine = affines[toOrigDim(enc, i)]; + if (isDenseDLT(getDimLevelType(enc, i)) && + affine.isa()) { + codegen.loopEmitter.genDenseAffineAddressAtCurLevel( + rewriter, op.getLoc(), input->getOperandNumber(), i, affine); + } else { + // Breaks on first non-dense non-constant level. + break; + } + } + } + } +} + /// Recursively generates code while computing iteration lattices in order /// to manage the complexity of implementing co-iteration over unions /// and intersections of sparse iterations spaces. @@ -1309,7 +1609,6 @@ //===----------------------------------------------------------------------===// namespace { - /// Sparse rewriting rule for generic Lingalg operation. struct GenericOpSparsifier : public OpRewritePattern { public: @@ -1324,7 +1623,8 @@ return failure(); unsigned numTensors = op->getNumOperands(); unsigned numLoops = op.getNumLoops(); - Merger merger(numTensors, numLoops); + unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op); + Merger merger(numTensors, numLoops, numFilterLoops); if (!findSparseAnnotations(merger, op)) return failure(); @@ -1338,9 +1638,9 @@ unsigned outerParNest = 0; // Computes a topologically sorted iteration graph to ensure tensors // are visited in natural index order. Gradually relaxes the considered - // constraints until an acyclic iteration graph results, such that sparse - // code generation can proceed. As a last resort, an attempt is made - // to resolve cycles by inserting a conversion. + // constraints until an acyclic iteration graph results, such that + // sparse code generation can proceed. As a last resort, an attempt is + // made to resolve cycles by inserting a conversion. std::vector topSort; // Whether the current GenericOp is admissible. bool isAdmissible = false; @@ -1377,6 +1677,7 @@ CodeGen codegen(options, tensors, numTensors, numLoops, sparseOut, outerParNest, topSort); genBuffers(merger, codegen, rewriter, op); + genConstantDenseAddress(codegen, rewriter, op); genStmt(merger, codegen, rewriter, op, exp, 0); genResult(merger, codegen, rewriter, op); return success(); @@ -1407,7 +1708,7 @@ auto srcTp = tval.getType().cast(); auto dstEnc = SparseTensorEncodingAttr::get( op->getContext(), srcEnc.getDimLevelType(), - permute(getContext(), op.getMatchingIndexingMap(t), + permute(merger, getContext(), op.getMatchingIndexingMap(t), topSort), // new order srcEnc.getHigherOrdering(), srcEnc.getPointerBitWidth(), srcEnc.getIndexBitWidth()); diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir @@ -2,6 +2,7 @@ // RUN: mlir-opt %s -sparsification | FileCheck %s #SpVec = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }> +#EncDenseVec = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> #CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }> #trait1 = { @@ -54,6 +55,94 @@ return %0 : tensor<32xf32> } +// CHECK-LABEL: func.func @mul_inv_sparse1d( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 3 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_6]]) -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_4]] : index +// CHECK: %[[VAL_19:.*]] = scf.if %[[VAL_18]] -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_23:.*]] = scf.for %[[VAL_24:.*]] = %[[VAL_21]] to %[[VAL_22]] step %[[VAL_3]] iter_args(%[[VAL_25:.*]] = %[[VAL_16]]) -> (tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref +// CHECK: %[[VAL_27:.*]] = arith.mulf %[[VAL_26]], %[[VAL_20]] : f32 +// CHECK: %[[VAL_28:.*]] = arith.addf %[[VAL_27]], %[[VAL_5]] : f32 +// CHECK: %[[VAL_29:.*]] = sparse_tensor.insert %[[VAL_28]] into %[[VAL_25]]{{\[}}%[[VAL_17]]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: scf.yield %[[VAL_29]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: scf.yield %[[VAL_30:.*]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_16]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: scf.yield %[[VAL_31:.*]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: %[[VAL_32:.*]] = sparse_tensor.load %[[VAL_33:.*]] hasInserts : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: return %[[VAL_32]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +func.func @mul_inv_sparse1d(%arga: tensor<32xf32, #SpVec>, + %argb: tensor<4xf32, #SpVec>) -> tensor<32xf32, #SpVec> { + %argx = bufferization.alloc_tensor() : tensor<32xf32, #SpVec> + %0 = linalg.generic #trait1 + ins(%arga, %argb: tensor<32xf32, #SpVec>, tensor<4xf32, #SpVec>) + outs(%argx: tensor<32xf32, #SpVec>) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = arith.mulf %a, %b : f32 + %1 = arith.addf %x, %0 : f32 + linalg.yield %1 : f32 + } -> tensor<32xf32, #SpVec> + return %0 : tensor<32xf32, #SpVec> +} + + +// CHECK-LABEL: func.func @mul_inv_enc_dense1d( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> { +// CHECK: %[[VAL_2:.*]] = arith.constant 32 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 3 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_6]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref +// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_5]] { +// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_11]]] : memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref +// CHECK: %[[VAL_14:.*]] = arith.mulf %[[VAL_13]], %[[VAL_10]] : f32 +// CHECK: %[[VAL_15:.*]] = arith.addf %[[VAL_12]], %[[VAL_14]] : f32 +// CHECK: memref.store %[[VAL_15]], %[[VAL_9]]{{\[}}%[[VAL_11]]] : memref +// CHECK: } +// CHECK: %[[VAL_16:.*]] = sparse_tensor.load %[[VAL_6]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: return %[[VAL_16]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +func.func @mul_inv_enc_dense1d(%arga: tensor<32xf32, #EncDenseVec>, + %argb: tensor<4xf32, #EncDenseVec>) -> tensor<32xf32, #EncDenseVec> { + %argx = bufferization.alloc_tensor() : tensor<32xf32, #EncDenseVec> + %0 = linalg.generic #trait1 + ins(%arga, %argb: tensor<32xf32, #EncDenseVec>, tensor<4xf32, #EncDenseVec>) + outs(%argx: tensor<32xf32, #EncDenseVec>) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = arith.mulf %a, %b : f32 + %1 = arith.addf %x, %0 : f32 + linalg.yield %1 : f32 + } -> tensor<32xf32, #EncDenseVec> + return %0 : tensor<32xf32, #EncDenseVec> +} + #trait2 = { indexing_maps = [ affine_map<(i) -> (i)>, // a @@ -104,6 +193,58 @@ return %0 : tensor<32xi32> } +// CHECK-LABEL: func.func @and_affine_sparse1d( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>>) +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_5:.*]] = bufferization.alloc_tensor() : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index} : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_5]]) -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_19]] to %[[VAL_20]] step %[[VAL_3]] iter_args(%[[VAL_23:.*]] = %[[VAL_16]]) -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_24:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_22]]] : memref +// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_17]], %[[VAL_4]] : index +// CHECK: %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_24]], %[[VAL_25]] : index +// CHECK: %[[VAL_27:.*]] = scf.if %[[VAL_26]] -> (tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_28:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_22]]] : memref +// CHECK: %[[VAL_29:.*]] = arith.andi %[[VAL_18]], %[[VAL_28]] : i32 +// CHECK: %[[VAL_30:.*]] = sparse_tensor.insert %[[VAL_29]] into %[[VAL_23]]{{\[}}%[[VAL_17]]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: scf.yield %[[VAL_30]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_23]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: scf.yield %[[VAL_31:.*]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: scf.yield %[[VAL_32:.*]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: %[[VAL_33:.*]] = sparse_tensor.load %[[VAL_34:.*]] hasInserts : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: return %[[VAL_33]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +func.func @and_affine_sparse1d(%arga: tensor<32xi32, #SpVec>, + %argb: tensor<34xi32, #SpVec>) -> tensor<32xi32, #SpVec> { + %argx = bufferization.alloc_tensor() : tensor<32xi32, #SpVec> + %0 = linalg.generic #trait2 + ins(%arga, %argb: tensor<32xi32, #SpVec>, tensor<34xi32, #SpVec>) + outs(%argx: tensor<32xi32, #SpVec>) { + ^bb(%a: i32, %b: i32, %x: i32): + %0 = arith.andi %a, %b : i32 + linalg.yield %0 : i32 + } -> tensor<32xi32, #SpVec> + return %0 : tensor<32xi32, #SpVec> +} + + #trait3 = { indexing_maps = [ affine_map<(i,j) -> (i,j)>, // a @@ -160,3 +301,66 @@ } -> tensor<32x16xf64> return %0 : tensor<32x16xf64> } + +// CHECK-LABEL: func.func @mul_affine_sparse2d( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> { +// CHECK: %[[VAL_2:.*]] = arith.constant 32 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK: %[[VAL_7:.*]] = arith.constant 3 : index +// CHECK: %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 1 : index} : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_15:.*]] = scf.for %[[VAL_16:.*]] = %[[VAL_3]] to %[[VAL_2]] step %[[VAL_4]] iter_args(%[[VAL_17:.*]] = %[[VAL_8]]) -> (tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_18:.*]] = arith.addi %[[VAL_16]], %[[VAL_5]] : index +// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_16]]] : memref +// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_16]], %[[VAL_4]] : index +// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_20]]] : memref +// CHECK: %[[VAL_22:.*]] = scf.for %[[VAL_23:.*]] = %[[VAL_19]] to %[[VAL_21]] step %[[VAL_4]] iter_args(%[[VAL_24:.*]] = %[[VAL_17]]) -> (tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_23]]] : memref +// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref +// CHECK: %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_18]]] : memref +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_18]], %[[VAL_4]] : index +// CHECK: %[[VAL_29:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_28]]] : memref +// CHECK: %[[VAL_30:.*]]:2 = scf.for %[[VAL_31:.*]] = %[[VAL_27]] to %[[VAL_29]] step %[[VAL_4]] iter_args(%[[VAL_32:.*]] = %[[VAL_6]], %[[VAL_33:.*]] = %[[VAL_24]]) -> (f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_31]]] : memref +// CHECK: %[[VAL_35:.*]] = arith.addi %[[VAL_25]], %[[VAL_7]] : index +// CHECK: %[[VAL_36:.*]] = arith.cmpi eq, %[[VAL_34]], %[[VAL_35]] : index +// CHECK: %[[VAL_37:.*]]:2 = scf.if %[[VAL_36]] -> (f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>) { +// CHECK: %[[VAL_38:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref +// CHECK: %[[VAL_39:.*]] = arith.mulf %[[VAL_26]], %[[VAL_38]] : f64 +// CHECK: %[[VAL_40:.*]] = arith.addf %[[VAL_32]], %[[VAL_39]] : f64 +// CHECK: scf.yield %[[VAL_40]], %[[VAL_33]] : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_32]], %[[VAL_33]] : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: scf.yield %[[VAL_41:.*]]#0, %[[VAL_41]]#1 : f64, tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: %[[VAL_42:.*]] = sparse_tensor.insert %[[VAL_43:.*]]#0 into %[[VAL_43]]#1{{\[}}%[[VAL_16]], %[[VAL_25]]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: scf.yield %[[VAL_42]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: scf.yield %[[VAL_44:.*]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: } +// CHECK: %[[VAL_45:.*]] = sparse_tensor.load %[[VAL_46:.*]] hasInserts : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: return %[[VAL_45]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +func.func @mul_affine_sparse2d(%arga: tensor<32x16xf64, #CSR>, + %argb: tensor<34x19xf64, #CSR>) -> tensor<32x16xf64, #CSR> { + %argx = bufferization.alloc_tensor() : tensor<32x16xf64, #CSR> + %0 = linalg.generic #trait3 + ins(%arga, %argb: tensor<32x16xf64, #CSR>, tensor<34x19xf64, #CSR>) + outs(%argx: tensor<32x16xf64, #CSR>) { + ^bb(%a: f64, %b: f64, %x: f64): + %0 = arith.mulf %a, %b : f64 + %1 = arith.addf %x, %0 : f64 + linalg.yield %1 : f64 + } -> tensor<32x16xf64, #CSR> + return %0 : tensor<32x16xf64, #CSR> +} + diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir @@ -0,0 +1,115 @@ +// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#CCC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed", "compressed" ] }> + +#CDC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "dense", "compressed" ] + // FIXME: Still inadmissible might need investigation + // dimOrdering = affine_map<(i,j,k) -> (j,k,i)> +}> + +// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f +func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_1d_nwc_wcf(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>, + strides = dense<1> : tensor<1xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_1d_nwc_wcf_CCC(%arg0: tensor, %arg1: tensor) -> tensor { + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c3, %c6, %c1) : tensor + %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>, + strides = dense<1> : tensor<1xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_1d_nwc_wcf_CDC(%arg0: tensor, %arg1: tensor) -> tensor { + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c3, %c6, %c1) : tensor + %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>, + strides = dense<1> : tensor<1xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %in1D_tmp = call @alloc_3d_filled_f32(%c3, %c8, %c1, %val) : (index, index, index, f32) -> (tensor) + %in1D_nwc = tensor.insert %f10 into %in1D_tmp[%c0, %c3, %c0] : tensor + %filter1D_nwc = call @alloc_3d_filled_f32(%c3, %c1, %c1, %val) : (index, index, index, f32) -> (tensor) + %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (tensor) + + %in1D_nwc_CCC = sparse_tensor.convert %in1D_nwc + : tensor to tensor + %filter1D_nwc_CCC = sparse_tensor.convert %filter1D_nwc + : tensor to tensor + + %in1D_nwc_CDC = sparse_tensor.convert %in1D_nwc + : tensor to tensor + %filter1D_nwc_CDC = sparse_tensor.convert %filter1D_nwc + : tensor to tensor + + %dense_ret = call @conv_1d_nwc_wcf(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (tensor, tensor, tensor) -> (tensor) + %CCC_ret = call @conv_1d_nwc_wcf_CCC(%in1D_nwc_CCC, %filter1D_nwc_CCC) : (tensor, tensor) -> (tensor) + %CDC_ret = call @conv_1d_nwc_wcf_CDC(%in1D_nwc_CDC, %filter1D_nwc_CDC) : (tensor, tensor) -> (tensor) + + // CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ), + // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ), + // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0], %zero + : tensor, vector<3x6x1xf32> + vector.print %dense_v : vector<3x6x1xf32> + + // CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ), + // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ), + // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) ) + %1 = sparse_tensor.convert %CCC_ret + : tensor to tensor + %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero + : tensor, vector<3x6x1xf32> + vector.print %v1 : vector<3x6x1xf32> + + // CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ), + // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ), + // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) ) + %2 = sparse_tensor.convert %CDC_ret + : tensor to tensor + %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero + : tensor, vector<3x6x1xf32> + vector.print %v2 : vector<3x6x1xf32> + + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir @@ -0,0 +1,209 @@ +// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> +#CSR = #sparse_tensor.encoding<{dimLevelType = ["dense", "compressed"]}> +#CSC = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "compressed" ], + dimOrdering = affine_map<(i,j) -> (j,i)> +}> + +// An example of a 2D convolution with a sparse filter. +module { + + func.func @conv2d(%input: tensor<8x8xi32>, + %filter: tensor<3x3xi32, #DCSR>, + %output: tensor<6x6xi32>) -> tensor<6x6xi32> { + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) + outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32> + return %0 : tensor<6x6xi32> + } + + func.func @conv2d_sparse_out(%input: tensor<8x8xi32>, + %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> { + %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) + outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> + return %0 : tensor<6x6xi32, #DCSR> + } + + func.func @conv2d_all_sparse_DCSR(%input: tensor<8x8xi32, #DCSR>, + %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> { + %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32, #DCSR>) + outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> + return %0 : tensor<6x6xi32, #DCSR> + } + + func.func @conv2d_all_sparse_CSR(%input: tensor<8x8xi32, #CSR>, + %filter: tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> { + %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSR> + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32, #CSR>) + outs (%s: tensor<6x6xi32, #CSR>) -> tensor<6x6xi32, #CSR> + return %0 : tensor<6x6xi32, #CSR> + } + + func.func @conv2d_all_sparse_CSC(%input: tensor<8x8xi32, #CSC>, + %filter: tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> { + %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSC> + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32, #CSC>) + outs (%s: tensor<6x6xi32, #CSC>) -> tensor<6x6xi32, #CSC> + return %0 : tensor<6x6xi32, #CSC> + } + + func.func @entry() { + %c0 = arith.constant 0 : index + %i0 = arith.constant 0 : i32 + + // A typical edge detection filter. + %filter = arith.constant dense<[ + [ 1, 0, -1 ], + [ 0, 0, 0 ], + [ -1, 0, 1 ] + ]> : tensor<3x3xi32> + %sparse_filter_DCSR = sparse_tensor.convert %filter + : tensor<3x3xi32> to tensor<3x3xi32, #DCSR> + %sparse_filter_CSR = sparse_tensor.convert %filter + : tensor<3x3xi32> to tensor<3x3xi32, #CSR> + %sparse_filter_CSC = sparse_tensor.convert %filter + : tensor<3x3xi32> to tensor<3x3xi32, #CSC> + + + %input = arith.constant dense<[ + [ 1, 2, 3, 4, 0, 6, 7, 8 ], + [ 2, 2, 4, 4, 0, 0, 6, 8 ], + [ 2, 2, 4, 4, 0, 0, 6, 8 ], + [ 2, 2, 3, 4, 0, 0, 7, 8 ], + [ 1, 3, 3, 4, 0, 0, 6, 8 ], + [ 3, 2, 3, 4, 0, 0, 7, 8 ], + [ 1, 3, 3, 4, 3, 6, 6, 8 ], + [ 1, 3, 3, 4, 3, 0, 7, 8 ] + ]> : tensor<8x8xi32> + %sparse_input_DCSR = sparse_tensor.convert %input + : tensor<8x8xi32> to tensor<8x8xi32, #DCSR> + %sparse_input_CSR = sparse_tensor.convert %input + : tensor<8x8xi32> to tensor<8x8xi32, #CSR> + %sparse_input_CSC = sparse_tensor.convert %input + : tensor<8x8xi32> to tensor<8x8xi32, #CSC> + + // Call the kernel. + %output = arith.constant dense<0> : tensor<6x6xi32> + %0 = call @conv2d(%input, %sparse_filter_DCSR, %output) + : (tensor<8x8xi32>, + tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32> + %1 = call @conv2d_sparse_out(%input, %sparse_filter_DCSR) + : (tensor<8x8xi32>, + tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> + %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %sparse_filter_DCSR) + : (tensor<8x8xi32, #DCSR>, + tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> + %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR) + : (tensor<8x8xi32, #CSR>, + tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> + %4 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %sparse_filter_CSC) + : (tensor<8x8xi32, #CSC>, + tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> + + + // Verify the output. + // + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %v = vector.transfer_read %0[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v : vector<6x6xi32> + + // + // Should be the same as dense output + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %sparse_ret = sparse_tensor.convert %1 + : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> + %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v1 : vector<6x6xi32> + + // + // Should be the same as dense output + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %all_sparse_DCSR = sparse_tensor.convert %2 + : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> + %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v2 : vector<6x6xi32> + + // + // Should be the same as dense output + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %all_sparse_CSR = sparse_tensor.convert %3 + : tensor<6x6xi32, #CSR> to tensor<6x6xi32> + %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v3 : vector<6x6xi32> + + // + // Should be the same as dense output + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %all_sparse_CSC = sparse_tensor.convert %4 + : tensor<6x6xi32, #CSC> to tensor<6x6xi32> + %v4 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v4 : vector<6x6xi32> + + // Release the resources. + bufferization.dealloc_tensor %sparse_filter_DCSR : tensor<3x3xi32, #DCSR> + bufferization.dealloc_tensor %sparse_filter_CSR : tensor<3x3xi32, #CSR> + bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC> + + bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR> + bufferization.dealloc_tensor %sparse_input_CSR : tensor<8x8xi32, #CSR> + bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC> + + bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR> + bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR> + bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR> + bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CSC> + return + } +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir @@ -0,0 +1,158 @@ +// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#CCCC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed", "compressed", "compressed" ] +}> + +#CDCD = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "dense", "compressed", "dense" ] +}> + +// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f +func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nhwc_hwcf(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor, %arg1: tensor) -> tensor { + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c3, %c6, %c6, %c1) : tensor + %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nhwc_hwcf_CDCD(%arg0: tensor, %arg1: tensor) -> tensor { + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c3, %c6, %c6, %c1) : tensor + %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor) + %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (tensor) + %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c3, %c0] : tensor + %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (tensor) + + %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + %filter2D_nhwc_CCCC = sparse_tensor.convert %filter2D_nhwc + : tensor to tensor + + %in2D_nhwc_CDCD = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + %filter2D_nhwc_CDCD = sparse_tensor.convert %filter2D_nhwc + : tensor to tensor + + %dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) + %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc_CCCC) : (tensor, tensor) -> (tensor) + %CDCD_ret = call @conv_2d_nhwc_hwcf_CDCD(%in2D_nhwc_CDCD, %filter2D_nhwc_CDCD) : (tensor, tensor) -> (tensor) + + // CHECK: ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x6x6x1xf32> + vector.print %dense_v : vector<3x6x6x1xf32> + + // CHECK: ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) + %1 = sparse_tensor.convert %CCCC_ret + : tensor to tensor + %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x6x6x1xf32> + vector.print %v1 : vector<3x6x6x1xf32> + + // CHECK: ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) + %2 = sparse_tensor.convert %CDCD_ret + : tensor to tensor + %v2 = vector.transfer_read %2[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x6x6x1xf32> + vector.print %v2 : vector<3x6x6x1xf32> + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir @@ -0,0 +1,206 @@ +// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#CCC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed", "compressed" ] +}> + +#CDC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "dense", "compressed" ] +}> + +// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f +func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_3d + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_CCC(%arg0: tensor, %arg1: tensor) -> tensor { + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor + %ret = linalg.conv_3d + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_CDC(%arg0: tensor, %arg1: tensor) -> tensor { + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor + %ret = linalg.conv_3d + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter3D = call @alloc_3d_filled_f32(%c3, %c3, %c3, %val) : (index, index, index, f32) -> (tensor) + %in3D_tmp = call @alloc_3d_filled_f32(%c8, %c8, %c8, %val) : (index, index, index, f32) -> (tensor) + %in3D = tensor.insert %f10 into %in3D_tmp[%c0, %c3, %c0] : tensor + %out3D = call @alloc_3d_filled_f32(%c6, %c6, %c6, %zero) : (index, index, index, f32) -> (tensor) + + %in3D_CCC = sparse_tensor.convert %in3D + : tensor to tensor + %filter3D_CCC = sparse_tensor.convert %filter3D + : tensor to tensor + + %in3D_CDC = sparse_tensor.convert %in3D + : tensor to tensor + %filter3D_CDC = sparse_tensor.convert %filter3D + : tensor to tensor + + %dense_ret = call @conv_3d(%in3D, %filter3D, %out3D) : (tensor, tensor, tensor) -> (tensor) + %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D_CCC) : (tensor, tensor) -> (tensor) + %CDC_ret = call @conv_3d_CDC(%in3D_CDC, %filter3D_CDC) : (tensor, tensor) -> (tensor) + + // CHECK:( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0], %zero + : tensor, vector<6x6x6xf32> + vector.print %dense_v : vector<6x6x6xf32> + + // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) + %1 = sparse_tensor.convert %CCC_ret + : tensor to tensor + %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero + : tensor, vector<6x6x6xf32> + vector.print %v1 : vector<6x6x6xf32> + + // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) + %2 = sparse_tensor.convert %CCC_ret + : tensor to tensor + %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero + : tensor, vector<6x6x6xf32> + vector.print %v2 : vector<6x6x6xf32> + + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir @@ -0,0 +1,226 @@ +// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=true | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// RUN: mlir-opt %s --sparse-compiler="enable-runtime-library=false enable-buffer-initialization=true" | \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +#CCCCC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed", "compressed", "compressed", "compressed" ] +}> + +#CDCDC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "dense", "compressed", "dense", "compressed"] +}> + +// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f +func.func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4, %s5) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_ndhwc_dhwcf(%arg0: tensor, + %arg1: tensor, + %arg2: tensor) -> tensor { + %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>, + strides = dense<1> : tensor<3xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_ndhwc_dhwcf_CCCCC(%arg0: tensor, + %arg1: tensor) + -> tensor { + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c1, %c6, %c6, %c6, %c1) + : tensor + %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>, + strides = dense<1> : tensor<3xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_ndhwc_dhwcf_CDCDC(%arg0: tensor, + %arg1: tensor) + -> tensor { + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c1, %c6, %c6, %c6, %c1) + : tensor + %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>, + strides = dense<1> : tensor<3xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %in3D_tmp = call @alloc_5d_filled_f32(%c1, %c8, %c8, %c8, %c1, %val) : (index, index, index, index, index, f32) -> (tensor) + %in3D_ndhwc = tensor.insert %f10 into %in3D_tmp[%c0, %c0, %c0, %c3, %c0] : tensor + + %filter3D_ndhwc = call @alloc_5d_filled_f32(%c3, %c3, %c3, %c1, %c1, %val) : (index, index, index, index, index, f32) -> (tensor) + %out3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c6, %c6, %c6, %c1, %zero) : (index, index, index, index, index, f32) -> (tensor) + + %in3D_ndhwc_CCCCC = sparse_tensor.convert %in3D_ndhwc + : tensor to tensor + %filter3D_ndhwc_CCCCC = sparse_tensor.convert %filter3D_ndhwc + : tensor to tensor + + %in3D_ndhwc_CDCDC = sparse_tensor.convert %in3D_ndhwc + : tensor to tensor + %filter3D_ndhwc_CDCDC = sparse_tensor.convert %filter3D_ndhwc + : tensor to tensor + + // CHECK:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) ) + %dense_ret = call @conv_3d_ndhwc_dhwcf(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc) + : (tensor, tensor, tensor) -> (tensor) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0, %c0], %zero + : tensor, vector<1x6x6x6x1xf32> + vector.print %dense_v : vector<1x6x6x6x1xf32> + + %CCCCC_ret = call @conv_3d_ndhwc_dhwcf_CCCCC(%in3D_ndhwc_CCCCC, %filter3D_ndhwc_CCCCC) + : (tensor, + tensor) -> (tensor) + + // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) ) + %1 = sparse_tensor.convert %CCCCC_ret + : tensor to tensor + %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0, %c0], %zero + : tensor, vector<1x6x6x6x1xf32> + vector.print %v1 : vector<1x6x6x6x1xf32> + + %CDCDC_ret = call @conv_3d_ndhwc_dhwcf_CDCDC(%in3D_ndhwc_CDCDC, %filter3D_ndhwc_CDCDC) + : (tensor, + tensor) -> (tensor) + + // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ), + // CHECK-SAME: ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), + // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) ) + %2 = sparse_tensor.convert %CDCDC_ret + : tensor to tensor + %v2 = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0, %c0], %zero + : tensor, vector<1x6x6x6x1xf32> + vector.print %v2 : vector<1x6x6x6x1xf32> + + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir deleted file mode 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir +++ /dev/null @@ -1,95 +0,0 @@ -// RUN: mlir-opt %s --sparse-compiler | \ -// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ -// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ -// RUN: FileCheck %s - -#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> - -// An example of a 2D convolution with a sparse filter. -module { - - func.func @conv2d(%input: tensor<8x8xi32>, - %filter: tensor<3x3xi32, #DCSR>, - %output: tensor<6x6xi32>) -> tensor<6x6xi32> { - %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) - outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32> - return %0 : tensor<6x6xi32> - } - - func.func @conv2d_sparse_out(%input: tensor<8x8xi32>, - %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> { - %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> - %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) - outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> - return %0 : tensor<6x6xi32, #DCSR> - } - - func.func @entry() { - %c0 = arith.constant 0 : index - %i0 = arith.constant 0 : i32 - - // A typical edge detection filter. - %filter = arith.constant dense<[ - [ 1, 0, -1 ], - [ 0, 0, 0 ], - [ -1, 0, 1 ] - ]> : tensor<3x3xi32> - %sparse_filter = sparse_tensor.convert %filter - : tensor<3x3xi32> to tensor<3x3xi32, #DCSR> - - %input = arith.constant dense<[ - [ 1, 2, 3, 4, 0, 6, 7, 8 ], - [ 2, 2, 4, 4, 0, 0, 6, 8 ], - [ 2, 2, 4, 4, 0, 0, 6, 8 ], - [ 2, 2, 3, 4, 0, 0, 7, 8 ], - [ 1, 3, 3, 4, 0, 0, 6, 8 ], - [ 3, 2, 3, 4, 0, 0, 7, 8 ], - [ 1, 3, 3, 4, 3, 6, 6, 8 ], - [ 1, 3, 3, 4, 3, 0, 7, 8 ] - ]> : tensor<8x8xi32> - - // Call the kernel. - %output = arith.constant dense<0> : tensor<6x6xi32> - %0 = call @conv2d(%input, %sparse_filter, %output) - : (tensor<8x8xi32>, - tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32> - %1 = call @conv2d_sparse_out(%input, %sparse_filter) - : (tensor<8x8xi32>, - tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> - - // Verify the output. - // - // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), - // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), - // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), - // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), - // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), - // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) - // - %v = vector.transfer_read %0[%c0, %c0], %i0 - : tensor<6x6xi32>, vector<6x6xi32> - vector.print %v : vector<6x6xi32> - - // - // Should be the same as dense output - // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), - // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), - // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), - // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), - // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), - // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) - // - %sparse_ret = sparse_tensor.convert %1 - : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> - %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0 - : tensor<6x6xi32>, vector<6x6xi32> - vector.print %v1 : vector<6x6xi32> - - // Release the resources. - bufferization.dealloc_tensor %sparse_filter : tensor<3x3xi32, #DCSR> - bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR> - return - } -} diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp --- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp +++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp @@ -127,7 +127,7 @@ protected: MergerTestBase(unsigned numTensors, unsigned numLoops) : numTensors(numTensors), numLoops(numLoops), - merger(numTensors, numLoops) {} + merger(numTensors, numLoops, /*numFilterLoops=*/0) {} /// /// Expression construction helpers.