diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h --- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h @@ -409,6 +409,9 @@ lvlTypes[t][i] = dlt; loopToLvl[t][i] = lvl; lvlToLoop[t][lvl] = i; + // Maybe we should favor a constant loop bound when there are multiple + // choices. + loopBounds[i] = std::make_pair(t, lvl); } using ForeachTensorLoopIdCallback = function_ref . - void setLoopDependentTensorLevel(LoopId i, TensorId t, Level lvl) { - assert(isValidLoopId(i) && isValidLevel(t, lvl)); - loopToDependencies[i][t] = lvl; - levelToDependentIdx[t][lvl].push_back(i); + void setLoopDependentTensorLevel(LoopId i, TensorId t, Level lvl, + DimLevelType dlt) { + assert(isValidLoopId(i) && isValidLevel(t, lvl) && + lvl < levelToDependentLoop[t].size()); + // Must be the first time we define it. + assert(!loopToDependencies[i][t].has_value()); + loopToDependencies[i][t] = std::make_pair(lvl, dlt); + levelToDependentLoop[t][lvl].push_back(i); } /// Whether the loop has dependent slice. @@ -464,7 +471,7 @@ /// expression on t_l, e.g., A[i+j] => {i, j} std::vector &getDependentLoops(TensorId t, Level lvl) { assert(isValidLevel(t, lvl)); - return levelToDependentIdx[t][lvl]; + return levelToDependentLoop[t][lvl]; } /// Returns the defining [tid, lvl] for the loop. @@ -482,6 +489,26 @@ return loopToDependencies[i][t].has_value(); } + Level getLoopDependentLevel(TensorLoopId b) const { + assert(isLvlWithNonTrivialIdxExp(b)); + return loopToDependencies[loop(b)][tensor(b)]->first; + } + + DimLevelType getLoopDependentLevelType(TensorLoopId b) const { + assert(isLvlWithNonTrivialIdxExp(b)); + return loopToDependencies[loop(b)][tensor(b)]->second; + } + + /// Checks whether the TensorLoopId represents a tensor level with + /// non-trivial index expression on it. + bool isSparseLvlWithNonTrivialIdxExp(TensorLoopId b) const { + if (isLvlWithNonTrivialIdxExp(b)) { + auto dlt = getLoopDependentLevelType(b); + return isCompressedDLT(dlt) || isSingletonDLT(dlt); + } + return false; + } + /// Convenience getters to immediately access the stored nodes. /// These methods return `const&` because the underlying objects must /// not be mutated by client code. The only exception is for mutating @@ -626,13 +653,15 @@ // Map from a loop to its dependencies if any. // The dependencies of a loop is a set of (tensor, level) pairs. // It is currently only set for non-trivial index expressions. - // E.g., A[i+j] => i and j will have dependencies {A0} to indicate that - // i and j are used in the non-trivial index expression on A0. - std::vector>> loopToDependencies; + // E.g., A[i+j] => i and j will have dependencies {A0, dlt(A0)} to indicate + // that i and j are used in the non-trivial index expression on A0. + std::vector>>> + loopToDependencies; + // The inverse map of ldxToDependencies from tensor level -> dependent loop // E.g., A[i+j], we have A0 => {i, j}, to indicate that A0 uses both {i, j} // to compute its indices. - std::vector>> levelToDependentIdx; + std::vector>> levelToDependentLoop; // Map from a loop to the [tid, lvl] pair that defines the loop boundary. std::vector> loopBounds; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h @@ -82,6 +82,8 @@ // d0 and d1 (for affine expression reduction). // If the list is empty, it means that there is no affine expression on the // input [tid, dim]. + // NOTE: the order of the returned list should be consistent with the + // topological order of the iteration graph. using DependentLvlGetter = function_ref>(TensorId, Level)>; @@ -133,10 +135,7 @@ ArrayRef tids, ArrayRef lvls); /// Exits the current loop sequence, this will reset universal index to 0. - void exitCurrentLoopSeq() { - assert(loopSeqStack.size() == loopStack.size() + 1); - loopSeqStack.pop_back(); - } + void exitCurrentLoopSeq(OpBuilder &builder, Location loc); // TODO: Get rid of `lvls` in the argument list? Track the level we // are currently at internally. Then it would be enterNextLvlForTensor. @@ -208,10 +207,14 @@ } private: - struct LoopInfo { - LoopInfo(ArrayRef tids, ArrayRef lvls, Operation *loop, - Block *userBlock, Value iv, StringAttr loopTag) - : tids(tids), lvls(lvls), loop(loop), userCodeBlock(userBlock), iv(iv) { + struct LoopInfo final { + LoopInfo(ArrayRef tids, ArrayRef lvls, + ArrayRef slicedTids, ArrayRef slicedLvls, + ArrayRef sliceReduced, Operation *loop, Block *userBlock, + Value iv, StringAttr loopTag) + : tids(tids), lvls(lvls), slicedTids(slicedTids), + slicedLvls(slicedLvls), sliceReduced(sliceReduced), loop(loop), + userCodeBlock(userBlock), iv(iv) { // Attached a special tag to loop emitter generated loop. if (loopTag) loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag); @@ -222,12 +225,39 @@ const llvm::SmallVector tids; // The corresponding levels for the tensors const llvm::SmallVector lvls; + // The set of tensors for slice-driven loop conditions. + const llvm::SmallVector slicedTids; + // The corresponding level for slice-driven tensors. + const llvm::SmallVector slicedLvls; + // Whether the tensor is fully reduced (e.g., i + j => j). + const llvm::SmallVector sliceReduced; const Operation *loop; // the loop operation Block *const userCodeBlock; // the block holding users' generated code. const Value iv; // the induction variable for the loop }; - /// Linearizes address for dense level (i.e., p = (i * d0) + j). + struct SliceInfo final { + // Note that we do not need to create a actual sparse tensor slice but + // instead only need to maintain the metadata of the slice. + SliceInfo(Value minCrd, Value offset, Value isNonEmpty, + std::optional slicedOnLvl, unsigned depth) + : minCrd(minCrd), offset(offset), isNonEmpty(isNonEmpty), + slicedOnLvl(slicedOnLvl), depth(depth) { + // TODO: use std::optional> + assert(!slicedOnLvl || minCrd); + } + + // Whether this is the tensor that has not yet been sliced. + bool isInitialTensor() const { return !slicedOnLvl.has_value(); } + + Value minCrd; // the minimal coordinate of the slice. + Value offset; // the offset of the current slice. + Value isNonEmpty; // whether the slice is empty. + std::optional slicedOnLvl; // the level on which the slice is done + unsigned depth; // the depth (relative to dependentDimMap[tid][lvl]). + }; + + /// Linearizes address for dense dimension (i.e., p = (i * d0) + j). Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value iv); @@ -281,6 +311,11 @@ ArrayRef tids, ArrayRef lvls); + Operation *emitForLoopOverTensorAtLvl(OpBuilder &builder, Location loc, + TensorId tid, Level lvl, + MutableArrayRef reduc, + bool isParallel); + /// Exits a for loop, returns the reduction results, e.g., /// For sequential for loops: /// %ret = for () { @@ -344,6 +379,78 @@ return {dstLvl}; } + // + // Slice-driven loop related methods. + // + + /// Retrieves the most recent slices on lvl. To reduce affine expression like + /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of + /// size d2). This methods returns the latter slice (of size d2), which is + /// also the final slice on the level. + SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl); + + /// Get the total number of constraints needed to fully *resolve* + /// dependent levels on tensor[tid]. + size_t remDepOnLevel(TensorId tid, Level lvl) const; + + /// Whether the tid, lvl is fully *reduced*, i.e., the non-trivial index + /// expression has been reduced to a trivial one. + /// E.g., A[i+j] => A[2 + i] (j is reduced) + bool depFullyReduced(TensorId tid, Level lvl) const { + return remDepOnLevel(tid, lvl) == 1; + } + + /// Whether the tid, lvl is fully resolved, i.e., we entered the level already + /// (the index on that level is determined). + /// E.g., A[i+j] => A[2 + 3] (both i and j become invariants for inner loops). + bool lvlFullyResolved(TensorId tid, Level lvl) const { + return remDepOnLevel(tid, lvl) == 0; + } + + /// Generates a whileOp to iterate over a subset of coordinates on tid on lvl + /// using the pHi and pLo provided, the loop break on the first coordinate + /// that exceeds the slice boundary (i.e., coord >= slice.offset + + /// slice.size). + std::pair + genSliceLvlTraverseLoop(OpBuilder &builder, Location loc, Value pLo, + Value pHi, Value offset, TensorId tid, Level lvl, + size_t depth, ValueRange userReduc, bool genYield, + /*bodyBuilder=*/ + llvm::function_ref)>); + + /// Generates a nested loop that iterates over tid on all the coordinates on + /// lvl. + ValueRange genSliceAllLvlTraverseLoop( + OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl, + size_t depth, ValueRange userReduc, + /*bodyBody=*/ + llvm::function_ref)>); + + /// Generates code to get the first non-empty slice of tid on lvl. + /// return true if has already been resolved. + bool genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl); + + /// Generates code to get the next non-empty slices of tid on lvl. + void genSliceNextInduction(OpBuilder &builder, Location loc, + const Operation *whileOp, TensorId tid, Level lvl, + SmallVectorImpl &operands, + unsigned &retIdx); + + /// Generates a slice-driven while loop like follows. + /// + /// curSlice = getFirstNonEmptySlice(tensor). + /// + /// while(isNonEmpty) { + /// ..user code.. + /// isNonEmpty, curSlice = getNextNonEmptySlice(curSlice) + /// } + Operation *emitSliceDrivenLoopOverTensorAtLvl(OpBuilder &builder, + Location loc, TensorId tid, + Level lvl, + MutableArrayRef reduc); + /// A optional string attribute that should be attached to the loop /// generated by loop emitter, it might help following passes to identify /// loops that operates on sparse tensors more easily. @@ -353,6 +460,9 @@ bool hasOutput; bool isSparseOut; + /// The insertion point to allocate top level local + Operation *localInsertPos; + // // Fields which have `numTensor` many entries. // @@ -388,6 +498,10 @@ std::vector> coordinatesBuffers; // to_coordinates std::vector valBuffer; // to_value + // + // Slice-driven loops related fields. + // + /// Whether the sparse input is a slice. std::vector isSparseSlices; /// Values related to slices. @@ -399,6 +513,21 @@ std::vector>>> dependentLvlMap; + // The cached position buffer for the slices, they serve the same purpose as + // ptrBuffer for compressed dimensions. + // But they always starts with the first pidx pointing to coord > slice.offset + // to avoid iteration from the beginning. + std::vector>> slicePosBuffer; + + // The cached size for each slices. + std::vector>> sliceSizes; + + // The number of reduced dependencies on a tensor level so far. + std::vector> levelReducedDep; + + // sliceStack[tid] holds the generated slice stack on tid. + std::vector> sliceStack; + // // View based reshape related-fields and methods // @@ -419,9 +548,11 @@ /// alive. std::vector loopStack; - /// Loop Sequence Stack, stores the universal index for the current loop - /// sequence. - std::vector loopSeqStack; + // Loop Sequence Stack, stores the unversial index for the current loop + // sequence. and a list of tids which was taken sliced. + // TODO: maybe we should have a LoopSeqInfo + std::vector>>> + loopSeqStack; /// Maps `LoopId` (used by `AffineDimExpr`) to `LoopOrd` (in the `loopStack`). /// TODO: We should probably use a callback function here to make it more diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -25,9 +25,15 @@ // File local helper functions. //===----------------------------------------------------------------------===// -/// Generates a position/coordinate load from the sparse storage scheme. -/// Narrower data types need to be zero extended before casting the -/// value into the `Index` type used for looping and indexing. +#define CMPI(p, l, r) \ + (builder.create(loc, arith::CmpIPredicate::p, l, r) \ + .getResult()) + +#define C_IDX(v) (constantIndex(builder, loc, v)) + +/// Generates a pointer/index load from the sparse storage scheme. Narrower +/// data types need to be zero extended before casting the value into the +/// index type used for looping and indexing. static Value genIndexLoad(OpBuilder &builder, Location loc, Value mem, Value s) { // For the scalar case, we simply zero extend narrower indices into 64-bit @@ -102,21 +108,18 @@ // First, coord >= offset (skip the check if offset is known to be 0). if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl); !(staticOffset.has_value() && *staticOffset == 0)) { - auto geOffset = builder.create( - loc, arith::CmpIPredicate::uge, crd, offset); + auto geOffset = CMPI(uge, crd, offset); conds.push_back(geOffset); } // Second, coord_in_slice < length - auto ltLength = builder.create(loc, arith::CmpIPredicate::ult, - newCrd, lvlSizes[tid][lvl]); + auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]); conds.push_back(ltLength); // Third, rem == 0 (skip the check if stride is known to be 1). if (auto staticStride = enc.getStaticLvlSliceStride(lvl); !(staticStride.has_value() && *staticStride == 1)) { - auto fitStride = builder.create( - loc, arith::CmpIPredicate::eq, crdRem, constantIndex(builder, loc, 0)); + auto fitStride = CMPI(eq, crdRem, C_IDX(0)); conds.push_back(fitStride); } @@ -134,7 +137,7 @@ Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value crd) { - Value pos = lvl == 0 ? constantIndex(builder, loc, 0) : posits[tid][lvl - 1]; + Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1]; Value mul = builder.create(loc, highs[tid][lvl], pos); if (isSparseSlices[tid]) crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl], @@ -177,8 +180,7 @@ /*afterBuilder=*/ [](OpBuilder &builder, Location loc, ValueRange ivs) { // pos ++ - Value nextPos = builder.create( - loc, ivs[0], constantIndex(builder, loc, 1)); + Value nextPos = builder.create(loc, ivs[0], C_IDX(1)); builder.create(loc, nextPos); }); // Return the segment high. @@ -187,7 +189,7 @@ Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, Level dstLvl) { - Value crd = constantIndex(builder, loc, 0); + Value crd = C_IDX(0); const auto reassoc = getCollapseReassociation(tid, dstLvl); const unsigned reassocSize = reassoc.size(); for (unsigned i = 0; i < reassocSize; i++) { @@ -246,6 +248,11 @@ this->dependentLvlMap.assign( numTensors, std::vector>>()); + this->slicePosBuffer.assign(numTensors, std::vector>()); + this->sliceSizes.assign(numTensors, std::vector>()); + this->sliceStack.assign(numTensors, std::vector()); + + this->levelReducedDep.assign(numTensors, std::vector()); // Initialize nested types of `TensorId`-indexed fields. for (TensorId tid = 0; tid < numTensors; tid++) { @@ -288,16 +295,30 @@ coordinatesBuffers[tid].assign(lvlRank, Value()); sliceOffsets[tid].assign(lvlRank, Value()); sliceStrides[tid].assign(lvlRank, Value()); + + // Slice-driven loops related initialization. + levelReducedDep[tid].assign(lvlRank, 0); dependentLvlMap[tid].assign(lvlRank, std::vector>()); + slicePosBuffer[tid].assign(lvlRank, std::vector()); + sliceSizes[tid].assign(lvlRank, std::vector()); + sliceStack[tid].emplace_back(/*minCrd=*/Value(), + /*offset=*/Value(), /*isNonEmpty*/ Value(), + std::nullopt, 0); if (dimGetter) { auto reassoc = collapseReassoc[tid]; Level dstRank = reassoc ? reassoc.size() : lvlRank; for (Level l = 0; l < dstRank; l++) { dependentLvlMap[tid][l] = dimGetter(tid, l); + unsigned depends = dependentLvlMap[tid][l].size(); + if (depends == 0) + continue; // TODO: View-base collapse and dependent index reduction are not // compatible right now. - assert(!reassoc || dependentLvlMap[tid][l].empty()); + assert(!reassoc); + // We need depends - 1 slices to fully resolve the affine expression. + sliceSizes[tid][l].assign(depends - 1, nullptr); + slicePosBuffer[tid][l].assign(depends - 1, nullptr); } } } @@ -398,6 +419,31 @@ // some loop preparation from tensor iteration, but will also (undesirably) // hoist the code ouside if-conditions. } + + Type indexType = builder.getIndexType(); + Value c0 = constantZero(builder, loc, indexType); + for (TensorId t = 0, e = tensors.size(); t < e; t++) { + auto rtp = tensors[t].getType().dyn_cast(); + if (!rtp) + continue; + + Level lvlRank = SparseTensorType(rtp).getLvlRank(); + for (Level lvl = 0; lvl < lvlRank; lvl++) { + if (!dependentLvlMap[t][lvl].empty()) { + // Needs at least two operands to form a non-trivial affine expression. + ArrayRef> depLvls = dependentLvlMap[t][lvl]; + assert(depLvls.size() > 1); + + Value size = c0; + for (unsigned e = depLvls.size() - 1; e >= 1; e--) { + auto [dt, dd] = depLvls[e]; + size = builder.create(loc, size, lvlSizes[dt][dd]); + sliceSizes[t][lvl][e - 1] = size; + } + } + } + } + localInsertPos = builder.getInsertionPoint()->getPrevNode(); } void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, @@ -405,12 +451,47 @@ ArrayRef lvls) { // TODO: sort assert(loopSeqStack.size() == loopStack.size()); - // Universal Index starts from 0. - loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); // Prepares for all the tensors used in the current loop sequence. - assert(tids.size() == lvls.size()); - for (auto [tid, lvl] : llvm::zip(tids, lvls)) - prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); + std::vector> slicedTids; + for (auto [tid, lvl] : llvm::zip(tids, lvls)) { + if (!dependentLvlMap[tid][lvl].empty()) { + bool fullyRed = genSliceBegin(builder, loc, tid, lvl); + slicedTids.emplace_back(tid, lvl, fullyRed); + } else { + prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); + } + } + + // Universal Index starts from 0. + loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids)); +} + +void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) { + assert(loopSeqStack.size() == loopStack.size() + 1); + + const auto &slicedTids = loopSeqStack.back().second; + + // Pop out outdated slices. + for (auto [tid, lvl, res] : slicedTids) { + if (!res) { + assert(sliceStack[tid].back().slicedOnLvl == lvl); + sliceStack[tid].pop_back(); + // There is an additional item in sliceStack for the input tensor. + // assert(sliceResolvedConstraints[tid] + 1 == sliceStack[tid].size()); + } else { + Value c1 = C_IDX(1); + Value c2 = C_IDX(2); + + // pIdx += 2, we finished the current lvl, advance the pointer index of + // the previous level by two to skip the [pLo, pHi] for current level. + // TODO: we could probably use an SSA value for it. + Value sPtrBuf = slicePosBuffer[tid][lvl].back(); + Value curP = genIndexLoad(builder, loc, sPtrBuf, c1); + Value nexP = builder.create(loc, curP, c2); + builder.create(loc, nexP, sPtrBuf, c1); + } + } + loopSeqStack.pop_back(); } Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { @@ -438,51 +519,30 @@ } case AffineExprKind::Constant: { int64_t c = a.cast().getValue(); - return constantIndex(builder, loc, c); + return C_IDX(c); } default: llvm_unreachable("unexpected affine subscript"); } } -Operation *LoopEmitter::enterLoopOverTensorAtLvl( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef lvls, MutableArrayRef reduc, bool isParallel) { - // TODO: support multiple return on parallel for? - assert(!isParallel || reduc.size() <= 1); - bool isSparseInput = false; - TensorId tid = tids.front(); - Level dstLvl = lvls.front(); - assert(tids.size() == lvls.size()); - for (auto [t, l] : llvm::zip(tids, lvls)) { - // TODO: this check for validity of the (t,l) pairs should be - // checked/enforced at the callsites, if possible. - assert(isValidLevel(t, l)); - assert(!coords[t][l]); // We cannot re-enter the same level - const auto lvlTp = lvlTypes[t][l]; - const bool isSparse = isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp); - // Must be a recognizable level-type. - assert(isSparse || isDenseDLT(lvlTp)); - // We can at most have one sparse input, otherwise, a while loop is required - // to co-iterate multiple sparse tensors. - assert(!isSparseInput || !isSparse); - if (isSparse) { - tid = t; - dstLvl = l; - } - isSparseInput = isSparseInput || isSparse; - } +Operation *LoopEmitter::emitForLoopOverTensorAtLvl(OpBuilder &builder, + Location loc, TensorId tid, + Level dstLvl, + MutableArrayRef reduc, + bool isParallel) { + bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) || + isSingletonDLT(lvlTypes[tid][dstLvl]); const auto reassoc = getCollapseReassociation(tid, dstLvl); // TODO: support dynamic slices. - // Use the first source-level here to build the loop bound (which is - // also the biggest range). + // Uses the first dimension here to build the loop bound (which is also the + // biggest range). const Level srcLvl = reassoc.front(); - const Value step = constantIndex(builder, loc, 1); - /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. - const Value lo = isSparseInput ? posits[tid][srcLvl] // current position - : loopSeqStack.back(); // universal index - const Value hi = highs[tid][srcLvl]; + Value step = C_IDX(1); + Value lo = isSparseCond ? posits[tid][srcLvl] // current offset + : loopSeqStack.back().first; // universal index + Value hi = highs[tid][srcLvl]; Operation *loop = nullptr; Value iv; @@ -518,7 +578,7 @@ assert(loop && iv); Value crd; - if (isSparseInput) { + if (isSparseCond) { assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); // For COO, the position is the same across consecutive levels. /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. @@ -530,7 +590,7 @@ crd = iv; } - if (isSparseSlices[tid] && isSparseInput) { + if (isSparseSlices[tid] && isSparseCond) { // For sparse level slices, we need to filter out invalid coordinates that // are not included in the slice. SmallVector types; @@ -559,15 +619,108 @@ } assert(crd); - coords[tid][srcLvl] = crd; - // NOTE: we can also prepare for next level here in advance - // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(srcLvl), loop, - builder.getInsertionBlock(), crd, loopTag); + coords[tid][dstLvl] = crd; + return loop; +} + +Operation *LoopEmitter::enterLoopOverTensorAtLvl( + OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef lvls, MutableArrayRef reduc, bool isParallel) { + // TODO: support multiple return on parallel for? + assert(!isParallel || reduc.size() <= 1); + bool isSparseCond = false, isSliceCond = false; + size_t tid = tids.front(), lvl = lvls.front(); + + for (auto [t, l] : llvm::zip(tids, lvls)) { + assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair + assert(!coords[t][l] || // We cannot re-enter the same level + !dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop + auto dimType = lvlTypes[t][l]; + // Must be a recognizable DLT. + assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || + isSingletonDLT(dimType)); + + // This is a slice-driven loop. + if (!dependentLvlMap[t][l].empty()) { + assert(!isSliceCond && !isSparseCond); + isSliceCond = true; + tid = t; + lvl = l; + continue; + } + + bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); + // We can at most have one sparse input, otherwise, a while loop is + // required to co-iterate multiple sparse tensors. + assert(!isSparseCond || !isSparse); + assert(!isSliceCond || !isSparseCond); + if (isSparse) { + tid = t; + lvl = l; + } + isSparseCond = isSparseCond || isSparse; + } + + // if the slice is fully reduced, we can now use TACO-based algorithm to + // iterate it. + Operation *l = nullptr; + if (isSliceCond) { + bool fullyReduced = depFullyReduced(tid, lvl); + if (!fullyReduced) { + l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc); + } else { + const SliceInfo &info = getFinalSliceOnLvl(tid, lvl); + Value offset = info.offset; + unsigned depth = info.depth - 1; + Operation *insertPoint = nullptr; + // TODO: we should generalize the method to support iteration over for + // normal slices as well to allow early break. + l = genSliceLvlTraverseLoop( + builder, loc, posits[tid][lvl], highs[tid][lvl], offset, tid, lvl, + depth, reduc, + /*genYield=*/false, // unaware of the yield values from user yet + [this, tid, lvl, reduc, offset, + &insertPoint](OpBuilder &builder, Location loc, Value iv, + MutableArrayRef innerReduc) { + assert(innerReduc.size() == reduc.size()); + // Updates users' reduction variable inplace + for (unsigned i = 0, e = reduc.size(); i < e; i++) + reduc[i] = innerReduc[i]; + // Loads the coordinates. + Value absC = genIndexLoad(builder, loc, + coordinatesBuffers[tid][lvl], iv); + + // We need to substract the offset to get relative coordinates. + // TODO: how to assert relC >=0 during runtime? + insertPoint = builder.create(loc, absC, offset); + posits[tid][lvl] = iv; + coords[tid][lvl] = insertPoint->getResult(0); + }) + .first; + // We did not finish the loop body, reset the insertion point and delegate + // to user. + builder.setInsertionPointAfter(insertPoint); + } + levelReducedDep[tid][lvl]++; + // NOTE: we can also prepare for next dim here in advance + // Pushes the loop into stack. + loopStack.emplace_back( + ArrayRef(), ArrayRef(), ArrayRef(tid), + ArrayRef(lvl), ArrayRef(fullyReduced), l, + builder.getInsertionBlock(), coords[tid][lvl], loopTag); + } else { + l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc, isParallel); + // NOTE: we can also prepare for next dim here in advance + // Pushes the loop into stack. + loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), + ArrayRef(), ArrayRef(), + ArrayRef(), l, builder.getInsertionBlock(), + coords[tid][lvl], loopTag); + } + // Emit extra locals. emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); - - return loop; + return l; } Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl( @@ -582,7 +735,7 @@ // break when exceeding (for ordered levels). // TODO: There are many other potiential opportunities that we might apply in // the future. E.g., we could use binary search to locate positions. - const Value step = constantIndex(builder, loc, 1); + const Value step = C_IDX(1); const Value pLo = posits[tid][lvl]; const Value pHi = highs[tid][lvl]; scf::ForOp forOp = builder.create(loc, pLo, pHi, step, reduc); @@ -604,8 +757,7 @@ // Generate an if-condition to filter out coordinates that are not // equal to the result of the affine expression. Value expected = genAffine(builder, loc, affine); - auto pred = builder.create(loc, arith::CmpIPredicate::eq, crd, - expected); + auto pred = CMPI(eq, coords[tid][lvl], expected); SmallVector types; for (Value red : reduc) { types.push_back(red.getType()); @@ -631,8 +783,10 @@ // NOTE: we can also prepare for next lvl here in advance // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), forOp, - builder.getInsertionBlock(), crd, nullptr); + loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), + ArrayRef(), ArrayRef(), + ArrayRef(), forOp, builder.getInsertionBlock(), + coords[tid][lvl], nullptr); return forOp; } @@ -648,20 +802,24 @@ Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls, bool needsUniv, MutableArrayRef reduc) { + // NOTE: make sure that the slice driven tensor-related reduction variable + // appears first than normal tensors. assert(tids.size() == lvls.size()); SmallVector types; SmallVector operands; // Construct the while-loop with a parameter for each coordinate. const Type indexType = builder.getIndexType(); for (auto [tid, lvl] : llvm::zip(tids, lvls)) { + // TODO: support coiteration with slice driven tensors. const auto lvlTp = lvlTypes[tid][lvl]; + assert(dependentLvlMap[tid][lvl].empty() && "TODO: not yet implemented"); if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto reassoc = getCollapseReassociation(tid, lvl); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { // This is the segment high for each non-unique levels. types.push_back(indexType); - operands.push_back(constantIndex(builder, loc, 0)); + operands.push_back(C_IDX(0)); } } const auto pos = posits[tid][reassoc.front()]; @@ -678,7 +836,7 @@ if (needsUniv) { types.push_back(indexType); // Update universal index. - operands.push_back(loopSeqStack.back()); + operands.push_back(loopSeqStack.back().first); } assert(types.size() == operands.size()); scf::WhileOp whileOp = builder.create(loc, types, operands); @@ -707,8 +865,7 @@ Value op1 = before->getArgument(o); // We used the first level bound as the bound the collapsed set of levels. Value op2 = highs[tid][reassoc.front()]; - Value opc = builder.create(loc, arith::CmpIPredicate::ult, - op1, op2); + Value opc = CMPI(ult, op1, op2); cond = cond ? builder.create(loc, cond, opc) : opc; // Update positions Value pos = after->getArgument(o++); @@ -752,8 +909,7 @@ // // This "idx" is the index into `llvm::zip(tids, lvls)` for (auto [pred, idx] : slicesPreds) { - Value nextPos = builder.create( - loc, yields[idx], constantIndex(builder, loc, 1)); + Value nextPos = builder.create(loc, yields[idx], C_IDX(1)); yields[idx] = builder.create(loc, pred, yields[idx], nextPos); } @@ -783,9 +939,9 @@ if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto crd = coords[tid][lvl]; if (min) { - Value cmp = builder.create( - loc, arith::CmpIPredicate::ult, crd, min); - min = builder.create(loc, cmp, crd, min); + Value cmp = CMPI(ult, coords[tid][lvl], min); + min = + builder.create(loc, cmp, coords[tid][lvl], min); } else { min = crd; } @@ -798,8 +954,9 @@ } // Sets up the loop stack. - loopStack.emplace_back(tids, lvls, whileOp, builder.getInsertionBlock(), min, - loopTag); + loopStack.emplace_back(tids, lvls, ArrayRef(), ArrayRef(), + ArrayRef(), whileOp, builder.getInsertionBlock(), + min, loopTag); assert(loopStack.size() == loopSeqStack.size()); for (auto [tid, dstLvl] : llvm::zip(tids, lvls)) { @@ -869,8 +1026,8 @@ if (isDenseDLT(lvlTp)) return; - const Value c0 = constantIndex(builder, loc, 0); - const Value c1 = constantIndex(builder, loc, 1); + const Value c0 = C_IDX(0); + const Value c1 = C_IDX(1); for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) { // Either the first level, or the previous level has been set. /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. @@ -1022,6 +1179,7 @@ auto whileOp = llvm::cast(loopInfo.loop); builder.setInsertionPointToEnd(loopInfo.userCodeBlock); Value iv = loopInfo.iv; + // Finalize the induction. Note that the induction could be performed // in the individual if-branches to avoid re-evaluating the conditions. // However, that would result in a rather elaborate forest of yield @@ -1029,7 +1187,35 @@ // after the if-statements more closely resembles code generated by TACO. unsigned o = 0; SmallVector operands; - Value one = constantIndex(builder, loc, 1); + unsigned delta = 0; + for (auto [tid, lvl, resolved] : llvm::zip( + loopInfo.slicedTids, loopInfo.slicedLvls, loopInfo.sliceReduced)) { + levelReducedDep[tid][lvl]--; + if (!resolved) { + genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o); + } else { + // TODO: We need to distinguish coiterate loop with slice-driven loop and + // fully reduced while op for iterating one slices. + // FIXME: since we didn't implement coiteration, this must be iteration + // just on fully resolved slice. + assert(loopInfo.slicedTids.size() == 1 && loopInfo.tids.empty()); + // The if guard to filter out out-range coordinates. + assert(llvm::isa(builder.getInsertionBlock()->getParentOp())); + posits[tid][lvl] = whileOp->getResult(o++); + // FIXME: we are not using continue here since we do not support + // coiteration on slices. But it need to be treated similarly as the + // universal index. + o++; // skip continue flag. + // Since we did not push two results from whileOp. The size of the + // operands vector is smaller than the actual number of return values from + // the whileOp. + // It is because we are actually generate yield in the IfOp inside the + // whileOp to only iterates over inbound coordinates within the slices. + delta += 2; + } + }; + + Value one = C_IDX(1); for (auto [tid, dstLvl] : llvm::zip(loopInfo.tids, loopInfo.lvls)) { const auto lvlTp = lvlTypes[tid][dstLvl]; if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { @@ -1044,8 +1230,7 @@ } const Value crd = coords[tid][dstLvl]; const Value pos = posits[tid][dstLvl]; - Value cmp = - builder.create(loc, arith::CmpIPredicate::eq, crd, iv); + Value cmp = CMPI(eq, crd, iv); // If the loop contains a coiteration with non-unique level, we fast // forward all the duplicated coords by setting the position to the // segment high. @@ -1080,15 +1265,15 @@ } // An (optional) universal index. - if (operands.size() < whileOp.getNumResults()) { - assert(operands.size() + 1 == whileOp.getNumResults()); + if (operands.size() + delta < whileOp.getNumResults()) { + assert(operands.size() + delta + 1 == whileOp.getNumResults()); // The last one is the universial index. operands.push_back(builder.create(loc, iv, one)); // update the loop starting point of current loop sequence - loopSeqStack.back() = whileOp->getResult(o++); + loopSeqStack.back().first = whileOp->getResult(o++); } - assert(o == operands.size()); + assert(o == operands.size() + delta); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); } @@ -1109,3 +1294,573 @@ assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); } + +//===----------------------------------------------------------------------===// +// Slice-driven loop related methods. +//===----------------------------------------------------------------------===// + +LoopEmitter::SliceInfo &LoopEmitter::getFinalSliceOnLvl(TensorId tid, + Level lvl) { + for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie; + it++) { + if (it->slicedOnLvl == lvl) { + assert(it->depth == dependentLvlMap[tid][lvl].size() - 1); + return *it; + } + } + + llvm_unreachable("Failed to find sliceInfo"); +} + +size_t LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const { + size_t totalDependencies = dependentLvlMap[tid][lvl].size(); + if (totalDependencies != 0) { + assert(totalDependencies >= 2); + return totalDependencies - levelReducedDep[tid][lvl]; + } + return totalDependencies; +} + +std::pair LoopEmitter::genSliceLvlTraverseLoop( + OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset, + TensorId tid, Level lvl, size_t depth, ValueRange userReduc, bool genYield, + llvm::function_ref)> + bodyBuilder) { + Value c1 = C_IDX(1); + Value sliceHi = + builder.create(loc, offset, sliceSizes[tid][lvl].back()); + + SmallVector reduc = { + loopLo, // loop lower bounds + constantI1(builder, loc, true), // continue + }; + // Append user required reduction value. + reduc.append(userReduc.begin(), userReduc.end()); + scf::WhileOp whileOp = builder.create( + loc, ValueRange(reduc).getTypes(), reduc, + /*beforeBuilder=*/ + [loopHi](OpBuilder &builder, Location loc, ValueRange args) { + Value lo = args[0]; + Value cont = args[1]; + Value inBound = CMPI(ult, lo, loopHi); + Value cond = builder.create(loc, cont, inBound); + // continue if not yet break nor out of bound. + builder.create(loc, cond, args); + }, + /*afterBuilder=*/ + [this, c1, tid, lvl, sliceHi, genYield, + bodyBuilder](OpBuilder &builder, Location loc, ValueRange args) { + Value iv = args[0]; + Value coord = + genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv); + // If coord < sliceHi + Value cont = CMPI(ult, coord, sliceHi); + + TypeRange types = args.drop_front(2).getTypes(); + auto ifOp = builder.create(loc, types, cont, true); + { + // 2 reduction variable maintained by us. + SmallVector ifRet = args.drop_front(2); + assert(ifRet.size() == args.size() - 2); + + OpBuilder::InsertionGuard guard(builder); + // If not in slice. + // Break the while loop (by setting continue to false) + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + builder.create(loc, ifRet); + + // If this is a legit coordinates in slice + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + bodyBuilder(builder, loc, iv, ifRet); + if (genYield) { + builder.setInsertionPointToEnd(&ifOp.getThenRegion().front()); + builder.create(loc, ifRet); + } + } + // Marks this speical ifOp to avoid sparisification finalizing it. + ifOp->setAttr(getLoopEmitterLoopAttrName(), + StringAttr::get(builder.getContext(), "slice")); + // Insertion point restored to after ifOp. + SmallVector yields; + // Increase induction variable. + yields.push_back(builder.create(loc, iv, c1)); + yields.push_back(cont); + yields.append(ifOp.getResults().begin(), ifOp.getResults().end()); + builder.create(loc, yields); + }); + + builder.setInsertionPointAfter(whileOp); + return std::make_pair(whileOp, whileOp.getResults().drop_front(2)); +} + +ValueRange LoopEmitter::genSliceAllLvlTraverseLoop( + OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl, + size_t depth, ValueRange userReduc, + llvm::function_ref)> + bodyBuilder) { + + Value c0 = C_IDX(0); + Value c1 = C_IDX(1); + Value c2 = C_IDX(2); + + // TODO: it only works on all compressed tensor. + Value sPtrBuf = slicePosBuffer[tid][lvl][depth]; + Value pSt = c2; // pointer starting index + Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize + + auto forOp = + scf::buildLoopNest( + builder, loc, pSt, mSz, c2, userReduc, + [this, c1, depth, tid, lvl, offset, sPtrBuf, + bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + // generate traversal for each level. + Value loopLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); + Value loopHi = genIndexLoad( + builder, loc, sPtrBuf, + builder.create(loc, ivs.front(), c1)); + return genSliceLvlTraverseLoop(builder, loc, loopLo, loopHi, offset, + tid, lvl, depth, iterArgs, true, + bodyBuilder) + .second; + }) + .loops.front(); + + // Insert after current while operation. + builder.setInsertionPointAfter(forOp); + return forOp.getResults(); +} + +bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, + Level lvl) { + Value c0 = C_IDX(0); + Value c1 = C_IDX(1); + Value c2 = C_IDX(2); + Value c3 = C_IDX(3); + Value c4 = C_IDX(4); + + if (depFullyReduced(tid, lvl)) { + // If constraints on the tensor is fully resolved. We do not need to + // generates slice begin any more, instead we fall back to TACO-based + // algorithm to (co)iterates over the slice. + Value pLoPtr = + genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), c1); + pLoPtr = builder.create(loc, pLoPtr, c2); + Value pHiPtr = builder.create(loc, pLoPtr, c1); + posits[tid][lvl] = + genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pLoPtr); + highs[tid][lvl] = + genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pHiPtr); + return true; + } + + // Only when the level is sorted, the next-non-empty slice can be computed + // efficiently. + assert(isOrderedDLT(lvlTypes[tid][lvl])); + if (isDenseDLT(lvlTypes[tid][lvl]) || isSingletonDLT(lvlTypes[tid][lvl])) + llvm_unreachable("TODO: dense level should be easy to support, while " + "singleton level requres more efforts"); + + assert(!dependentLvlMap[tid][lvl].empty()); + assert(!sliceStack[tid].empty()); + + const SliceInfo &sliceInfo = sliceStack[tid].back(); + auto baseEnc = getSparseTensorEncoding(tensors[tid].getType()); + // Generate caches required to fast compute next-non-empty slices with + // increasing offset for slice-base loop. + if (slicePosBuffer[tid][lvl][0] == nullptr) { + OpBuilder::InsertionGuard guard(builder); + // The buffer can be reused, and the size is loop invariant: it only depends + // on the iteration graph's toposort. + builder.setInsertionPointAfter(localInsertPos); + Value bufSize = C_IDX(1); + Value c2 = C_IDX(2); + // Accumlates the size required to cache the pLo for the slice. + // E.g., if we want to cache the pIdx for slice on the second + // level. We at most need to a memref. + // NOTE: this is apperantly an over-approximation when the previous + // level is compressed, and we can compute a precise memory size + // inside the loops. But that would also requires us to allocate/free + // memorys in loops. + // TODO: Maybe using allocaScopOp inside the loop to resolve the issue? + for (Level curLevel = lvl; + curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) { + auto depth = remDepOnLevel(tid, curLevel - 1); + assert(sliceSizes[tid][lvl].size() >= depth); + Value sz = *(sliceSizes[tid][lvl].rbegin() + depth - 1); + bufSize = builder.create(loc, bufSize, sz); + } + // For a pair of [pLo, pHi]. Note that we can not compress pHi because slice + // creates segments in the index buffer so that the pHi for the current + // level is no longer the pLo for the next level. + bufSize = builder.create(loc, bufSize, c2); + // Additional two metadata {memSize, idx} at head. + bufSize = builder.create(loc, bufSize, c2); + llvm::for_each( + slicePosBuffer[tid][lvl], [bufSize, loc, &builder](Value &cache) { + cache = genAlloca(builder, loc, bufSize, builder.getIndexType()); + }); + } + + Value size, minCrd, isNonEmpty; + unsigned depth = 0; + if (sliceInfo.isInitialTensor() || + (lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) { + // The input tensor is slices, not yet handled. + if (baseEnc.isSlice()) + llvm_unreachable("TODO: not yet implemented"); + + assert(lvl == 0); // must be reduing the affine expression on the first lvl. + assert(!(lvl >= 1 && lvlFullyResolved(tid, lvl - 1)) && "TODO"); + // Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi] + Value sPtrBuf = slicePosBuffer[tid][0][0]; + Value pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); + builder.create(loc, c4, sPtrBuf, c0); // memSize = 4 + builder.create(loc, c0, sPtrBuf, c1); // index = 0 + builder.create(loc, c0, sPtrBuf, c2); // pLo = 0; + builder.create(loc, pHi, sPtrBuf, c3); // loaded pHi. + + size = sliceSizes[tid][0][0]; + // This is an non empty tensor if 0 < pHi. + isNonEmpty = CMPI(ult, c0, pHi); + // The minimal coord must be at the first on ordered level. + // FIXME: Technically we should load the coord only when the slice is + // nonempty. though we assume that even on empty sparse tensors, a non-empty + // ptr/idx buffer is allocated for each level so it would not cause OOB to + // avoid generating a ifOp here. + minCrd = genIndexLoad(builder, loc, coordinatesBuffers[tid][0], c0); + depth = 1; + } else { // The previous level has not been full resolved. + unsigned prevLvl = *sliceInfo.slicedOnLvl; + assert(lvl >= prevLvl); + if (lvl != prevLvl + 1) { + // Either lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one + // variable need to be reduced on the same level). + // Or lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a + // simple dim expression in between). + llvm_unreachable("TODO: not yet implemented"); + } else { + assert(slicePosBuffer[tid][prevLvl].size() == sliceInfo.depth); + Value sPtrBuf = slicePosBuffer[tid][lvl][0]; + + SmallVector reduc = { + constantI1(builder, loc, false), // isNonEmpty + lvlSizes[tid][lvl], // minCrd + c2, // memSize + }; + ValueRange result = genSliceAllLvlTraverseLoop( + builder, loc, sliceInfo.offset, tid, prevLvl, sliceInfo.depth - 1, + reduc, + [this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, + Value iv, + MutableArrayRef reduc) { + Value &isNonEmpty = reduc[0]; + Value &minCrd = reduc[1]; + Value &curMemSize = reduc[2]; + + Value pHi = builder.create(loc, iv, c1); + Value sPLo = + genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv); + Value sPHi = + genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi); + + // isNonEmpty = isNonEmpty || lvlNonEmpty + Value lvlNonEmpty = CMPI(ult, sPLo, sPHi); + isNonEmpty = + builder.create(loc, lvlNonEmpty, isNonEmpty); + + // Update minimal coordinate. + auto ifNonEmpty = builder.create( + loc, builder.getIndexType(), lvlNonEmpty, true); + { + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(ifNonEmpty.thenBlock()); + Value curC = genIndexLoad(builder, loc, + coordinatesBuffers[tid][lvl], sPLo); + Value isCurSmaller = CMPI(ult, curC, minCrd); + Value newMin = builder.create(loc, isCurSmaller, + curC, minCrd); + builder.create(loc, newMin); + builder.setInsertionPointToStart(ifNonEmpty.elseBlock()); + builder.create(loc, minCrd); + } + minCrd = ifNonEmpty.getResult(0); + + // filles in + builder.create(loc, sPLo, sPtrBuf, curMemSize); + Value nxtMemSize = + builder.create(loc, curMemSize, c1); + builder.create(loc, sPHi, sPtrBuf, nxtMemSize); + + // curMemSize += 2 + curMemSize = builder.create(loc, curMemSize, c2); + }); + + size = sliceSizes[tid][lvl][0]; + isNonEmpty = result[0]; + minCrd = result[1]; + depth = 1; + + // Two metadata [memSize, idx]. + // TODO: we might be able to use an SSA value for memSize here to avoid + // memory operation. + builder.create(loc, result[2], sPtrBuf, c0); + builder.create(loc, c0, sPtrBuf, c1); + } + } + + assert(depth > 0 && size && isNonEmpty && minCrd && depth); + // Compute the minimal offsets viable for a non empty tensor. + // offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0; + // NOTE: that minCrd is invalid when isNonEmpty = false, in which case + // the computed slices are meaningless. + // FIXME: support relative offset compute. + Value geSize = CMPI(uge, minCrd, size); + Value pred = builder.create(loc, isNonEmpty, geSize); + + Value mp1 = builder.create(loc, minCrd, c1); + Value mms = builder.create(loc, mp1, size); + // This is the absolute offset related to the underly tensor. + Value absOffset = builder.create(loc, pred, mms, c0); + // This is the relative offset related to the base slice. + // Value relOffset = absOffset; + sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, depth); + return false; +} + +void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, + const Operation *op, TensorId tid, + Level lvl, + SmallVectorImpl &operands, + unsigned &retIdx) { + if (!isCompressedDLT(lvlTypes[tid][lvl])) + llvm_unreachable("TODO"); + + // else generate code to compute next non empty slice. + Value c0 = C_IDX(0); + Value c1 = C_IDX(1); + Value c2 = C_IDX(2); + + auto whileOp = llvm::cast(op); + SliceInfo &info = sliceStack[tid].back(); + assert(info.slicedOnLvl == lvl); + + // + // We forward to the next non empty slice by + // if (minCrd > offset) { + // offset += 1 + // } else { + // minCrd = nextMinInSlice(); + // offset = minCrd - size + 1; + // } + // + // if (offset + size > parents.size) + // isNonEmpty = false; + // + Value absOffset = info.offset; + // Resets slices pointers as the resolved slices are invalidated after we + // moves forward to the next slice. + for (unsigned i = 0; i <= lvl; i++) + builder.create(loc, c0, slicePosBuffer[tid][i].back(), c1); + + SmallVector reduc = {info.minCrd, info.isNonEmpty, absOffset}; + Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1]; + Value fastPathP = CMPI(ugt, info.minCrd, absOffset); + auto ifOp = builder.create(loc, ValueRange(reduc).getTypes(), + fastPathP, true); + { + OpBuilder::InsertionGuard guard(builder); + // Take the fast path if minCrd > offset + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + reduc[2] = builder.create(loc, absOffset, c1); + // Yield offset + 1. + builder.create(loc, reduc); + + // Else, take the slow path. + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + reduc[2] = absOffset; // restore value. + Value pSt = c2; // pointer starting index + Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize + reduc[0] = lvlSizes[tid][lvl]; // next min coord + reduc[1] = constantI1(builder, loc, false); // isNonEmpty + auto loopArgs = static_cast(reduc).drop_back(); + auto forOp = scf::buildLoopNest( + builder, loc, pSt, mSz, c2, loopArgs, + [this, tid, lvl, c1, sPtrBuf, + &info](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + Value curMinCrd = iterArgs[0]; + Value isNonEmpty = iterArgs[1]; + + Type idxTp = builder.getIndexType(); + Value pLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); + Value pHi = + genIndexLoad(builder, loc, sPtrBuf, + builder.create(loc, ivs.front(), c1)); + // + // if pLo < pHi + // coord = load[pLo] + // if coord == minCrd + // pLo += 1 + // + // if pLo < pHi + // curMinCrd = min(curMinCrd, load[pLo]) + // + Value pred = CMPI(ult, pLo, pHi); + auto advPLo = builder.create(loc, idxTp, pred, true); + /* if pLo < pHi */ { + builder.setInsertionPointToStart(&advPLo.getThenRegion().front()); + // coord = load[pLo] + Value coord = + genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo); + Value pred = CMPI(eq, coord, info.minCrd); + auto ifEqual = builder.create(loc, idxTp, pred, true); + /* if coord == minCrd */ { + builder.setInsertionPointToStart( + &ifEqual.getThenRegion().front()); + Value newPlo = builder.create(loc, pLo, c1); + // Updates the cache. + builder.create(loc, newPlo, sPtrBuf, + ivs.front()); + builder.create(loc, newPlo); + } + /* else coord != minCrd */ { + builder.setInsertionPointToStart( + &ifEqual.getElseRegion().front()); + builder.create(loc, pLo); + } + builder.setInsertionPointAfter(ifEqual); + builder.create(loc, ifEqual.getResults()); + } + /* else pLo >= pHi */ { + builder.setInsertionPointToStart(&advPLo.getElseRegion().front()); + builder.create(loc, pLo); + } + + builder.setInsertionPointAfter(advPLo); + pLo = advPLo.getResult(0); + Value lvlNonEmpty = CMPI(ult, pLo, pHi); + // Update minCrds + auto newMin = + builder.create(loc, idxTp, lvlNonEmpty, true); + builder.setInsertionPointToStart(&newMin.getThenRegion().front()); + builder.create( + loc, + genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo)); + + builder.setInsertionPointToStart(&newMin.getElseRegion().front()); + builder.create(loc, curMinCrd); + builder.setInsertionPointAfter(newMin); + + // isNonEmpty = isNonEmpty || lvlNonEmpty + isNonEmpty = + builder.create(loc, lvlNonEmpty, isNonEmpty); + curMinCrd = builder.create( + loc, CMPI(ult, newMin.getResult(0), curMinCrd), + newMin.getResult(0), curMinCrd); + return {curMinCrd, isNonEmpty}; + }); + + builder.setInsertionPointAfter(forOp.loops.front()); + // minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0 + Value tmp = builder.create(loc, forOp.results.front(), c1); + Value minOffset = builder.create( + loc, tmp, sliceSizes[tid][lvl][info.depth - 1]); + Value p = CMPI(uge, tmp, sliceSizes[tid][lvl][info.depth - 1]); + minOffset = builder.create(loc, p, minOffset, c0); + SmallVector yields; + yields.assign(forOp.results.begin(), forOp.results.end()); + yields.push_back(minOffset); + builder.create(loc, yields); + } + + Value nextMinCrd = ifOp.getResults()[0]; + Value nextNonEmpty = ifOp.getResults()[1]; + + // the next offset should at least be offset + 1; + Value minOffset = ifOp.getResults()[2]; + Value nxOffset = builder.create(loc, info.offset, c1); + Value maxPred = CMPI(ugt, minOffset, nxOffset); + Value nextAbsOffset = + builder.create(loc, maxPred, minOffset, nxOffset); + + Value sliceUB = builder.create( + loc, nextAbsOffset, sliceSizes[tid][lvl][info.depth - 1]); + + // FIXME: this only works if the parsent is the tensor, we should use the + // parents slice size + parent offset. + assert(info.depth - 1 == 0); + // nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound. + nextNonEmpty = builder.create( + loc, nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl])); + + // FIXME: compute relative offset. + assert(info.depth - 1 == 0); + Value nextRelOffset = nextAbsOffset; + nextRelOffset = + builder.create(loc, nextNonEmpty, nextRelOffset, c0); + + operands.push_back(nextNonEmpty); + operands.push_back(nextMinCrd); + operands.push_back(nextAbsOffset); // we push the absolute offset. + + // Update the slice stack. + info.isNonEmpty = whileOp.getResult(retIdx++); + info.minCrd = whileOp.getResult(retIdx++); + info.offset = whileOp.getResult(retIdx++); +} + +Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtLvl( + OpBuilder &builder, Location loc, TensorId tid, Level lvl, + MutableArrayRef reduc) { + assert(!depFullyReduced(tid, lvl)); + SliceInfo &sliceInfo = sliceStack[tid].back(); + assert(sliceInfo.slicedOnLvl == lvl); + + // NOTE: The order matters! + SmallVector operands{sliceInfo.isNonEmpty, sliceInfo.minCrd, + sliceInfo.offset}; + // number of reduction maintained by us. + size_t numMetaReduc = operands.size(); + + // Append user-required reduction values. + operands.append(reduc.begin(), reduc.end()); + assert(operands.size() == numMetaReduc + reduc.size()); + + auto whileOp = builder.create( + loc, ValueRange(operands).getTypes(), operands, + /*beforeBuilder=*/ + [](OpBuilder &builder, Location loc, ValueRange args) { + builder.create(loc, /*isNonEmpty*/ args[0], args); + }, + /*afterBuilder=*/ + [this, tid, lvl, reduc, numMetaReduc, + &sliceInfo](OpBuilder &builder, Location loc, ValueRange args) { + assert(args.size() == reduc.size() + numMetaReduc); + sliceInfo.isNonEmpty = args[0]; + sliceInfo.minCrd = args[1]; + sliceInfo.offset = args[2]; + // The slice offset is the coordinate. + Value c = sliceInfo.offset; + if (sliceInfo.depth > 1) { + // Coord is the relative offset related to its parents. + // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1] + llvm_unreachable("TODO: not yet implement"); + } + coords[tid][lvl] = c; + + for (unsigned i = 0, e = reduc.size(); i < e; i++) + reduc[i] = args[i + numMetaReduc]; + }); + + // Increments the number of resolved constraints on tid. + // levelReducedDep[tid][lvl]++; + // Set the insertion point to while loop body. + builder.setInsertionPointToEnd(&whileOp.getAfter().front()); + return whileOp; +} + +#undef CMPI +#undef C_IDX diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -1018,7 +1018,7 @@ // Link the reduction chain. Note that loop emitter update the reducValue // in place. loopEmitter.exitCurrentLoop(rewriter, loc, reducValue); - loopEmitter.exitCurrentLoopSeq(); + loopEmitter.exitCurrentLoopSeq(rewriter, loc); } // Replace the foreach operator with the value returned by the outtermost diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -83,9 +83,12 @@ class AffineDimFinder : public AffineExprVisitor { public: explicit AffineDimFinder(linalg::GenericOp op) - : iterTypes(op.getIteratorTypesArray()) {} + : iterTypes(op.getIteratorTypes()) {} void visitDimExpr(AffineDimExpr expr) { - if (pickedDim == nullptr || pickIterType == iterTypes[expr.getPosition()]) { + if (pickedDim == nullptr || + pickIterType == iterTypes[expr.getPosition()] + .cast() + .getValue()) { pickedDim = expr; } } @@ -106,7 +109,7 @@ /// The iterator type that we want. utils::IteratorType pickIterType; /// The mapping between dim=>iterator type. - SmallVector iterTypes; + ArrayAttr iterTypes; }; // Flattens an affine expression into a list of AffineDimExprs. @@ -306,7 +309,7 @@ // else increase min(d0_1, d0_2). return false; } - merger.setLoopDependentTensorLevel(ldx, tensor, lvl); + merger.setLoopDependentTensorLevel(ldx, tensor, lvl, dlt); } return true; } @@ -774,10 +777,6 @@ for (OpOperand &t : env.op()->getOpOperands()) { // Get map and encoding. const auto enc = getSparseTensorEncoding(t.get().getType()); - assert(env.op().getMatchingIndexingMap(&t).getNumDims() + - getNumNonTrivialIdxExpOnSparseLvls(env.op()) == - numLoops); - // Skips dense inputs/outputs when not requested. const bool isDenseInput = !enc && env.op().isDpsInput(&t); const bool isDenseOutput = !enc && !isDenseInput; @@ -1498,8 +1497,7 @@ // consumed by a subsequent lattice point. if (needsUniv) { for (const LatPointId li : env.set(lts).drop_front()) - if (!env.merger().hasAnySparse(env.lat(li).simple) && - !env.merger().hasSparseIdxReduction(env.lat(li).simple)) + if (!env.merger().hasAnySparse(env.lat(li).simple)) return true; } return false; @@ -1680,14 +1678,17 @@ /// Ends a single loop in current sequence. Returns new values for needsUniv. static bool endLoop(CodegenEnv &env, RewriterBase &rewriter, Operation *loop, - LoopId idx, LatPointId li, bool needsUniv) { - // End a while-loop. - if (auto whileOp = dyn_cast(loop)) { - finalizeWhileOp(env, rewriter, idx, needsUniv, whileOp); - } else if (auto forOp = dyn_cast(loop)) { - // Any iteration of a reduction for-loop creates a valid lex insert. + LoopId idx, LatPointId li, bool needsUniv, + bool isSingleCond) { + + if (isSingleCond) { + // Could be a for-loop or a while-loop for iterating over slice. + // Any iteration creates a valid lex insert. if (env.isReduc() && env.getValidLexInsert()) env.setValidLexInsert(constantI1(rewriter, env.op().getLoc(), true)); + } else if (auto whileOp = dyn_cast(loop)) { + // End a while-loop. + finalizeWhileOp(env, rewriter, idx, needsUniv, whileOp); } else { needsUniv = false; } @@ -1701,10 +1702,10 @@ } /// Ends a loop sequence at given level. -static void endLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp, - LoopOrd at, LoopId idx, LoopId ldx) { +static void endLoopSeq(CodegenEnv &env, OpBuilder &builder, unsigned exp, + unsigned at, unsigned idx, unsigned ldx) { assert(!env.getLoopVar(idx)); - env.emitter().exitCurrentLoopSeq(); + env.emitter().exitCurrentLoopSeq(builder, env.op().getLoc()); // Unmark bookkeeping of invariants and loop index. genInvariants(env, builder, exp, ldx, /*atStart=*/false); // Finalize access pattern expansion for sparse tensor output. @@ -1769,7 +1770,7 @@ } // End a loop. - needsUniv = endLoop(env, rewriter, loop, idx, li, needsUniv); + needsUniv = endLoop(env, rewriter, loop, idx, li, needsUniv, isSingleCond); } // End a loop sequence. diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp --- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp +++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp @@ -213,10 +213,11 @@ std::vector>(numLoops, std::nullopt)), lvlToLoop(numTensors, std::vector>(maxLvlRank, std::nullopt)), - loopToDependencies(numLoops, std::vector>( - numTensors, std::nullopt)), - levelToDependentIdx(numTensors, std::vector>( - maxLvlRank, std::vector())), + loopToDependencies( + numLoops, std::vector>>( + numTensors, std::nullopt)), + levelToDependentLoop(numTensors, std::vector>( + maxLvlRank, std::vector())), loopBounds(numLoops, std::make_pair(numTensors, numLoops)) {} //===----------------------------------------------------------------------===// @@ -396,15 +397,10 @@ } } - BitVector simple(lat(p0).bits); - bool reset = - isSingleton && (hasAnySparse(simple) || hasSparseIdxReduction(simple)); - // `be`, `b`, and `offset` are `TensorLoopId` in spirit; but we avoid - // using that class in this function because we need to do a bunch of - // arithmetic on them, so using the newtype would introduce too much - // boilerplate. - const unsigned be = simple.size(); - unsigned offset = 0; // relative to the end + BitVector simple(latPoints[p0].bits); + bool reset = isSingleton && hasAnySparse(simple); + const TensorLoopId be = simple.size(); + TensorLoopId offset = 0; // relative to the end if (!reset) // Starts resetting from a dense level, so that the first bit (if kept) // is not undefined level-type. @@ -419,10 +415,9 @@ // keep the rightmost bit (which could possibly be a synthetic tensor). for (unsigned b = be - 1 - offset, i = 0; i < be; b = b == 0 ? be - 1 : b - 1, i++) { - // FIXME: better name? also slice on dense level has locate property as - // well. Handle it correctly! - if (simple[b] && !isLvlWithNonTrivialIdxExp(TensorLoopId{b})) { - const auto dlt = getDimLevelType(TensorLoopId{b}); + // Slice on dense level has locate property as well, and can be optimized. + if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) { + const auto dlt = getDimLevelType(b); if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt)) { if (reset) simple.reset(b); @@ -447,9 +442,9 @@ } bool Merger::onlyDenseDiff(LatPointId i, LatPointId j) const { - BitVector tmp(lat(j).bits); - tmp ^= lat(i).bits; - return !hasAnySparse(tmp) && !hasSparseIdxReduction(tmp); + BitVector tmp(latPoints[j].bits); + tmp ^= latPoints[i].bits; + return !hasAnySparse(tmp); } bool Merger::expContainsTensor(ExprId e, TensorId t) const { @@ -591,19 +586,17 @@ } bool Merger::hasAnySparse(const BitVector &bits) const { - for (TensorLoopId b = 0, be = bits.size(); b < be; b++) - if (bits[b]) { - const auto dlt = getDimLevelType(b); - if (isCompressedDLT(dlt) || isSingletonDLT(dlt)) - return true; - } - return false; + for (TensorLoopId b : bits.set_bits()) { + const auto dlt = getDimLevelType(b); + if (isCompressedDLT(dlt) || isSingletonDLT(dlt)) + return true; + } + return hasSparseIdxReduction(bits); } bool Merger::hasSparseIdxReduction(const BitVector &bits) const { - // TODO: return false on dense levels. - for (unsigned b = 0, be = bits.size(); b < be; b++) - if (bits[b] && isLvlWithNonTrivialIdxExp(b)) + for (TensorLoopId b : bits.set_bits()) + if (isSparseLvlWithNonTrivialIdxExp(b)) return true; return false; } diff --git a/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir b/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir @@ -0,0 +1,284 @@ +// RUN: mlir-opt %s --sparsification="enable-index-reduction=true" --cse | FileCheck %s + +#map = affine_map<(d0, d1, d2, d3) -> (d0 + d2, d1 + d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> + +#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> + +// CHECK-LABEL: func.func @conv2d_all_sparse_CSR( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x8xi32, #{{.*}}>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<3x3xi32>) -> tensor<6x6xi32, #{{.*}}> { +// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[VAL_9:.*]] = arith.constant true +// CHECK-DAG: %[[VAL_10:.*]] = arith.constant false +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.alloc_tensor() : tensor<6x6xi32, #{{.*}}> +// CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #{{.*}}> to memref +// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #{{.*}}> to memref +// CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #{{.*}}> to memref +// CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #{{.*}}> to memref +// CHECK-DAG: %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xi32, #{{.*}}> to memref +// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : memref<3x3xi32> +// CHECK-DAG: %[[VAL_18:.*]] = memref.alloca(%[[VAL_2]]) : memref +// CHECK-DAG: %[[VAL_19:.*]] = memref.alloca(%[[VAL_7]]) : memref +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref +// CHECK: memref.store %[[VAL_7]], %[[VAL_19]]{{\[}}%[[VAL_4]]] : memref +// CHECK: memref.store %[[VAL_4]], %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: memref.store %[[VAL_4]], %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref +// CHECK: memref.store %[[VAL_20]], %[[VAL_19]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_21:.*]] = arith.cmpi ugt, %[[VAL_20]], %[[VAL_4]] : index +// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_4]]] : memref +// CHECK: %[[VAL_23:.*]] = arith.cmpi uge, %[[VAL_22]], %[[VAL_3]] : index +// CHECK: %[[VAL_24:.*]] = arith.andi %[[VAL_21]], %[[VAL_23]] : i1 +// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_22]], %[[VAL_5]] : index +// CHECK: %[[VAL_26:.*]] = arith.subi %[[VAL_25]], %[[VAL_3]] : index +// CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_24]], %[[VAL_26]], %[[VAL_4]] : index +// CHECK: %[[VAL_28:.*]]:4 = scf.while (%[[VAL_29:.*]] = %[[VAL_21]], %[[VAL_30:.*]] = %[[VAL_22]], %[[VAL_31:.*]] = %[[VAL_27]], %[[VAL_32:.*]] = %[[VAL_11]]) : (i1, index, index, tensor<6x6xi32, #{{.*}}>) -> (i1, index, index, tensor<6x6xi32, #{{.*}}>) { +// CHECK: scf.condition(%[[VAL_29]]) %[[VAL_29]], %[[VAL_30]], %[[VAL_31]], %[[VAL_32]] : i1, index, index, tensor<6x6xi32, #{{.*}}> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_33:.*]]: i1, %[[VAL_34:.*]]: index, %[[VAL_35:.*]]: index, %[[VAL_36:.*]]: tensor<6x6xi32, #{{.*}}>): +// CHECK: %[[VAL_37:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_4]]] : memref +// CHECK: %[[VAL_38:.*]]:3 = scf.for %[[VAL_39:.*]] = %[[VAL_6]] to %[[VAL_37]] step %[[VAL_6]] iter_args(%[[VAL_40:.*]] = %[[VAL_10]], %[[VAL_41:.*]] = %[[VAL_2]], %[[VAL_42:.*]] = %[[VAL_6]]) -> (i1, index, index) { +// CHECK: %[[VAL_43:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_39]]] : memref +// CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_39]], %[[VAL_5]] : index +// CHECK: %[[VAL_45:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_44]]] : memref +// CHECK: %[[VAL_46:.*]] = arith.addi %[[VAL_35]], %[[VAL_3]] : index +// CHECK: %[[VAL_47:.*]]:5 = scf.while (%[[VAL_48:.*]] = %[[VAL_43]], %[[VAL_49:.*]] = %[[VAL_9]], %[[VAL_50:.*]] = %[[VAL_40]], %[[VAL_51:.*]] = %[[VAL_41]], %[[VAL_52:.*]] = %[[VAL_42]]) : (index, i1, i1, index, index) -> (index, i1, i1, index, index) { +// CHECK: %[[VAL_53:.*]] = arith.cmpi ult, %[[VAL_48]], %[[VAL_45]] : index +// CHECK: %[[VAL_54:.*]] = arith.andi %[[VAL_49]], %[[VAL_53]] : i1 +// CHECK: scf.condition(%[[VAL_54]]) %[[VAL_48]], %[[VAL_49]], %[[VAL_50]], %[[VAL_51]], %[[VAL_52]] : index, i1, i1, index, index +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_55:.*]]: index, %[[VAL_56:.*]]: i1, %[[VAL_57:.*]]: i1, %[[VAL_58:.*]]: index, %[[VAL_59:.*]]: index): +// CHECK: %[[VAL_60:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_55]]] : memref +// CHECK: %[[VAL_61:.*]] = arith.cmpi ult, %[[VAL_60]], %[[VAL_46]] : index +// CHECK: %[[VAL_62:.*]]:3 = scf.if %[[VAL_61]] -> (i1, index, index) { +// CHECK: %[[VAL_63:.*]] = arith.addi %[[VAL_55]], %[[VAL_5]] : index +// CHECK: %[[VAL_64:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_55]]] : memref +// CHECK: %[[VAL_65:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_63]]] : memref +// CHECK: %[[VAL_66:.*]] = arith.cmpi ult, %[[VAL_64]], %[[VAL_65]] : index +// CHECK: %[[VAL_67:.*]] = arith.ori %[[VAL_66]], %[[VAL_57]] : i1 +// CHECK: %[[VAL_68:.*]] = scf.if %[[VAL_66]] -> (index) { +// CHECK: %[[VAL_69:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_64]]] : memref +// CHECK: %[[VAL_70:.*]] = arith.cmpi ult, %[[VAL_69]], %[[VAL_58]] : index +// CHECK: %[[VAL_71:.*]] = arith.select %[[VAL_70]], %[[VAL_69]], %[[VAL_58]] : index +// CHECK: scf.yield %[[VAL_71]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_58]] : index +// CHECK: } +// CHECK: memref.store %[[VAL_64]], %[[VAL_18]]{{\[}}%[[VAL_59]]] : memref +// CHECK: %[[VAL_72:.*]] = arith.addi %[[VAL_59]], %[[VAL_5]] : index +// CHECK: memref.store %[[VAL_65]], %[[VAL_18]]{{\[}}%[[VAL_72]]] : memref +// CHECK: %[[VAL_73:.*]] = arith.addi %[[VAL_59]], %[[VAL_6]] : index +// CHECK: scf.yield %[[VAL_67]], %[[VAL_74:.*]], %[[VAL_73]] : i1, index, index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_57]], %[[VAL_58]], %[[VAL_59]] : i1, index, index +// CHECK: } {"Emitted from" = "slice"} +// CHECK: %[[VAL_75:.*]] = arith.addi %[[VAL_55]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_75]], %[[VAL_61]], %[[VAL_76:.*]]#0, %[[VAL_76]]#1, %[[VAL_76]]#2 : index, i1, i1, index, index +// CHECK: } +// CHECK: scf.yield %[[VAL_77:.*]]#2, %[[VAL_77]]#3, %[[VAL_77]]#4 : i1, index, index +// CHECK: } +// CHECK: memref.store %[[VAL_78:.*]]#2, %[[VAL_18]]{{\[}}%[[VAL_4]]] : memref +// CHECK: memref.store %[[VAL_4]], %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_79:.*]] = arith.cmpi uge, %[[VAL_78]]#1, %[[VAL_3]] : index +// CHECK: %[[VAL_80:.*]] = arith.andi %[[VAL_78]]#0, %[[VAL_79]] : i1 +// CHECK: %[[VAL_81:.*]] = arith.addi %[[VAL_78]]#1, %[[VAL_5]] : index +// CHECK: %[[VAL_82:.*]] = arith.subi %[[VAL_81]], %[[VAL_3]] : index +// CHECK: %[[VAL_83:.*]] = arith.select %[[VAL_80]], %[[VAL_82]], %[[VAL_4]] : index +// CHECK: %[[VAL_84:.*]]:4 = scf.while (%[[VAL_85:.*]] = %[[VAL_78]]#0, %[[VAL_86:.*]] = %[[VAL_78]]#1, %[[VAL_87:.*]] = %[[VAL_83]], %[[VAL_88:.*]] = %[[VAL_36]]) : (i1, index, index, tensor<6x6xi32, #{{.*}}>) -> (i1, index, index, tensor<6x6xi32, #{{.*}}>) { +// CHECK: scf.condition(%[[VAL_85]]) %[[VAL_85]], %[[VAL_86]], %[[VAL_87]], %[[VAL_88]] : i1, index, index, tensor<6x6xi32, #{{.*}}> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_89:.*]]: i1, %[[VAL_90:.*]]: index, %[[VAL_91:.*]]: index, %[[VAL_92:.*]]: tensor<6x6xi32, #{{.*}}>): +// CHECK: %[[VAL_93:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_94:.*]] = arith.addi %[[VAL_93]], %[[VAL_6]] : index +// CHECK: %[[VAL_95:.*]] = arith.addi %[[VAL_94]], %[[VAL_5]] : index +// CHECK: %[[VAL_96:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_94]]] : memref +// CHECK: %[[VAL_97:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_95]]] : memref +// CHECK: %[[VAL_98:.*]] = arith.addi %[[VAL_35]], %[[VAL_3]] : index +// CHECK: %[[VAL_99:.*]]:5 = scf.while (%[[VAL_100:.*]] = %[[VAL_96]], %[[VAL_101:.*]] = %[[VAL_9]], %[[VAL_102:.*]] = %[[VAL_8]], %[[VAL_103:.*]] = %[[VAL_10]], %[[VAL_104:.*]] = %[[VAL_92]]) : (index, i1, i32, i1, tensor<6x6xi32, #{{.*}}>) -> (index, i1, i32, i1, tensor<6x6xi32, #{{.*}}>) { +// CHECK: %[[VAL_105:.*]] = arith.cmpi ult, %[[VAL_100]], %[[VAL_97]] : index +// CHECK: %[[VAL_106:.*]] = arith.andi %[[VAL_101]], %[[VAL_105]] : i1 +// CHECK: scf.condition(%[[VAL_106]]) %[[VAL_100]], %[[VAL_101]], %[[VAL_102]], %[[VAL_103]], %[[VAL_104]] : index, i1, i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_107:.*]]: index, %[[VAL_108:.*]]: i1, %[[VAL_109:.*]]: i32, %[[VAL_110:.*]]: i1, %[[VAL_111:.*]]: tensor<6x6xi32, #{{.*}}>): +// CHECK: %[[VAL_112:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_107]]] : memref +// CHECK: %[[VAL_113:.*]] = arith.cmpi ult, %[[VAL_112]], %[[VAL_98]] : index +// CHECK: %[[VAL_114:.*]]:3 = scf.if %[[VAL_113]] -> (i32, i1, tensor<6x6xi32, #{{.*}}>) { +// CHECK: %[[VAL_115:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_107]]] : memref +// CHECK: %[[VAL_116:.*]] = arith.subi %[[VAL_115]], %[[VAL_35]] : index +// CHECK: %[[VAL_117:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_118:.*]] = arith.addi %[[VAL_117]], %[[VAL_6]] : index +// CHECK: %[[VAL_119:.*]] = arith.addi %[[VAL_118]], %[[VAL_5]] : index +// CHECK: %[[VAL_120:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_118]]] : memref +// CHECK: %[[VAL_121:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_119]]] : memref +// CHECK: %[[VAL_122:.*]] = arith.addi %[[VAL_91]], %[[VAL_3]] : index +// CHECK: %[[VAL_123:.*]]:5 = scf.while (%[[VAL_124:.*]] = %[[VAL_120]], %[[VAL_125:.*]] = %[[VAL_9]], %[[VAL_126:.*]] = %[[VAL_109]], %[[VAL_127:.*]] = %[[VAL_110]], %[[VAL_128:.*]] = %[[VAL_111]]) : (index, i1, i32, i1, tensor<6x6xi32, #{{.*}}>) -> (index, i1, i32, i1, tensor<6x6xi32, #{{.*}}>) { +// CHECK: %[[VAL_129:.*]] = arith.cmpi ult, %[[VAL_124]], %[[VAL_121]] : index +// CHECK: %[[VAL_130:.*]] = arith.andi %[[VAL_125]], %[[VAL_129]] : i1 +// CHECK: scf.condition(%[[VAL_130]]) %[[VAL_124]], %[[VAL_125]], %[[VAL_126]], %[[VAL_127]], %[[VAL_128]] : index, i1, i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_131:.*]]: index, %[[VAL_132:.*]]: i1, %[[VAL_133:.*]]: i32, %[[VAL_134:.*]]: i1, %[[VAL_135:.*]]: tensor<6x6xi32, #{{.*}}>): +// CHECK: %[[VAL_136:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_131]]] : memref +// CHECK: %[[VAL_137:.*]] = arith.cmpi ult, %[[VAL_136]], %[[VAL_122]] : index +// CHECK: %[[VAL_138:.*]]:3 = scf.if %[[VAL_137]] -> (i32, i1, tensor<6x6xi32, #{{.*}}>) { +// CHECK: %[[VAL_139:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_131]]] : memref +// CHECK: %[[VAL_140:.*]] = arith.subi %[[VAL_139]], %[[VAL_91]] : index +// CHECK: %[[VAL_141:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_131]]] : memref +// CHECK: %[[VAL_142:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_116]], %[[VAL_140]]] : memref<3x3xi32> +// CHECK: %[[VAL_143:.*]] = arith.muli %[[VAL_141]], %[[VAL_142]] : i32 +// CHECK: %[[VAL_144:.*]] = arith.addi %[[VAL_133]], %[[VAL_143]] : i32 +// CHECK: scf.yield %[[VAL_144]], %[[VAL_9]], %[[VAL_135]] : i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_133]], %[[VAL_134]], %[[VAL_135]] : i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } {"Emitted from" = "slice"} +// CHECK: %[[VAL_145:.*]] = arith.addi %[[VAL_131]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_145]], %[[VAL_137]], %[[VAL_146:.*]]#0, %[[VAL_146]]#1, %[[VAL_146]]#2 : index, i1, i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_147:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_148:.*]] = arith.addi %[[VAL_147]], %[[VAL_6]] : index +// CHECK: memref.store %[[VAL_148]], %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: scf.yield %[[VAL_149:.*]]#2, %[[VAL_9]], %[[VAL_149]]#4 : i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_109]], %[[VAL_110]], %[[VAL_111]] : i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } {"Emitted from" = "slice"} +// CHECK: %[[VAL_150:.*]] = arith.addi %[[VAL_107]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_150]], %[[VAL_113]], %[[VAL_151:.*]]#0, %[[VAL_151]]#1, %[[VAL_151]]#2 : index, i1, i32, i1, tensor<6x6xi32, #{{.*}}> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_152:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_153:.*]] = arith.addi %[[VAL_152]], %[[VAL_6]] : index +// CHECK: memref.store %[[VAL_153]], %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_154:.*]] = scf.if %[[VAL_155:.*]]#3 -> (tensor<6x6xi32, #{{.*}}>) { +// CHECK: %[[VAL_156:.*]] = sparse_tensor.insert %[[VAL_155]]#2 into %[[VAL_155]]#4{{\[}}%[[VAL_35]], %[[VAL_91]]] : tensor<6x6xi32, #{{.*}}> +// CHECK: scf.yield %[[VAL_156]] : tensor<6x6xi32, #{{.*}}> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_157:.*]]#4 : tensor<6x6xi32, #{{.*}}> +// CHECK: } +// CHECK: memref.store %[[VAL_4]], %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: memref.store %[[VAL_4]], %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_158:.*]] = arith.cmpi ugt, %[[VAL_90]], %[[VAL_91]] : index +// CHECK: %[[VAL_159:.*]]:3 = scf.if %[[VAL_158]] -> (index, i1, index) { +// CHECK: %[[VAL_160:.*]] = arith.addi %[[VAL_91]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_90]], %[[VAL_89]], %[[VAL_160]] : index, i1, index +// CHECK: } else { +// CHECK: %[[VAL_161:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_4]]] : memref +// CHECK: %[[VAL_162:.*]]:2 = scf.for %[[VAL_163:.*]] = %[[VAL_6]] to %[[VAL_161]] step %[[VAL_6]] iter_args(%[[VAL_164:.*]] = %[[VAL_2]], %[[VAL_165:.*]] = %[[VAL_10]]) -> (index, i1) { +// CHECK: %[[VAL_166:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_163]]] : memref +// CHECK: %[[VAL_167:.*]] = arith.addi %[[VAL_163]], %[[VAL_5]] : index +// CHECK: %[[VAL_168:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_167]]] : memref +// CHECK: %[[VAL_169:.*]] = arith.cmpi ult, %[[VAL_166]], %[[VAL_168]] : index +// CHECK: %[[VAL_170:.*]] = scf.if %[[VAL_169]] -> (index) { +// CHECK: %[[VAL_171:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_166]]] : memref +// CHECK: %[[VAL_172:.*]] = arith.cmpi eq, %[[VAL_171]], %[[VAL_90]] : index +// CHECK: %[[VAL_173:.*]] = scf.if %[[VAL_172]] -> (index) { +// CHECK: %[[VAL_174:.*]] = arith.addi %[[VAL_166]], %[[VAL_5]] : index +// CHECK: memref.store %[[VAL_174]], %[[VAL_18]]{{\[}}%[[VAL_163]]] : memref +// CHECK: scf.yield %[[VAL_174]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_166]] : index +// CHECK: } +// CHECK: scf.yield %[[VAL_175:.*]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_166]] : index +// CHECK: } +// CHECK: %[[VAL_176:.*]] = arith.cmpi ult, %[[VAL_177:.*]], %[[VAL_168]] : index +// CHECK: %[[VAL_178:.*]] = scf.if %[[VAL_176]] -> (index) { +// CHECK: %[[VAL_179:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_177]]] : memref +// CHECK: scf.yield %[[VAL_179]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_164]] : index +// CHECK: } +// CHECK: %[[VAL_180:.*]] = arith.ori %[[VAL_176]], %[[VAL_165]] : i1 +// CHECK: %[[VAL_181:.*]] = arith.cmpi ult, %[[VAL_182:.*]], %[[VAL_164]] : index +// CHECK: %[[VAL_183:.*]] = arith.select %[[VAL_181]], %[[VAL_182]], %[[VAL_164]] : index +// CHECK: scf.yield %[[VAL_183]], %[[VAL_180]] : index, i1 +// CHECK: } +// CHECK: %[[VAL_184:.*]] = arith.addi %[[VAL_185:.*]]#0, %[[VAL_5]] : index +// CHECK: %[[VAL_186:.*]] = arith.subi %[[VAL_184]], %[[VAL_3]] : index +// CHECK: %[[VAL_187:.*]] = arith.cmpi uge, %[[VAL_184]], %[[VAL_3]] : index +// CHECK: %[[VAL_188:.*]] = arith.select %[[VAL_187]], %[[VAL_186]], %[[VAL_4]] : index +// CHECK: scf.yield %[[VAL_185]]#0, %[[VAL_185]]#1, %[[VAL_188]] : index, i1, index +// CHECK: } +// CHECK: %[[VAL_189:.*]] = arith.addi %[[VAL_91]], %[[VAL_5]] : index +// CHECK: %[[VAL_190:.*]] = arith.cmpi ugt, %[[VAL_191:.*]]#2, %[[VAL_189]] : index +// CHECK: %[[VAL_192:.*]] = arith.select %[[VAL_190]], %[[VAL_191]]#2, %[[VAL_189]] : index +// CHECK: %[[VAL_193:.*]] = arith.addi %[[VAL_192]], %[[VAL_3]] : index +// CHECK: %[[VAL_194:.*]] = arith.cmpi ule, %[[VAL_193]], %[[VAL_2]] : index +// CHECK: %[[VAL_195:.*]] = arith.andi %[[VAL_191]]#1, %[[VAL_194]] : i1 +// CHECK: scf.yield %[[VAL_195]], %[[VAL_191]]#0, %[[VAL_192]], %[[VAL_196:.*]] : i1, index, index, tensor<6x6xi32, #{{.*}}> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: memref.store %[[VAL_4]], %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_197:.*]] = arith.cmpi ugt, %[[VAL_34]], %[[VAL_35]] : index +// CHECK: %[[VAL_198:.*]]:3 = scf.if %[[VAL_197]] -> (index, i1, index) { +// CHECK: %[[VAL_199:.*]] = arith.addi %[[VAL_35]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_34]], %[[VAL_33]], %[[VAL_199]] : index, i1, index +// CHECK: } else { +// CHECK: %[[VAL_200:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_4]]] : memref +// CHECK: %[[VAL_201:.*]]:2 = scf.for %[[VAL_202:.*]] = %[[VAL_6]] to %[[VAL_200]] step %[[VAL_6]] iter_args(%[[VAL_203:.*]] = %[[VAL_2]], %[[VAL_204:.*]] = %[[VAL_10]]) -> (index, i1) { +// CHECK: %[[VAL_205:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_202]]] : memref +// CHECK: %[[VAL_206:.*]] = arith.addi %[[VAL_202]], %[[VAL_5]] : index +// CHECK: %[[VAL_207:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_206]]] : memref +// CHECK: %[[VAL_208:.*]] = arith.cmpi ult, %[[VAL_205]], %[[VAL_207]] : index +// CHECK: %[[VAL_209:.*]] = scf.if %[[VAL_208]] -> (index) { +// CHECK: %[[VAL_210:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_205]]] : memref +// CHECK: %[[VAL_211:.*]] = arith.cmpi eq, %[[VAL_210]], %[[VAL_34]] : index +// CHECK: %[[VAL_212:.*]] = scf.if %[[VAL_211]] -> (index) { +// CHECK: %[[VAL_213:.*]] = arith.addi %[[VAL_205]], %[[VAL_5]] : index +// CHECK: memref.store %[[VAL_213]], %[[VAL_19]]{{\[}}%[[VAL_202]]] : memref +// CHECK: scf.yield %[[VAL_213]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_205]] : index +// CHECK: } +// CHECK: scf.yield %[[VAL_214:.*]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_205]] : index +// CHECK: } +// CHECK: %[[VAL_215:.*]] = arith.cmpi ult, %[[VAL_216:.*]], %[[VAL_207]] : index +// CHECK: %[[VAL_217:.*]] = scf.if %[[VAL_215]] -> (index) { +// CHECK: %[[VAL_218:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_216]]] : memref +// CHECK: scf.yield %[[VAL_218]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_203]] : index +// CHECK: } +// CHECK: %[[VAL_219:.*]] = arith.ori %[[VAL_215]], %[[VAL_204]] : i1 +// CHECK: %[[VAL_220:.*]] = arith.cmpi ult, %[[VAL_221:.*]], %[[VAL_203]] : index +// CHECK: %[[VAL_222:.*]] = arith.select %[[VAL_220]], %[[VAL_221]], %[[VAL_203]] : index +// CHECK: scf.yield %[[VAL_222]], %[[VAL_219]] : index, i1 +// CHECK: } +// CHECK: %[[VAL_223:.*]] = arith.addi %[[VAL_224:.*]]#0, %[[VAL_5]] : index +// CHECK: %[[VAL_225:.*]] = arith.subi %[[VAL_223]], %[[VAL_3]] : index +// CHECK: %[[VAL_226:.*]] = arith.cmpi uge, %[[VAL_223]], %[[VAL_3]] : index +// CHECK: %[[VAL_227:.*]] = arith.select %[[VAL_226]], %[[VAL_225]], %[[VAL_4]] : index +// CHECK: scf.yield %[[VAL_224]]#0, %[[VAL_224]]#1, %[[VAL_227]] : index, i1, index +// CHECK: } +// CHECK: %[[VAL_228:.*]] = arith.addi %[[VAL_35]], %[[VAL_5]] : index +// CHECK: %[[VAL_229:.*]] = arith.cmpi ugt, %[[VAL_230:.*]]#2, %[[VAL_228]] : index +// CHECK: %[[VAL_231:.*]] = arith.select %[[VAL_229]], %[[VAL_230]]#2, %[[VAL_228]] : index +// CHECK: %[[VAL_232:.*]] = arith.addi %[[VAL_231]], %[[VAL_3]] : index +// CHECK: %[[VAL_233:.*]] = arith.cmpi ule, %[[VAL_232]], %[[VAL_2]] : index +// CHECK: %[[VAL_234:.*]] = arith.andi %[[VAL_230]]#1, %[[VAL_233]] : i1 +// CHECK: scf.yield %[[VAL_234]], %[[VAL_230]]#0, %[[VAL_231]], %[[VAL_235:.*]]#3 : i1, index, index, tensor<6x6xi32, #{{.*}}> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_236:.*]] = sparse_tensor.load %[[VAL_237:.*]]#3 hasInserts : tensor<6x6xi32, #{{.*}}> +// CHECK: return %[[VAL_236]] : tensor<6x6xi32, #{{.*}}> +// CHECK: } +func.func @conv2d_all_sparse_CSR(%arg0: tensor<8x8xi32, #DCSR>, + %arg1: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { + %0 = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %1 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<8x8xi32, #DCSR>, tensor<3x3xi32>) + outs(%0 : tensor<6x6xi32, #DCSR>) { + ^bb0(%in: i32, %in_0: i32, %out: i32): + %2 = arith.muli %in, %in_0 : i32 + %3 = arith.addi %out, %2 : i32 + linalg.yield %3 : i32 + } -> tensor<6x6xi32, #DCSR> + return %1 : tensor<6x6xi32, #DCSR> +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir @@ -0,0 +1,81 @@ +// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=false" +// DEFINE: %{command} = mlir-opt %s --sparse-compiler=%{option} | \ +// DEFINE: mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// DEFINE: FileCheck %s +// +// RUN: %{command} + +#map = affine_map<(d0, d1, d2, d3) -> (d0 + d2, d1 + d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> + +#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> + +module { + func.func @conv2d_all_sparse_CSR(%arg0: tensor<8x8xi32, #DCSR>, %arg1: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { + %0 = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %1 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<8x8xi32, #DCSR>, tensor<3x3xi32>) + outs(%0 : tensor<6x6xi32, #DCSR>) { + ^bb0(%in: i32, %in_0: i32, %out: i32): + %2 = arith.muli %in, %in_0 : i32 + %3 = arith.addi %out, %2 : i32 + linalg.yield %3 : i32 + } -> tensor<6x6xi32, #DCSR> + return %1 : tensor<6x6xi32, #DCSR> + } + + func.func @entry() { + %c0 = arith.constant 0 : index + %i0 = arith.constant 0 : i32 + + // A typical edge detection filter. + %filter = arith.constant dense<[ + [ 1, 0, -1 ], + [ 0, 0, 0 ], + [ -1, 0, 1 ] + ]> : tensor<3x3xi32> + + %input = arith.constant dense<[ + [ 1, 2, 3, 4, 0, 6, 7, 8 ], + [ 2, 2, 4, 4, 0, 0, 6, 8 ], + [ 2, 2, 4, 4, 0, 0, 6, 8 ], + [ 2, 2, 3, 4, 0, 0, 7, 8 ], + [ 1, 3, 3, 4, 0, 0, 6, 8 ], + [ 3, 2, 3, 4, 0, 0, 7, 8 ], + [ 1, 3, 3, 4, 3, 6, 6, 8 ], + [ 1, 3, 3, 4, 3, 0, 7, 8 ] + ]> : tensor<8x8xi32> + + %sparse_filter_CSR = sparse_tensor.convert %filter + : tensor<3x3xi32> to tensor<3x3xi32> + + %sparse_input_CSR = sparse_tensor.convert %input + : tensor<8x8xi32> to tensor<8x8xi32, #DCSR> + + %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR) + : (tensor<8x8xi32, #DCSR>, + tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> + + %out = sparse_tensor.convert %3 + : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> + // + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %v2 = vector.transfer_read %out[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v2 : vector<6x6xi32> + + return + } + +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir @@ -0,0 +1,97 @@ +// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=false" +// DEFINE: %{command} = mlir-opt %s --sparse-compiler=%{option} | \ +// DEFINE: mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// DEFINE: FileCheck %s +// +// RUN: %{command} + +#CCC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed", "compressed" ] +}> + +func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_CCC(%arg0: tensor, %arg1: tensor) -> tensor { + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor + %ret = linalg.conv_3d + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter3D = call @alloc_3d_filled_f32(%c3, %c3, %c3, %val) : (index, index, index, f32) -> (tensor) + %in3D_tmp = call @alloc_3d_filled_f32(%c8, %c8, %c8, %val) : (index, index, index, f32) -> (tensor) + %in3D = tensor.insert %f10 into %in3D_tmp[%c0, %c3, %c0] : tensor + %out3D = call @alloc_3d_filled_f32(%c6, %c6, %c6, %zero) : (index, index, index, f32) -> (tensor) + + %in3D_CCC = sparse_tensor.convert %in3D + : tensor to tensor + %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D) : (tensor, tensor) -> (tensor) + // CHECK: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) + %1 = sparse_tensor.convert %CCC_ret + : tensor to tensor + %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero + : tensor, vector<6x6x6xf32> + vector.print %v1 : vector<6x6x6xf32> + + // Free the resources + bufferization.dealloc_tensor %in3D : tensor + bufferization.dealloc_tensor %filter3D : tensor + + bufferization.dealloc_tensor %in3D_CCC : tensor + bufferization.dealloc_tensor %CCC_ret : tensor + + return +}