diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h --- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h @@ -435,6 +435,9 @@ lvlTypes[t][i] = dlt; loopToLvl[t][i] = lvl; lvlToLoop[t][lvl] = i; + // Maybe we should favor constant dimensions when there are multiple + // choices. + loopBounds[i] = std::make_pair(t, lvl); } /// Iterates over a set of `TensorLoopId`s, invoking the callback @@ -490,11 +493,6 @@ return loopToDependencies[loop(b)][tensor(b)].has_value(); } - // Return the defining [tid, dim] for the loop. - std::pair getLoopDefiningDim(unsigned l) const { - return loopBounds[l]; - } - /// Convenience getters to immediately access the stored nodes. /// Typically it is inadvisible to keep the reference around, as in /// `TensorExpr &te = merger.exp(e)`, since insertions into the merger diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h @@ -82,6 +82,8 @@ // d0 and d1 (for affine expression reduction). // If the list is empty, it means that there is no affine expression on the // input [tid, dim]. + // NOTE: the order of the returned list should be consistent with the + // topological order of the iteration graph. using DependentDimGetter = function_ref>(unsigned, unsigned)>; @@ -133,11 +135,8 @@ void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls); - /// Exits the current loop sequence, this will reset universal index to 0. - void exitCurrentLoopSeq() { - assert(loopSeqStack.size() == loopStack.size() + 1); - loopSeqStack.pop_back(); - } + // exit the current loop sequence, this will reset universal index to 0. + void exitCurrentLoopSeq(OpBuilder &builder, Location loc); // TODO: Get rid of `lvls` in the argument list? Track the level we // are currently at internally. Then it would be enterNextLvlForTensor. @@ -210,9 +209,13 @@ private: struct LoopInfo { - LoopInfo(ArrayRef tids, ArrayRef lvls, Operation *loop, - Block *userBlock, Value iv, StringAttr loopTag) - : tids(tids), lvls(lvls), loop(loop), userCodeBlock(userBlock), iv(iv) { + LoopInfo(ArrayRef tids, ArrayRef lvls, + ArrayRef slicedTids, ArrayRef slicedDims, + ArrayRef sliceResolved, Operation *loop, Block *userBlock, + Value iv, StringAttr loopTag) + : tids(tids), lvls(lvls), slicedTids(slicedTids), + slicedDims(slicedDims), sliceResolved(sliceResolved), loop(loop), + userCodeBlock(userBlock), iv(iv) { // Attached a special tag to loop emitter generated loop. if (loopTag) loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag); @@ -223,12 +226,37 @@ const llvm::SmallVector tids; // The corresponding levels for the tensors const llvm::SmallVector lvls; + // The set of tensors that the loop is operating on + const llvm::SmallVector slicedTids; + // The corresponding dims for the tensors + const llvm::SmallVector slicedDims; + // The corresponding dims for the tensors + const llvm::SmallVector sliceResolved; const Operation *loop; // the loop operation Block *const userCodeBlock; // the block holding users' generated code. const Value iv; // the induction variable for the loop }; - /// Linearizes address for dense level (i.e., p = (i * d0) + j). + struct SliceInfo { + SliceInfo(Value baseSlice, Value minCoord, Value offset, Value isNonEmpty, + std::optional slicedOnLvl, unsigned depth) + : baseSlice(baseSlice), minCoord(minCoord), offset(offset), + isNonEmpty(isNonEmpty), slicedOnLvl(slicedOnLvl), depth(depth) { + assert(!slicedOnLvl || minCoord); + } + + // Whether this is the first slice + bool isInitialTensor() const { return !slicedOnLvl.has_value(); } + + Value baseSlice; // the current slices being reduced + Value minCoord; // the minimal coordinates of the slice on lvl. + Value offset; // the offset of the current slice. + Value isNonEmpty; // whether the slice is empty. + std::optional slicedOnLvl; // the level on which the slice is done + unsigned depth; // the depth (relative to dependentDimMap[tid][lvl]). + }; + + /// Linearizes address for dense dimension (i.e., p = (i * d0) + j). Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value iv); @@ -278,6 +306,11 @@ ArrayRef tids, ArrayRef lvls); + Operation *emitForLoopOverTensorAtDim(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + MutableArrayRef reduc, + bool isParallel); + /// Exits a for loop, returns the reduction results, e.g., /// For sequential for loops: /// %ret = for () { @@ -341,6 +374,68 @@ return {dstLvl}; } + // + // Slice-driven loop related methods. + // + + /// Retrieves the most recent slices on lvl. To reduce affine expression like + /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of + /// size d2). This methods returns the latter slice (of size d2), which is + /// also the final slice on the level. + SliceInfo &getFinalSliceOnLvl(size_t tid, size_t lvl); + + /// Get the total number of constraints that needed to fully resolve the + /// dependent dimension on tensor[tid]. + size_t sliceTotalConstraints(size_t tid); + + /// Whether the tid is fully resolved, i.e., all the dependent dimension are + /// reduced by slices offsets. + bool sliceFullyResolved(size_t tid); + + /// Generates a whileOp to iterate over a subset of coordinates on tid on lvl + /// using the pHi and pLo provided, the loop break on the first coordinate + /// that exceeds the slice boundary (i.e., coord >= slice.offset + + /// slice.size). + std::pair + genSliceLvlTraverseLoop(OpBuilder &builder, Location loc, Value pLo, + Value pHi, Value offset, size_t tid, size_t lvl, + size_t depth, ValueRange userReduc, bool genYield, + /*bodyBody=*/ + llvm::function_ref)>); + + /// Generates a nested loop that iterates over tid on all the coordinates on + /// lvl. + ValueRange genSliceAllLvlTraverseLoop( + OpBuilder &builder, Location loc, Value offset, size_t tid, size_t lvl, + size_t depth, ValueRange userReduc, + /*bodyBody=*/ + llvm::function_ref)>); + + /// Generates code to get the first non-empty slice of tid on lvl. + /// return true if has already been resolved. + bool genSliceBegin(OpBuilder &builder, Location loc, size_t tid, size_t lvl); + + /// Generates code to get the next non-empty slices of tid on lvl. + void genSliceNextInduction(OpBuilder &builder, Location loc, + const Operation *whileOp, size_t tid, size_t lvl, + SmallVectorImpl &operands, + unsigned &retIdx); + + /// Generates a slice-driven while loop like follows. + /// + /// curSlice = getFirstNonEmptySlice(tensor). + /// + /// while(isNonEmpty) { + /// ..user code.. + /// isNonEmpty, curSlice = getNextNonEmptySlice(curSlice) + /// } + Operation *emitSliceDrivenLoopOverTensorAtDim(OpBuilder &builder, + Location loc, size_t tid, + size_t lvl, + MutableArrayRef reduc); + /// A optional string attribute that should be attached to the loop /// generated by loop emitter, it might help following passes to identify /// loops that operates on sparse tensors more easily. @@ -385,6 +480,10 @@ std::vector> coordinatesBuffers; // to_coordinates std::vector valBuffer; // to_value + // + // Slice-driven loops related fields. + // + /// Whether the sparse input is a slice. std::vector isSparseSlices; /// Values related to slices. @@ -396,6 +495,21 @@ std::vector>>> dependentDimMap; + // The cached pointer buffer for the slices, they serve the same purpose as + // ptrBuffer for compressed dimensions. But they always starts with the first + // pidx pointing to coord > slice.offset to avoid iteration from the + // beginning. + std::vector>> slicePtrBuffer; + + // The cached size for each slices. + std::vector>> sliceSizes; + + // The number of resolved constraints so far. + std::vector sliceResolvedConstraints; + + // sliceStack[tid] holds the generated slice stack on tid. + std::vector> sliceStack; + // // View based reshape related-fields and methods // @@ -416,9 +530,12 @@ /// alive. std::vector loopStack; - /// Loop Sequence Stack, stores the universal index for the current loop - /// sequence. - std::vector loopSeqStack; + // Loop Sequence Stack, stores the unversial index for the current loop + // sequence. and a list of tids which was taken sliced. + // TODO: maybe we should have a LoopSeqInfo + std::vector< + std::pair>>> + loopSeqStack; /// Maps `LoopId` (used by `AffineDimExpr`) to `LoopOrd` (in the `loopStack`). /// TODO: We should probably use a callback function here to make it more diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -25,16 +25,28 @@ // File local helper functions. //===----------------------------------------------------------------------===// -/// Generates a position/coordinate load from the sparse storage scheme. -/// Narrower data types need to be zero extended before casting the -/// value into the `Index` type used for looping and indexing. -static Value genIndexLoad(OpBuilder &builder, Location loc, Value mem, +#define CMPI(p, l, r) \ + (builder.create(loc, arith::CmpIPredicate::p, l, r) \ + .getResult()) + +/// Extracts a corresponding vector of type from a ValueRange. +static SmallVector getTypesFromValues(ValueRange vs) { + SmallVector ret; + for (auto v : vs) + ret.push_back(v.getType()); + return ret; +} + +/// Generates a pointer/index load from the sparse storage scheme. Narrower +/// data types need to be zero extended before casting the value into the +/// index type used for looping and indexing. +static Value genIndexLoad(OpBuilder &builder, Location loc, Value ptr, Value s) { // For the scalar case, we simply zero extend narrower indices into 64-bit // values before casting to index without a performance penalty. Here too, // however, indices that already are 64-bit, in theory, cannot express the // full range as explained above. - Value load = builder.create(loc, mem, s); + Value load = builder.create(loc, ptr, s); if (!load.getType().isa()) { if (load.getType().getIntOrFloatBitWidth() < 64) load = builder.create(loc, builder.getI64Type(), load); @@ -85,6 +97,59 @@ return std::make_pair(crd, rem); } +/// Helper method the generate a tensor.extract_slice operation with the given +/// offset and size on dim. +static Value genExtractSliceWithOffsetOnDim(OpBuilder &builder, Location loc, + Value src, Value dynOffset, + Value sz, unsigned dim) { + + RankedTensorType srcTp = src.getType().cast(); + int rank = srcTp.getRank(); + + SmallVector offsets(rank, 0); + SmallVector strides(rank, 1); + SmallVector sizes(srcTp.getShape()); + SmallVector dynSizes; + + offsets[dim] = ShapedType::kDynamic; + sizes[dim] = ShapedType::kDynamic; + + auto srcEncoding = getSparseTensorEncoding(srcTp); + SmallVector sliceAttrs; + + for (unsigned i = 0, e = sizes.size(); i < e; i++) { + // Infers the slice attribute array (sets offset/size to dynamic on the + // slicing dimension). + int offset = i == dim ? SparseTensorDimSliceAttr::kDynamic : 0; + int size = (i == dim || ShapedType::isDynamic(sizes[i])) + ? SparseTensorDimSliceAttr::kDynamic + : sizes[i]; + sliceAttrs.push_back(SparseTensorDimSliceAttr::get(srcTp.getContext(), + /*offset=*/offset, + /*size=*/size, + /*stride=*/1)); + if (ShapedType::isDynamic(sizes[i])) { + if (dim == i) + dynSizes.push_back(sz); + else + dynSizes.push_back(linalg::createOrFoldDimOp(builder, loc, src, dim)); + } + } + + // Keeps original encodings but attaches slice attribute. + auto encoding = SparseTensorEncodingAttr::get( + srcTp.getContext(), srcEncoding.getDimLevelType(), + srcEncoding.getDimOrdering(), srcEncoding.getHigherOrdering(), + srcEncoding.getPosWidth(), srcEncoding.getCrdWidth(), sliceAttrs); + + auto retTp = RankedTensorType::get(sizes, srcTp.getElementType(), encoding); + return builder + .create(loc, retTp, src, ValueRange{dynOffset}, + dynSizes, ValueRange{}, offsets, sizes, + strides) + .getResult(); +} + std::pair LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd, TensorId tid, Level lvl) { @@ -102,21 +167,20 @@ // First, coord >= offset (skip the check if offset is known to be 0). if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl); !(staticOffset.has_value() && *staticOffset == 0)) { - auto geOffset = builder.create( - loc, arith::CmpIPredicate::uge, crd, offset); + // First, coord >= offset (skip the check if offset is known to be 0). + auto geOffset = CMPI(uge, crd, offset); conds.push_back(geOffset); } // Second, coord_in_slice < length - auto ltLength = builder.create(loc, arith::CmpIPredicate::ult, - newCrd, lvlSizes[tid][lvl]); + auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]); conds.push_back(ltLength); // Third, rem == 0 (skip the check if stride is known to be 1). if (auto staticStride = enc.getStaticLvlSliceStride(lvl); !(staticStride.has_value() && *staticStride == 1)) { - auto fitStride = builder.create( - loc, arith::CmpIPredicate::eq, crdRem, constantIndex(builder, loc, 0)); + // Third, rem == 0 (skip the check if stride is known to be 1). + auto fitStride = CMPI(eq, crdRem, constantIndex(builder, loc, 0)); conds.push_back(fitStride); } @@ -247,6 +311,11 @@ // TODO: dim or level? this->dependentDimMap.assign( numTensors, std::vector>>()); + this->slicePtrBuffer.assign(tensors.size(), + std::vector>()); + this->sliceSizes.assign(tensors.size(), std::vector>()); + this->sliceStack.assign(tensors.size(), std::vector()); + this->sliceResolvedConstraints.assign(tensors.size(), 0); // Initialize nested types of `TensorId`-indexed fields. for (TensorId tid = 0; tid < numTensors; tid++) { @@ -290,11 +359,25 @@ sliceOffsets[tid].assign(lvlRank, Value()); sliceStrides[tid].assign(lvlRank, Value()); + // Slice-driven loops related initialization. dependentDimMap[tid].assign(lvlRank, std::vector>()); - if (dimGetter) - for (Level l = 0; l < lvlRank; l++) - dependentDimMap[tid][l] = dimGetter(tid, l); + slicePtrBuffer[tid].assign(lvlRank, std::vector()); + sliceSizes[tid].assign(lvlRank, std::vector()); + sliceStack[tid].emplace_back(tensors[tid], /*minCoord=*/Value(), + /*offset=*/Value(), /*isNonEmpty*/ Value(), + std::nullopt, 0); + if (dimGetter) { + for (unsigned i = 0; i < lvlRank; i++) { + dependentDimMap[tid][i] = dimGetter(tid, i); + unsigned depends = dependentDimMap[tid][i].size(); + if (depends != 0) { + // We need depends - 1 slices to fully resolve the affine expression. + slicePtrBuffer[tid][i].assign(depends - 1, nullptr); + sliceSizes[tid][i].assign(depends - 1, nullptr); + } + } + } } // Construct the inverse of the `topSort` from the sparsifier. @@ -393,6 +476,74 @@ // some loop preparation from tensor iteration, but will also (undesirably) // hoist the code ouside if-conditions. } + + Type indexType = builder.getIndexType(); + Value c0 = constantZero(builder, loc, indexType); + Value c2 = constantIndex(builder, loc, 2); + // TODO: We should probably use integer with pointer bitwidth for the cache. + MemRefType cacheTp = MemRefType::get({ShapedType::kDynamic}, indexType); + // Generate caches required to fast compute next-non-empty slices with + // increasing offset for slice-base loop. + // We need to start a separate loop here because the cache size depends on the + // dimension size computed in the aboves loops. + for (size_t t = 0, e = tensors.size(); t < e; t++) { + auto rtp = tensors[t].getType().dyn_cast(); + if (!rtp) + continue; + + // for a pair of [pLo, pHi]. Note that we can not compress pHi because slice + // creates segments in the index buffer so that the pHi for the current dim + // is no longer the pLo for the next dim. + Value pIdxSize = c2; + auto rank = rtp.getRank(); + for (unsigned lvl = 0; lvl < rank; lvl++) { + if (!dependentDimMap[t][lvl].empty()) { + // Needs at least two operands to form a non-trivial affine expression. + ArrayRef> dependedDim = + dependentDimMap[t][lvl]; + assert(dependedDim.size() > 1); + + Value size = c0; + for (unsigned e = dependedDim.size() - 1; e >= 1; e--) { + auto [dt, dd] = dependedDim[e]; + size = builder.create(loc, size, lvlSizes[dt][dd]); + sliceSizes[t][lvl][e - 1] = size; + } + + // No cache for dense level, they can be simply increased by one. + auto dlt = lvlTypes[t][lvl]; + + if (!isDenseDLT(dlt)) { + llvm::for_each(slicePtrBuffer[t][lvl], [cacheTp, pIdxSize, c2, loc, + &builder](Value &cache) { + cache = builder.create( + loc, cacheTp, + // Additional two metadata {memSize, idx} at head. + builder.create(loc, pIdxSize, c2).getResult()); + }); + } + + // Accumlates the size required to cache the pLo for the slice. + // E.g., if we want to cache the pIdx for slice on the second + // level. We at most need to a memref. + // NOTE: this is apperantly an over-approximation when the previous + // level is compressed, and we can compute a precise memory size + // inside the loops. But that would also requires us to allocate/free + // memorys in loops. + // TODO: Maybe using allocaScopOp inside the loop to resolve the issue? + if (!dependentDimMap[t][lvl].empty()) { + auto [dt, dd] = dependentDimMap[t][lvl].back(); + pIdxSize = + builder.create(loc, pIdxSize, lvlSizes[dt][dd]); + } else { + // This level does not need to be sliced, the final size of the slice + // on the level will be the same as the current size. + pIdxSize = + builder.create(loc, pIdxSize, lvlSizes[t][lvl]); + } + } + } + } } void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, @@ -400,12 +551,49 @@ ArrayRef lvls) { // TODO: sort assert(loopSeqStack.size() == loopStack.size()); - // Universal Index starts from 0. - loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); // Prepares for all the tensors used in the current loop sequence. - assert(tids.size() == lvls.size()); - for (auto [tid, lvl] : llvm::zip(tids, lvls)) - prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); + std::vector> slicedTids; + for (auto [tid, lvl] : llvm::zip(tids, lvls)) { + if (!dependentDimMap[tid][lvl].empty()) { + bool fullyRes = genSliceBegin(builder, loc, tid, lvl); + slicedTids.emplace_back(tid, lvl, fullyRes); + } else { + prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); + } + } + + // Universal Index starts from 0. + loopSeqStack.emplace_back(constantIndex(builder, loc, 0), + std::move(slicedTids)); +} + +void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) { + assert(loopSeqStack.size() == loopStack.size() + 1); + + const std::vector> &slicedTids = + loopSeqStack.back().second; + + // Pop out outdated slices. + for (auto [tid, lvl, res] : slicedTids) { + if (!res) { + assert(sliceStack[tid].back().slicedOnLvl == lvl); + sliceStack[tid].pop_back(); + // There is an additional item in sliceStack for the input tensor. + assert(sliceResolvedConstraints[tid] + 1 == sliceStack[tid].size()); + } else { + Value c1 = constantIndex(builder, loc, 1); + Value c2 = constantIndex(builder, loc, 2); + + // pIdx += 2, we finished the current lvl, advance the pointer index of + // the previous level by two to skip the [pLo, pHi] for current level. + // TODO: we could probably use an SSA value for it. + Value sPtrBuf = slicePtrBuffer[tid][lvl].back(); + Value curP = genIndexLoad(builder, loc, sPtrBuf, c1); + Value nexP = builder.create(loc, curP, c2); + builder.create(loc, nexP, sPtrBuf, c1); + } + } + loopSeqStack.pop_back(); } Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { @@ -439,44 +627,23 @@ } } -Operation *LoopEmitter::enterLoopOverTensorAtLvl( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef lvls, MutableArrayRef reduc, bool isParallel) { - // TODO: support multiple return on parallel for? - assert(!isParallel || reduc.size() <= 1); - bool isSparseInput = false; - TensorId tid = tids.front(); - Level dstLvl = lvls.front(); - assert(tids.size() == lvls.size()); - for (auto [t, l] : llvm::zip(tids, lvls)) { - // TODO: this check for validity of the (t,l) pairs should be - // checked/enforced at the callsites, if possible. - assert(t < lvlTypes.size() && l < lvlTypes[t].size()); - assert(!coords[t][l]); // We cannot re-enter the same level - const auto lvlTp = lvlTypes[t][l]; - const bool isSparse = isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp); - // Must be a recognizable level-type. - assert(isSparse || isDenseDLT(lvlTp)); - // We can at most have one sparse input, otherwise, a while loop is required - // to co-iterate multiple sparse tensors. - assert(!isSparseInput || !isSparse); - if (isSparse) { - tid = t; - dstLvl = l; - } - isSparseInput = isSparseInput || isSparse; - } +Operation *LoopEmitter::emitForLoopOverTensorAtDim(OpBuilder &builder, + Location loc, size_t tid, + size_t dstLvl, + MutableArrayRef reduc, + bool isParallel) { + bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) || + isSingletonDLT(lvlTypes[tid][dstLvl]); const auto reassoc = getCollapseReassociation(tid, dstLvl); // TODO: support dynamic slices. - // Use the first source-level here to build the loop bound (which is - // also the biggest range). + // Uses the first dimension here to build the loop bound (which is also the + // biggest range). const Level srcLvl = reassoc.front(); - const Value step = constantIndex(builder, loc, 1); - /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. - const Value lo = isSparseInput ? posits[tid][srcLvl] // current position - : loopSeqStack.back(); // universal index - const Value hi = highs[tid][srcLvl]; + Value step = constantIndex(builder, loc, 1); + Value lo = isSparseCond ? posits[tid][srcLvl] // current offset + : loopSeqStack.back().first; // universal index + Value hi = highs[tid][srcLvl]; Operation *loop = nullptr; Value iv; @@ -512,7 +679,7 @@ assert(loop && iv); Value crd; - if (isSparseInput) { + if (isSparseCond) { assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); // For COO, the position is the same across consecutive levels. /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. @@ -524,7 +691,7 @@ crd = iv; } - if (isSparseSlices[tid] && isSparseInput) { + if (isSparseSlices[tid] && isSparseCond) { // For sparse level slices, we need to filter out invalid coordinates that // are not included in the slice. SmallVector types; @@ -553,15 +720,107 @@ } assert(crd); - coords[tid][srcLvl] = crd; - // NOTE: we can also prepare for next level here in advance - // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(srcLvl), loop, - builder.getInsertionBlock(), crd, loopTag); + coords[tid][dstLvl] = crd; + return loop; +} + +Operation *LoopEmitter::enterLoopOverTensorAtLvl( + OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef lvls, MutableArrayRef reduc, bool isParallel) { + // TODO: support multiple return on parallel for? + assert(!isParallel || reduc.size() <= 1); + bool isSparseCond = false, isSliceCond = false; + size_t tid = tids.front(), lvl = lvls.front(); + + for (auto [t, d] : llvm::zip(tids, lvls)) { + assert(lvlTypes[t].size() > d); // Must be a valid tid, dim pair + assert(!coords[t][d] || // We cannot re-enter the same level + !dependentDimMap[t][d].empty()); // unless it is a slice-driver loop + auto dimType = lvlTypes[t][d]; + // Must be a recognizable DLT. + assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || + isSingletonDLT(dimType)); + + // This is a slice-driven loop. + if (!dependentDimMap[t][d].empty()) { + assert(!isSliceCond && !isSparseCond); + isSliceCond = true; + tid = t; + lvl = d; + continue; + } + + bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); + // We can at most have one sparse input, otherwise, a while loop is + // required to co-iterate multiple sparse tensors. + assert(!isSparseCond || !isSparse); + assert(!isSliceCond || !isSparseCond); + if (isSparse) { + tid = t; + lvl = d; + } + isSparseCond = isSparseCond || isSparse; + } + + // if the slice is fully reduced, we can now use TACO-based algorithm to + // iterate it. + Operation *l = nullptr; + if (isSliceCond) { + bool fullyResolved = sliceFullyResolved(tid); + if (!fullyResolved) { + l = emitSliceDrivenLoopOverTensorAtDim(builder, loc, tid, lvl, reduc); + } else { + const SliceInfo &info = getFinalSliceOnLvl(tid, lvl); + Value offset = info.offset; + unsigned depth = info.depth - 1; + Operation *insertPoint = nullptr; + // TODO: we should generalize the method to support iteration over for + // normal slices as well to allow early break. + l = genSliceLvlTraverseLoop( + builder, loc, posits[tid][lvl], highs[tid][lvl], offset, tid, lvl, + depth, reduc, + /*genYield=*/false, // unaware of the yield values from user yet + [this, tid, lvl, reduc, offset, + &insertPoint](OpBuilder &builder, Location loc, Value iv, + MutableArrayRef innerReduc) { + assert(innerReduc.size() == reduc.size()); + // Updates users' reduction variable inplace + for (unsigned i = 0, e = reduc.size(); i < e; i++) + reduc[i] = innerReduc[i]; + // Loads the coordinates. + Value absC = genIndexLoad(builder, loc, + coordinatesBuffers[tid][lvl], iv); + + // We need to substract the offset to get relative coordinates. + // TODO: how to assert relC >=0 during runtime? + insertPoint = builder.create(loc, absC, offset); + posits[tid][lvl] = iv; + coords[tid][lvl] = insertPoint->getResult(0); + }) + .first; + // We did not finish the loop body, reset the insertion point and delegate + // to user. + builder.setInsertionPointAfter(insertPoint); + } + // NOTE: we can also prepare for next dim here in advance + // Pushes the loop into stack. + loopStack.emplace_back( + ArrayRef(), ArrayRef(), ArrayRef(tid), + ArrayRef(lvl), ArrayRef(fullyResolved), l, + builder.getInsertionBlock(), coords[tid][lvl], loopTag); + } else { + l = emitForLoopOverTensorAtDim(builder, loc, tid, lvl, reduc, isParallel); + // NOTE: we can also prepare for next dim here in advance + // Pushes the loop into stack. + loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), + ArrayRef(), ArrayRef(), + ArrayRef(), l, builder.getInsertionBlock(), + coords[tid][lvl], loopTag); + } + // Emit extra locals. emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); - - return loop; + return l; } Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl( @@ -598,8 +857,7 @@ // Generate an if-condition to filter out coordinates that are not // equal to the result of the affine expression. Value expected = genAffine(builder, loc, affine); - auto pred = builder.create(loc, arith::CmpIPredicate::eq, crd, - expected); + auto pred = CMPI(eq, coords[tid][lvl], expected); SmallVector types; for (Value red : reduc) { types.push_back(red.getType()); @@ -625,8 +883,10 @@ // NOTE: we can also prepare for next lvl here in advance // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), forOp, - builder.getInsertionBlock(), crd, nullptr); + loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), + ArrayRef(), ArrayRef(), + ArrayRef(), forOp, builder.getInsertionBlock(), + coords[tid][lvl], nullptr); return forOp; } @@ -642,14 +902,18 @@ Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls, bool needsUniv, MutableArrayRef reduc) { + // NOTE: make sure that the slice driven tensor-related reduction variable + // appears first than normal tensors. assert(tids.size() == lvls.size()); SmallVector types; SmallVector operands; // Construct the while-loop with a parameter for each coordinate. - const Type indexType = builder.getIndexType(); + Type indexType = builder.getIndexType(); for (auto [tid, lvl] : llvm::zip(tids, lvls)) { - const auto lvlTp = lvlTypes[tid][lvl]; - if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { + // TODO: support coiteration with slice driven tensors. + assert(dependentDimMap[tid][lvl].empty() && "TODO: not yet implemented"); + if (isCompressedDLT(lvlTypes[tid][lvl]) || + isSingletonDLT(lvlTypes[tid][lvl])) { const auto reassoc = getCollapseReassociation(tid, lvl); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { @@ -672,7 +936,7 @@ if (needsUniv) { types.push_back(indexType); // Update universal index. - operands.push_back(loopSeqStack.back()); + operands.push_back(loopSeqStack.back().first); } assert(types.size() == operands.size()); scf::WhileOp whileOp = builder.create(loc, types, operands); @@ -701,8 +965,7 @@ Value op1 = before->getArgument(o); // We used the first level bound as the bound the collapsed set of levels. Value op2 = highs[tid][reassoc.front()]; - Value opc = builder.create(loc, arith::CmpIPredicate::ult, - op1, op2); + Value opc = CMPI(ult, op1, op2); cond = cond ? builder.create(loc, cond, opc) : opc; // Update positions Value pos = after->getArgument(o++); @@ -777,9 +1040,9 @@ if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto crd = coords[tid][lvl]; if (min) { - Value cmp = builder.create( - loc, arith::CmpIPredicate::ult, crd, min); - min = builder.create(loc, cmp, crd, min); + Value cmp = CMPI(ult, coords[tid][lvl], min); + min = + builder.create(loc, cmp, coords[tid][lvl], min); } else { min = crd; } @@ -792,8 +1055,9 @@ } // Sets up the loop stack. - loopStack.emplace_back(tids, lvls, whileOp, builder.getInsertionBlock(), min, - loopTag); + loopStack.emplace_back(tids, lvls, ArrayRef(), ArrayRef(), + ArrayRef(), whileOp, builder.getInsertionBlock(), + min, loopTag); assert(loopStack.size() == loopSeqStack.size()); for (auto [tid, dstLvl] : llvm::zip(tids, lvls)) { @@ -1015,6 +1279,7 @@ auto whileOp = llvm::cast(loopInfo.loop); builder.setInsertionPointToEnd(loopInfo.userCodeBlock); Value iv = loopInfo.iv; + // Finalize the induction. Note that the induction could be performed // in the individual if-branches to avoid re-evaluating the conditions. // However, that would result in a rather elaborate forest of yield @@ -1022,6 +1287,34 @@ // after the if-statements more closely resembles code generated by TACO. unsigned o = 0; SmallVector operands; + unsigned delta = 0; + for (auto [tid, dim, resolved] : llvm::zip( + loopInfo.slicedTids, loopInfo.slicedDims, loopInfo.sliceResolved)) { + if (!resolved) { + genSliceNextInduction(builder, loc, whileOp, tid, dim, operands, o); + sliceResolvedConstraints[tid]--; + } else { + // TODO: We need to distinguish coiterate loop with slice-driven loop and + // fully reduced while op for iterating one slices. + // since we didn't implement coiteration, this must be iteration just + // on fully resolved slice. + assert(loopInfo.slicedTids.size() == 1 && loopInfo.tids.empty()); + // The if guard to filter out out-range coordinates. + assert(llvm::isa(builder.getInsertionBlock()->getParentOp())); + posits[tid][dim] = whileOp->getResult(o++); + // FIXME: we are not using continue here since we do not support + // coiteration on slices. But it need to be treated similarly as the + // universal index. + o++; // skip continue flag. + // Since we did not push two results from whileOp. The size of the + // operands vector is smaller than the actual number of return values from + // the whileOp. + // It is because we are actually generate yield in the IfOp inside the + // whileOp to only iterates over inbound coordinates within the slices. + delta += 2; + } + }; + Value one = constantIndex(builder, loc, 1); for (auto [tid, dstLvl] : llvm::zip(loopInfo.tids, loopInfo.lvls)) { const auto lvlTp = lvlTypes[tid][dstLvl]; @@ -1037,8 +1330,7 @@ } const Value crd = coords[tid][dstLvl]; const Value pos = posits[tid][dstLvl]; - Value cmp = - builder.create(loc, arith::CmpIPredicate::eq, crd, iv); + Value cmp = CMPI(eq, crd, iv); // If the loop contains a coiteration with non-unique level, we fast // forward all the duplicated coords by setting the position to the // segment high. @@ -1073,15 +1365,15 @@ } // An (optional) universal index. - if (operands.size() < whileOp.getNumResults()) { - assert(operands.size() + 1 == whileOp.getNumResults()); + if (operands.size() + delta < whileOp.getNumResults()) { + assert(operands.size() + delta + 1 == whileOp.getNumResults()); // The last one is the universial index. operands.push_back(builder.create(loc, iv, one)); // update the loop starting point of current loop sequence - loopSeqStack.back() = whileOp->getResult(o++); + loopSeqStack.back().first = whileOp->getResult(o++); } - assert(o == operands.size()); + assert(o == operands.size() + delta); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); } @@ -1102,3 +1394,559 @@ assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); } + +//===----------------------------------------------------------------------===// +// Slice-driven loop related methods. +//===----------------------------------------------------------------------===// + +LoopEmitter::SliceInfo &LoopEmitter::getFinalSliceOnLvl(size_t tid, + size_t lvl) { + for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie; + it++) { + if (it->slicedOnLvl == lvl) { + assert(it->depth == dependentDimMap[tid][lvl].size() - 1); + return *it; + } + } + + llvm_unreachable("Failed to find sliceInfo"); +} + +size_t LoopEmitter::sliceTotalConstraints(size_t tid) { + size_t numConstraints = 0; + for (const auto &lvlDeps : dependentDimMap[tid]) { + if (!lvlDeps.empty()) { + assert(lvlDeps.size() >= 2); + numConstraints += lvlDeps.size() - 1; + } + } + return numConstraints; +} + +bool LoopEmitter::sliceFullyResolved(size_t tid) { + return sliceTotalConstraints(tid) == sliceResolvedConstraints[tid]; +} + +std::pair LoopEmitter::genSliceLvlTraverseLoop( + OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset, + size_t tid, size_t lvl, size_t depth, ValueRange userReduc, bool genYield, + llvm::function_ref)> + bodyBuilder) { + Value c1 = constantIndex(builder, loc, 1); + Value sliceHi = + builder.create(loc, offset, sliceSizes[tid][lvl].back()); + + SmallVector reduc = { + loopLo, // loop lower bounds + constantI1(builder, loc, true), // continue + }; + // Append user required reduction value. + reduc.append(userReduc.begin(), userReduc.end()); + SmallVector types = getTypesFromValues(reduc); + + scf::WhileOp whileOp = builder.create( + loc, types, reduc, + /*beforeBuilder=*/ + [loopHi](OpBuilder &builder, Location loc, ValueRange args) { + Value lo = args[0]; + Value cont = args[1]; + Value inBound = CMPI(ult, lo, loopHi); + Value cond = builder.create(loc, cont, inBound); + // continue if not yet break nor out of bound. + builder.create(loc, cond, args); + }, + /*afterBuilder=*/ + [this, c1, tid, lvl, sliceHi, genYield, + bodyBuilder](OpBuilder &builder, Location loc, ValueRange args) { + Value iv = args[0]; + Value coord = + genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv); + // If coord < sliceHi + Value cont = CMPI(ult, coord, sliceHi); + + SmallVector types = getTypesFromValues(args.drop_front(2)); + auto ifOp = builder.create(loc, types, cont, true); + { + // 2 reduction variable maintained by us. + SmallVector ifRet = args.drop_front(2); + assert(ifRet.size() == args.size() - 2); + + OpBuilder::InsertionGuard guard(builder); + // If not in slice. + // Break the while loop (by setting continue to false) + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + builder.create(loc, ifRet); + + // If this is a legit coordinates in slice + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + bodyBuilder(builder, loc, iv, ifRet); + if (genYield) { + builder.setInsertionPointToEnd(&ifOp.getThenRegion().front()); + builder.create(loc, ifRet); + } + } + // Marks this speical ifOp to avoid sparisification finalizing it. + ifOp->setAttr(getLoopEmitterLoopAttrName(), + StringAttr::get(builder.getContext(), "slice")); + // Insertion point restored to after ifOp. + SmallVector yields; + // Increase induction variable. + yields.push_back(builder.create(loc, iv, c1)); + yields.push_back(cont); + yields.append(ifOp.getResults().begin(), ifOp.getResults().end()); + builder.create(loc, yields); + }); + + builder.setInsertionPointAfter(whileOp); + return std::make_pair(whileOp, whileOp.getResults().drop_front(2)); +} + +ValueRange LoopEmitter::genSliceAllLvlTraverseLoop( + OpBuilder &builder, Location loc, Value offset, size_t tid, size_t lvl, + size_t depth, ValueRange userReduc, + llvm::function_ref)> + bodyBuilder) { + + Value c0 = constantIndex(builder, loc, 0); + Value c1 = constantIndex(builder, loc, 1); + Value c2 = constantIndex(builder, loc, 2); + + // TODO: it only works on all compressed tensor. + Value sPtrBuf = slicePtrBuffer[tid][lvl][depth]; + Value pSt = c2; // pointer starting index + Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize + + auto forOp = + scf::buildLoopNest( + builder, loc, pSt, mSz, c2, userReduc, + [this, c1, depth, tid, lvl, offset, sPtrBuf, + bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + // generate traversal for each level. + Value loopLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); + Value loopHi = genIndexLoad( + builder, loc, sPtrBuf, + builder.create(loc, ivs.front(), c1)); + return genSliceLvlTraverseLoop(builder, loc, loopLo, loopHi, offset, + tid, lvl, depth, iterArgs, true, + bodyBuilder) + .second; + }) + .loops.front(); + + // Insert after current while operation. + builder.setInsertionPointAfter(forOp); + return forOp.getResults(); +} + +bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, size_t tid, + size_t lvl) { + + Value c0 = constantIndex(builder, loc, 0); + Value c1 = constantIndex(builder, loc, 1); + Value c2 = constantIndex(builder, loc, 2); + Value c3 = constantIndex(builder, loc, 3); + Value c4 = constantIndex(builder, loc, 4); + + if (sliceFullyResolved(tid)) { + // If constraints on the tensor is fully resolved. We do not need to + // generates slice begin any more, instead we fall back to TACO-based + // algorithm to (co)iterates over the slice. + Value pLoPtr = + genIndexLoad(builder, loc, slicePtrBuffer[tid][lvl].back(), c1); + pLoPtr = builder.create(loc, pLoPtr, c2); + Value pHiPtr = builder.create(loc, pLoPtr, c1); + posits[tid][lvl] = + genIndexLoad(builder, loc, slicePtrBuffer[tid][lvl].back(), pLoPtr); + highs[tid][lvl] = + genIndexLoad(builder, loc, slicePtrBuffer[tid][lvl].back(), pHiPtr); + return true; + } + + // Only when the level is sorted, the next-non-empty slice can be computed + // efficiently. + assert(isOrderedDLT(lvlTypes[tid][lvl])); + if (isDenseDLT(lvlTypes[tid][lvl]) || isSingletonDLT(lvlTypes[tid][lvl])) + llvm_unreachable("TODO: dense level should be easy to support, while " + "singleton level requres more efforts"); + + assert(!dependentDimMap[tid][lvl].empty()); + assert(!sliceStack[tid].empty()); + + const SliceInfo &sliceInfo = sliceStack[tid].back(); + auto baseEnc = getSparseTensorEncoding(sliceInfo.baseSlice.getType()); + + Value size, minCoord, isNonEmpty; + unsigned depth = 0; + if (sliceInfo.isInitialTensor()) { + // The input tensor is slices, not yet handled. + if (baseEnc.isSlice()) + llvm_unreachable("TODO: not yet implemented"); + + assert(lvl == 0); // must be reduing the affine expression on the first lvl. + // Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi] + Value sPtrBuf = slicePtrBuffer[tid][0][0]; + Value pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); + builder.create(loc, c4, sPtrBuf, c0); // memSize = 4 + builder.create(loc, c0, sPtrBuf, c1); // index = 0 + builder.create(loc, c0, sPtrBuf, c2); // pLo = 0; + builder.create(loc, pHi, sPtrBuf, c3); // loaded pHi. + + size = sliceSizes[tid][0][0]; + // This is an non empty tensor if 0 < pHi. + isNonEmpty = CMPI(ult, c0, pHi); + // The minimal coord must be at the first on ordered level. + // FIXME: Technically we should load the coord only when the slice is + // nonempty. though we assume that even on empty sparse tensors, a non-empty + // ptr/idx buffer is allocated for each level so it would not cause OOB to + // avoid generating a ifOp here. + minCoord = genIndexLoad(builder, loc, coordinatesBuffers[tid][0], c0); + depth = 1; + } else { + unsigned prevLvl = *sliceInfo.slicedOnLvl; + assert(lvl >= prevLvl); + if (lvl != prevLvl + 1) { + // Either lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one + // variable need to be reduced on the same level). + // Or lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a + // simple dim expression in between). + llvm_unreachable("TODO: not yet implemented"); + } else { + assert(slicePtrBuffer[tid][prevLvl].size() == sliceInfo.depth); + Value sPtrBuf = slicePtrBuffer[tid][lvl][0]; + + SmallVector reduc = { + constantI1(builder, loc, false), // isNonEmpty + lvlSizes[tid][lvl], // minCoord + c2, // memSize + }; + ValueRange result = genSliceAllLvlTraverseLoop( + builder, loc, sliceInfo.offset, tid, prevLvl, sliceInfo.depth - 1, + reduc, + [this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, + Value iv, + MutableArrayRef reduc) { + Value &isNonEmpty = reduc[0]; + Value &minCoord = reduc[1]; + Value &curMemSize = reduc[2]; + + Value pHi = builder.create(loc, iv, c1); + Value sPLo = + genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv); + Value sPHi = + genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi); + + // isNonEmpty = isNonEmpty || lvlNonEmpty + Value lvlNonEmpty = CMPI(ult, sPLo, sPHi); + isNonEmpty = + builder.create(loc, lvlNonEmpty, isNonEmpty); + + // Update minimal coordinate. + auto ifNonEmpty = builder.create( + loc, builder.getIndexType(), lvlNonEmpty, true); + { + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(ifNonEmpty.thenBlock()); + Value curC = genIndexLoad(builder, loc, + coordinatesBuffers[tid][lvl], sPLo); + Value isCurSmaller = CMPI(ult, curC, minCoord); + Value newMin = builder.create(loc, isCurSmaller, + curC, minCoord); + builder.create(loc, newMin); + builder.setInsertionPointToStart(ifNonEmpty.elseBlock()); + builder.create(loc, minCoord); + } + minCoord = ifNonEmpty.getResult(0); + + // filles in + builder.create(loc, sPLo, sPtrBuf, curMemSize); + Value nxtMemSize = + builder.create(loc, curMemSize, c1); + builder.create(loc, sPHi, sPtrBuf, nxtMemSize); + + // curMemSize += 2 + curMemSize = builder.create(loc, curMemSize, c2); + }); + + size = sliceSizes[tid][lvl][0]; + isNonEmpty = result[0]; + minCoord = result[1]; + depth = 1; + + // Two metadata [memSize, idx]. + // TODO: we might be able to use an SSA value for memSize here to avoid + // memory operation. + builder.create(loc, result[2], sPtrBuf, c0); + builder.create(loc, c0, sPtrBuf, c1); + } + } + + assert(depth > 0 && size && isNonEmpty && minCoord && depth); + // Compute the minimal offsets viable for a non empty tensor. + // offset = isNonEmpty && minCoord >= size ? minCoord - size + 1 : 0; + // NOTE: that minCoord is invalid when isNonEmpty = false, in which case + // the computed slices are meaningless. + // FIXME: support relative offset compute. + Value geSize = CMPI(uge, minCoord, size); + Value pred = builder.create(loc, isNonEmpty, geSize); + + Value mp1 = builder.create(loc, minCoord, c1); + Value mms = builder.create(loc, mp1, size); + // This is the absolute offset related to the underly tensor. + Value absOffset = builder.create(loc, pred, mms, c0); + // This is the relative offset related to the base slice. + Value relOffset = absOffset; + uint64_t dim = toOrigDim(baseEnc, lvl); + Value newSlice = genExtractSliceWithOffsetOnDim( + builder, loc, sliceInfo.baseSlice, relOffset, size, dim); + sliceStack[tid].emplace_back(newSlice, minCoord, absOffset, isNonEmpty, lvl, + depth); + return false; +} + +void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, + const Operation *op, size_t tid, + size_t lvl, + SmallVectorImpl &operands, + unsigned &retIdx) { + if (!isCompressedDLT(lvlTypes[tid][lvl])) + llvm_unreachable("TODO"); + + // else generate code to compute next non empty slice. + Value c0 = constantIndex(builder, loc, 0); + Value c1 = constantIndex(builder, loc, 1); + Value c2 = constantIndex(builder, loc, 2); + + auto whileOp = llvm::cast(op); + SliceInfo &info = sliceStack[tid].back(); + assert(info.slicedOnLvl == lvl); + + // + // We forward to the next non empty slice by + // if (minCoord > offset) { + // offset += 1 + // } else { + // minCoord = nextMinInSlice(); + // offset = minCoord - size + 1; + // } + // + // if (offset + size > parents.size) + // isNonEmpty = false; + // + Value absOffset = info.offset; + // Resets slices pointers as the resolved slices are invalidated after we + // moves forward to the next slice. + for (unsigned i = 0; i <= lvl; i++) + builder.create(loc, c0, slicePtrBuffer[tid][i].back(), c1); + + SmallVector reduc = {info.minCoord, info.isNonEmpty, absOffset}; + SmallVector types = getTypesFromValues(reduc); + Value sPtrBuf = slicePtrBuffer[tid][lvl][info.depth - 1]; + Value fastPathP = CMPI(ugt, info.minCoord, absOffset); + auto ifOp = builder.create(loc, types, fastPathP, true); + { + OpBuilder::InsertionGuard guard(builder); + // Take the fast path if minCoord > offset + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + reduc[2] = builder.create(loc, absOffset, c1); + // Yield offset + 1. + builder.create(loc, reduc); + + // Else, take the slow path. + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + reduc[2] = absOffset; // restore value. + Value pSt = c2; // pointer starting index + Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize + reduc[0] = lvlSizes[tid][lvl]; // next min coord + reduc[1] = constantI1(builder, loc, false); // isNonEmpty + auto loopArgs = static_cast(reduc).drop_back(); + auto forOp = scf::buildLoopNest( + builder, loc, pSt, mSz, c2, loopArgs, + [this, tid, lvl, c1, sPtrBuf, + &info](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + Value curMinCoord = iterArgs[0]; + Value isNonEmpty = iterArgs[1]; + + Type idxTp = builder.getIndexType(); + Value pLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); + Value pHi = + genIndexLoad(builder, loc, sPtrBuf, + builder.create(loc, ivs.front(), c1)); + // + // if pLo < pHi + // coord = load[pLo] + // if coord == minCoord + // pLo += 1 + // + // if pLo < pHi + // curMinCoord = min(curMinCoord, load[pLo]) + // + Value pred = CMPI(ult, pLo, pHi); + auto advPLo = builder.create(loc, idxTp, pred, true); + /* if pLo < pHi */ { + builder.setInsertionPointToStart(&advPLo.getThenRegion().front()); + // coord = load[pLo] + Value coord = + genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo); + Value pred = CMPI(eq, coord, info.minCoord); + auto ifEqual = builder.create(loc, idxTp, pred, true); + /* if coord == minCoord */ { + builder.setInsertionPointToStart( + &ifEqual.getThenRegion().front()); + Value newPlo = builder.create(loc, pLo, c1); + // Updates the cache. + builder.create(loc, newPlo, sPtrBuf, + ivs.front()); + builder.create(loc, newPlo); + } + /* else coord != minCoord */ { + builder.setInsertionPointToStart( + &ifEqual.getElseRegion().front()); + builder.create(loc, pLo); + } + builder.setInsertionPointAfter(ifEqual); + builder.create(loc, ifEqual.getResults()); + } + /* else pLo >= pHi */ { + builder.setInsertionPointToStart(&advPLo.getElseRegion().front()); + builder.create(loc, pLo); + } + + builder.setInsertionPointAfter(advPLo); + pLo = advPLo.getResult(0); + Value lvlNonEmpty = CMPI(ult, pLo, pHi); + // Update minCoords + auto newMin = + builder.create(loc, idxTp, lvlNonEmpty, true); + builder.setInsertionPointToStart(&newMin.getThenRegion().front()); + builder.create( + loc, + genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo)); + + builder.setInsertionPointToStart(&newMin.getElseRegion().front()); + builder.create(loc, curMinCoord); + builder.setInsertionPointAfter(newMin); + + // isNonEmpty = isNonEmpty || lvlNonEmpty + isNonEmpty = + builder.create(loc, lvlNonEmpty, isNonEmpty); + curMinCoord = builder.create( + loc, CMPI(ult, newMin.getResult(0), curMinCoord), + newMin.getResult(0), curMinCoord); + return {curMinCoord, isNonEmpty}; + }); + + builder.setInsertionPointAfter(forOp.loops.front()); + // minOffset = minCoord + 1 >= size ? minCoord + 1 - size : c0 + Value tmp = builder.create(loc, forOp.results.front(), c1); + Value minOffset = builder.create( + loc, tmp, sliceSizes[tid][lvl][info.depth - 1]); + Value p = CMPI(uge, tmp, sliceSizes[tid][lvl][info.depth - 1]); + minOffset = builder.create(loc, p, minOffset, c0); + SmallVector yields; + yields.assign(forOp.results.begin(), forOp.results.end()); + yields.push_back(minOffset); + builder.create(loc, yields); + } + + Value nextMinCoord = ifOp.getResults()[0]; + //// builder.create(loc, nextMinCoord); + Value nextNonEmpty = ifOp.getResults()[1]; + + // the next offset should at least be offset + 1; + Value minOffset = ifOp.getResults()[2]; + Value nxOffset = builder.create(loc, info.offset, c1); + Value maxPred = CMPI(ugt, minOffset, nxOffset); + Value nextAbsOffset = + builder.create(loc, maxPred, minOffset, nxOffset); + + Value sliceUB = builder.create( + loc, nextAbsOffset, sliceSizes[tid][lvl][info.depth - 1]); + + // FIXME: this only works if the parsent is the tensor, we should use the + // parents slice size + parent offset. + assert(info.depth - 1 == 0); + // nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound. + nextNonEmpty = builder.create( + loc, nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl])); + + // FIXME: compute relative offset. + assert(info.depth - 1 == 0); + Value nextRelOffset = nextAbsOffset; + nextRelOffset = + builder.create(loc, nextNonEmpty, nextRelOffset, c0); + + uint64_t dim = + toOrigDim(getSparseTensorEncoding(tensors[tid].getType()), lvl); + + Value nextSlice = genExtractSliceWithOffsetOnDim( + builder, loc, sliceStack[tid][sliceStack.size() - 2].baseSlice, + nextRelOffset, sliceSizes[tid][lvl][info.depth - 1], dim); + + operands.push_back(nextNonEmpty); + operands.push_back(nextSlice); + operands.push_back(nextMinCoord); + operands.push_back(nextAbsOffset); // we push the absolute offset. + + // Update the slice stack. + info.isNonEmpty = whileOp.getResult(retIdx++); + info.baseSlice = whileOp.getResult(retIdx++); + info.minCoord = whileOp.getResult(retIdx++); + info.offset = whileOp.getResult(retIdx++); +} + +Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtDim( + OpBuilder &builder, Location loc, size_t tid, size_t lvl, + MutableArrayRef reduc) { + assert(!sliceFullyResolved(tid)); + SliceInfo &sliceInfo = sliceStack[tid].back(); + assert(sliceInfo.slicedOnLvl == lvl); + + // NOTE: The order matters! + constexpr size_t numMetaReduc = 4; // number of reduction maintained by us. + SmallVector operands{sliceInfo.isNonEmpty, sliceInfo.baseSlice, + sliceInfo.minCoord, sliceInfo.offset}; + // Append user-required reduction values. + operands.append(reduc.begin(), reduc.end()); + assert(operands.size() == numMetaReduc + reduc.size()); + + SmallVector types = getTypesFromValues(operands); + + auto whileOp = builder.create( + loc, types, operands, + /*beforeBuilder=*/ + [](OpBuilder &builder, Location loc, ValueRange args) { + builder.create(loc, /*isNonEmpty*/ args[0], args); + }, + /*afterBuilder=*/ + [this, tid, lvl, reduc, &sliceInfo](OpBuilder &builder, Location loc, + ValueRange args) { + assert(args.size() == reduc.size() + numMetaReduc); + sliceInfo.isNonEmpty = args[0]; + sliceInfo.baseSlice = args[1]; + sliceInfo.minCoord = args[2]; + sliceInfo.offset = args[3]; + // The slice offset is the coordinate. + Value c = sliceInfo.offset; + if (sliceInfo.depth > 1) { + // Coord is the relative offset related to its parents. + // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1] + llvm_unreachable("TODO: not yet implement"); + } + coords[tid][lvl] = c; + + for (unsigned i = 0, e = reduc.size(); i < e; i++) + reduc[i] = args[i + numMetaReduc]; + }); + + // Increments the number of resolved constraints on tid. + sliceResolvedConstraints[tid]++; + // Set the insertion point to while loop body. + builder.setInsertionPointToEnd(&whileOp.getAfter().front()); + return whileOp; +} + +#undef CMPI diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -1003,7 +1003,7 @@ // Link the reduction chain. Note that loop emitter update the reducValue // in place. loopEmitter.exitCurrentLoop(rewriter, loc, reducValue); - loopEmitter.exitCurrentLoopSeq(); + loopEmitter.exitCurrentLoopSeq(rewriter, loc); } // Replace the foreach operator with the value returned by the outtermost diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -83,9 +83,12 @@ class AffineDimFinder : public AffineExprVisitor { public: explicit AffineDimFinder(linalg::GenericOp op) - : iterTypes(op.getIteratorTypesArray()) {} + : iterTypes(op.getIteratorTypes()) {} void visitDimExpr(AffineDimExpr expr) { - if (pickedDim == nullptr || pickIterType == iterTypes[expr.getPosition()]) { + if (pickedDim == nullptr || + pickIterType == iterTypes[expr.getPosition()] + .cast() + .getValue()) { pickedDim = expr; } } @@ -106,7 +109,7 @@ /// The iterator type that we want. utils::IteratorType pickIterType; /// The mapping between dim=>iterator type. - SmallVector iterTypes; + ArrayAttr iterTypes; }; // Flattens an affine expression into a list of AffineDimExprs. @@ -756,11 +759,7 @@ // Iterate over the indexing maps of every tensor in the tensor expression. for (OpOperand &t : env.op()->getOpOperands()) { // Get map and encoding. - const auto map = env.op().getMatchingIndexingMap(&t); const auto enc = getSparseTensorEncoding(t.get().getType()); - assert(map.getNumDims() + getNumNonTrivialIdxExpOnSparseLvls(env.op()) == - n); - // Skips dense inputs/outputs when not requested. const bool isDenseInput = !enc && env.op().isDpsInput(&t); const bool isDenseOutput = !enc && !isDenseInput; @@ -1664,14 +1663,17 @@ /// Ends a single loop in current sequence. Returns new values for needsUniv. static bool endLoop(CodegenEnv &env, RewriterBase &rewriter, Operation *loop, - LoopId idx, LatPointId li, bool needsUniv) { - // End a while-loop. - if (auto whileOp = dyn_cast(loop)) { - finalizeWhileOp(env, rewriter, idx, needsUniv, env.lat(li).bits, whileOp); - } else if (auto forOp = dyn_cast(loop)) { - // Any iteration of a reduction for-loop creates a valid lex insert. + LoopId idx, LatPointId li, bool needsUniv, + bool isSingleCond) { + + if (isSingleCond) { + // Could be a for-loop or a while-loop for iterating over slice. + // Any iteration creates a valid lex insert. if (env.isReduc() && env.getValidLexInsert()) env.setValidLexInsert(constantI1(rewriter, env.op().getLoc(), true)); + } else if (auto whileOp = dyn_cast(loop)) { + // End a while-loop. + finalizeWhileOp(env, rewriter, idx, needsUniv, env.lat(li).bits, whileOp); } else { needsUniv = false; } @@ -1685,10 +1687,10 @@ } /// Ends a loop sequence at given level. -static void endLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp, - LoopOrd at, LoopId idx, LoopId ldx) { +static void endLoopSeq(CodegenEnv &env, OpBuilder &builder, unsigned exp, + unsigned at, unsigned idx, unsigned ldx) { assert(!env.getLoopVar(idx)); - env.emitter().exitCurrentLoopSeq(); + env.emitter().exitCurrentLoopSeq(builder, env.op().getLoc()); // Unmark bookkeeping of invariants and loop index. genInvariants(env, builder, exp, ldx, /*atStart=*/false); // Finalize access pattern expansion for sparse tensor output. @@ -1745,7 +1747,7 @@ } // End a loop. - needsUniv = endLoop(env, rewriter, loop, idx, li, needsUniv); + needsUniv = endLoop(env, rewriter, loop, idx, li, needsUniv, isSingleCond); } // End a loop sequence. diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp --- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp +++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp @@ -379,8 +379,8 @@ // keep the rightmost bit (which could possibly be a synthetic tensor). for (TensorLoopId b = be - 1 - offset, i = 0; i < be; b = b == 0 ? be - 1 : b - 1, i++) { - // FIXME: better name? also slice on dense level has locate property as - // well. Handle it correctly! + // FIXME: slice on dense level has locate property as well. Handle it + // correctly! if (simple[b] && !isLvlWithNonTrivialIdxExp(b)) { const auto dlt = getDimLevelType(b); if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt)) { diff --git a/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir b/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir @@ -0,0 +1,290 @@ +// RUN: mlir-opt %s --sparsification="enable-index-reduction=true" --cse | FileCheck %s + +#map = affine_map<(d0, d1, d2, d3) -> (d0 + d2, d1 + d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> + +#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> + +// CHECK-LABEL: func.func @conv2d_all_sparse_CSR( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<3x3xi32>) -> tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> { +// CHECK: %[[VAL_2:.*]] = arith.constant 8 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 3 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 4 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_7:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_8:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_9:.*]] = arith.constant true +// CHECK: %[[VAL_10:.*]] = arith.constant false +// CHECK: %[[VAL_11:.*]] = bufferization.alloc_tensor() : tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref +// CHECK: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref +// CHECK: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref +// CHECK: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref +// CHECK: %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref +// CHECK: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : memref<3x3xi32> +// CHECK: %[[VAL_18:.*]] = memref.alloca(%[[VAL_4]]) : memref +// CHECK: %[[VAL_19:.*]] = memref.alloca(%[[VAL_2]]) : memref +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_6]]] : memref +// CHECK: memref.store %[[VAL_4]], %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: memref.store %[[VAL_5]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref +// CHECK: memref.store %[[VAL_5]], %[[VAL_18]]{{\[}}%[[VAL_7]]] : memref +// CHECK: memref.store %[[VAL_20]], %[[VAL_18]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_21:.*]] = arith.cmpi ugt, %[[VAL_20]], %[[VAL_5]] : index +// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_23:.*]] = arith.cmpi uge, %[[VAL_22]], %[[VAL_3]] : index +// CHECK: %[[VAL_24:.*]] = arith.andi %[[VAL_21]], %[[VAL_23]] : i1 +// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_22]], %[[VAL_6]] : index +// CHECK: %[[VAL_26:.*]] = arith.subi %[[VAL_25]], %[[VAL_3]] : index +// CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_24]], %[[VAL_26]], %[[VAL_5]] : index +// CHECK: %[[VAL_28:.*]] = tensor.extract_slice %[[VAL_0]]{{\[}}%[[VAL_27]], 0] {{\[}}%[[VAL_3]], 8] [1, 1] : tensor<8x8xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to tensor> +// CHECK: %[[VAL_29:.*]]:5 = scf.while (%[[VAL_30:.*]] = %[[VAL_21]], %[[VAL_31:.*]] = %[[VAL_28]], %[[VAL_32:.*]] = %[[VAL_22]], %[[VAL_33:.*]] = %[[VAL_27]], %[[VAL_34:.*]] = %[[VAL_11]]) : (i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) -> (i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: scf.condition(%[[VAL_30]]) %[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_33]], %[[VAL_34]] : i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_35:.*]]: i1, %[[VAL_36:.*]]: tensor>, %[[VAL_37:.*]]: index, %[[VAL_38:.*]]: index, %[[VAL_39:.*]]: tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>): +// CHECK: %[[VAL_40:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_41:.*]]:3 = scf.for %[[VAL_42:.*]] = %[[VAL_7]] to %[[VAL_40]] step %[[VAL_7]] iter_args(%[[VAL_43:.*]] = %[[VAL_10]], %[[VAL_44:.*]] = %[[VAL_2]], %[[VAL_45:.*]] = %[[VAL_7]]) -> (i1, index, index) { +// CHECK: %[[VAL_46:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_42]]] : memref +// CHECK: %[[VAL_47:.*]] = arith.addi %[[VAL_42]], %[[VAL_6]] : index +// CHECK: %[[VAL_48:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_47]]] : memref +// CHECK: %[[VAL_49:.*]] = arith.addi %[[VAL_38]], %[[VAL_3]] : index +// CHECK: %[[VAL_50:.*]]:5 = scf.while (%[[VAL_51:.*]] = %[[VAL_46]], %[[VAL_52:.*]] = %[[VAL_9]], %[[VAL_53:.*]] = %[[VAL_43]], %[[VAL_54:.*]] = %[[VAL_44]], %[[VAL_55:.*]] = %[[VAL_45]]) : (index, i1, i1, index, index) -> (index, i1, i1, index, index) { +// CHECK: %[[VAL_56:.*]] = arith.cmpi ult, %[[VAL_51]], %[[VAL_48]] : index +// CHECK: %[[VAL_57:.*]] = arith.andi %[[VAL_52]], %[[VAL_56]] : i1 +// CHECK: scf.condition(%[[VAL_57]]) %[[VAL_51]], %[[VAL_52]], %[[VAL_53]], %[[VAL_54]], %[[VAL_55]] : index, i1, i1, index, index +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_58:.*]]: index, %[[VAL_59:.*]]: i1, %[[VAL_60:.*]]: i1, %[[VAL_61:.*]]: index, %[[VAL_62:.*]]: index): +// CHECK: %[[VAL_63:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_58]]] : memref +// CHECK: %[[VAL_64:.*]] = arith.cmpi ult, %[[VAL_63]], %[[VAL_49]] : index +// CHECK: %[[VAL_65:.*]]:3 = scf.if %[[VAL_64]] -> (i1, index, index) { +// CHECK: %[[VAL_66:.*]] = arith.addi %[[VAL_58]], %[[VAL_6]] : index +// CHECK: %[[VAL_67:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_58]]] : memref +// CHECK: %[[VAL_68:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_66]]] : memref +// CHECK: %[[VAL_69:.*]] = arith.cmpi ult, %[[VAL_67]], %[[VAL_68]] : index +// CHECK: %[[VAL_70:.*]] = arith.ori %[[VAL_69]], %[[VAL_60]] : i1 +// CHECK: %[[VAL_71:.*]] = scf.if %[[VAL_69]] -> (index) { +// CHECK: %[[VAL_72:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_67]]] : memref +// CHECK: %[[VAL_73:.*]] = arith.cmpi ult, %[[VAL_72]], %[[VAL_61]] : index +// CHECK: %[[VAL_74:.*]] = arith.select %[[VAL_73]], %[[VAL_72]], %[[VAL_61]] : index +// CHECK: scf.yield %[[VAL_74]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_61]] : index +// CHECK: } +// CHECK: memref.store %[[VAL_67]], %[[VAL_19]]{{\[}}%[[VAL_62]]] : memref +// CHECK: %[[VAL_75:.*]] = arith.addi %[[VAL_62]], %[[VAL_6]] : index +// CHECK: memref.store %[[VAL_68]], %[[VAL_19]]{{\[}}%[[VAL_75]]] : memref +// CHECK: %[[VAL_76:.*]] = arith.addi %[[VAL_62]], %[[VAL_7]] : index +// CHECK: scf.yield %[[VAL_70]], %[[VAL_77:.*]], %[[VAL_76]] : i1, index, index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_60]], %[[VAL_61]], %[[VAL_62]] : i1, index, index +// CHECK: } {"Emitted from" = "slice"} +// CHECK: %[[VAL_78:.*]] = arith.addi %[[VAL_58]], %[[VAL_6]] : index +// CHECK: scf.yield %[[VAL_78]], %[[VAL_64]], %[[VAL_79:.*]]#0, %[[VAL_79]]#1, %[[VAL_79]]#2 : index, i1, i1, index, index +// CHECK: } +// CHECK: scf.yield %[[VAL_80:.*]]#2, %[[VAL_80]]#3, %[[VAL_80]]#4 : i1, index, index +// CHECK: } +// CHECK: memref.store %[[VAL_81:.*]]#2, %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: memref.store %[[VAL_5]], %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_82:.*]] = arith.cmpi uge, %[[VAL_81]]#1, %[[VAL_3]] : index +// CHECK: %[[VAL_83:.*]] = arith.andi %[[VAL_81]]#0, %[[VAL_82]] : i1 +// CHECK: %[[VAL_84:.*]] = arith.addi %[[VAL_81]]#1, %[[VAL_6]] : index +// CHECK: %[[VAL_85:.*]] = arith.subi %[[VAL_84]], %[[VAL_3]] : index +// CHECK: %[[VAL_86:.*]] = arith.select %[[VAL_83]], %[[VAL_85]], %[[VAL_5]] : index +// CHECK: %[[VAL_87:.*]] = tensor.extract_slice %[[VAL_36]][0, %[[VAL_86]]] {{\[}}%[[VAL_2]], %[[VAL_3]]] [1, 1] : tensor> to tensor> +// CHECK: %[[VAL_88:.*]]:5 = scf.while (%[[VAL_89:.*]] = %[[VAL_81]]#0, %[[VAL_90:.*]] = %[[VAL_87]], %[[VAL_91:.*]] = %[[VAL_81]]#1, %[[VAL_92:.*]] = %[[VAL_86]], %[[VAL_93:.*]] = %[[VAL_39]]) : (i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) -> (i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: scf.condition(%[[VAL_89]]) %[[VAL_89]], %[[VAL_90]], %[[VAL_91]], %[[VAL_92]], %[[VAL_93]] : i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_94:.*]]: i1, %[[VAL_95:.*]]: tensor>, %[[VAL_96:.*]]: index, %[[VAL_97:.*]]: index, %[[VAL_98:.*]]: tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>): +// CHECK: %[[VAL_99:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_100:.*]] = arith.addi %[[VAL_99]], %[[VAL_7]] : index +// CHECK: %[[VAL_101:.*]] = arith.addi %[[VAL_100]], %[[VAL_6]] : index +// CHECK: %[[VAL_102:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_100]]] : memref +// CHECK: %[[VAL_103:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_101]]] : memref +// CHECK: %[[VAL_104:.*]] = arith.addi %[[VAL_38]], %[[VAL_3]] : index +// CHECK: %[[VAL_105:.*]]:5 = scf.while (%[[VAL_106:.*]] = %[[VAL_102]], %[[VAL_107:.*]] = %[[VAL_9]], %[[VAL_108:.*]] = %[[VAL_8]], %[[VAL_109:.*]] = %[[VAL_10]], %[[VAL_110:.*]] = %[[VAL_98]]) : (index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) -> (index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: %[[VAL_111:.*]] = arith.cmpi ult, %[[VAL_106]], %[[VAL_103]] : index +// CHECK: %[[VAL_112:.*]] = arith.andi %[[VAL_107]], %[[VAL_111]] : i1 +// CHECK: scf.condition(%[[VAL_112]]) %[[VAL_106]], %[[VAL_107]], %[[VAL_108]], %[[VAL_109]], %[[VAL_110]] : index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_113:.*]]: index, %[[VAL_114:.*]]: i1, %[[VAL_115:.*]]: i32, %[[VAL_116:.*]]: i1, %[[VAL_117:.*]]: tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>): +// CHECK: %[[VAL_118:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_113]]] : memref +// CHECK: %[[VAL_119:.*]] = arith.cmpi ult, %[[VAL_118]], %[[VAL_104]] : index +// CHECK: %[[VAL_120:.*]]:3 = scf.if %[[VAL_119]] -> (i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: %[[VAL_121:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_113]]] : memref +// CHECK: %[[VAL_122:.*]] = arith.subi %[[VAL_121]], %[[VAL_38]] : index +// CHECK: %[[VAL_123:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_124:.*]] = arith.addi %[[VAL_123]], %[[VAL_7]] : index +// CHECK: %[[VAL_125:.*]] = arith.addi %[[VAL_124]], %[[VAL_6]] : index +// CHECK: %[[VAL_126:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_124]]] : memref +// CHECK: %[[VAL_127:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_125]]] : memref +// CHECK: %[[VAL_128:.*]] = arith.addi %[[VAL_97]], %[[VAL_3]] : index +// CHECK: %[[VAL_129:.*]]:5 = scf.while (%[[VAL_130:.*]] = %[[VAL_126]], %[[VAL_131:.*]] = %[[VAL_9]], %[[VAL_132:.*]] = %[[VAL_115]], %[[VAL_133:.*]] = %[[VAL_116]], %[[VAL_134:.*]] = %[[VAL_117]]) : (index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) -> (index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: %[[VAL_135:.*]] = arith.cmpi ult, %[[VAL_130]], %[[VAL_127]] : index +// CHECK: %[[VAL_136:.*]] = arith.andi %[[VAL_131]], %[[VAL_135]] : i1 +// CHECK: scf.condition(%[[VAL_136]]) %[[VAL_130]], %[[VAL_131]], %[[VAL_132]], %[[VAL_133]], %[[VAL_134]] : index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_137:.*]]: index, %[[VAL_138:.*]]: i1, %[[VAL_139:.*]]: i32, %[[VAL_140:.*]]: i1, %[[VAL_141:.*]]: tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>): +// CHECK: %[[VAL_142:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_137]]] : memref +// CHECK: %[[VAL_143:.*]] = arith.cmpi ult, %[[VAL_142]], %[[VAL_128]] : index +// CHECK: %[[VAL_144:.*]]:3 = scf.if %[[VAL_143]] -> (i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: %[[VAL_145:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_137]]] : memref +// CHECK: %[[VAL_146:.*]] = arith.subi %[[VAL_145]], %[[VAL_97]] : index +// CHECK: %[[VAL_147:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_137]]] : memref +// CHECK: %[[VAL_148:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_122]], %[[VAL_146]]] : memref<3x3xi32> +// CHECK: %[[VAL_149:.*]] = arith.muli %[[VAL_147]], %[[VAL_148]] : i32 +// CHECK: %[[VAL_150:.*]] = arith.addi %[[VAL_139]], %[[VAL_149]] : i32 +// CHECK: scf.yield %[[VAL_150]], %[[VAL_9]], %[[VAL_141]] : i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_139]], %[[VAL_140]], %[[VAL_141]] : i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } {"Emitted from" = "slice"} +// CHECK: %[[VAL_151:.*]] = arith.addi %[[VAL_137]], %[[VAL_6]] : index +// CHECK: scf.yield %[[VAL_151]], %[[VAL_143]], %[[VAL_152:.*]]#0, %[[VAL_152]]#1, %[[VAL_152]]#2 : index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_153:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_154:.*]] = arith.addi %[[VAL_153]], %[[VAL_7]] : index +// CHECK: memref.store %[[VAL_154]], %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref +// CHECK: scf.yield %[[VAL_155:.*]]#2, %[[VAL_9]], %[[VAL_155]]#4 : i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_115]], %[[VAL_116]], %[[VAL_117]] : i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } {"Emitted from" = "slice"} +// CHECK: %[[VAL_156:.*]] = arith.addi %[[VAL_113]], %[[VAL_6]] : index +// CHECK: scf.yield %[[VAL_156]], %[[VAL_119]], %[[VAL_157:.*]]#0, %[[VAL_157]]#1, %[[VAL_157]]#2 : index, i1, i32, i1, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_158:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_159:.*]] = arith.addi %[[VAL_158]], %[[VAL_7]] : index +// CHECK: memref.store %[[VAL_159]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_160:.*]] = scf.if %[[VAL_161:.*]]#3 -> (tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) { +// CHECK: %[[VAL_162:.*]] = sparse_tensor.insert %[[VAL_161]]#2 into %[[VAL_161]]#4{{\[}}%[[VAL_38]], %[[VAL_97]]] : tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: scf.yield %[[VAL_162]] : tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } else { +// CHECK: scf.yield %[[VAL_163:.*]]#4 : tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } +// CHECK: memref.store %[[VAL_5]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref +// CHECK: memref.store %[[VAL_5]], %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_164:.*]] = arith.cmpi ugt, %[[VAL_96]], %[[VAL_97]] : index +// CHECK: %[[VAL_165:.*]]:3 = scf.if %[[VAL_164]] -> (index, i1, index) { +// CHECK: %[[VAL_166:.*]] = arith.addi %[[VAL_97]], %[[VAL_6]] : index +// CHECK: scf.yield %[[VAL_96]], %[[VAL_94]], %[[VAL_166]] : index, i1, index +// CHECK: } else { +// CHECK: %[[VAL_167:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_168:.*]]:2 = scf.for %[[VAL_169:.*]] = %[[VAL_7]] to %[[VAL_167]] step %[[VAL_7]] iter_args(%[[VAL_170:.*]] = %[[VAL_2]], %[[VAL_171:.*]] = %[[VAL_10]]) -> (index, i1) { +// CHECK: %[[VAL_172:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_169]]] : memref +// CHECK: %[[VAL_173:.*]] = arith.addi %[[VAL_169]], %[[VAL_6]] : index +// CHECK: %[[VAL_174:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_173]]] : memref +// CHECK: %[[VAL_175:.*]] = arith.cmpi ult, %[[VAL_172]], %[[VAL_174]] : index +// CHECK: %[[VAL_176:.*]] = scf.if %[[VAL_175]] -> (index) { +// CHECK: %[[VAL_177:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_172]]] : memref +// CHECK: %[[VAL_178:.*]] = arith.cmpi eq, %[[VAL_177]], %[[VAL_96]] : index +// CHECK: %[[VAL_179:.*]] = scf.if %[[VAL_178]] -> (index) { +// CHECK: %[[VAL_180:.*]] = arith.addi %[[VAL_172]], %[[VAL_6]] : index +// CHECK: memref.store %[[VAL_180]], %[[VAL_19]]{{\[}}%[[VAL_169]]] : memref +// CHECK: scf.yield %[[VAL_180]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_172]] : index +// CHECK: } +// CHECK: scf.yield %[[VAL_181:.*]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_172]] : index +// CHECK: } +// CHECK: %[[VAL_182:.*]] = arith.cmpi ult, %[[VAL_183:.*]], %[[VAL_174]] : index +// CHECK: %[[VAL_184:.*]] = scf.if %[[VAL_182]] -> (index) { +// CHECK: %[[VAL_185:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_183]]] : memref +// CHECK: scf.yield %[[VAL_185]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_170]] : index +// CHECK: } +// CHECK: %[[VAL_186:.*]] = arith.ori %[[VAL_182]], %[[VAL_171]] : i1 +// CHECK: %[[VAL_187:.*]] = arith.cmpi ult, %[[VAL_188:.*]], %[[VAL_170]] : index +// CHECK: %[[VAL_189:.*]] = arith.select %[[VAL_187]], %[[VAL_188]], %[[VAL_170]] : index +// CHECK: scf.yield %[[VAL_189]], %[[VAL_186]] : index, i1 +// CHECK: } +// CHECK: %[[VAL_190:.*]] = arith.addi %[[VAL_191:.*]]#0, %[[VAL_6]] : index +// CHECK: %[[VAL_192:.*]] = arith.subi %[[VAL_190]], %[[VAL_3]] : index +// CHECK: %[[VAL_193:.*]] = arith.cmpi uge, %[[VAL_190]], %[[VAL_3]] : index +// CHECK: %[[VAL_194:.*]] = arith.select %[[VAL_193]], %[[VAL_192]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_191]]#0, %[[VAL_191]]#1, %[[VAL_194]] : index, i1, index +// CHECK: } +// CHECK: %[[VAL_195:.*]] = arith.addi %[[VAL_97]], %[[VAL_6]] : index +// CHECK: %[[VAL_196:.*]] = arith.cmpi ugt, %[[VAL_197:.*]]#2, %[[VAL_195]] : index +// CHECK: %[[VAL_198:.*]] = arith.select %[[VAL_196]], %[[VAL_197]]#2, %[[VAL_195]] : index +// CHECK: %[[VAL_199:.*]] = arith.addi %[[VAL_198]], %[[VAL_3]] : index +// CHECK: %[[VAL_200:.*]] = arith.cmpi ule, %[[VAL_199]], %[[VAL_2]] : index +// CHECK: %[[VAL_201:.*]] = arith.andi %[[VAL_197]]#1, %[[VAL_200]] : i1 +// CHECK: %[[VAL_202:.*]] = arith.select %[[VAL_201]], %[[VAL_198]], %[[VAL_5]] : index +// CHECK: %[[VAL_203:.*]] = tensor.extract_slice %[[VAL_36]][0, %[[VAL_202]]] {{\[}}%[[VAL_2]], %[[VAL_3]]] [1, 1] : tensor> to tensor> +// CHECK: scf.yield %[[VAL_201]], %[[VAL_203]], %[[VAL_197]]#0, %[[VAL_198]], %[[VAL_204:.*]] : i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: memref.store %[[VAL_5]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref +// CHECK: %[[VAL_205:.*]] = arith.cmpi ugt, %[[VAL_37]], %[[VAL_38]] : index +// CHECK: %[[VAL_206:.*]]:3 = scf.if %[[VAL_205]] -> (index, i1, index) { +// CHECK: %[[VAL_207:.*]] = arith.addi %[[VAL_38]], %[[VAL_6]] : index +// CHECK: scf.yield %[[VAL_37]], %[[VAL_35]], %[[VAL_207]] : index, i1, index +// CHECK: } else { +// CHECK: %[[VAL_208:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_209:.*]]:2 = scf.for %[[VAL_210:.*]] = %[[VAL_7]] to %[[VAL_208]] step %[[VAL_7]] iter_args(%[[VAL_211:.*]] = %[[VAL_2]], %[[VAL_212:.*]] = %[[VAL_10]]) -> (index, i1) { +// CHECK: %[[VAL_213:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_210]]] : memref +// CHECK: %[[VAL_214:.*]] = arith.addi %[[VAL_210]], %[[VAL_6]] : index +// CHECK: %[[VAL_215:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_214]]] : memref +// CHECK: %[[VAL_216:.*]] = arith.cmpi ult, %[[VAL_213]], %[[VAL_215]] : index +// CHECK: %[[VAL_217:.*]] = scf.if %[[VAL_216]] -> (index) { +// CHECK: %[[VAL_218:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_213]]] : memref +// CHECK: %[[VAL_219:.*]] = arith.cmpi eq, %[[VAL_218]], %[[VAL_37]] : index +// CHECK: %[[VAL_220:.*]] = scf.if %[[VAL_219]] -> (index) { +// CHECK: %[[VAL_221:.*]] = arith.addi %[[VAL_213]], %[[VAL_6]] : index +// CHECK: memref.store %[[VAL_221]], %[[VAL_18]]{{\[}}%[[VAL_210]]] : memref +// CHECK: scf.yield %[[VAL_221]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_213]] : index +// CHECK: } +// CHECK: scf.yield %[[VAL_222:.*]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_213]] : index +// CHECK: } +// CHECK: %[[VAL_223:.*]] = arith.cmpi ult, %[[VAL_224:.*]], %[[VAL_215]] : index +// CHECK: %[[VAL_225:.*]] = scf.if %[[VAL_223]] -> (index) { +// CHECK: %[[VAL_226:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_224]]] : memref +// CHECK: scf.yield %[[VAL_226]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_211]] : index +// CHECK: } +// CHECK: %[[VAL_227:.*]] = arith.ori %[[VAL_223]], %[[VAL_212]] : i1 +// CHECK: %[[VAL_228:.*]] = arith.cmpi ult, %[[VAL_229:.*]], %[[VAL_211]] : index +// CHECK: %[[VAL_230:.*]] = arith.select %[[VAL_228]], %[[VAL_229]], %[[VAL_211]] : index +// CHECK: scf.yield %[[VAL_230]], %[[VAL_227]] : index, i1 +// CHECK: } +// CHECK: %[[VAL_231:.*]] = arith.addi %[[VAL_232:.*]]#0, %[[VAL_6]] : index +// CHECK: %[[VAL_233:.*]] = arith.subi %[[VAL_231]], %[[VAL_3]] : index +// CHECK: %[[VAL_234:.*]] = arith.cmpi uge, %[[VAL_231]], %[[VAL_3]] : index +// CHECK: %[[VAL_235:.*]] = arith.select %[[VAL_234]], %[[VAL_233]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_232]]#0, %[[VAL_232]]#1, %[[VAL_235]] : index, i1, index +// CHECK: } +// CHECK: %[[VAL_236:.*]] = arith.addi %[[VAL_38]], %[[VAL_6]] : index +// CHECK: %[[VAL_237:.*]] = arith.cmpi ugt, %[[VAL_238:.*]]#2, %[[VAL_236]] : index +// CHECK: %[[VAL_239:.*]] = arith.select %[[VAL_237]], %[[VAL_238]]#2, %[[VAL_236]] : index +// CHECK: %[[VAL_240:.*]] = arith.addi %[[VAL_239]], %[[VAL_3]] : index +// CHECK: %[[VAL_241:.*]] = arith.cmpi ule, %[[VAL_240]], %[[VAL_2]] : index +// CHECK: %[[VAL_242:.*]] = arith.andi %[[VAL_238]]#1, %[[VAL_241]] : i1 +// CHECK: %[[VAL_243:.*]] = arith.select %[[VAL_242]], %[[VAL_239]], %[[VAL_5]] : index +// CHECK: %[[VAL_244:.*]] = tensor.extract_slice %[[VAL_36]]{{\[}}%[[VAL_243]], 0] {{\[}}%[[VAL_3]], 8] [1, 1] : tensor> to tensor> +// CHECK: scf.yield %[[VAL_242]], %[[VAL_244]], %[[VAL_238]]#0, %[[VAL_239]], %[[VAL_245:.*]]#4 : i1, tensor>, index, index, tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_246:.*]] = sparse_tensor.load %[[VAL_247:.*]]#4 hasInserts : tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: return %[[VAL_246]] : tensor<6x6xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> +// CHECK: } +func.func @conv2d_all_sparse_CSR(%arg0: tensor<8x8xi32, #DCSR>, + %arg1: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { + %0 = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %1 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<8x8xi32, #DCSR>, tensor<3x3xi32>) + outs(%0 : tensor<6x6xi32, #DCSR>) { + ^bb0(%in: i32, %in_0: i32, %out: i32): + %2 = arith.muli %in, %in_0 : i32 + %3 = arith.addi %out, %2 : i32 + linalg.yield %3 : i32 + } -> tensor<6x6xi32, #DCSR> + return %1 : tensor<6x6xi32, #DCSR> +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir @@ -0,0 +1,81 @@ +// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=false" +// DEFINE: %{command} = mlir-opt %s --sparse-compiler=%{option} | \ +// DEFINE: mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// DEFINE: FileCheck %s +// +// RUN: %{command} + +#map = affine_map<(d0, d1, d2, d3) -> (d0 + d2, d1 + d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> + +#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> + +module { + func.func @conv2d_all_sparse_CSR(%arg0: tensor<8x8xi32, #DCSR>, %arg1: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { + %0 = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %1 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<8x8xi32, #DCSR>, tensor<3x3xi32>) + outs(%0 : tensor<6x6xi32, #DCSR>) { + ^bb0(%in: i32, %in_0: i32, %out: i32): + %2 = arith.muli %in, %in_0 : i32 + %3 = arith.addi %out, %2 : i32 + linalg.yield %3 : i32 + } -> tensor<6x6xi32, #DCSR> + return %1 : tensor<6x6xi32, #DCSR> + } + + func.func @entry() { + %c0 = arith.constant 0 : index + %i0 = arith.constant 0 : i32 + + // A typical edge detection filter. + %filter = arith.constant dense<[ + [ 1, 0, -1 ], + [ 0, 0, 0 ], + [ -1, 0, 1 ] + ]> : tensor<3x3xi32> + + %input = arith.constant dense<[ + [ 1, 2, 3, 4, 0, 6, 7, 8 ], + [ 2, 2, 4, 4, 0, 0, 6, 8 ], + [ 2, 2, 4, 4, 0, 0, 6, 8 ], + [ 2, 2, 3, 4, 0, 0, 7, 8 ], + [ 1, 3, 3, 4, 0, 0, 6, 8 ], + [ 3, 2, 3, 4, 0, 0, 7, 8 ], + [ 1, 3, 3, 4, 3, 6, 6, 8 ], + [ 1, 3, 3, 4, 3, 0, 7, 8 ] + ]> : tensor<8x8xi32> + + %sparse_filter_CSR = sparse_tensor.convert %filter + : tensor<3x3xi32> to tensor<3x3xi32> + + %sparse_input_CSR = sparse_tensor.convert %input + : tensor<8x8xi32> to tensor<8x8xi32, #DCSR> + + %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR) + : (tensor<8x8xi32, #DCSR>, + tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> + + %out = sparse_tensor.convert %3 + : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> + // + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %v2 = vector.transfer_read %out[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v2 : vector<6x6xi32> + + return + } + +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir @@ -0,0 +1,97 @@ +// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=false" +// DEFINE: %{command} = mlir-opt %s --sparse-compiler=%{option} | \ +// DEFINE: mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ +// DEFINE: FileCheck %s +// +// RUN: %{command} + +#CCC = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed", "compressed" ] +}> + +func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_3d_CCC(%arg0: tensor, %arg1: tensor) -> tensor { + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor + %ret = linalg.conv_3d + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter3D = call @alloc_3d_filled_f32(%c3, %c3, %c3, %val) : (index, index, index, f32) -> (tensor) + %in3D_tmp = call @alloc_3d_filled_f32(%c8, %c8, %c8, %val) : (index, index, index, f32) -> (tensor) + %in3D = tensor.insert %f10 into %in3D_tmp[%c0, %c3, %c0] : tensor + %out3D = call @alloc_3d_filled_f32(%c6, %c6, %c6, %zero) : (index, index, index, f32) -> (tensor) + + %in3D_CCC = sparse_tensor.convert %in3D + : tensor to tensor + %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D) : (tensor, tensor) -> (tensor) + // CHECK: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) + %1 = sparse_tensor.convert %CCC_ret + : tensor to tensor + %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero + : tensor, vector<6x6x6xf32> + vector.print %v1 : vector<6x6x6xf32> + + // Free the resources + bufferization.dealloc_tensor %in3D : tensor + bufferization.dealloc_tensor %filter3D : tensor + + bufferization.dealloc_tensor %in3D_CCC : tensor + bufferization.dealloc_tensor %CCC_ret : tensor + + return +}