diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h @@ -264,6 +264,9 @@ unsigned depth; // the depth (relative to dependentDimMap[tid][lvl]). }; + using LoopBodyBuilder = llvm::function_ref)>; + /// Linearizes address for dense dimension (i.e., p = (i * d0) + j). Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value iv); @@ -318,9 +321,13 @@ ArrayRef tids, ArrayRef lvls); + /// Emits a for loop to iterate over a tensor level with the provided lower + /// bound `lo` and upper bound `hi`. + /// Apart from iterating just single tensor level, for loops can be used for + /// slice-driven loop on dense level too. Operation *emitForLoopOverTensorAtLvl(OpBuilder &builder, Location loc, - TensorId tid, Level lvl, - MutableArrayRef reduc, + TensorId tid, Level lvl, Value lo, + Value hi, MutableArrayRef reduc, bool isParallel); /// Emits a while loop to iterate over a sparse level that has been sliced. @@ -400,9 +407,16 @@ /// Retrieves the most recent slices on lvl. To reduce affine expression like /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of - /// size d2). This methods returns the latter slice (of size d2), which is - /// also the final slice on the level. - const SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl); + /// size d2). This methods returns the latter slice (of size d2). + const SliceInfo &getMostRecentSliceOnLvl(TensorId tid, Level lvl); + + /// Similar to getMostRecentSliceOnLvl, but yields error when the most recent + /// slice is not the final slice needed to fully reduced the dependencies. + const SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl) { + const SliceInfo &info = getMostRecentSliceOnLvl(tid, lvl); + assert(info.depth == dependentLvlMap[tid][lvl].size() - 1); + return info; + } /// Get the total number of constraints needed to fully *resolve* /// dependent levels on tensor[tid]. @@ -430,18 +444,15 @@ genSliceLvlTraverseLoop(OpBuilder &builder, Location loc, Value pLo, Value pHi, Value offset, Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield, - /*bodyBuilder=*/ - llvm::function_ref)>); + LoopBodyBuilder bodyBuilder); /// Generates a nested loop that iterates over tid on all the coordinates on /// lvl. - ValueRange genUnResolvedSliceTreeTraverse( - OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl, - size_t depth, ValueRange userReduc, - /*bodyBody=*/ - llvm::function_ref)>); + ValueRange + genUnResolvedSliceTreeTraverse(OpBuilder &builder, Location loc, TensorId tid, + ArrayRef unResLvls, + ValueRange userReduc, + LoopBodyBuilder bodyBuilder); /// Generates code to get the first non-empty slice of tid on lvl, when all /// the previous level before `lvl` are resolved (or lvl is the first level). @@ -459,6 +470,11 @@ void genUnResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl); + /// Invalidates the index kept in slice postion buffers (by setting it to + /// zero). + /// TODO: We should instead use an SSA value for the index. + void invalidateSliceIterIdx(OpBuilder &builder, Location loc, TensorId tid, + Level lvl); /// Generates code to get the first non-empty slice of tid on lvl. /// return true if has already been resolved. bool genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -29,6 +29,8 @@ (builder.create(loc, arith::CmpIPredicate::p, l, r) \ .getResult()) +#define ADDI(lhs, rhs) (builder.create(loc, lhs, rhs)) + #define C_IDX(v) (constantIndex(builder, loc, v)) /// Generates a pointer/index load from the sparse storage scheme. Narrower @@ -495,16 +497,16 @@ // There is an additional item in sliceStack for the input tensor. // assert(sliceResolvedConstraints[tid] + 1 == sliceStack[tid].size()); } else { - Value c1 = C_IDX(1); - Value c2 = C_IDX(2); - - // pIdx += 2, we finished the current lvl, advance the pointer index of - // the previous level by two to skip the [pLo, pHi] for current level. - // TODO: we could probably use an SSA value for it. - Value sPtrBuf = slicePosBuffer[tid][lvl].back(); - Value curP = genIndexLoad(builder, loc, sPtrBuf, c1); - Value nexP = builder.create(loc, curP, c2); - builder.create(loc, nexP, sPtrBuf, c1); + Value c1 = C_IDX(1), c2 = C_IDX(2); + if (!isDenseDLT(lvlTypes[tid][lvl])) { + // pIdx += 2, we finished the current lvl, advance the pointer index of + // the previous level by two to skip the [pLo, pHi] for current level. + // TODO: we could probably use an SSA value for it. + Value sPtrBuf = slicePosBuffer[tid][lvl].back(); + Value curP = genIndexLoad(builder, loc, sPtrBuf, c1); + Value nexP = builder.create(loc, curP, c2); + builder.create(loc, nexP, sPtrBuf, c1); + } } } loopSeqStack.pop_back(); @@ -542,11 +544,9 @@ } } -Operation *LoopEmitter::emitForLoopOverTensorAtLvl(OpBuilder &builder, - Location loc, TensorId tid, - Level dstLvl, - MutableArrayRef reduc, - bool isParallel) { +Operation *LoopEmitter::emitForLoopOverTensorAtLvl( + OpBuilder &builder, Location loc, TensorId tid, Level dstLvl, Value lo, + Value hi, MutableArrayRef reduc, bool isParallel) { bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) || isSingletonDLT(lvlTypes[tid][dstLvl]); @@ -556,9 +556,6 @@ // biggest range). const Level srcLvl = reassoc.front(); Value step = C_IDX(1); - Value lo = isSparseCond ? posits[tid][srcLvl] // current offset - : loopSeqStack.back().first; // universal index - Value hi = highs[tid][srcLvl]; Operation *loop = nullptr; Value iv; @@ -678,32 +675,31 @@ ArrayRef lvls, MutableArrayRef reduc, bool isParallel) { // TODO: support multiple return on parallel for? assert(!isParallel || reduc.size() <= 1); - bool isSparseCond = false, isSliceCond = false; + bool isSparseCond = false, isSparseSliceCond = false; size_t tid = tids.front(), lvl = lvls.front(); - for (auto [t, l] : llvm::zip(tids, lvls)) { assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair assert(!coords[t][l] || // We cannot re-enter the same level !dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop - auto dimType = lvlTypes[t][l]; + auto lvlType = lvlTypes[t][l]; // Must be a recognizable DLT. - assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || - isSingletonDLT(dimType)); + assert(isDenseDLT(lvlType) || isCompressedDLT(lvlType) || + isSingletonDLT(lvlType)); - // This is a slice-driven loop. - if (!dependentLvlMap[t][l].empty()) { - assert(!isSliceCond && !isSparseCond); - isSliceCond = true; + // This is a slice-driven loop on sparse level. + if (!dependentLvlMap[t][l].empty() && !isDenseDLT(lvlType)) { + assert(!isSparseSliceCond && !isSparseCond); + isSparseSliceCond = true; tid = t; lvl = l; continue; } - bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); + bool isSparse = isCompressedDLT(lvlType) || isSingletonDLT(lvlType); // We can at most have one sparse input, otherwise, a while loop is // required to co-iterate multiple sparse tensors. assert(!isSparseCond || !isSparse); - assert(!isSliceCond || !isSparseCond); + assert(!isSparseSliceCond || !isSparseCond); if (isSparse) { tid = t; lvl = l; @@ -711,8 +707,24 @@ isSparseCond = isSparseCond || isSparse; } + DimLevelType lvlType = lvlTypes[tid][lvl]; + // TODO: Dense slice driven loop can be generated using for loop as well. + assert(!isSparseSliceCond || !isDenseDLT(lvlType)); + bool isDenseSliceCond = + isDenseDLT(lvlType) && !dependentLvlMap[tid][lvl].empty(); + // if the slice is fully reduced, we can now use TACO-based algorithm to + // iterate it. Operation *l = nullptr; - if (isSliceCond) { + + // At most one tensor used as condition in for loop; + SmallVector condTid; + SmallVector condLvl; + // There Might be multiple dense slice driven tensor. + SmallVector sliceTids; + SmallVector sliceLvls; + SmallVector sliceReduc; + + if (isSparseSliceCond) { bool fullyReduced = depFullyReduced(tid, lvl); if (!fullyReduced) { l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc); @@ -725,22 +737,63 @@ lvl, reduc); } levelReducedDep[tid][lvl]++; - // NOTE: we can also prepare for next dim here in advance - // Pushes the loop into stack. - loopStack.emplace_back( - ArrayRef(), ArrayRef(), ArrayRef(tid), - ArrayRef(lvl), ArrayRef(fullyReduced), l, - builder.getInsertionBlock(), coords[tid][lvl], loopTag); + sliceTids.push_back(tid); + sliceLvls.push_back(lvl); + sliceReduc.push_back(fullyReduced); } else { - l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc, isParallel); - // NOTE: we can also prepare for next dim here in advance - // Pushes the loop into stack. - loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), - ArrayRef(), ArrayRef(), - ArrayRef(), l, builder.getInsertionBlock(), - coords[tid][lvl], loopTag); + Value lo = isSparseCond ? posits[tid][lvl] // current offset + : loopSeqStack.back().first; // universal index + Value hi = highs[tid][lvl]; + if (isDenseSliceCond) { + bool fullyReduced = depFullyReduced(tid, lvl); + Value sliceSz = sliceSizes[tid][lvl][sliceStack[tid].back().depth - 1]; + // Adjust for loop hi for dense slice-driven loop. + if (fullyReduced) { + hi = sliceSz; + condTid.push_back(tid); + condLvl.push_back(lvl); + } else { + hi = builder.create(loc, lvlSizes[tid][lvl], sliceSz); + hi = builder.create(loc, hi, C_IDX(1)); + } + } else { + condTid.push_back(tid); + condLvl.push_back(lvl); + } + l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi, reduc, + isParallel); } - + Value iv = coords[tid][lvl]; + for (auto [t, l] : llvm::zip(tids, lvls)) { + // We only need to handle slice-driven loops on dense level here. + // If it is a slice-driven loop on sparse level, it needs a while loop to + // insert break statements, and it must have been handled correctly in L692. + if (!dependentLvlMap[t][l].empty() && isDenseDLT(lvlTypes[t][l])) { + // Pushes sliced levels to build correct LoopInfo. + bool fullyReduc = depFullyReduced(t, l); + SliceInfo &info = sliceStack[t].back(); + if (fullyReduc) { + posits[t][l] = + genAddress(builder, loc, t, l, + builder.create(loc, info.offset, iv)); + } else { + // Puts sliced dense loop into LoopInfo so that LoopEmitter knows how to + // exit it. + sliceTids.push_back(t); + sliceLvls.push_back(l); + sliceReduc.push_back(fullyReduc); + // Update the slice information as we enter the new loop. + assert(*info.slicedOnLvl == l); + info.minCrd = info.offset = iv; + info.isNonEmpty = constantI1(builder, loc, true); + levelReducedDep[t][l]++; + } + } + } + // NOTE: we can also prepare for next dim here in advance + // Pushes the loop into stack. + loopStack.emplace_back(condTid, condLvl, sliceTids, sliceLvls, sliceReduc, l, + builder.getInsertionBlock(), iv, loopTag); // Emit extra locals. emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); return l; @@ -1098,6 +1151,10 @@ assert(tids.size() == lvls.size()); for (auto [tid, lvl] : llvm::zip(tids, lvls)) { if (isDenseDLT(lvlTypes[tid][lvl])) { + // Slice-driven dense level should have be handled already. + if (!dependentLvlMap[tid][lvl].empty()) + continue; + auto enc = getSparseTensorEncoding(tensors[tid].getType()); if (enc && !isSparseOutput(tid)) { bool validPos = lvl == 0 || posits[tid][lvl - 1]; @@ -1119,6 +1176,18 @@ MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); + for (auto [tid, lvl, reduced] : llvm::zip( + loopInfo.slicedTids, loopInfo.slicedLvls, loopInfo.sliceReduced)) { + SliceInfo &info = sliceStack[tid].back(); + assert(isDenseDLT(lvlTypes[tid][lvl])); + assert(*info.slicedOnLvl == lvl && !reduced); + (void)reduced; + // Resets slices pointers as the resolved slices are invalidated after we + // moves forward to the next slice. + invalidateSliceIterIdx(rewriter, loc, tid, lvl); + info.minCrd = info.offset = info.isNonEmpty = Value(); + levelReducedDep[tid][lvl]--; + } if (auto forOp = llvm::dyn_cast(loopInfo.loop)) { if (!reduc.empty()) { assert(reduc.size() == forOp.getNumResults()); @@ -1212,6 +1281,8 @@ unsigned delta = 0; for (auto [tid, lvl, resolved] : llvm::zip( loopInfo.slicedTids, loopInfo.slicedLvls, loopInfo.sliceReduced)) { + // TODO: handle dense. + assert(isCompressedDLT(lvlTypes[tid][lvl])); levelReducedDep[tid][lvl]--; if (!resolved) { genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o); @@ -1321,12 +1392,11 @@ // Slice-driven loop related methods. //===----------------------------------------------------------------------===// -const LoopEmitter::SliceInfo &LoopEmitter::getFinalSliceOnLvl(TensorId tid, - Level lvl) { +const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid, + Level lvl) { for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie; it++) { if (it->slicedOnLvl == lvl) { - assert(it->depth == dependentLvlMap[tid][lvl].size() - 1); return *it; } } @@ -1346,9 +1416,7 @@ std::pair LoopEmitter::genSliceLvlTraverseLoop( OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset, Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield, - llvm::function_ref)> - bodyBuilder) { + LoopBodyBuilder bodyBuilder) { Value c1 = C_IDX(1); Value sliceHi = builder.create(loc, offset, size); @@ -1416,40 +1484,106 @@ } ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse( - OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl, - size_t depth, ValueRange userReduc, - llvm::function_ref)> - bodyBuilder) { - + OpBuilder &builder, Location loc, TensorId tid, + ArrayRef unResLvls, ValueRange userReduc, + LoopBodyBuilder bodyBuilder) { + // assert(unResLvls.size() == 1 && "TODO"); Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); - // TODO: it only works on all compressed tensor. - Value sPtrBuf = slicePosBuffer[tid][lvl][depth]; - Value pSt = c2; // pointer starting index - Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize - - auto forOp = - scf::buildLoopNest( - builder, loc, pSt, mSz, c2, userReduc, - [this, c1, tid, lvl, offset, sPtrBuf, - bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs, - ValueRange iterArgs) -> scf::ValueVector { + const SliceInfo &frontSlice = *unResLvls.back(); + Level firstLvl = *frontSlice.slicedOnLvl; + assert(!lvlFullyResolved(tid, firstLvl) && "TODO"); + + // FIXME: it is not zero when the first level is fully resolved. + Value pos = c0; + OpBuilder::InsertPoint ip; + SmallVector innerArgs(userReduc.begin(), userReduc.end()); + scf::ForOp outterMost = nullptr; + if (!lvlFullyResolved(tid, firstLvl)) { + if (isCompressedDLT(lvlTypes[tid][firstLvl])) { + unsigned depth = frontSlice.depth - 1; + Value offset = frontSlice.offset; + Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth]; + Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize + outterMost = builder.create( + loc, c2, mSz, c2, innerArgs, + [this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos, &innerArgs]( + OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) { // generate traversal for each level. - Value loopLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); - Value loopHi = genIndexLoad( - builder, loc, sPtrBuf, - builder.create(loc, ivs.front(), c1)); - return genSliceLvlTraverseLoop(builder, loc, loopLo, loopHi, offset, - sliceSizes[tid][lvl].back(), tid, - lvl, iterArgs, true, bodyBuilder) - .second; - }) - .loops.front(); + Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv); + Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1)); + ValueRange itArgs = + genSliceLvlTraverseLoop( + builder, loc, loopLo, loopHi, offset, + sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs, + false, + [&](OpBuilder &builder, Location, Value iv, + MutableArrayRef reduc) { + ip = builder.saveInsertionPoint(); + pos = iv; + innerArgs.assign(reduc.begin(), reduc.end()); + }) + .second; + builder.create(loc, itArgs); + }); + } else if (isDenseDLT(lvlTypes[tid][firstLvl])) { + assert(firstLvl == 0); // This must be the first level. + Value lb = frontSlice.offset; + Value sliceSz = + sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1]; + Value ub = ADDI(lb, sliceSz); + outterMost = builder.create( + loc, lb, ub, c1, innerArgs, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) { + ip = builder.saveInsertionPoint(); + pos = iv; + innerArgs.assign(iterArgs.begin(), iterArgs.end()); + }); + } + // We generated the loop for the first slice above, now remove it. + unResLvls = unResLvls.drop_back(); + } + // Reset the insertion point into the loop body. + builder.restoreInsertionPoint(ip); + if (!unResLvls.empty()) { + // Fills in dense slices levels in between. + SmallVector lbs, ubs, steps, lvlSzs; + for (const SliceInfo *slice : llvm::reverse(unResLvls)) { + Level sliceLvl = *slice->slicedOnLvl; + assert(isDenseDLT(lvlTypes[tid][sliceLvl])); + Value offset = slice->offset; + Value sliceSz = sliceSizes[tid][sliceLvl][slice->depth - 1]; + lbs.push_back(offset); + ubs.push_back(builder.create(loc, offset, sliceSz)); + steps.push_back(c1); + lvlSzs.push_back(lvlSizes[tid][sliceLvl]); + } + auto denseNest = scf::buildLoopNest( + builder, loc, lbs, ubs, steps, innerArgs, + [&innerArgs, &lvlSzs, &pos, + bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + for (auto em : llvm::enumerate(ivs)) { + // Linearizes postion: pos = (pos * lvlsize) + iv; + pos = builder.create(loc, pos, lvlSzs[em.index()]); + pos = builder.create(loc, pos, em.value()); + } + innerArgs.assign(iterArgs.begin(), iterArgs.end()); + // Generates user request loop body. + // TODO: we do not have to check inbound for dense levels + bodyBuilder(builder, loc, pos, innerArgs); + return innerArgs; + }); + builder.create(loc, denseNest.results); + } else { + // Generates user request loop body. + bodyBuilder(builder, loc, pos, innerArgs); + builder.create(loc, innerArgs); + } // Insert after current while operation. - builder.setInsertionPointAfter(forOp); - return forOp.getResults(); + builder.setInsertionPointAfter(outterMost); + return outterMost.getResults(); } void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc, @@ -1457,6 +1591,13 @@ assert(lvl == 0 && "TODO: handle non-first level"); Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3), c4 = C_IDX(4); + if (isDenseDLT(lvlTypes[tid][lvl])) { + // Dense slice begin is trivial. + sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0, + /*nonEmpty=*/constantI1(builder, loc, true), + lvl, /*depth=*/1); + return; + } Value size = sliceSizes[tid][0][0]; Value sPtrBuf = slicePosBuffer[tid][0][0]; Value pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); @@ -1482,18 +1623,41 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl) { - assert(isCompressedDLT(lvlTypes[tid][lvl])); Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); - const SliceInfo &sliceInfo = sliceStack[tid].back(); - unsigned prevLvl = *sliceInfo.slicedOnLvl; - assert(lvl >= prevLvl); - // Either lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one + unsigned depth = levelReducedDep[tid][lvl]; + Value size = sliceSizes[tid][lvl][depth]; + // Dense slice begin is trivial + if (isDenseDLT(lvlTypes[tid][lvl])) { + sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), lvl, + depth + 1); + return; + } + + assert(isCompressedDLT(lvlTypes[tid][lvl])); + // Unhandled Cases: + // + // 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one // variable need to be reduced on the same level). - // Or lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a + // + // 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a // simple dim expression in between). - assert(lvl == prevLvl + 1 && "TODO: not yet implemented"); + assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1); + // Check slice stack integrity. - assert(slicePosBuffer[tid][prevLvl].size() == sliceInfo.depth); + assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth); + + SmallVector unResSlices; + for (Level curLvl = lvl; curLvl >= 1; curLvl--) { + Level prevLvl = curLvl - 1; + unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl)); + if (!isDenseDLT(lvlTypes[tid][prevLvl]) || lvlFullyResolved(tid, prevLvl)) { + break; + } + } + + assert(!unResSlices.empty() && + !lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl)); + Value sPtrBuf = slicePosBuffer[tid][lvl].back(); SmallVector reduc = { constantI1(builder, loc, false), // isNonEmpty @@ -1502,7 +1666,7 @@ }; ValueRange result = genUnResolvedSliceTreeTraverse( - builder, loc, sliceInfo.offset, tid, prevLvl, sliceInfo.depth - 1, reduc, + builder, loc, tid, unResSlices, reduc, [this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, Value iv, MutableArrayRef reduc) { @@ -1543,8 +1707,6 @@ curMemSz = builder.create(loc, curMemSz, c2); }); - unsigned depth = levelReducedDep[tid][lvl]; - Value size = sliceSizes[tid][lvl][depth]; Value isNonEmpty = result[0]; Value minCrd = result[1]; // Two metadata [memSize, idx]. @@ -1561,6 +1723,10 @@ Value c1 = C_IDX(1), c2 = C_IDX(2); if (depFullyReduced(tid, lvl)) { + // Do not need to prepare for slice driven loop on dense level after it is + // fully reduced. + if (isDenseDLT(lvlTypes[tid][lvl])) + return true; // If constraints on the tensor is fully resolved. We do not need to // generates slice begin any more, instead we fall back to TACO-based // algorithm to (co)iterates over the slice. @@ -1589,8 +1755,7 @@ const SliceInfo &sliceInfo = sliceStack[tid].back(); auto baseEnc = getSparseTensorEncoding(tensors[tid].getType()); - if (baseEnc.isSlice()) - llvm_unreachable("TODO: not yet implemented"); + assert(!baseEnc.isSlice() && "TODO: not yet implemented"); // Generate caches required to fast compute next-non-empty slices with // increasing offset for slice-base loop. // We do not need cache for dense levels. @@ -1639,6 +1804,16 @@ return false; } +void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc, + TensorId tid, Level lvl) { + for (unsigned i = 0; i <= lvl; i++) { + if (!isDenseDLT(lvlTypes[tid][i])) { + builder.create(loc, C_IDX(0), + slicePosBuffer[tid][i].back(), C_IDX(1)); + } + } +} + void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, const Operation *op, TensorId tid, Level lvl, @@ -1648,14 +1823,11 @@ llvm_unreachable("TODO"); // else generate code to compute next non empty slice. - Value c0 = C_IDX(0); - Value c1 = C_IDX(1); - Value c2 = C_IDX(2); + Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); auto whileOp = llvm::cast(op); SliceInfo &info = sliceStack[tid].back(); assert(info.slicedOnLvl == lvl); - // // We forward to the next non empty slice by // if (minCrd > offset) { @@ -1671,8 +1843,7 @@ Value absOffset = info.offset; // Resets slices pointers as the resolved slices are invalidated after we // moves forward to the next slice. - for (unsigned i = 0; i <= lvl; i++) - builder.create(loc, c0, slicePosBuffer[tid][i].back(), c1); + invalidateSliceIterIdx(builder, loc, tid, lvl); SmallVector reduc = {info.minCrd, info.isNonEmpty, absOffset}; Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1]; @@ -1870,4 +2041,5 @@ } #undef CMPI +#undef ADDI #undef C_IDX diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -1481,11 +1481,11 @@ std::optional lvl, DimLevelType dlt, bool isIdxReduc) { assert(env.merger().loop(b) == idx); - // FIXME: Dense index reduction can reuse the universal index as well. - if (!isIdxReduc && (isDenseDLT(dlt) || isUndefDLT(dlt))) { + if (isDenseDLT(dlt) || isUndefDLT(dlt)) { needsUniv = true; - } else { - // sparse/singleton levels. + } + if (isCompressedDLT(dlt) || isSingletonDLT(dlt) || isIdxReduc) { + // sparse/singleton levels, or a dense/sparse index reduction loop. tids.push_back(tid); lvls.push_back(*lvl); } @@ -1578,7 +1578,7 @@ tids.push_back(tid); lvls.push_back(*lvl); numloopCond++; - } else if (isDenseDLT(dlt)) { + } else if (isDenseDLT(dlt) || isIdxReduc) { tids.push_back(tid); lvls.push_back(*lvl); } else { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir @@ -1,9 +1,4 @@ -// UNSUPPORTED: target={{.*}} -// FIXME: The test case is disabled (for now) because affine index on sparse tensor -// are not handled efficiently by sparse compiler, the test case will be re-enabled -// after new algorithm is implemented. - -// DEFINE: %{option} = enable-runtime-library=true +// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true" // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: -e entry -entry-point-result=void \ @@ -13,16 +8,16 @@ // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-index-reduction=true" // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation and vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-index-reduction=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" // RUN: %{compile} | %{run} // Do the same run, but now with direct IR generation and, if available, VLA // vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA" +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-index-reduction=true enable-arm-sve=%ENABLE_VLA" // REDEFINE: %{run} = %lli \ // REDEFINE: --entry-function=entry_lli \ // REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ @@ -33,6 +28,7 @@ #DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> #CSR = #sparse_tensor.encoding<{dimLevelType = ["dense", "compressed"]}> +#CDR = #sparse_tensor.encoding<{dimLevelType = ["compressed", "dense"]}> #CSC = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(i,j) -> (j,i)> @@ -42,46 +38,55 @@ module { func.func @conv2d(%input: tensor<8x8xi32>, - %filter: tensor<3x3xi32, #DCSR>, + %filter: tensor<3x3xi32>, %output: tensor<6x6xi32>) -> tensor<6x6xi32> { %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) + ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32>) outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32> return %0 : tensor<6x6xi32> } func.func @conv2d_sparse_out(%input: tensor<8x8xi32>, - %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> { + %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) + ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32>) outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> return %0 : tensor<6x6xi32, #DCSR> } func.func @conv2d_all_sparse_DCSR(%input: tensor<8x8xi32, #DCSR>, - %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> { + %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32, #DCSR>) + ins (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32>) outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> return %0 : tensor<6x6xi32, #DCSR> } func.func @conv2d_all_sparse_CSR(%input: tensor<8x8xi32, #CSR>, - %filter: tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> { + %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #CSR> { %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSR> %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32, #CSR>) + ins (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32>) outs (%s: tensor<6x6xi32, #CSR>) -> tensor<6x6xi32, #CSR> return %0 : tensor<6x6xi32, #CSR> } + func.func @conv2d_all_sparse_CD(%input: tensor<8x8xi32, #CDR>, + %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #CDR> { + %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CDR> + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32, #CDR>, tensor<3x3xi32>) + outs (%s: tensor<6x6xi32, #CDR>) -> tensor<6x6xi32, #CDR> + return %0 : tensor<6x6xi32, #CDR> + } + func.func @conv2d_all_sparse_CSC(%input: tensor<8x8xi32, #CSC>, - %filter: tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> { + %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #CSC> { %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSC> %0 = linalg.conv_2d - ins (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32, #CSC>) + ins (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32>) outs (%s: tensor<6x6xi32, #CSC>) -> tensor<6x6xi32, #CSC> return %0 : tensor<6x6xi32, #CSC> } @@ -96,12 +101,6 @@ [ 0, 0, 0 ], [ -1, 0, 1 ] ]> : tensor<3x3xi32> - %sparse_filter_DCSR = sparse_tensor.convert %filter - : tensor<3x3xi32> to tensor<3x3xi32, #DCSR> - %sparse_filter_CSR = sparse_tensor.convert %filter - : tensor<3x3xi32> to tensor<3x3xi32, #CSR> - %sparse_filter_CSC = sparse_tensor.convert %filter - : tensor<3x3xi32> to tensor<3x3xi32, #CSC> %input = arith.constant dense<[ @@ -118,26 +117,31 @@ : tensor<8x8xi32> to tensor<8x8xi32, #DCSR> %sparse_input_CSR = sparse_tensor.convert %input : tensor<8x8xi32> to tensor<8x8xi32, #CSR> + %sparse_input_CD = sparse_tensor.convert %input + : tensor<8x8xi32> to tensor<8x8xi32, #CDR> %sparse_input_CSC = sparse_tensor.convert %input : tensor<8x8xi32> to tensor<8x8xi32, #CSC> // Call the kernel. %output = arith.constant dense<0> : tensor<6x6xi32> - %0 = call @conv2d(%input, %sparse_filter_DCSR, %output) + %0 = call @conv2d(%input, %filter, %output) : (tensor<8x8xi32>, - tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32> - %1 = call @conv2d_sparse_out(%input, %sparse_filter_DCSR) + tensor<3x3xi32>, tensor<6x6xi32>) -> tensor<6x6xi32> + %1 = call @conv2d_sparse_out(%input, %filter) : (tensor<8x8xi32>, - tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> - %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %sparse_filter_DCSR) + tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> + %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %filter) : (tensor<8x8xi32, #DCSR>, - tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> - %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR) + tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> + %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %filter) : (tensor<8x8xi32, #CSR>, - tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> - %4 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %sparse_filter_CSC) + tensor<3x3xi32>) -> tensor<6x6xi32, #CSR> + %4 = call @conv2d_all_sparse_CD(%sparse_input_CD, %filter) + : (tensor<8x8xi32, #CDR>, + tensor<3x3xi32>) -> tensor<6x6xi32, #CDR> + %5 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %filter) : (tensor<8x8xi32, #CSC>, - tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> + tensor<3x3xi32>) -> tensor<6x6xi32, #CSC> // Verify the output. @@ -183,6 +187,21 @@ : tensor<6x6xi32>, vector<6x6xi32> vector.print %v2 : vector<6x6xi32> + // + // Should be the same as dense output + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %all_sparse_CD = sparse_tensor.convert %4 + : tensor<6x6xi32, #CDR> to tensor<6x6xi32> + %v4 = vector.transfer_read %all_sparse_CD[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v4 : vector<6x6xi32> + // // Should be the same as dense output // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), @@ -207,25 +226,23 @@ // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) // - %all_sparse_CSC = sparse_tensor.convert %4 + %all_sparse_CSC = sparse_tensor.convert %5 : tensor<6x6xi32, #CSC> to tensor<6x6xi32> - %v4 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0 + %v5 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0 : tensor<6x6xi32>, vector<6x6xi32> - vector.print %v4 : vector<6x6xi32> + vector.print %v5 : vector<6x6xi32> // Release the resources. - bufferization.dealloc_tensor %sparse_filter_DCSR : tensor<3x3xi32, #DCSR> - bufferization.dealloc_tensor %sparse_filter_CSR : tensor<3x3xi32, #CSR> - bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC> - bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR> bufferization.dealloc_tensor %sparse_input_CSR : tensor<8x8xi32, #CSR> bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC> + bufferization.dealloc_tensor %sparse_input_CD : tensor<8x8xi32, #CDR> bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR> bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR> bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR> - bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CSC> + bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CDR> + bufferization.dealloc_tensor %5 : tensor<6x6xi32, #CSC> return } } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir deleted file mode 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_slice_based.mlir +++ /dev/null @@ -1,81 +0,0 @@ -// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=false" -// DEFINE: %{command} = mlir-opt %s --sparse-compiler=%{option} | \ -// DEFINE: mlir-cpu-runner \ -// DEFINE: -e entry -entry-point-result=void \ -// DEFINE: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ -// DEFINE: FileCheck %s -// -// RUN: %{command} - -#map = affine_map<(d0, d1, d2, d3) -> (d0 + d2, d1 + d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> - -#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }> - -module { - func.func @conv2d_all_sparse_CSR(%arg0: tensor<8x8xi32, #DCSR>, %arg1: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { - %0 = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> - %1 = linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["parallel", "parallel", "reduction", "reduction"]} - ins(%arg0, %arg1 : tensor<8x8xi32, #DCSR>, tensor<3x3xi32>) - outs(%0 : tensor<6x6xi32, #DCSR>) { - ^bb0(%in: i32, %in_0: i32, %out: i32): - %2 = arith.muli %in, %in_0 : i32 - %3 = arith.addi %out, %2 : i32 - linalg.yield %3 : i32 - } -> tensor<6x6xi32, #DCSR> - return %1 : tensor<6x6xi32, #DCSR> - } - - func.func @entry() { - %c0 = arith.constant 0 : index - %i0 = arith.constant 0 : i32 - - // A typical edge detection filter. - %filter = arith.constant dense<[ - [ 1, 0, -1 ], - [ 0, 0, 0 ], - [ -1, 0, 1 ] - ]> : tensor<3x3xi32> - - %input = arith.constant dense<[ - [ 1, 2, 3, 4, 0, 6, 7, 8 ], - [ 2, 2, 4, 4, 0, 0, 6, 8 ], - [ 2, 2, 4, 4, 0, 0, 6, 8 ], - [ 2, 2, 3, 4, 0, 0, 7, 8 ], - [ 1, 3, 3, 4, 0, 0, 6, 8 ], - [ 3, 2, 3, 4, 0, 0, 7, 8 ], - [ 1, 3, 3, 4, 3, 6, 6, 8 ], - [ 1, 3, 3, 4, 3, 0, 7, 8 ] - ]> : tensor<8x8xi32> - - %sparse_filter_CSR = sparse_tensor.convert %filter - : tensor<3x3xi32> to tensor<3x3xi32> - - %sparse_input_CSR = sparse_tensor.convert %input - : tensor<8x8xi32> to tensor<8x8xi32, #DCSR> - - %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR) - : (tensor<8x8xi32, #DCSR>, - tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> - - %out = sparse_tensor.convert %3 - : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> - // - // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), - // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), - // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), - // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), - // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), - // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) - // - %v2 = vector.transfer_read %out[%c0, %c0], %i0 - : tensor<6x6xi32>, vector<6x6xi32> - vector.print %v2 : vector<6x6xi32> - - return - } - -} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir @@ -1,9 +1,4 @@ -// UNSUPPORTED: target={{.*}} -// FIXME: The test case is disabled (for now) because affine index on sparse tensor -// are not handled efficiently by sparse compiler, the test case will be re-enabled -// after new algorithm is implemented. - -// DEFINE: %{option} = enable-runtime-library=true +// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=true" // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: -e entry -entry-point-result=void \ @@ -13,16 +8,16 @@ // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true" // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation and vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" // RUN: %{compile} | %{run} // Do the same run, but now with direct IR generation and, if available, VLA // vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA" +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" // REDEFINE: %{run} = %lli \ // REDEFINE: --entry-function=entry_lli \ // REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ @@ -39,6 +34,10 @@ dimLevelType = [ "compressed", "dense", "compressed" ] }> +#DDC = #sparse_tensor.encoding<{ + dimLevelType = [ "dense", "compressed", "compressed" ] +}> + // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor { %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor @@ -53,24 +52,33 @@ return %ret : tensor } -func.func @conv_3d_CCC(%arg0: tensor, %arg1: tensor) -> tensor { +func.func @conv_3d_CCC(%arg0: tensor, %arg1: tensor) -> tensor { %c6 = arith.constant 6 : index %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor %ret = linalg.conv_3d - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } -func.func @conv_3d_CDC(%arg0: tensor, %arg1: tensor) -> tensor { +func.func @conv_3d_CDC(%arg0: tensor, %arg1: tensor) -> tensor { %c6 = arith.constant 6 : index %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor %ret = linalg.conv_3d - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } +func.func @conv_3d_DDC(%arg0: tensor, %arg1: tensor) -> tensor { + %c6 = arith.constant 6 : index + %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor + %ret = linalg.conv_3d + ins (%arg0, %arg1: tensor, tensor) + outs (%s: tensor) -> tensor + return %ret : tensor +} + func.func @entry() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -88,17 +96,15 @@ %in3D_CCC = sparse_tensor.convert %in3D : tensor to tensor - %filter3D_CCC = sparse_tensor.convert %filter3D - : tensor to tensor - %in3D_CDC = sparse_tensor.convert %in3D : tensor to tensor - %filter3D_CDC = sparse_tensor.convert %filter3D - : tensor to tensor + %in3D_DDC = sparse_tensor.convert %in3D + : tensor to tensor %dense_ret = call @conv_3d(%in3D, %filter3D, %out3D) : (tensor, tensor, tensor) -> (tensor) - %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D_CCC) : (tensor, tensor) -> (tensor) - %CDC_ret = call @conv_3d_CDC(%in3D_CDC, %filter3D_CDC) : (tensor, tensor) -> (tensor) + %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D) : (tensor, tensor) -> (tensor) + %CDC_ret = call @conv_3d_CDC(%in3D_CDC, %filter3D) : (tensor, tensor) -> (tensor) + %DDC_ret = call @conv_3d_DDC(%in3D_DDC, %filter3D) : (tensor, tensor) -> (tensor) // CHECK:( ( ( 108, 108, 108, 108, 108, 108 ), // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), @@ -224,18 +230,59 @@ : tensor, vector<6x6x6xf32> vector.print %v2 : vector<6x6x6xf32> + // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), + // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) + %3 = sparse_tensor.convert %DDC_ret + : tensor to tensor + %v3 = vector.transfer_read %3[%c0, %c0, %c0], %zero + : tensor, vector<6x6x6xf32> + vector.print %v2 : vector<6x6x6xf32> + // Free the resources bufferization.dealloc_tensor %in3D : tensor bufferization.dealloc_tensor %filter3D : tensor bufferization.dealloc_tensor %out3D : tensor bufferization.dealloc_tensor %in3D_CDC : tensor - bufferization.dealloc_tensor %filter3D_CDC : tensor bufferization.dealloc_tensor %in3D_CCC : tensor - bufferization.dealloc_tensor %filter3D_CCC : tensor + bufferization.dealloc_tensor %in3D_DDC : tensor bufferization.dealloc_tensor %CCC_ret : tensor bufferization.dealloc_tensor %CDC_ret : tensor - + bufferization.dealloc_tensor %DDC_ret : tensor return } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir deleted file mode 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_slice_based.mlir +++ /dev/null @@ -1,97 +0,0 @@ -// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=false" -// DEFINE: %{command} = mlir-opt %s --sparse-compiler=%{option} | \ -// DEFINE: mlir-cpu-runner \ -// DEFINE: -e entry -entry-point-result=void \ -// DEFINE: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \ -// DEFINE: FileCheck %s -// -// RUN: %{command} - -#CCC = #sparse_tensor.encoding<{ - dimLevelType = [ "compressed", "compressed", "compressed" ] -}> - -func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor { - %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor - %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor - return %ret : tensor -} - -func.func @conv_3d_CCC(%arg0: tensor, %arg1: tensor) -> tensor { - %c6 = arith.constant 6 : index - %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor - %ret = linalg.conv_3d - ins (%arg0, %arg1: tensor, tensor) - outs (%s: tensor) -> tensor - return %ret : tensor -} - -func.func @entry() { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c6 = arith.constant 6 : index - %c8 = arith.constant 8 : index - %f10 = arith.constant 10.00000e+00 : f32 - %val = arith.constant 2.00000e+00 : f32 - %zero = arith.constant 0.00000e+00 : f32 - - %filter3D = call @alloc_3d_filled_f32(%c3, %c3, %c3, %val) : (index, index, index, f32) -> (tensor) - %in3D_tmp = call @alloc_3d_filled_f32(%c8, %c8, %c8, %val) : (index, index, index, f32) -> (tensor) - %in3D = tensor.insert %f10 into %in3D_tmp[%c0, %c3, %c0] : tensor - %out3D = call @alloc_3d_filled_f32(%c6, %c6, %c6, %zero) : (index, index, index, f32) -> (tensor) - - %in3D_CCC = sparse_tensor.convert %in3D - : tensor to tensor - %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D) : (tensor, tensor) -> (tensor) - // CHECK: ( ( ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 124, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), - // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), - // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), - // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), - // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ), - // CHECK-SAME: ( ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), - // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) - %1 = sparse_tensor.convert %CCC_ret - : tensor to tensor - %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero - : tensor, vector<6x6x6xf32> - vector.print %v1 : vector<6x6x6xf32> - - // Free the resources - bufferization.dealloc_tensor %in3D : tensor - bufferization.dealloc_tensor %filter3D : tensor - - bufferization.dealloc_tensor %in3D_CCC : tensor - bufferization.dealloc_tensor %CCC_ret : tensor - - return -}