diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ SparseStorageSpecifierToLLVM.cpp SparseTensorCodegen.cpp SparseTensorConversion.cpp + SparseTensorLoopEmitter.cpp SparseTensorPasses.cpp SparseTensorRewriting.cpp SparseTensorStorageLayout.cpp diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h @@ -14,6 +14,7 @@ #define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_CODEGENENV_H_ #include "CodegenUtils.h" +#include "SparseTensorLoopEmitter.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" @@ -22,7 +23,6 @@ namespace mlir { namespace sparse_tensor { - /// The code generation environment class aggregates a number of data /// structures that are needed during the code generation phase of /// sparsification. This environment simplifies passing around such @@ -45,9 +45,9 @@ linalg::GenericOp op() const { return linalgOp; } const SparsificationOptions &options() const { return sparseOptions; } Merger &merger() { return latticeMerger; } - SparseTensorLoopEmitter *emitter() { return loopEmitter; } + LoopEmitter *emitter() { return loopEmitter; } - void startEmit(OpOperand *so, unsigned lv, SparseTensorLoopEmitter *le); + void startEmit(OpOperand *so, unsigned lv, LoopEmitter *le); /// Generates loop boundary statements (entering/exiting loops). The function /// passes and updates the passed-in parameters. @@ -135,7 +135,7 @@ // Loop emitter helper class (keep reference in scope!). // TODO: move emitter constructor up in time? - SparseTensorLoopEmitter *loopEmitter; + LoopEmitter *loopEmitter; // Topological sort. std::vector topSort; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp @@ -24,8 +24,7 @@ expFilled(), expAdded(), expCount(), redVal(), redExp(-1u), redCustom(-1u) {} -void CodegenEnv::startEmit(OpOperand *so, unsigned lv, - SparseTensorLoopEmitter *le) { +void CodegenEnv::startEmit(OpOperand *so, unsigned lv, LoopEmitter *le) { assert(sparseOut == nullptr && loopEmitter == nullptr && insChain == nullptr && "must only start emitting once"); sparseOut = so; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -313,274 +313,6 @@ return !rtp || rtp.getRank() == 0; } -//===----------------------------------------------------------------------===// -// SparseTensorLoopEmiter class, manages sparse tensors and helps to -// generate loop structure to (co)-iterate sparse tensors. -// -// An example usage: -// To generate the following loops over T1 and T2 -// -// for i in TENSOR_1_0 { -// for j : TENSOR_2_0 { -// for k : TENSOR_1_1 {} -// for k : TENSOR_2_1 {} -// } -// } -// -// One can use -// -// SparseTensorLoopEmiter loopEmiter({T1, T1}); -// loopEmiter.initializeLoopEmit(); -// loopEmiter.enterLoopOverTensorAtDim(T1, 0); -// loopEmiter.enterLoopOverTensorAtDim(T2, 0); -// loopEmiter.enterLoopOverTensorAtDim(T1, 1); -// loopEmiter.exitCurrentLoop(); -// loopEmiter.enterLoopOverTensorAtDim(T2, 1); -// loopEmiter.exitCurrentLoop(); // exit k -// loopEmiter.exitCurrentLoop(); // exit j -// loopEmiter.exitCurrentLoop(); // exit i -//===----------------------------------------------------------------------===// - -class SparseTensorLoopEmitter { -public: - /// Optional callback function to setup dense output tensors when - /// initializing the loop emitter (e.g., to fill a dense output with zeros). - using OutputUpdater = function_ref; - - SparseTensorLoopEmitter() = default; - - /// Takes an array of tensors inputs, on which the generated loops will - /// iterate on. The index of the tensor in the array is also the tensor id - /// (tid) used in related functions. If isSparseOut is set, loop emitter - /// assume that the sparse output tensor is empty, and will always generate - /// loops on it based on the dim sizes. An optional array could be provided - /// (by sparsification) to indicate the loop id sequence that will be - /// generated. It is used to establish the mapping between affineDimExpr to - /// the corresponding loop index in the loop stack that are maintained by the - /// loop emitter. - void initialize(ValueRange tensors, StringAttr loopTag = nullptr, - bool hasOutput = false, bool isSparseOut = false, - ArrayRef topSort = {}); - - explicit SparseTensorLoopEmitter(ValueRange tensors, - StringAttr loopTag = nullptr, - bool hasOutput = false, - bool isSparseOut = false, - ArrayRef topSort = {}); - - /// Starts a loop emitting session by generating all the buffers needed to - /// iterate tensors. - void initializeLoopEmit(OpBuilder &builder, Location loc, - OutputUpdater updater = nullptr); - - /// Generates a list of operations to compute the affine expression. - Value genAffine(OpBuilder &builder, AffineExpr a, Location loc); - - /// Enters a new loop sequence, the loops within the same sequence starts - /// from the break points of previous loop instead of starting over from 0. - /// e.g., - /// { - /// // loop sequence start. - /// p0 = while(xxx) - /// ... - /// break p0 - /// - /// // Starts loop from p0 - /// for (i = p0; i < end; i++) - /// ... - /// // loop sequence end. - /// } - void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef dims); - - // exit the current loop sequence, this will reset universal index to 0. - void exitCurrentLoopSeq() { - assert(loopSeqStack.size() == loopStack.size() + 1); - loopSeqStack.pop_back(); - } - - // TODO: Gets rid of `dim` in the argument list? Track the dimension we - // are currently at internally. Then it would be enterNextDimForTensor. - // Still need a way to specify the dim for non annoated dense tensor though, - // as it can be accessed out of order. - /// Emits loop over tensor_tid_dim, it assumes that loops between - /// tensor_tid_[0, dim - 1] have already been generated. - /// The function will also perform in-place update on the `reduc` vector to - /// return the reduction variable used inside the generated loop. - Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc, - ArrayRef tids, - ArrayRef dims, - MutableArrayRef reduc = {}, - bool isParallel = false); - - Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc, - size_t tid, size_t dim, - AffineExpr affine, - MutableArrayRef reduc = {}); - - void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc, - size_t tid, size_t dim, - AffineExpr affine); - - /// Emits a co-iteration loop over a set of tensors. - Operation *enterCoIterationOverTensorsAtDims( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef dims, bool needsUniv, MutableArrayRef reduc = {}); - - void exitCurrentLoop(RewriterBase &rewriter, Location loc, - MutableArrayRef reduc = {}); - - /// Returns the array of coordinate for all the loop generated till now. - void getCoordinateArray(SmallVectorImpl &coords) const { - for (auto &l : loopStack) - coords.push_back(l.iv); - } - - /// Gets loop induction variable at the given level. - unsigned getCurrentDepth() const { return loopStack.size(); } - - /// Gets loop induction variable at the given level. - Value getLoopIV(size_t level) const { - if (level < loopStack.size()) - return loopStack[level].iv; - return nullptr; - } - - /// - /// Getters. - /// - const std::vector> &getPidxs() const { return pidxs; }; - const std::vector> &getCoord() const { return coord; }; - const std::vector> &getHighs() const { return highs; }; - const std::vector> &getPtrBuffer() const { - return ptrBuffer; - }; - const std::vector> &getIdxBuffer() const { - return idxBuffer; - }; - const std::vector &getValBuffer() const { return valBuffer; }; - - constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() { - return llvm::StringLiteral("Emitted from"); - } - -private: - struct LoopLevelInfo { - LoopLevelInfo(ArrayRef tids, ArrayRef dims, Operation *loop, - Value iv, StringAttr loopTag) - : tids(tids), dims(dims), loop(loop), iv(iv) { - // Attached a special tag to loop emitter generated loop. - if (loopTag) - loop->setAttr(SparseTensorLoopEmitter::getLoopEmitterLoopAttrName(), - loopTag); - } - // TODO: maybe use a vector for tid and dim? - // The set of tensors that the loop is operating on - const llvm::SmallVector tids; - // The corresponding dims for the tensors - const llvm::SmallVector dims; - const Operation *loop; // the loop operation - const Value iv; // the induction variable for the loop - }; - - /// Linearizes address for dense dimension (i.e., p = (i * d0) + j). - Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim, - Value iv) { - Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1]; - Value mul = builder.create(loc, highs[tid][dim], p); - Value add = builder.create(loc, mul, iv); - return add; - } - - bool isOutputTensor(size_t tid) { - return hasOutput && tid == tensors.size() - 1; - } - - bool isSparseOutput(size_t tid) { return isOutputTensor(tid) && isSparseOut; } - - /// Setups [lo, hi] for iterating tensor[dim], it assumes that tensor[0 - /// ...dims-1] has already been setup. - void prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, - size_t dim); - - /// Emits extra locals, since the locals might not be in simplified lattices - /// point used to generate the loops, but are still required to generates - /// expressions. - void emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc, - ArrayRef tids, - ArrayRef dims); - - /// Exits a for loop, returns the reduction results, e.g., - /// For sequential for loops: - /// %ret = for () { - /// ... - /// %val = addi %args, %c - /// yield %val - /// } - /// For parallel loops, the following generated code by users: - /// %ret = parallel () init(%args) { - /// ... - /// %val = op %args, %c - /// } - /// will be transformed into - /// %ret = parallel () init(%args) { - /// ... - /// scf.reduce(%c) bb0(%0, %1){ - /// %val = op %0, %1 - /// scf.reduce.return %val - /// } - /// } - /// NOTE: only one instruction will be moved into reduce block, - /// transformation will fail if multiple instructions are used to compute - /// the reduction value. Return %ret to user, while %val is provided by - /// users (`reduc`). - void exitForLoop(RewriterBase &rewriter, Location loc, - MutableArrayRef reduc); - - /// Exits a while loop, returns the reduction results. - void exitCoIterationLoop(OpBuilder &builder, Location loc, - MutableArrayRef reduc); - - /// A optional string attribute that should be attached to the loop - /// generated by loop emitter, it might help following passes to identify - /// loops that operates on sparse tensors more easily. - StringAttr loopTag; - /// Whether the loop emitter needs to treat the last tensor as the output - /// tensor. - bool hasOutput; - bool isSparseOut; - /// Input and (optional) output tensors. - std::vector tensors; - /// The dim type array for each tensor. - std::vector> dimTypes; - /// Sparse iteration information (by tensor and dim). These arrays - /// are updated to remain current within the current loop. - std::vector> pidxs; - std::vector> coord; - std::vector> highs; - std::vector> ptrBuffer; // to_pointers - std::vector> idxBuffer; // to_indices - std::vector valBuffer; // to_value - - // Loop Stack, stores the information of all the nested loops that are - // alive. - std::vector loopStack; - - // Loop Sequence Stack, stores the unversial index for the current loop - // sequence. - std::vector loopSeqStack; - - // Maps AffineDimExpr to the index of the loop in loopStack. - // TODO: We should probably use a callback function here to make it more - // general. - std::vector sparsiferLoopLvlMap; - - // TODO: not yet used, it should track the current level for each tensor - // to help eliminate `dim` paramters from above APIs. - // std::vector curLv; -}; - } // namespace sparse_tensor } // namespace mlir diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "CodegenUtils.h" +#include "SparseTensorLoopEmitter.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" @@ -21,25 +22,6 @@ using namespace mlir; using namespace mlir::sparse_tensor; -/// Generates a pointer/index load from the sparse storage scheme. Narrower -/// data types need to be zero extended before casting the value into the -/// index type used for looping and indexing. -static Value genIndexLoad(OpBuilder &builder, Location loc, Value ptr, - Value s) { - // For the scalar case, we simply zero extend narrower indices into 64-bit - // values before casting to index without a performance penalty. Here too, - // however, indices that already are 64-bit, in theory, cannot express the - // full range as explained above. - Value load = builder.create(loc, ptr, s); - if (!load.getType().isa()) { - if (load.getType().getIntOrFloatBitWidth() < 64) - load = builder.create(loc, builder.getI64Type(), load); - load = - builder.create(loc, builder.getIndexType(), load); - } - return load; -} - /// If the tensor is a sparse constant, generates and returns the pair of /// the constants for the indices and the values. static Optional> @@ -90,652 +72,6 @@ return val; } -//===----------------------------------------------------------------------===// -// Sparse tensor loop emitter class implementations -//===----------------------------------------------------------------------===// - -SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors, - StringAttr loopTag, - bool hasOutput, - bool isSparseOut, - ArrayRef topSort) { - initialize(tensors, loopTag, hasOutput, isSparseOut, topSort); -} - -void SparseTensorLoopEmitter::initialize(ValueRange tensors, StringAttr loopTag, - bool hasOutput, bool isSparseOut, - ArrayRef topSort) { - // First initializes fields. - this->loopTag = loopTag; - this->hasOutput = hasOutput; - this->isSparseOut = isSparseOut; - this->tensors.assign(tensors.begin(), tensors.end()); - this->dimTypes.assign(tensors.size(), std::vector()); - this->pidxs.assign(tensors.size(), std::vector()); - this->coord.assign(tensors.size(), std::vector()); - this->highs.assign(tensors.size(), std::vector()); - this->ptrBuffer.assign(tensors.size(), std::vector()); - this->idxBuffer.assign(tensors.size(), std::vector()); - this->valBuffer.assign(tensors.size(), nullptr); - this->loopStack.reserve(topSort.size()); - this->sparsiferLoopLvlMap.assign(topSort.size(), 0); - - for (size_t tid = 0, e = tensors.size(); tid < e; tid++) { - auto t = tensors[tid]; - // a scalar or 0-dimension tensors - if (isZeroRankedTensorOrScalar(t.getType())) - continue; - auto rtp = t.getType().cast(); - auto rank = static_cast(rtp.getRank()); - auto enc = getSparseTensorEncoding(rtp); - // We always treat sparse output tensor as dense so that we always iterate - // it based on dim size. - if (enc && !(isOutputTensor(tid) && isSparseOut)) - for (auto dimTp : enc.getDimLevelType()) - dimTypes[tid].push_back(dimTp); - else - dimTypes[tid].assign(rank, DimLevelType::Dense); - - // Initialize using empty value. - pidxs[tid].assign(rank, Value()); - coord[tid].assign(rank, Value()); - highs[tid].assign(rank, Value()); - ptrBuffer[tid].assign(rank, Value()); - idxBuffer[tid].assign(rank, Value()); - } - - // FIXME: This map should be maintained outside loop emitter. - for (unsigned i = 0, e = topSort.size(); i < e; i++) { - // This is an inverse map of the topologically sorted loop index from - // sparsifier. This is needed to map the AffineDimExpr back to the loopStack - // index used in loop emitter. - sparsiferLoopLvlMap[topSort[i]] = i; - } -} - -void SparseTensorLoopEmitter::initializeLoopEmit( - OpBuilder &builder, Location loc, - SparseTensorLoopEmitter::OutputUpdater updater) { - // For every tensor, find lower and upper bound on dimensions, set the - // same bounds on loop indices, and obtain dense or sparse buffer(s). - for (size_t t = 0, e = tensors.size(); t < e; t++) { - auto tensor = tensors[t]; - auto rtp = tensor.getType().dyn_cast(); - if (!rtp) - // Skips only scalar, zero ranked tensor still need to be bufferized and - // (probably) filled with zeros by users. - continue; - auto rank = rtp.getRank(); - auto shape = rtp.getShape(); - auto enc = getSparseTensorEncoding(rtp); - auto dynShape = {ShapedType::kDynamic}; - // Scan all dimensions of current tensor. - for (int64_t d = 0; d < rank; d++) { - // This should be called only once at beginning. - assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]); - // Handle sparse storage schemes. - if (isCompressedDLT(dimTypes[t][d])) { - auto ptrTp = - MemRefType::get(dynShape, getPointerOverheadType(builder, enc)); - auto indTp = - MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); - auto dim = builder.getIndexAttr(d); - // Generate sparse primitives to obtains pointer and indices. - ptrBuffer[t][d] = builder.create(loc, ptrTp, tensor, dim); - idxBuffer[t][d] = builder.create(loc, indTp, tensor, dim); - } else if (isSingletonDLT(dimTypes[t][d])) { - // Singleton dimension, fetch indices. - auto indTp = - MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); - auto dim = builder.getIndexAttr(d); - idxBuffer[t][d] = builder.create(loc, indTp, tensor, dim); - } else { - // Dense dimension, nothing to fetch. - assert(isDenseDLT(dimTypes[t][d])); - } - - // Find upper bound in current dimension. - unsigned p = toOrigDim(enc, d); - Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p); - highs[t][d] = up; - } - - // Perform the required bufferization. Dense inputs materialize - // from the input tensors. Sparse inputs use sparse primitives to obtain the - // values. - // Delegates extra output initialization to clients. - bool isOutput = isOutputTensor(t); - Type elementType = rtp.getElementType(); - if (!enc) { - // Non-annotated dense tensors. - auto denseTp = MemRefType::get(shape, elementType); - Value denseVal = - builder.create(loc, denseTp, tensor); - // Dense outputs need special handling. - if (isOutput && updater) - denseVal = updater(builder, loc, denseVal, tensor); - - valBuffer[t] = denseVal; - } else { - // Annotated sparse tensors. - // We also need the value buffer for annotated all dense `sparse` tensor. - auto dynShape = {ShapedType::kDynamic}; - auto sparseTp = MemRefType::get(dynShape, elementType); - valBuffer[t] = builder.create(loc, sparseTp, tensor); - } - // NOTE: we can also prepare for 0 dim here in advance, this will hosit - // some loop preparation from tensor iteration, but will also (undesirably) - // hosit the code ouside if conditions. - } -} - -void SparseTensorLoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, - ArrayRef tids, - ArrayRef dims) { - // Universal Index start from 0 - assert(loopSeqStack.size() == loopStack.size()); - // Universal index starts from 0 - loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); - // Prepares for all the tensors used in the current loop sequence. - for (auto [tid, dim] : llvm::zip(tids, dims)) - prepareLoopOverTensorAtDim(builder, loc, tid, dim); -} - -Value SparseTensorLoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, - Location loc) { - switch (a.getKind()) { - case AffineExprKind::DimId: { - unsigned idx = a.cast().getPosition(); - return loopStack[sparsiferLoopLvlMap[idx]].iv; - } - case AffineExprKind::Add: { - auto binOp = a.cast(); - return builder.create( - loc, genAffine(builder, binOp.getLHS(), loc), - genAffine(builder, binOp.getRHS(), loc)); - } - case AffineExprKind::Mul: { - auto binOp = a.cast(); - return builder.create( - loc, genAffine(builder, binOp.getLHS(), loc), - genAffine(builder, binOp.getRHS(), loc)); - } - case AffineExprKind::Constant: { - int64_t c = a.cast().getValue(); - return constantIndex(builder, loc, c); - } - default: - llvm_unreachable("unexpected affine subscript"); - } -} - -Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef dims, MutableArrayRef reduc, bool isParallel) { - // TODO: support multiple return on parallel for? - assert(!isParallel || reduc.size() <= 1); - - bool isSparseInput = false; - size_t tid = tids.front(), dim = dims.front(); - for (auto [t, d] : llvm::zip(tids, dims)) { - assert(dimTypes[t].size() > d); // Must be a valid tid, dim pair - assert(!coord[t][d]); // We cannot re-enter the same level - auto dimType = dimTypes[t][d]; - // Must be a recognizable DLT. - assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || - isSingletonDLT(dimType)); - bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); - // We can at most have one sparse input, otherwise, a while loop is required - // to co-iterate multiple sparse tensors. - assert(!isSparseInput || !isSparse); - if (isSparse) { - tid = t; - dim = d; - } - isSparseInput = isSparseInput || isSparse; - } - - Value step = constantIndex(builder, loc, 1); - Value lo = isSparseInput ? pidxs[tid][dim] // current offset - : loopSeqStack.back(); // univeral tid - Value hi = highs[tid][dim]; - Operation *loop = nullptr; - Value iv; - if (isParallel) { - scf::ParallelOp parOp = - builder.create(loc, lo, hi, step, reduc); - builder.setInsertionPointToStart(parOp.getBody()); - assert(parOp.getNumReductions() == reduc.size()); - iv = parOp.getInductionVars()[0]; - - // In-place update on the reduction variable vector. - // Note that the init vals is not the actual reduction variables but instead - // used as a `special handle` to (temporarily) represent them. The - // expression on init vals will be moved into scf.reduce and replaced with - // the block arguments when exiting the loop (see exitForLoop). This is - // needed as we can not build the actual reduction block and get the actual - // reduction varaible before users fill parallel loop body. - for (int i = 0, e = reduc.size(); i < e; i++) - reduc[i] = parOp.getInitVals()[i]; - loop = parOp; - } else { - scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); - builder.setInsertionPointToStart(forOp.getBody()); - iv = forOp.getInductionVar(); - - // In-place update on the reduction variable vector. - assert(forOp.getNumRegionIterArgs() == reduc.size()); - for (int i = 0, e = reduc.size(); i < e; i++) - reduc[i] = forOp.getRegionIterArg(i); - loop = forOp; - } - assert(loop && iv); - - if (isSparseInput) { - pidxs[tid][dim] = iv; - // Generating a load on the indices array yields the coordinate. - Value ptr = idxBuffer[tid][dim]; - coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv); - } else { - // Dense tensor, the coordinates is the inducation variable. - coord[tid][dim] = iv; - } - // NOTE: we can also prepare for next dim here in advance - // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), loop, - coord[tid][dim], loopTag); - // Emit extra locals. - emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims); - - return loop; -} - -Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim( - OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine, - MutableArrayRef reduc) { - assert(!affine.isa() && !isDenseDLT(dimTypes[tid][dim])); - assert(dimTypes[tid].size() > dim); - // We can not re-enter the same level. - assert(!coord[tid][dim]); - - Value step = constantIndex(builder, loc, 1); - - Value lo = pidxs[tid][dim]; - Value hi = highs[tid][dim]; - - // TODO: We should instead use a whileOp for filter loop to allow early - // break when exceeding (for ordered dimensions). - // TODO: There are many other potiential opportunities that we might apply in - // the future. E.g., we could use binary search to located the pointer index. - scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); - - // In-place update on the reduction variable vector. - assert(forOp.getNumRegionIterArgs() == reduc.size()); - for (int i = 0, e = reduc.size(); i < e; i++) - reduc[i] = forOp.getRegionIterArg(i); - - builder.setInsertionPointToStart(forOp.getBody()); - Value iv = forOp.getInductionVar(); - - pidxs[tid][dim] = iv; - // Generating a load on the indices array yields the coordinate. - Value ptr = idxBuffer[tid][dim]; - coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv); - - // Generate an if condition to filter out indices that is not equal to the - // result of the affine expression. - Value expected = genAffine(builder, affine, loc); - auto pred = builder.create(loc, arith::CmpIPredicate::eq, - coord[tid][dim], expected); - SmallVector types; - for (Value red : reduc) { - types.push_back(red.getType()); - } - - bool hasReduc = !types.empty(); - scf::IfOp ifOp = - builder.create(loc, types, pred, /*else*/ hasReduc); - if (hasReduc) { - // scf.for (a) -> v - // %s = scf.if (a) -> v - // user-generated code. - // else - // yield a - // yield %s - builder.create(loc, ifOp.getResults()); - builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); - // On mismatch. - builder.create(loc, reduc); - } - // Set the insert point to matched branch. - builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); - - // NOTE: we can also prepare for next dim here in advance - // Push the loop into stack - loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, - coord[tid][dim], nullptr); - return forOp; -} - -void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel( - OpBuilder &builder, Location loc, size_t tid, size_t dim, - AffineExpr affine) { - Value affineV = genAffine(builder, affine, loc); - pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV); -} - -Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef dims, bool needsUniv, MutableArrayRef reduc) { - assert(tids.size() == dims.size()); - SmallVector types; - SmallVector operands; - // Construct the while-loop with a parameter for each index. - Type indexType = builder.getIndexType(); - for (auto [tid, dim] : llvm::zip(tids, dims)) { - if (isCompressedDLT(dimTypes[tid][dim]) || - isSingletonDLT(dimTypes[tid][dim])) { - assert(pidxs[tid][dim]); - types.push_back(indexType); - operands.push_back(pidxs[tid][dim]); - } - } - // The position where user-supplied reduction variable starts. - for (Value rec : reduc) { - types.push_back(rec.getType()); - operands.push_back(rec); - } - if (needsUniv) { - types.push_back(indexType); - // Update universal index. - operands.push_back(loopSeqStack.back()); - } - assert(types.size() == operands.size()); - scf::WhileOp whileOp = builder.create(loc, types, operands); - - SmallVector locs(types.size(), loc); - Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); - Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); - - // Build the "before" region, which effectively consists - // of a conjunction of "i < upper" tests on all induction. - builder.setInsertionPointToStart(&whileOp.getBefore().front()); - Value cond; - unsigned o = 0; - for (auto [tid, dim] : llvm::zip(tids, dims)) { - if (isCompressedDLT(dimTypes[tid][dim]) || - isSingletonDLT(dimTypes[tid][dim])) { - Value op1 = before->getArgument(o); - Value op2 = highs[tid][dim]; - Value opc = builder.create(loc, arith::CmpIPredicate::ult, - op1, op2); - cond = cond ? builder.create(loc, cond, opc) : opc; - // Update - pidxs[tid][dim] = after->getArgument(o++); - } - } - builder.create(loc, cond, before->getArguments()); - - // Generates while body. - builder.setInsertionPointToStart(&whileOp.getAfter().front()); - Value min; - for (auto [tid, dim] : llvm::zip(tids, dims)) { - // Prepares for next level. - if (isCompressedDLT(dimTypes[tid][dim]) || - isSingletonDLT(dimTypes[tid][dim])) { - Value ptr = idxBuffer[tid][dim]; - Value s = pidxs[tid][dim]; - Value load = genIndexLoad(builder, loc, ptr, s); - coord[tid][dim] = load; - if (!needsUniv) { - if (min) { - Value cmp = builder.create( - loc, arith::CmpIPredicate::ult, load, min); - min = builder.create(loc, cmp, load, min); - } else { - min = load; - } - } - } - } - - if (needsUniv) { - assert(!min); - // Otherwise, universal index is the minimal pidx. - min = after->getArguments().back(); - } - - // Sets up the loop stack. - loopStack.emplace_back(tids, dims, whileOp, min, loopTag); - assert(loopStack.size() == loopSeqStack.size()); - - // Emits extra locals - emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims); - - // Updates reduction variables - assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0)); - // In-place update on reduction variable. - for (unsigned i = 0, e = reduc.size(); i < e; i++) - reduc[i] = after->getArgument(o + i); - - return whileOp; -} - -void SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, - Location loc, - size_t tid, - size_t dim) { - assert(dimTypes[tid].size() > dim); - auto dimType = dimTypes[tid][dim]; - - if (isDenseDLT(dimType)) - return; - - // Either the first dimension, or the previous dimension has been set. - assert(dim == 0 || pidxs[tid][dim - 1]); - Value c0 = constantIndex(builder, loc, 0); - Value c1 = constantIndex(builder, loc, 1); - if (isCompressedDLT(dimType)) { - Value ptr = ptrBuffer[tid][dim]; - - Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1]; - pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo); - - Value pHi = builder.create(loc, pLo, c1); - highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi); - return; - } - if (isSingletonDLT(dimType)) { - Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1]; - Value pHi = builder.create(loc, pLo, c1); - - pidxs[tid][dim] = pLo; - highs[tid][dim] = pHi; - return; - } - - llvm_unreachable("Unrecognizable dimesion type!"); -} - -void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef dims) { - // Initialize dense positions. Note that we generate dense indices of the - // output tensor unconditionally, since they may not appear in the lattice, - // but may be needed for linearized codegen. - for (auto [tid, dim] : llvm::zip(tids, dims)) { - if (isDenseDLT(dimTypes[tid][dim])) { - auto enc = getSparseTensorEncoding(tensors[tid].getType()); - if (enc && !isSparseOutput(tid)) { - bool validPidx = dim == 0 || pidxs[tid][dim - 1]; - if (!validPidx) { - // We might not find the pidx for the sparse output tensor as it is - // unconditionally required by the sparsification. - assert(isOutputTensor(tid)); - continue; - } - pidxs[tid][dim] = - genAddress(builder, loc, tid, dim, loopStack.back().iv); - // NOTE: we can also prepare for next dim here in advance - } - } - } -} - -void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, - MutableArrayRef reduc) { - LoopLevelInfo &loopInfo = loopStack.back(); - auto &dims = loopStack.back().dims; - auto &tids = loopStack.back().tids; - auto forOp = llvm::dyn_cast(loopInfo.loop); - if (forOp) { - if (!reduc.empty()) { - assert(reduc.size() == forOp.getNumResults()); - rewriter.create(loc, reduc); - } - // Exit the loop. - rewriter.setInsertionPointAfter(forOp); - // In-place update reduction variables. - for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) - reduc[i] = forOp.getResult(i); - } else { - auto parOp = llvm::cast(loopInfo.loop); - if (!reduc.empty()) { - assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); - Operation *redExp = reduc.front().getDefiningOp(); - // Reduction expression should have no use. - assert(redExp->getUses().empty()); - // This must be a binary operation. - // NOTE: This is users' responsibilty to ensure the operation are - // commutative. - assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); - - Value redVal = parOp.getInitVals().front(); - Value curVal; - if (redExp->getOperand(0) == redVal) - curVal = redExp->getOperand(1); - else if (redExp->getOperand(1) == redVal) - curVal = redExp->getOperand(0); - // One of the operands must be the init value (which is also the - // previous reduction value). - assert(curVal); - // The reduction expression should be the only user of the reduction val - // inside the parallel for. - unsigned numUsers = 0; - for (Operation *op : redVal.getUsers()) { - if (op->getParentOp() == parOp) - numUsers++; - } - assert(numUsers == 1); - (void)numUsers; // to silence unused variable warning in release build - - rewriter.setInsertionPointAfter(redExp); - auto redOp = rewriter.create(loc, curVal); - // Attach to the reduction op. - Block *redBlock = &redOp.getRegion().getBlocks().front(); - rewriter.setInsertionPointToEnd(redBlock); - Operation *newRed = rewriter.clone(*redExp); - // Replaces arguments of the reduction expression by using the block - // arguments from scf.reduce. - rewriter.updateRootInPlace( - newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); - // Erases the out-dated reduction expression. - rewriter.eraseOp(redExp); - rewriter.setInsertionPointToEnd(redBlock); - rewriter.create(loc, newRed->getResult(0)); - } - rewriter.setInsertionPointAfter(parOp); - // In-place update reduction variables. - for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) - reduc[i] = parOp.getResult(i); - } - - // Finished iterating a tensor, clean up - // We only do the clean up on for loop as while loops do not necessarily - // finish the iteration on a sparse tensor - for (auto [tid, dim] : llvm::zip(tids, dims)) { - // Reset to null. - coord[tid][dim] = Value(); - pidxs[tid][dim] = Value(); - // Dense dimension, high is fixed. - if (!isDenseDLT(dimTypes[tid][dim])) - highs[tid][dim] = Value(); - } -} - -void SparseTensorLoopEmitter::exitCoIterationLoop( - OpBuilder &builder, Location loc, MutableArrayRef reduc) { - auto whileOp = llvm::cast(loopStack.back().loop); - auto &dims = loopStack.back().dims; - auto &tids = loopStack.back().tids; - Value iv = loopStack.back().iv; - // Generation while loop induction at the end. - builder.setInsertionPointToEnd(&whileOp.getAfter().front()); - // Finalize the induction. Note that the induction could be performed - // in the individual if-branches to avoid re-evaluating the conditions. - // However, that would result in a rather elaborate forest of yield - // instructions during code generation. Moreover, performing the induction - // after the if-statements more closely resembles code generated by TACO. - unsigned o = 0; - SmallVector operands; - Value one = constantIndex(builder, loc, 1); - for (auto [tid, dim] : llvm::zip(tids, dims)) { - if (isCompressedDLT(dimTypes[tid][dim]) || - isSingletonDLT(dimTypes[tid][dim])) { - Value op1 = coord[tid][dim]; - Value op3 = pidxs[tid][dim]; - Value cmp = - builder.create(loc, arith::CmpIPredicate::eq, op1, iv); - Value add = builder.create(loc, op3, one); - operands.push_back(builder.create(loc, cmp, add, op3)); - // Following loops continue iteration from the break point of the - // current while loop. - pidxs[tid][dim] = whileOp->getResult(o++); - // The coordinates are invalid now. - coord[tid][dim] = nullptr; - // highs remains unchanged. - } - } - - // Reduction value from users. - for (auto &i : reduc) { - operands.push_back(i); - // In place update reduction variable. - i = whileOp->getResult(o++); - } - - // An (optional) universal index. - if (operands.size() < whileOp.getNumResults()) { - assert(operands.size() + 1 == whileOp.getNumResults()); - // The last one is the universial index. - operands.push_back(builder.create(loc, iv, one)); - // update the loop starting point of current loop sequence - loopSeqStack.back() = whileOp->getResult(o++); - } - - assert(o == operands.size()); - builder.create(loc, operands); - builder.setInsertionPointAfter(whileOp); -} - -void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter, - Location loc, - MutableArrayRef reduc) { - // Clean up the values, it would help use to discover potential bug at a - // earlier stage (instead of silently using a wrong value). - LoopLevelInfo &loopInfo = loopStack.back(); - assert(loopInfo.tids.size() == loopInfo.dims.size()); - SmallVector red; - if (llvm::isa(loopInfo.loop)) { - exitCoIterationLoop(rewriter, loc, reduc); - } else { - exitForLoop(rewriter, loc, reduc); - } - - assert(loopStack.size() == loopSeqStack.size()); - loopStack.pop_back(); -} - //===----------------------------------------------------------------------===// // ExecutionEngine/SparseTensorUtils helper functions. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorLoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorLoopEmitter.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorLoopEmitter.h @@ -0,0 +1,283 @@ +//===- SparseTensorLoopEmitter.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_ +#define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_ + +#include + +#include "mlir/Dialect/SparseTensor/IR/Enums.h" +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +namespace sparse_tensor { + +//===----------------------------------------------------------------------===// +// SparseTensorLoopEmiter class, manages sparse tensors and helps to +// generate loop structure to (co)-iterate sparse tensors. +// +// An example usage: +// To generate the following loops over T1 and T2 +// +// for i in TENSOR_1_0 { +// for j : TENSOR_2_0 { +// for k : TENSOR_1_1 {} +// for k : TENSOR_2_1 {} +// } +// } +// +// One can use +// +// SparseTensorLoopEmiter loopEmiter({T1, T1}); +// loopEmiter.initializeLoopEmit(); +// loopEmiter.enterLoopOverTensorAtDim(T1, 0); +// loopEmiter.enterLoopOverTensorAtDim(T2, 0); +// loopEmiter.enterLoopOverTensorAtDim(T1, 1); +// loopEmiter.exitCurrentLoop(); +// loopEmiter.enterLoopOverTensorAtDim(T2, 1); +// loopEmiter.exitCurrentLoop(); // exit k +// loopEmiter.exitCurrentLoop(); // exit j +// loopEmiter.exitCurrentLoop(); // exit i +//===----------------------------------------------------------------------===// + +class LoopEmitter { +public: + /// Optional callback function to setup dense output tensors when + /// initializing the loop emitter (e.g., to fill a dense output with zeros). + using OutputUpdater = function_ref; + + LoopEmitter() = default; + + /// Takes an array of tensors inputs, on which the generated loops will + /// iterate on. The index of the tensor in the array is also the tensor id + /// (tid) used in related functions. If isSparseOut is set, loop emitter + /// assume that the sparse output tensor is empty, and will always generate + /// loops on it based on the dim sizes. An optional array could be provided + /// (by sparsification) to indicate the loop id sequence that will be + /// generated. It is used to establish the mapping between affineDimExpr to + /// the corresponding loop index in the loop stack that are maintained by the + /// loop emitter. + void initialize(ValueRange tensors, StringAttr loopTag = nullptr, + bool hasOutput = false, bool isSparseOut = false, + ArrayRef topSort = {}); + + explicit LoopEmitter(ValueRange tensors, StringAttr loopTag = nullptr, + bool hasOutput = false, bool isSparseOut = false, + ArrayRef topSort = {}); + + /// Starts a loop emitting session by generating all the buffers needed to + /// iterate tensors. + void initializeLoopEmit(OpBuilder &builder, Location loc, + OutputUpdater updater = nullptr); + + /// Generates a list of operations to compute the affine expression. + Value genAffine(OpBuilder &builder, AffineExpr a, Location loc); + + /// Enters a new loop sequence, the loops within the same sequence starts + /// from the break points of previous loop instead of starting over from 0. + /// e.g., + /// { + /// // loop sequence start. + /// p0 = while(xxx) + /// ... + /// break p0 + /// + /// // Starts loop from p0 + /// for (i = p0; i < end; i++) + /// ... + /// // loop sequence end. + /// } + void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef dims); + + // exit the current loop sequence, this will reset universal index to 0. + void exitCurrentLoopSeq() { + assert(loopSeqStack.size() == loopStack.size() + 1); + loopSeqStack.pop_back(); + } + + // TODO: Gets rid of `dim` in the argument list? Track the dimension we + // are currently at internally. Then it would be enterNextDimForTensor. + // Still need a way to specify the dim for non annoated dense tensor though, + // as it can be accessed out of order. + /// Emits loop over tensor_tid_dim, it assumes that loops between + /// tensor_tid_[0, dim - 1] have already been generated. + /// The function will also perform in-place update on the `reduc` vector to + /// return the reduction variable used inside the generated loop. + Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc, + ArrayRef tids, + ArrayRef dims, + MutableArrayRef reduc = {}, + bool isParallel = false); + + Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + AffineExpr affine, + MutableArrayRef reduc = {}); + + void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + AffineExpr affine); + + /// Emits a co-iteration loop over a set of tensors. + Operation *enterCoIterationOverTensorsAtDims( + OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef dims, bool needsUniv, MutableArrayRef reduc = {}); + + void exitCurrentLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc = {}); + + /// Returns the array of coordinate for all the loop generated till now. + void getCoordinateArray(SmallVectorImpl &coords) const { + for (auto &l : loopStack) + coords.push_back(l.iv); + } + + /// Gets loop induction variable at the given level. + unsigned getCurrentDepth() const { return loopStack.size(); } + + /// Gets loop induction variable at the given level. + Value getLoopIV(size_t level) const { + if (level < loopStack.size()) + return loopStack[level].iv; + return nullptr; + } + + /// + /// Getters. + /// + const std::vector> &getPidxs() const { return pidxs; }; + const std::vector> &getCoord() const { return coord; }; + const std::vector> &getHighs() const { return highs; }; + const std::vector> &getPtrBuffer() const { + return ptrBuffer; + }; + const std::vector> &getIdxBuffer() const { + return idxBuffer; + }; + const std::vector &getValBuffer() const { return valBuffer; }; + + constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() { + return llvm::StringLiteral("Emitted from"); + } + +private: + struct LoopLevelInfo { + LoopLevelInfo(ArrayRef tids, ArrayRef dims, Operation *loop, + Value iv, StringAttr loopTag) + : tids(tids), dims(dims), loop(loop), iv(iv) { + // Attached a special tag to loop emitter generated loop. + if (loopTag) + loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag); + } + // TODO: maybe use a vector for tid and dim? + // The set of tensors that the loop is operating on + const llvm::SmallVector tids; + // The corresponding dims for the tensors + const llvm::SmallVector dims; + const Operation *loop; // the loop operation + const Value iv; // the induction variable for the loop + }; + + /// Linearizes address for dense dimension (i.e., p = (i * d0) + j). + Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim, + Value iv); + + bool isOutputTensor(size_t tid) { + return hasOutput && tid == tensors.size() - 1; + } + + bool isSparseOutput(size_t tid) { return isOutputTensor(tid) && isSparseOut; } + + /// Setups [lo, hi] for iterating tensor[dim], it assumes that tensor[0 + /// ...dims-1] has already been setup. + void prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, + size_t dim); + + /// Emits extra locals, since the locals might not be in simplified lattices + /// point used to generate the loops, but are still required to generates + /// expressions. + void emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc, + ArrayRef tids, + ArrayRef dims); + + /// Exits a for loop, returns the reduction results, e.g., + /// For sequential for loops: + /// %ret = for () { + /// ... + /// %val = addi %args, %c + /// yield %val + /// } + /// For parallel loops, the following generated code by users: + /// %ret = parallel () init(%args) { + /// ... + /// %val = op %args, %c + /// } + /// will be transformed into + /// %ret = parallel () init(%args) { + /// ... + /// scf.reduce(%c) bb0(%0, %1){ + /// %val = op %0, %1 + /// scf.reduce.return %val + /// } + /// } + /// NOTE: only one instruction will be moved into reduce block, + /// transformation will fail if multiple instructions are used to compute + /// the reduction value. Return %ret to user, while %val is provided by + /// users (`reduc`). + void exitForLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc); + + /// Exits a while loop, returns the reduction results. + void exitCoIterationLoop(OpBuilder &builder, Location loc, + MutableArrayRef reduc); + + /// A optional string attribute that should be attached to the loop + /// generated by loop emitter, it might help following passes to identify + /// loops that operates on sparse tensors more easily. + StringAttr loopTag; + /// Whether the loop emitter needs to treat the last tensor as the output + /// tensor. + bool hasOutput; + bool isSparseOut; + /// Input and (optional) output tensors. + std::vector tensors; + /// The dim type array for each tensor. + std::vector> dimTypes; + /// Sparse iteration information (by tensor and dim). These arrays + /// are updated to remain current within the current loop. + std::vector> pidxs; + std::vector> coord; + std::vector> highs; + std::vector> ptrBuffer; // to_pointers + std::vector> idxBuffer; // to_indices + std::vector valBuffer; // to_value + + // Loop Stack, stores the information of all the nested loops that are + // alive. + std::vector loopStack; + + // Loop Sequence Stack, stores the unversial index for the current loop + // sequence. + std::vector loopSeqStack; + + // Maps AffineDimExpr to the index of the loop in loopStack. + // TODO: We should probably use a callback function here to make it more + // general. + std::vector sparsiferLoopLvlMap; + + // TODO: not yet used, it should track the current level for each tensor + // to help eliminate `dim` paramters from above APIs. + // std::vector curLv; +}; + +} // namespace sparse_tensor +} // namespace mlir + +#endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorLoopEmitter.cpp copy from mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp copy to mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorLoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorLoopEmitter.cpp @@ -1,4 +1,4 @@ -//===- CodegenUtils.cpp - Utilities for generating MLIR -------------------===// +//===- SparseTensorLoopEmitter.cpp ---------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,21 +6,23 @@ // //===----------------------------------------------------------------------===// +#include "SparseTensorLoopEmitter.h" #include "CodegenUtils.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/Types.h" -#include "mlir/IR/Value.h" +#include "mlir/Dialect/SCF/IR/SCF.h" using namespace mlir; using namespace mlir::sparse_tensor; +//===----------------------------------------------------------------------===// +// File local helper functions. +//===----------------------------------------------------------------------===// + /// Generates a pointer/index load from the sparse storage scheme. Narrower /// data types need to be zero extended before casting the value into the /// index type used for looping and indexing. @@ -40,71 +42,26 @@ return load; } -/// If the tensor is a sparse constant, generates and returns the pair of -/// the constants for the indices and the values. -static Optional> -genSplitSparseConstant(OpBuilder &builder, Location loc, Value tensor) { - if (auto constOp = tensor.getDefiningOp()) { - if (auto attr = constOp.getValue().dyn_cast()) { - DenseElementsAttr indicesAttr = attr.getIndices(); - Value indices = builder.create(loc, indicesAttr); - DenseElementsAttr valuesAttr = attr.getValues(); - Value values = builder.create(loc, valuesAttr); - return std::make_pair(indices, values); - } - } - return {}; -} - -/// Generates the code to copy the index at indices[ivs] to ind, and return -/// the value at value[ivs]. -static Value genIndexAndValueForSparse(OpBuilder &builder, Location loc, - Value indices, Value values, - SmallVectorImpl &indicesArray, - ValueRange ivs, unsigned rank) { - for (unsigned i = 0; i < rank; i++) { - Value idx = constantIndex(builder, loc, i); - Value val = builder.create(loc, indices, - ValueRange{ivs[0], idx}); - val = builder.create(loc, builder.getIndexType(), val); - // builder.create(loc, val, ind, idx); - indicesArray.push_back(val); - } - return builder.create(loc, values, ivs[0]); -} - -/// Generates the code to read the value from tensor[ivs], and conditionally -/// stores the indices ivs to the memory in ind. The generated code looks like -/// the following and the insertion point after this routine is inside the -/// if-then branch behind the assignment to ind. This is to ensure that the -/// code that uses the ind, such as an addEltX call generated after, is inside -/// the if-then branch. -/// if (tensor[ivs] != 0) -/// ind = ivs -static Value genIndexAndValueForDense(OpBuilder &builder, Location loc, - Value tensor, - SmallVectorImpl &indicesArray, - ValueRange ivs) { - Value val = genValueForDense(builder, loc, tensor, ivs); - indicesArray.append(ivs.begin(), ivs.end()); - return val; -} - //===----------------------------------------------------------------------===// // Sparse tensor loop emitter class implementations //===----------------------------------------------------------------------===// -SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors, - StringAttr loopTag, - bool hasOutput, - bool isSparseOut, - ArrayRef topSort) { +Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, size_t tid, + size_t dim, Value iv) { + Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1]; + Value mul = builder.create(loc, highs[tid][dim], p); + Value add = builder.create(loc, mul, iv); + return add; +} + +LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput, + bool isSparseOut, ArrayRef topSort) { initialize(tensors, loopTag, hasOutput, isSparseOut, topSort); } -void SparseTensorLoopEmitter::initialize(ValueRange tensors, StringAttr loopTag, - bool hasOutput, bool isSparseOut, - ArrayRef topSort) { +void LoopEmitter::initialize(ValueRange tensors, StringAttr loopTag, + bool hasOutput, bool isSparseOut, + ArrayRef topSort) { // First initializes fields. this->loopTag = loopTag; this->hasOutput = hasOutput; @@ -153,9 +110,8 @@ } } -void SparseTensorLoopEmitter::initializeLoopEmit( - OpBuilder &builder, Location loc, - SparseTensorLoopEmitter::OutputUpdater updater) { +void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc, + LoopEmitter::OutputUpdater updater) { // For every tensor, find lower and upper bound on dimensions, set the // same bounds on loop indices, and obtain dense or sparse buffer(s). for (size_t t = 0, e = tensors.size(); t < e; t++) { @@ -229,9 +185,9 @@ } } -void SparseTensorLoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, - ArrayRef tids, - ArrayRef dims) { +void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, + ArrayRef tids, + ArrayRef dims) { // Universal Index start from 0 assert(loopSeqStack.size() == loopStack.size()); // Universal index starts from 0 @@ -241,8 +197,7 @@ prepareLoopOverTensorAtDim(builder, loc, tid, dim); } -Value SparseTensorLoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, - Location loc) { +Value LoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, Location loc) { switch (a.getKind()) { case AffineExprKind::DimId: { unsigned idx = a.cast().getPosition(); @@ -269,7 +224,7 @@ } } -Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim( +Operation *LoopEmitter::enterLoopOverTensorAtDim( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, MutableArrayRef reduc, bool isParallel) { // TODO: support multiple return on parallel for? @@ -350,7 +305,7 @@ return loop; } -Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim( +Operation *LoopEmitter::enterFilterLoopOverTensorAtDim( OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine, MutableArrayRef reduc) { assert(!affine.isa() && !isDenseDLT(dimTypes[tid][dim])); @@ -417,14 +372,15 @@ return forOp; } -void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel( - OpBuilder &builder, Location loc, size_t tid, size_t dim, - AffineExpr affine) { +void LoopEmitter::genDenseAffineAddressAtCurLevel(OpBuilder &builder, + Location loc, size_t tid, + size_t dim, + AffineExpr affine) { Value affineV = genAffine(builder, affine, loc); pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV); } -Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims( +Operation *LoopEmitter::enterCoIterationOverTensorsAtDims( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, bool needsUniv, MutableArrayRef reduc) { assert(tids.size() == dims.size()); @@ -521,10 +477,8 @@ return whileOp; } -void SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, - Location loc, - size_t tid, - size_t dim) { +void LoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, + size_t tid, size_t dim) { assert(dimTypes[tid].size() > dim); auto dimType = dimTypes[tid][dim]; @@ -557,9 +511,10 @@ llvm_unreachable("Unrecognizable dimesion type!"); } -void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims( - OpBuilder &builder, Location loc, ArrayRef tids, - ArrayRef dims) { +void LoopEmitter::emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, + Location loc, + ArrayRef tids, + ArrayRef dims) { // Initialize dense positions. Note that we generate dense indices of the // output tensor unconditionally, since they may not appear in the lattice, // but may be needed for linearized codegen. @@ -582,8 +537,8 @@ } } -void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, - MutableArrayRef reduc) { +void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc) { LoopLevelInfo &loopInfo = loopStack.back(); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; @@ -663,8 +618,8 @@ } } -void SparseTensorLoopEmitter::exitCoIterationLoop( - OpBuilder &builder, Location loc, MutableArrayRef reduc) { +void LoopEmitter::exitCoIterationLoop(OpBuilder &builder, Location loc, + MutableArrayRef reduc) { auto whileOp = llvm::cast(loopStack.back().loop); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; @@ -718,9 +673,8 @@ builder.setInsertionPointAfter(whileOp); } -void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter, - Location loc, - MutableArrayRef reduc) { +void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, + MutableArrayRef reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). LoopLevelInfo &loopInfo = loopStack.back(); @@ -735,457 +689,3 @@ assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); } - -//===----------------------------------------------------------------------===// -// ExecutionEngine/SparseTensorUtils helper functions. -//===----------------------------------------------------------------------===// - -OverheadType mlir::sparse_tensor::overheadTypeEncoding(unsigned width) { - switch (width) { - case 64: - return OverheadType::kU64; - case 32: - return OverheadType::kU32; - case 16: - return OverheadType::kU16; - case 8: - return OverheadType::kU8; - case 0: - return OverheadType::kIndex; - } - llvm_unreachable("Unsupported overhead bitwidth"); -} - -OverheadType mlir::sparse_tensor::overheadTypeEncoding(Type tp) { - if (tp.isIndex()) - return OverheadType::kIndex; - if (auto intTp = tp.dyn_cast()) - return overheadTypeEncoding(intTp.getWidth()); - llvm_unreachable("Unknown overhead type"); -} - -Type mlir::sparse_tensor::getOverheadType(Builder &builder, OverheadType ot) { - switch (ot) { - case OverheadType::kIndex: - return builder.getIndexType(); - case OverheadType::kU64: - return builder.getIntegerType(64); - case OverheadType::kU32: - return builder.getIntegerType(32); - case OverheadType::kU16: - return builder.getIntegerType(16); - case OverheadType::kU8: - return builder.getIntegerType(8); - } - llvm_unreachable("Unknown OverheadType"); -} - -OverheadType -mlir::sparse_tensor::pointerOverheadTypeEncoding(SparseTensorEncodingAttr enc) { - return overheadTypeEncoding(enc.getPointerBitWidth()); -} - -OverheadType -mlir::sparse_tensor::indexOverheadTypeEncoding(SparseTensorEncodingAttr enc) { - return overheadTypeEncoding(enc.getIndexBitWidth()); -} - -Type mlir::sparse_tensor::getPointerOverheadType(Builder &builder, - SparseTensorEncodingAttr enc) { - return getOverheadType(builder, pointerOverheadTypeEncoding(enc)); -} - -Type mlir::sparse_tensor::getIndexOverheadType(Builder &builder, - SparseTensorEncodingAttr enc) { - return getOverheadType(builder, indexOverheadTypeEncoding(enc)); -} - -// TODO: Adjust the naming convention for the constructors of -// `OverheadType` so we can use the `MLIR_SPARSETENSOR_FOREVERY_O` x-macro -// here instead of `MLIR_SPARSETENSOR_FOREVERY_FIXED_O`; to further reduce -// the possibility of typo bugs or things getting out of sync. -StringRef mlir::sparse_tensor::overheadTypeFunctionSuffix(OverheadType ot) { - switch (ot) { - case OverheadType::kIndex: - return "0"; -#define CASE(ONAME, O) \ - case OverheadType::kU##ONAME: \ - return #ONAME; - MLIR_SPARSETENSOR_FOREVERY_FIXED_O(CASE) -#undef CASE - } - llvm_unreachable("Unknown OverheadType"); -} - -StringRef mlir::sparse_tensor::overheadTypeFunctionSuffix(Type tp) { - return overheadTypeFunctionSuffix(overheadTypeEncoding(tp)); -} - -PrimaryType mlir::sparse_tensor::primaryTypeEncoding(Type elemTp) { - if (elemTp.isF64()) - return PrimaryType::kF64; - if (elemTp.isF32()) - return PrimaryType::kF32; - if (elemTp.isF16()) - return PrimaryType::kF16; - if (elemTp.isBF16()) - return PrimaryType::kBF16; - if (elemTp.isInteger(64)) - return PrimaryType::kI64; - if (elemTp.isInteger(32)) - return PrimaryType::kI32; - if (elemTp.isInteger(16)) - return PrimaryType::kI16; - if (elemTp.isInteger(8)) - return PrimaryType::kI8; - if (auto complexTp = elemTp.dyn_cast()) { - auto complexEltTp = complexTp.getElementType(); - if (complexEltTp.isF64()) - return PrimaryType::kC64; - if (complexEltTp.isF32()) - return PrimaryType::kC32; - } - llvm_unreachable("Unknown primary type"); -} - -StringRef mlir::sparse_tensor::primaryTypeFunctionSuffix(PrimaryType pt) { - switch (pt) { -#define CASE(VNAME, V) \ - case PrimaryType::k##VNAME: \ - return #VNAME; - MLIR_SPARSETENSOR_FOREVERY_V(CASE) -#undef CASE - } - llvm_unreachable("Unknown PrimaryType"); -} - -StringRef mlir::sparse_tensor::primaryTypeFunctionSuffix(Type elemTp) { - return primaryTypeFunctionSuffix(primaryTypeEncoding(elemTp)); -} - -//===----------------------------------------------------------------------===// -// Misc code generators. -//===----------------------------------------------------------------------===// - -mlir::Attribute mlir::sparse_tensor::getOneAttr(Builder &builder, Type tp) { - if (tp.isa()) - return builder.getFloatAttr(tp, 1.0); - if (tp.isa()) - return builder.getIndexAttr(1); - if (auto intTp = tp.dyn_cast()) - return builder.getIntegerAttr(tp, APInt(intTp.getWidth(), 1)); - if (tp.isa()) { - auto shapedTp = tp.cast(); - if (auto one = getOneAttr(builder, shapedTp.getElementType())) - return DenseElementsAttr::get(shapedTp, one); - } - llvm_unreachable("Unsupported attribute type"); -} - -Value mlir::sparse_tensor::genIsNonzero(OpBuilder &builder, mlir::Location loc, - Value v) { - Type tp = v.getType(); - Value zero = constantZero(builder, loc, tp); - if (tp.isa()) - return builder.create(loc, arith::CmpFPredicate::UNE, v, - zero); - if (tp.isIntOrIndex()) - return builder.create(loc, arith::CmpIPredicate::ne, v, - zero); - if (tp.dyn_cast()) - return builder.create(loc, v, zero); - llvm_unreachable("Non-numeric type"); -} - -void mlir::sparse_tensor::genReshapeDstShape( - Location loc, PatternRewriter &rewriter, SmallVectorImpl &dstShape, - ArrayRef srcShape, ArrayRef staticDstShape, - ArrayRef reassociation) { - // Collapse shape. - if (reassociation.size() < srcShape.size()) { - unsigned start = 0; - for (const auto &map : llvm::enumerate(reassociation)) { - auto dstDim = constantIndex(rewriter, loc, 1); - for (unsigned i = start; i < start + map.value().size(); i++) { - dstDim = rewriter.create(loc, dstDim, srcShape[i]); - } - dstShape.push_back(dstDim); - start = start + map.value().size(); - } - assert(start == srcShape.size()); - return; - } - - // Expand shape. - assert(reassociation.size() == srcShape.size()); - unsigned start = 0; - // Expand the i-th dimension in srcShape. - for (unsigned i = 0, size = srcShape.size(); i < size; i++) { - const auto &map = reassociation[i]; - auto srcDim = srcShape[i]; - // Iterate through dimensions expanded from the i-th dimension. - for (unsigned j = start; j < start + map.size(); j++) { - // There can be only one dynamic sized dimension among dimensions - // expanded from the i-th dimension in srcShape. - // For example, if srcDim = 8, then the expanded shape could be <2x?x2>, - // but not <2x?x?>. - if (staticDstShape[j] == ShapedType::kDynamic) { - // The expanded dimension has dynamic size. We compute the dimension - // by dividing srcDim by the product of the static dimensions. - int64_t product = 1; - for (unsigned k = start; k < start + map.size(); k++) { - if (staticDstShape[k] != ShapedType::kDynamic) { - product *= staticDstShape[k]; - } - } - // Compute the dynamic dimension size. - Value productVal = constantIndex(rewriter, loc, product); - Value dynamicSize = - rewriter.create(loc, srcDim, productVal); - dstShape.push_back(dynamicSize); - } else { - // The expanded dimension is statically known. - dstShape.push_back(constantIndex(rewriter, loc, staticDstShape[j])); - } - } - start = start + map.size(); - } - assert(start == staticDstShape.size()); -} - -void mlir::sparse_tensor::translateIndicesArray( - OpBuilder &builder, Location loc, - ArrayRef reassociation, ValueRange srcIndices, - ArrayRef srcShape, ArrayRef dstShape, - SmallVectorImpl &dstIndices) { - unsigned i = 0; - unsigned start = 0; - unsigned dstRank = dstShape.size(); - unsigned srcRank = srcShape.size(); - assert(srcRank == srcIndices.size()); - bool isCollapse = srcRank > dstRank; - ArrayRef shape = isCollapse ? srcShape : dstShape; - // Iterate over reassociation map. - for (const auto &map : llvm::enumerate(reassociation)) { - // Prepare strides information in dimension slice. - Value linear = constantIndex(builder, loc, 1); - for (unsigned j = start, end = start + map.value().size(); j < end; j++) { - linear = builder.create(loc, linear, shape[j]); - } - // Start expansion. - Value val; - if (!isCollapse) - val = srcIndices[i]; - // Iterate over dimension slice. - for (unsigned j = start, end = start + map.value().size(); j < end; j++) { - linear = builder.create(loc, linear, shape[j]); - if (isCollapse) { - Value old = srcIndices[j]; - Value mul = builder.create(loc, old, linear); - val = val ? builder.create(loc, val, mul) : mul; - } else { - Value old = val; - val = builder.create(loc, val, linear); - assert(dstIndices.size() == j); - dstIndices.push_back(val); - val = builder.create(loc, old, linear); - } - } - // Finalize collapse. - if (isCollapse) { - assert(dstIndices.size() == i); - dstIndices.push_back(val); - } - start += map.value().size(); - i++; - } - assert(dstIndices.size() == dstRank); -} - -FlatSymbolRefAttr mlir::sparse_tensor::getFunc(ModuleOp module, StringRef name, - TypeRange resultType, - ValueRange operands, - EmitCInterface emitCInterface) { - MLIRContext *context = module.getContext(); - auto result = SymbolRefAttr::get(context, name); - auto func = module.lookupSymbol(result.getAttr()); - if (!func) { - OpBuilder moduleBuilder(module.getBodyRegion()); - func = moduleBuilder.create( - module.getLoc(), name, - FunctionType::get(context, operands.getTypes(), resultType)); - func.setPrivate(); - if (static_cast(emitCInterface)) - func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), - UnitAttr::get(context)); - } - return result; -} - -func::CallOp mlir::sparse_tensor::createFuncCall( - OpBuilder &builder, Location loc, StringRef name, TypeRange resultType, - ValueRange operands, EmitCInterface emitCInterface) { - auto module = builder.getBlock()->getParentOp()->getParentOfType(); - FlatSymbolRefAttr fn = - getFunc(module, name, resultType, operands, emitCInterface); - return builder.create(loc, resultType, fn, operands); -} - -Type mlir::sparse_tensor::getOpaquePointerType(OpBuilder &builder) { - return LLVM::LLVMPointerType::get(builder.getI8Type()); -} - -Value mlir::sparse_tensor::genAlloca(OpBuilder &builder, Location loc, - unsigned sz, Type tp) { - return genAlloca(builder, loc, constantIndex(builder, loc, sz), tp); -} - -Value mlir::sparse_tensor::genAlloca(OpBuilder &builder, Location loc, Value sz, - Type tp) { - auto memTp = MemRefType::get({ShapedType::kDynamic}, tp); - return builder.create(loc, memTp, ValueRange{sz}); -} - -Value mlir::sparse_tensor::genAllocaScalar(OpBuilder &builder, Location loc, - Type tp) { - return builder.create(loc, MemRefType::get({}, tp)); -} - -Value mlir::sparse_tensor::allocaBuffer(OpBuilder &builder, Location loc, - ValueRange values) { - const unsigned sz = values.size(); - assert(sz >= 1); - Value buffer = genAlloca(builder, loc, sz, values[0].getType()); - for (unsigned i = 0; i < sz; i++) { - Value idx = constantIndex(builder, loc, i); - builder.create(loc, values[i], buffer, idx); - } - return buffer; -} - -Value mlir::sparse_tensor::allocDenseTensor(OpBuilder &builder, Location loc, - RankedTensorType tensorTp, - ValueRange sizes) { - Type elemTp = tensorTp.getElementType(); - auto shape = tensorTp.getShape(); - auto memTp = MemRefType::get(shape, elemTp); - SmallVector dynamicSizes; - for (unsigned i = 0, rank = tensorTp.getRank(); i < rank; i++) { - if (shape[i] == ShapedType::kDynamic) - dynamicSizes.push_back(sizes[i]); - } - Value mem = builder.create(loc, memTp, dynamicSizes); - Value zero = constantZero(builder, loc, elemTp); - builder.create(loc, ValueRange{zero}, ValueRange{mem}); - return mem; -} - -void mlir::sparse_tensor::deallocDenseTensor(OpBuilder &builder, Location loc, - Value buffer) { - builder.create(loc, buffer); -} - -Value mlir::sparse_tensor::genValueForDense(OpBuilder &builder, Location loc, - Value tensor, ValueRange ivs) { - Value val = builder.create(loc, tensor, ivs); - Value cond = genIsNonzero(builder, loc, val); - scf::IfOp ifOp = builder.create(loc, cond, /*else*/ false); - builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); - return val; -} - -// FIXME: -// 1. Dense tensors loop should be generated by loop emitter. -// 2. Support reduction variables to propagate SSA chains properly. -void mlir::sparse_tensor::genDenseTensorOrSparseConstantIterLoop( - OpBuilder &builder, Location loc, Value src, unsigned rank, - function_ref bodyBuilder) { - SmallVector indicesArray; - SmallVector lo; - SmallVector hi; - SmallVector st; - Value zero = constantIndex(builder, loc, 0); - Value one = constantIndex(builder, loc, 1); - auto indicesValues = genSplitSparseConstant(builder, loc, src); - bool isCOOConstant = indicesValues.has_value(); - Value indices; - Value values; - if (isCOOConstant) { - indices = indicesValues->first; - values = indicesValues->second; - lo.push_back(zero); - hi.push_back(linalg::createOrFoldDimOp(builder, loc, values, 0)); - st.push_back(one); - } else { - for (unsigned i = 0; i < rank; i++) { - lo.push_back(zero); - hi.push_back(linalg::createOrFoldDimOp(builder, loc, src, i)); - st.push_back(one); - } - } - - scf::buildLoopNest( - builder, loc, lo, hi, st, {}, - [&](OpBuilder &builder, Location loc, ValueRange ivs, - ValueRange args) -> scf::ValueVector { - Value val; - if (isCOOConstant) - val = genIndexAndValueForSparse(builder, loc, indices, values, - indicesArray, ivs, rank); - else - val = genIndexAndValueForDense(builder, loc, src, indicesArray, ivs); - bodyBuilder(builder, loc, val, indicesArray); - return {}; - }); -} - -void mlir::sparse_tensor::sizesFromSrc(OpBuilder &builder, - SmallVectorImpl &sizes, - Location loc, Value src) { - unsigned rank = src.getType().cast().getRank(); - for (unsigned i = 0; i < rank; i++) - sizes.push_back(linalg::createOrFoldDimOp(builder, loc, src, i)); -} - -Operation *mlir::sparse_tensor::getTop(Operation *op) { - for (; isa(op->getParentOp()) || - isa(op->getParentOp()) || - isa(op->getParentOp()) || - isa(op->getParentOp()); - op = op->getParentOp()) - ; - return op; -} - -void sparse_tensor::foreachInSparseConstant( - Location loc, RewriterBase &rewriter, SparseElementsAttr attr, - function_ref, Value)> callback) { - int64_t rank = attr.getType().getRank(); - // Foreach on constant. - DenseElementsAttr indicesAttr = attr.getIndices(); - DenseElementsAttr valuesAttr = attr.getValues(); - - SmallVector coords; - for (int i = 0, e = valuesAttr.size(); i < e; i++) { - coords.clear(); - for (int j = 0; j < rank; j++) { - auto coordAttr = indicesAttr.getValues()[i * rank + j]; - auto coord = - rewriter.create(loc, coordAttr.getInt()); - // Remaps coordinates. - coords.push_back(coord); - } - Value val; - if (attr.getElementType().isa()) { - auto valAttr = valuesAttr.getValues()[i]; - val = rewriter.create(loc, attr.getElementType(), - valAttr); - } else { - auto valAttr = valuesAttr.getValues()[i]; - // Remaps value. - val = rewriter.create(loc, valAttr); - } - assert(val); - callback(coords, val); - } -} diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CodegenUtils.h" +#include "SparseTensorLoopEmitter.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" @@ -893,7 +894,7 @@ auto enc = getSparseTensorEncoding(rtp); // 1. Generates loop for the sparse input. - SparseTensorLoopEmitter loopEmitter( + LoopEmitter loopEmitter( ValueRange{input}, StringAttr::get(getContext(), ForeachOp::getOperationName())); loopEmitter.initializeLoopEmit(rewriter, loc); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "CodegenUtils.h" +#include "SparseTensorLoopEmitter.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -381,11 +382,11 @@ } return true; } // An invariant or reduction. In both cases, we treat this as an - // invariant value, and rely on later replacing and folding to - // construct a proper reduction chain for the latter case. - if (codegen) - vexp = genVectorInvariantValue(rewriter, vl, exp); - return true; + // invariant value, and rely on later replacing and folding to + // construct a proper reduction chain for the latter case. + if (codegen) + vexp = genVectorInvariantValue(rewriter, vl, exp); + return true; } // Something defined outside the loop-body is invariant. Operation *def = exp.getDefiningOp(); @@ -509,9 +510,8 @@ forOpNew = rewriter.create( loc, forOp.getLowerBound(), forOp.getUpperBound(), step, vinit); forOpNew->setAttr( - SparseTensorLoopEmitter::getLoopEmitterLoopAttrName(), - forOp->getAttr( - SparseTensorLoopEmitter::getLoopEmitterLoopAttrName())); + LoopEmitter::getLoopEmitterLoopAttrName(), + forOp->getAttr(LoopEmitter::getLoopEmitterLoopAttrName())); rewriter.setInsertionPointToStart(forOpNew.getBody()); } else { forOp.setStep(step); @@ -578,8 +578,8 @@ ForOpRewriter(MLIRContext *context, unsigned vectorLength, bool enableVLAVectorization, bool enableSIMDIndex32) - : OpRewritePattern(context), - vl{vectorLength, enableVLAVectorization, enableSIMDIndex32} {} + : OpRewritePattern(context), vl{vectorLength, enableVLAVectorization, + enableSIMDIndex32} {} LogicalResult matchAndRewrite(scf::ForOp op, PatternRewriter &rewriter) const override { @@ -587,7 +587,7 @@ // sparse compiler, which means no data dependence analysis is required, // and its loop-body is very restricted in form. if (!op.getRegion().hasOneBlock() || !isIntValue(op.getStep(), 1) || - !op->hasAttr(SparseTensorLoopEmitter::getLoopEmitterLoopAttrName())) + !op->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) return failure(); // Analyze (!codegen) and rewrite (codegen) loop-body. if (vectorizeStmt(rewriter, op, vl, /*codegen=*/false) && @@ -615,8 +615,7 @@ Value inp = op.getSource(); if (auto redOp = inp.getDefiningOp()) { if (auto forOp = redOp.getVector().getDefiningOp()) { - if (forOp->hasAttr( - SparseTensorLoopEmitter::getLoopEmitterLoopAttrName())) { + if (forOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) { rewriter.replaceOp(op, redOp.getVector()); return success(); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -12,6 +12,7 @@ #include "CodegenEnv.h" #include "CodegenUtils.h" +#include "SparseTensorLoopEmitter.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -1564,7 +1565,7 @@ SmallVector tensors; for (OpOperand &t : op->getOpOperands()) tensors.push_back(t.get()); - SparseTensorLoopEmitter lpe( + LoopEmitter lpe( tensors, StringAttr::get(op.getContext(), linalg::GenericOp::getOperationName()), /*hasOutput=*/true, /*isSparseOut=*/sparseOut != nullptr,