diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h --- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h @@ -230,6 +230,10 @@ return tensor(b) == outTensor && index(b) == i; } + unsigned getOutTensorID() const { return outTensor; } + + unsigned getSynTensorID() const { return outTensor + 1; } + /// Returns true if given tensor iterates *only* in the given tensor /// expression. For the output tensor, this defines a "simply dynamic" /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h @@ -34,127 +34,6 @@ /// `createFuncCall()`, and `replaceOpWithFuncCall()`. enum class EmitCInterface : bool { Off = false, On = true }; -//===----------------------------------------------------------------------===// -// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate -// loop structure to (co-iterate) sparse tensors. -// -// An example usage: -// To generate following loops over T1 and T2 -// -// for i in T1[0] { -// for j : T2[0] { -// for k : T1[1] {} -// for k : T2[1] {} -// } -// } -// -// One can use -// -// SparseTensorLoopEmiter loopEmiter({T1, T1}); -// loopEmiter.initializeLoopEmit(); -// loopEmiter.enterLoopOverTensorAtDim(T1, 0); -// loopEmiter.enterLoopOverTensorAtDim(T2, 0); -// loopEmiter.enterLoopOverTensorAtDim(T1, 1); -// loopEmiter.exitCurrentLoop(); -// loopEmiter.enterLoopOverTensorAtDim(T2, 1); -// for 0 -> 3: -// loopEmiter.exitCurrentLoop(); -//===----------------------------------------------------------------------===// - -// TODO: Sparsification should also rely on this class to generate loops. -class SparseTensorLoopEmitter { -public: - /// Constructor: take an array of tensors inputs, on which the generated loops - /// will iterate on. The index of the tensor in the array is also the - /// tensor id (tid) used in related functions. - explicit SparseTensorLoopEmitter(ValueRange tensors, - bool isLastOutput = false); - - /// - /// Core functions. - /// - - /// Starts a loop emitting session: - /// 1. Generates all the buffers needed to iterate tensors. - /// 2. Generates the lo/hi bounds to iterate tensors[0]. - void initializeLoopEmit(OpBuilder &builder, Location loc); - - // TODO: Gets rid of `dim` in the argument list? Track the dimension we - // are currently at internally. Then it would be enterNextDimForTensor. - - /// Emits loop over tensor[dim], it assumes that loops between - /// tensor[0...dim - 1] have already been generated. - /// It also prepares to enter tensor[dim + 1]. - Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc, - size_t tid, size_t dim, - ArrayRef reduc = {}); - - /// Emits a coiteration loop over a set of tensors. - // TODO: not yet implemented - void enterCoiterationOverTensorsAtDims(OpBuilder &builder, Location loc, - ArrayRef ts, - ArrayRef ds); - - /// Emits extra locals, since the locals might not be in simplified lattices - /// point used to generate the loops, but are still required to generates - /// expressions. - Value emitExtraLocalsForTensorsAtDims(OpBuilder &builder, Location loc, - size_t tid, size_t dim); - - void exitCurrentLoop(); - - /// Return the array of coordinate for all the loop generated till now. - void getCoordinateArray(SmallVectorImpl &coords) { - for (auto &l : loopStack) - coords.push_back(l.idx); - } - - /// - /// Getters. - /// - - Value getTensorValueBuffer(size_t tid) { return valBuffer[tid]; } - Value getLastLevelTensorPointerIndex(size_t tid) { - return pidxs[tid].back(); - }; - -private: - struct LoopLevelInfo { - LoopLevelInfo(ArrayRef ts, ArrayRef ds, Value idx) - : tensors(ts), dims(ds), idx(idx) {} - llvm::SmallVector tensors; - llvm::SmallVector dims; - Value idx; - }; - - /// Return false if tid[dim] is a dense dimension that does not need to be - /// prepared (to be used by sparsification for needUniv). - bool prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, - size_t dim); - - /// Input (TODO: and output) tensors. - std::vector tensors; - /// The dim type array for each tensor. - std::vector> dims; - /// Sparse iteration information (by tensor and dim). These arrays - /// are updated to remain current within the current loop. - std::vector> pidxs; - std::vector> coord; - std::vector> highs; - /// Universal dense indices and upper bounds (by index). The sizes array is - /// set once with the inferred dimension sizes. - std::vector> sizes; - std::vector> ptrBuffer; // to_pointers - std::vector> idxBuffer; // to_indices - std::vector valBuffer; // to_value - - bool isLastOutput; // Is the last tensor output tensor - std::vector loopStack; - // TODO: not yet used, it should track the current level for each tensor - // to help eliminate `dim` paramters from above APIs. - std::vector curLv; -}; - //===----------------------------------------------------------------------===// // ExecutionEngine/SparseTensorUtils helper functions. //===----------------------------------------------------------------------===// @@ -400,6 +279,233 @@ return constantI8(builder, loc, static_cast(dlt)); } +/// Computes the shape of destination tensor of a reshape operator. This is only +/// used when operands have dynamic shape. The shape of the destination is +/// stored into dstShape. +void genReshapeDstShape(Location loc, PatternRewriter &rewriter, + SmallVector &dstShape, + ArrayRef srcShape, + ArrayRef staticDstShape, + ArrayRef reassociation); + +/// Helper method to translate indices during a reshaping operation. +void translateIndicesArray(OpBuilder &builder, Location loc, + ArrayRef reassociation, + ValueRange srcIndices, ArrayRef srcShape, + ArrayRef dstShape, + SmallVectorImpl &dstIndices); + +inline bool isZeroRankedTensorOrScalar(Type type) { + auto rtp = type.dyn_cast(); + return !rtp || rtp.getRank() == 0; +} + +//===----------------------------------------------------------------------===// +// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate +// loop structure to (co)-iterate sparse tensors. +// +// An example usage: +// To generate the following loops over T1 and T2 +// +// for i in TENSOR_1_0 { +// for j : TENSOR_2_0 { +// for k : TENSOR_1_1 {} +// for k : TENSOR_2_1 {} +// } +// } +// +// One can use +// +// SparseTensorLoopEmiter loopEmiter({T1, T1}); +// loopEmiter.initializeLoopEmit(); +// loopEmiter.enterLoopOverTensorAtDim(T1, 0); +// loopEmiter.enterLoopOverTensorAtDim(T2, 0); +// loopEmiter.enterLoopOverTensorAtDim(T1, 1); +// loopEmiter.exitCurrentLoop(); +// loopEmiter.enterLoopOverTensorAtDim(T2, 1); +// loopEmiter.exitCurrentLoop(); // exit k +// loopEmiter.exitCurrentLoop(); // exit j +// loopEmiter.exitCurrentLoop(); // exit i +//===----------------------------------------------------------------------===// + +// TODO: Sparsification should also rely on this class to generate loops. +class SparseTensorLoopEmitter { +public: + /// Optional callback function to setup dense output tensors when + /// initializing the loop emitter (e.g., to fill a dense output with zeros). + using OutputUpdater = function_ref; + + /// Constructor: take an array of tensors inputs, on which the generated loops + /// will iterate on. The index of the tensor in the array is also the + /// tensor id (tid) used in related functions. + /// If isSparseOut is set, loop emitter assume that the sparse output tensor + /// is empty, and will always generate loops on it based on the dim sizes. + explicit SparseTensorLoopEmitter(ValueRange tensors, + bool isLastOutput = false, + bool isSparseOut = false); + + /// Starts a loop emitting session by generating all the buffers needed to + /// iterate tensors. + void initializeLoopEmit(OpBuilder &builder, Location loc, + OutputUpdater updater = nullptr); + + /// Enters a new loop sequence, the loops within the same sequence starts from + /// the break points of previous loop instead of starting over from 0. + /// e.g., + /// { + /// // loop sequence start. + /// p0 = while(xxx) + /// ... + /// break p0 + /// + /// // Starts loop from p0 + /// for (i = p0; i < end; i++) + /// ... + /// // loop sequence end. + /// } + void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef dims); + + // exit the current loop sequence, this will reset universal index to 0. + void exitCurrentLoopSeq() { + assert(loopSeqStack.size() == loopStack.size() + 1); + loopSeqStack.pop_back(); + } + + // TODO: Gets rid of `dim` in the argument list? Track the dimension we + // are currently at internally. Then it would be enterNextDimForTensor. + // Still need a way to specify the dim for non annoated dense tensor though, + // as it can be accessed out of order. + /// Emits loop over tensor[dim], it assumes that loops between + /// tensor[0...dim - 1] have already been generated. + /// It also prepares to enter tensor[dim + 1]. + /// The function will also perform in-place update on the `reduc` vector to + /// return the reduction variable used inside the generated loop. + Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc, + size_t tid, size_t dim, + MutableArrayRef reduc = {}, + bool isParallel = false, + ArrayRef extraTids = {}, + ArrayRef extraDims = {}); + + /// Emits a co-iteration loop over a set of tensors. + Operation *enterCoIterationOverTensorsAtDims( + OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef dims, bool needsUniv, MutableArrayRef reduc = {}, + ArrayRef extraTids = {}, ArrayRef extraDims = {}); + + SmallVector exitCurrentLoop(OpBuilder &builder, Location loc, + ArrayRef reduc = {}); + + /// Returns the array of coordinate for all the loop generated till now. + void getCoordinateArray(SmallVectorImpl &coords) const { + for (auto &l : loopStack) + coords.push_back(l.iv); + } + + /// Gets loop induction variable at the given level. + Value getLoopIV(size_t level) const { + if (level < loopStack.size()) + return loopStack[level].iv; + return nullptr; + } + + /// + /// Getters. + /// + const std::vector> &getPidxs() const { return pidxs; }; + const std::vector> &getCoord() const { return coord; }; + const std::vector> &getHighs() const { return highs; }; + const std::vector> &getPtrBuffer() const { + return ptrBuffer; + }; + const std::vector> &getIdxBuffer() const { + return idxBuffer; + }; + const std::vector &getValBuffer() const { return valBuffer; }; + +private: + struct LoopLevelInfo { + LoopLevelInfo(ArrayRef tids, ArrayRef dims, Operation *loop, + Value iv) + : tids(tids), dims(dims), loop(loop), iv(iv) {} + // TODO: maybe use a vector for tid and dim? + // The set of tensors that the loop is operating on + const llvm::SmallVector tids; + // The corresponding dims for the tensors + const llvm::SmallVector dims; + const Operation *loop; // the loop operation + const Value iv; // the induction variable for the loop + }; + + /// Linearizes address for dense dimension (i.e., p = (i * d0) + j). + Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim, + Value iv) { + Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1]; + Value mul = builder.create(loc, highs[tid][dim], p); + Value add = builder.create(loc, mul, iv); + return add; + } + + bool isOutputTensor(size_t tid) { + return isLastOutput && tid == tensors.size() - 1; + } + + /// Setups [lo, hi] for iterating tensor[dim], it assumes that tensor[0 + /// ...dims-1] has already been setup. + void prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, + size_t dim); + + /// Emits extra locals, since the locals might not be in simplified lattices + /// point used to generate the loops, but are still required to generates + /// expressions. + void emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc, + ArrayRef tids, + ArrayRef dims); + + /// Exits a for loop, returns the reduction results, e.g., + /// %ret = for () { + /// ... + /// yield %val + /// } + /// Return %ret to user, while %val is provided by users (`reduc`) + SmallVector exitForLoop(OpBuilder &builder, Location loc, + ArrayRef reduc); + + /// Exits a while loop, returns the reduction results. + SmallVector exitCoiterationLoop(OpBuilder &builder, Location loc, + ArrayRef reduc); + + // Is the last tensor output tensor + bool isLastOutput; + /// Input and (optional) output tensors. + std::vector tensors; + /// The dim type array for each tensor. + std::vector> dimTypes; + /// Sparse iteration information (by tensor and dim). These arrays + /// are updated to remain current within the current loop. + std::vector> pidxs; + std::vector> coord; + std::vector> highs; + /// Universal dense indices and upper bounds (by index). The sizes array is + /// set once with the inferred dimension sizes. + std::vector> ptrBuffer; // to_pointers + std::vector> idxBuffer; // to_indices + std::vector valBuffer; // to_value + + // Loop Stack, stores the information of all the nested loops that are alive. + std::vector loopStack; + + // Loop Sequence Stack, stores the unversial index for the current loop + // sequence. + std::vector loopSeqStack; + + // TODO: not yet used, it should track the current level for each tensor + // to help eliminate `dim` paramters from above APIs. + // std::vector curLv; +}; + } // namespace sparse_tensor } // namespace mlir diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp @@ -95,45 +95,49 @@ //===----------------------------------------------------------------------===// SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors, - bool isLastOutput) - : tensors(tensors.begin(), tensors.end()), dims(tensors.size()), - pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()), - sizes(tensors.size()), ptrBuffer(tensors.size()), - idxBuffer(tensors.size()), valBuffer(tensors.size()), - isLastOutput(isLastOutput), loopStack(), curLv(tensors.size(), 0) { - for (size_t i = 0, e = tensors.size(); i < e; i++) { - auto t = tensors[i]; - auto rtp = t.getType().dyn_cast(); - if (!rtp) // a scalar (0-dimension tensors) + bool isLastOutput, + bool isSparseOut) + : isLastOutput(isLastOutput), tensors(tensors.begin(), tensors.end()), + dimTypes(tensors.size()), pidxs(tensors.size()), coord(tensors.size()), + highs(tensors.size()), ptrBuffer(tensors.size()), + idxBuffer(tensors.size()), valBuffer(tensors.size()), loopStack() { + for (size_t tid = 0, e = tensors.size(); tid < e; tid++) { + auto t = tensors[tid]; + // a scalar or 0-dimension tensors + if (isZeroRankedTensorOrScalar(t.getType())) continue; - + auto rtp = t.getType().cast(); auto rank = static_cast(rtp.getRank()); auto enc = getSparseTensorEncoding(rtp); - if (enc) + // We always treat sparse output tensor as dense so that we always iterate + // it based on dim size. + if (enc && !(isOutputTensor(tid) && isSparseOut)) for (auto dimTp : enc.getDimLevelType()) - dims[i].push_back(dimTp); + dimTypes[tid].push_back(dimTp); else - dims[i].assign(rank, DimLevelType::Dense); + dimTypes[tid].assign(rank, DimLevelType::Dense); // Initialize using empty value. - pidxs[i].assign(rank, Value()); - coord[i].assign(rank, Value()); - highs[i].assign(rank, Value()); - sizes[i].assign(rank, Value()); - ptrBuffer[i].assign(rank, Value()); - idxBuffer[i].assign(rank, Value()); + pidxs[tid].assign(rank, Value()); + coord[tid].assign(rank, Value()); + highs[tid].assign(rank, Value()); + ptrBuffer[tid].assign(rank, Value()); + idxBuffer[tid].assign(rank, Value()); } } -void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder, - Location loc) { +void SparseTensorLoopEmitter::initializeLoopEmit( + OpBuilder &builder, Location loc, + SparseTensorLoopEmitter::OutputUpdater updater) { // For every tensor, find lower and upper bound on dimensions, set the // same bounds on loop indices, and obtain dense or sparse buffer(s). - // TODO: Provides ability to generate loop on output buffer (with undef - // dim level in Merger in GenericOp Sparsification). for (size_t t = 0, e = tensors.size(); t < e; t++) { auto tensor = tensors[t]; - auto rtp = tensor.getType().cast(); + auto rtp = tensor.getType().dyn_cast(); + if (!rtp) + // Skips only scalar, zero ranked tensor still need to be bufferized and + // (probably) filled with zeros by users. + continue; auto rank = rtp.getRank(); auto shape = rtp.getShape(); auto enc = getSparseTensorEncoding(rtp); @@ -141,10 +145,9 @@ // Scan all dimensions of current tensor. for (int64_t d = 0; d < rank; d++) { // This should be called only once at beginning. - assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !sizes[t][d] && - !highs[t][d]); + assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]); // Handle sparse storage schemes. - if (isCompressedDLT(dims[t][d])) { + if (isCompressedDLT(dimTypes[t][d])) { auto ptrTp = MemRefType::get(dynShape, getPointerOverheadType(builder, enc)); auto indTp = @@ -153,7 +156,7 @@ // Generate sparse primitives to obtains pointer and indices. ptrBuffer[t][d] = builder.create(loc, ptrTp, tensor, dim); idxBuffer[t][d] = builder.create(loc, indTp, tensor, dim); - } else if (isSingletonDLT(dims[t][d])) { + } else if (isSingletonDLT(dimTypes[t][d])) { // Singleton dimension, fetch indices. auto indTp = MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); @@ -161,105 +164,225 @@ idxBuffer[t][d] = builder.create(loc, indTp, tensor, dim); } else { // Dense dimension, nothing to fetch. - assert(isDenseDLT(dims[t][d])); + assert(isDenseDLT(dimTypes[t][d])); } // Find upper bound in current dimension. unsigned p = toOrigDim(enc, d); Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p); - sizes[t][d] = highs[t][d] = up; + highs[t][d] = up; } + // Perform the required bufferization. Dense inputs materialize - // from the input tensors. Dense outputs need special handling. - // Sparse inputs use sparse primitives to obtain the values. + // from the input tensors. Sparse inputs use sparse primitives to obtain the + // values. + // Delegates extra output initialization to clients. + bool isOutput = isOutputTensor(t); Type elementType = rtp.getElementType(); - if (!enc) { // Non-annotated dense tensors. auto denseTp = MemRefType::get(shape, elementType); - if (isLastOutput && t == tensors.size() - 1) - llvm_unreachable("TODO: not yet handled"); - else - valBuffer[t] = - builder.create(loc, denseTp, tensor); + Value denseVal = + builder.create(loc, denseTp, tensor); + // Dense outputs need special handling. + if (isOutput && updater) + denseVal = updater(builder, loc, denseVal, tensor); + + valBuffer[t] = denseVal; } else { // Annotated sparse tensors. + // We also need the value buffer for annotated all dense `sparse` tensor. auto dynShape = {ShapedType::kDynamicSize}; auto sparseTp = MemRefType::get(dynShape, elementType); valBuffer[t] = builder.create(loc, sparseTp, tensor); } - // Prepare to enter the first dim for all (input) tensors - prepareLoopOverTensorAtDim(builder, loc, t, 0); + // NOTE: we can also prepares for 0 dim here in advance, this will hosit + // some loop preparation from tensor iteration, but will also (undesirably) + // hosit the code ouside if conditions. } } +void SparseTensorLoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, + ArrayRef tids, + ArrayRef dims) { + // Universal Index start from 0 + assert(loopSeqStack.size() == loopStack.size()); + // Universal index starts from 0 + loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); + // Prepares for all the tensors used in the current loop sequence. + for (auto [tid, dim] : llvm::zip(tids, dims)) + prepareLoopOverTensorAtDim(builder, loc, tid, dim); +} + Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim( OpBuilder &builder, Location loc, size_t tid, size_t dim, - ArrayRef reduc) { - assert(dims[tid].size() > dim); + MutableArrayRef reduc, bool isParallel, ArrayRef extraTids, + ArrayRef extraDims) { + assert(dimTypes[tid].size() > dim); // We can not re-enter the same level. assert(!coord[tid][dim]); + Value step = constantIndex(builder, loc, 1); - auto dimType = dims[tid][dim]; - bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); + auto dimType = dimTypes[tid][dim]; + bool isSparseInput = isCompressedDLT(dimType) || isSingletonDLT(dimType); assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || isSingletonDLT(dimType)); - Value lo = isSparse ? pidxs[tid][dim] : constantIndex(builder, loc, 0); + Value lo = isSparseInput ? pidxs[tid][dim] // current offset + : loopSeqStack.back(); // univeral tid Value hi = highs[tid][dim]; - // TODO: support reduction. - if (!reduc.empty()) - llvm_unreachable("TODO: not implemented yet"); - scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(forOp.getBody()); Value iv = forOp.getInductionVar(); - Operation *loop = forOp; - assert(iv); - if (isSparse) { + if (isSparseInput) { pidxs[tid][dim] = iv; // Generating a load on the indices array yields the coordinate. Value ptr = idxBuffer[tid][dim]; - // TODO: generates load for vector value. coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv); } else { // Dense tensor, the coordinates is the inducation variable. coord[tid][dim] = iv; // generate pidx for dense dim (pidx = i * sz + j) - // TODO: handle vector loop. - Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1]; - Value mul = builder.create(loc, sizes[tid][dim], p); - Value add = builder.create(loc, mul, iv); - pidxs[tid][dim] = add; + auto enc = getSparseTensorEncoding(tensors[tid].getType()); + if (enc) + pidxs[tid][dim] = genAddress(builder, loc, tid, dim, iv); } - // Prepares for next dim if this is not currently the innermost dimension. - if (dim != dims[tid].size() - 1) - prepareLoopOverTensorAtDim(builder, loc, tid, dim + 1); + // NOTE: we can also prepares for next dim here in advance + // Push the loop into stack + loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, + coord[tid][dim]); + // Emit extra locals. + emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims); + + // In-place update on the reduction variable vector. + assert(forOp.getNumRegionIterArgs() == reduc.size()); + for (int i = 0, e = reduc.size(); i < e; i++) + reduc[i] = forOp.getRegionIterArg(i); + return forOp; +} + +Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims( + OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef dims, bool needsUniv, MutableArrayRef reduc, + ArrayRef extraTids, ArrayRef extraDims) { + assert(tids.size() == dims.size()); + SmallVector types; + SmallVector operands; + // Construct the while-loop with a parameter for each index. + Type indexType = builder.getIndexType(); + for (auto [tid, dim] : llvm::zip(tids, dims)) { + if (isCompressedDLT(dimTypes[tid][dim]) || + isSingletonDLT(dimTypes[tid][dim])) { + assert(pidxs[tid][dim]); + types.push_back(indexType); + operands.push_back(pidxs[tid][dim]); + } + } + // The position where user-supplied reduction variable starts. + for (Value rec : reduc) { + types.push_back(rec.getType()); + operands.push_back(rec); + } + if (needsUniv) { + types.push_back(indexType); + // Update universal index. + operands.push_back(loopSeqStack.back()); + } + assert(types.size() == operands.size()); + scf::WhileOp whileOp = builder.create(loc, types, operands); + + SmallVector locs(types.size(), loc); + Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); + Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); + + // Build the "before" region, which effectively consists + // of a conjunction of "i < upper" tests on all induction. + builder.setInsertionPointToStart(&whileOp.getBefore().front()); + Value cond; + unsigned o = 0; + for (auto [tid, dim] : llvm::zip(tids, dims)) { + if (isCompressedDLT(dimTypes[tid][dim]) || + isSingletonDLT(dimTypes[tid][dim])) { + Value op1 = before->getArgument(o); + Value op2 = highs[tid][dim]; + Value opc = builder.create(loc, arith::CmpIPredicate::ult, + op1, op2); + cond = cond ? builder.create(loc, cond, opc) : opc; + // Update + pidxs[tid][dim] = after->getArgument(o++); + } + } + builder.create(loc, cond, before->getArguments()); + + // Generates while body. + builder.setInsertionPointToStart(&whileOp.getAfter().front()); + Value min; + for (auto [tid, dim] : llvm::zip(tids, dims)) { + // Prepares for next level. + if (isCompressedDLT(dimTypes[tid][dim]) || + isSingletonDLT(dimTypes[tid][dim])) { + Value ptr = idxBuffer[tid][dim]; + Value s = pidxs[tid][dim]; + Value load = genIndexLoad(builder, loc, ptr, s); + coord[tid][dim] = load; + if (!needsUniv) { + if (min) { + Value cmp = builder.create( + loc, arith::CmpIPredicate::ult, load, min); + min = builder.create(loc, cmp, load, min); + } else { + min = load; + } + } + } + } - loopStack.push_back(LoopLevelInfo({tid}, {dim}, coord[tid][dim])); - return loop; -} + if (needsUniv) { + assert(!min); + // Otherwise, universal index is the minimal pidx. + min = after->getArguments().back(); + } + + for (auto [tid, dim] : llvm::zip(tids, dims)) { + // All dense dim (as well as sparse output tensor) shared the same pidx in + // the while loop. + if (isDenseDLT(dimTypes[tid][dim])) { + pidxs[tid][dim] = min; + // generate pidx for dense dim (pidx = i * sz + j) + auto enc = getSparseTensorEncoding(tensors[tid].getType()); + if (enc) + pidxs[tid][dim] = genAddress(builder, loc, tid, dim, min); + } + // NOTE: we can also prepares for next dim here in advance + } + // Sets up the loop stack. + loopStack.emplace_back(tids, dims, whileOp, min); + assert(loopStack.size() == loopSeqStack.size()); -void SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims( - OpBuilder &builder, Location loc, ArrayRef ts, - ArrayRef ds) { - llvm_unreachable("TODO: unimplemented"); + // Emits extra locals + emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims); + + // Updates reduction variables + assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0)); + // In-place update on reduction variable. + for (unsigned i = 0, e = reduc.size(); i < e; i++) + reduc[i] = after->getArgument(o + i); + + return whileOp; } -bool SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, +void SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, size_t dim) { - // TODO: generate loop iteration on output tensor based on the shape - // instead of pointer/indices arrays. - assert(dims[tid].size() > dim); - auto dimType = dims[tid][dim]; + assert(dimTypes[tid].size() > dim); + auto dimType = dimTypes[tid][dim]; if (isDenseDLT(dimType)) - return false; + return; // Either the first dimension, or the previous dimension has been set. assert(dim == 0 || pidxs[tid][dim - 1]); @@ -269,11 +392,11 @@ Value ptr = ptrBuffer[tid][dim]; Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1]; - Value pHi = builder.create(loc, pLo, c1); - pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo); + + Value pHi = builder.create(loc, pLo, c1); highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi); - return true; + return; } if (isSingletonDLT(dimType)) { Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1]; @@ -281,34 +404,139 @@ pidxs[tid][dim] = pLo; highs[tid][dim] = pHi; - return true; + return; } llvm_unreachable("Unrecognizable dimesion type!"); } -Value SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDims( - OpBuilder &builder, Location loc, size_t tid, size_t dim) { - llvm_unreachable("TODO: not implemented yet"); +// FIXME: Make this call private +void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims( + OpBuilder &builder, Location loc, ArrayRef tids, + ArrayRef dims) { + // Initialize dense positions. Note that we generate dense indices of the + // output tensor unconditionally, since they may not appear in the lattice, + // but may be needed for linearized codegen. + for (auto [tid, dim] : llvm::zip(tids, dims)) { + assert(isDenseDLT(dimTypes[tid][dim])); + auto enc = getSparseTensorEncoding(tensors[tid].getType()); + if (enc) { + bool validPidx = dim == 0 || pidxs[tid][dim - 1]; + if (!validPidx) { + // We might not find the pidx for the sparse output tensor as it is + // unconditionally required by the sparsification. + assert(isOutputTensor(tid)); + continue; + } + pidxs[tid][dim] = genAddress(builder, loc, tid, dim, loopStack.back().iv); + // NOTE: we can also prepares for next dim here in advance + } + } } -void SparseTensorLoopEmitter::exitCurrentLoop() { - // Clean up the values, it would help use to discover potential bug at a - // earlier stage (instead of silently using a wrong value). +SmallVector +SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc, + ArrayRef reduc) { LoopLevelInfo &loopInfo = loopStack.back(); - assert(loopInfo.tensors.size() == loopInfo.dims.size()); - for (auto info : llvm::zip(loopInfo.tensors, loopInfo.dims)) { - auto tid = std::get<0>(info); - auto dim = std::get<1>(info); - assert(pidxs[tid][dim] && coord[tid][dim] && highs[tid][dim]); + auto &dims = loopStack.back().dims; + auto &tids = loopStack.back().tids; + auto forOp = llvm::cast(loopInfo.loop); + if (!reduc.empty()) { + assert(reduc.size() == forOp.getNumResults()); + builder.setInsertionPointToEnd(forOp.getBody()); + builder.create(loc, reduc); + } + + // Finished iterating a tensor, clean up + // We only do the clean up on for loop as while loops do not necessarily + // finish the iteration on a sparse tensor + for (auto [tid, dim] : llvm::zip(tids, dims)) { // Reset to null. - pidxs[tid][dim] = Value(); coord[tid][dim] = Value(); - if (!isDenseDLT(dims[tid][dim])) - // Dense dimension, high is fixed. + pidxs[tid][dim] = Value(); + // Dense dimension, high is fixed. + if (!isDenseDLT(dimTypes[tid][dim])) highs[tid][dim] = Value(); } + // exit the loop + builder.setInsertionPointAfter(forOp); + return forOp.getResults(); +} + +SmallVector +SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc, + ArrayRef reduc) { + auto whileOp = llvm::cast(loopStack.back().loop); + auto &dims = loopStack.back().dims; + auto &tids = loopStack.back().tids; + Value iv = loopStack.back().iv; + // Generation while loop induction at the end. + builder.setInsertionPointToEnd(&whileOp.getAfter().front()); + // Finalize the induction. Note that the induction could be performed + // in the individual if-branches to avoid re-evaluating the conditions. + // However, that would result in a rather elaborate forest of yield + // instructions during code generation. Moreover, performing the induction + // after the if-statements more closely resembles code generated by TACO. + unsigned o = 0; + SmallVector operands; + Value one = constantIndex(builder, loc, 1); + for (auto [tid, dim] : llvm::zip(tids, dims)) { + if (isCompressedDLT(dimTypes[tid][dim]) || + isSingletonDLT(dimTypes[tid][dim])) { + Value op1 = coord[tid][dim]; + Value op3 = pidxs[tid][dim]; + Value cmp = + builder.create(loc, arith::CmpIPredicate::eq, op1, iv); + Value add = builder.create(loc, op3, one); + operands.push_back(builder.create(loc, cmp, add, op3)); + // Following loops continue iteration from the break point of the + // current while loop. + pidxs[tid][dim] = whileOp->getResult(o++); + // The coordinates are invalid now. + coord[tid][dim] = nullptr; + // highs remains unchanged. + } + } + + // Reduction value from users. + SmallVector ret; + for (auto red : reduc) { + operands.push_back(red); + ret.push_back(whileOp->getResult(o++)); + } + + // An (optional) universal index. + if (operands.size() < whileOp.getNumResults()) { + assert(operands.size() + 1 == whileOp.getNumResults()); + // The last one is the universial index. + operands.push_back(builder.create(loc, iv, one)); + // update the loop starting point of current loop sequence + loopSeqStack.back() = whileOp->getResult(o++); + } + + assert(o == operands.size()); + builder.create(loc, operands); + builder.setInsertionPointAfter(whileOp); + return ret; +} + +SmallVector +SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc, + ArrayRef reduc) { + // Clean up the values, it would help use to discover potential bug at a + // earlier stage (instead of silently using a wrong value). + LoopLevelInfo &loopInfo = loopStack.back(); + assert(loopInfo.tids.size() == loopInfo.dims.size()); + SmallVector red; + if (llvm::isa(loopInfo.loop)) { + red = exitCoiterationLoop(builder, loc, reduc); + } else { + red = exitForLoop(builder, loc, reduc); + } + + assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); + return red; } //===----------------------------------------------------------------------===// @@ -500,9 +728,9 @@ auto srcDim = srcShape[i]; // Iterate through dimensions expanded from the i-th dimension. for (unsigned j = start; j < start + map.size(); j++) { - // There can be only one dynamic sized dimension among dimensions expanded - // from the i-th dimension in srcShape. For example, if srcDim = 8, then - // the expanded shape could be <2x?x2>, but not <2x?x?>. + // There can be only one dynamic sized dimension among dimensions + // expanded from the i-th dimension in srcShape. For example, if srcDim + // = 8, then the expanded shape could be <2x?x2>, but not <2x?x?>. if (staticDstShape[j] == ShapedType::kDynamicSize) { // The expanded dimension has dynamic size. We compute the dimension // by dividing srcDim by the product of the static dimensions. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -477,29 +477,35 @@ // 1. Generates loop for the sparse input. SparseTensorLoopEmitter loopEmitter(ValueRange{input}); loopEmitter.initializeLoopEmit(rewriter, loc); - for (int64_t i = 0; i < rank; i++) + for (int64_t i = 0; i < rank; i++) { + // TODO: provide utility function for loop sequences that only contains + // one for loop? + loopEmitter.enterNewLoopSeq(rewriter, loc, 0, static_cast(i)); loopEmitter.enterLoopOverTensorAtDim(rewriter, loc, 0, i); + } SmallVector coords; coords.reserve(rank); loopEmitter.getCoordinateArray(coords); - Value vals = loopEmitter.getTensorValueBuffer(0); - Value pidx = loopEmitter.getLastLevelTensorPointerIndex(0); + Value vals = loopEmitter.getValBuffer()[0]; + Value pidx = loopEmitter.getPidxs()[0].back(); // Loads the value from sparse tensor using pointer index; // loads the value from dense tensor using coordinate array. Value val = enc ? rewriter.create(loc, vals, pidx) : rewriter.create(loc, vals, coords); - for (int64_t i = 0; i < rank; i++) - loopEmitter.exitCurrentLoop(); - // 2. Inline the block in the foreach operator. Block::iterator inlinePos = rewriter.getInsertionPoint(); Block *srcBlock = op.getBody(); // Remove sparse_tensor.yield. rewriter.eraseOp(srcBlock->getTerminator()); + for (int64_t i = 0; i < rank; i++) { + loopEmitter.exitCurrentLoop(rewriter, loc); + loopEmitter.exitCurrentLoopSeq(); + } + SmallVector args; // Remap coordinates. for (int64_t i = 0; i < rank; i++) { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -40,6 +40,8 @@ namespace { +constexpr unsigned INVALID_ID = std::numeric_limits::max(); + // Iteration graph sorting. enum SortMask { kSparseOnly = 0x0, @@ -53,34 +55,16 @@ // Code generation. struct CodeGen { - CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops, - OpOperand *op, unsigned nest, std::vector &ts) - : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors), - pointers(numTensors, std::vector(numLoops)), - indices(numTensors, std::vector(numLoops)), - highs(numTensors, std::vector(numLoops)), - pidxs(numTensors, std::vector(numLoops)), - idxs(numTensors, std::vector(numLoops)), sparseOut(op), - outerParNest(nest), topSort(ts) {} + CodeGen(SparsificationOptions o, ValueRange tensors, unsigned numTensors, + unsigned numLoops, OpOperand *op, unsigned nest, + std::vector &ts) + : options(o), loopEmitter(tensors, /*isLastOutput=*/true, + /*isSparseOut=*/op != nullptr), + sparseOut(op), outerParNest(nest), topSort(ts) {} /// Sparsification options. SparsificationOptions options; - /// Universal dense indices and upper bounds (by index). The loops array - /// is updated with the value of the universal dense index in the current - /// loop. The sizes array is set once with the inferred dimension sizes. - std::vector loops; - std::vector sizes; - /// Buffers for storing dense and sparse numerical values (by tensor). - /// This array is set once during bufferization of all tensors. - std::vector buffers; - /// Sparse storage schemes (1-D): pointers and indices (by tensor and index). - /// This array is set once during bufferization of all sparse tensors. - std::vector> pointers; - std::vector> indices; - /// Sparse iteration information (by tensor and index). These arrays - /// are updated to remain current within the current loop. - std::vector> highs; - std::vector> pidxs; - std::vector> idxs; + /// Loop emitter helper class. + SparseTensorLoopEmitter loopEmitter; /// Current reduction, updated during code generation. When indices of a /// reduction are exhausted, all inner loops can use a scalarized reduction. unsigned redExp = -1u; @@ -98,8 +82,48 @@ Value expCount; // Topsort (reference should remain in scope). std::vector &topSort; + + // From tensor id + loop id => dim id. + // TODO: This map should probably be maintained by Merger (it can be set up + // together with dimLvlType Map). + std::vector> loopIdxToDim; + + // Initialize the above two mapping. + void buildLoopIdxToDimMap(linalg::GenericOp op); + + Value getLoopIdxValue(size_t loopIdx) const { + for (unsigned lv = 0; lv < topSort.size(); lv++) + if (topSort[lv] == loopIdx) + return loopEmitter.getLoopIV(lv); + + llvm_unreachable("invalid loop index"); + } }; +void CodeGen::buildLoopIdxToDimMap(linalg::GenericOp op) { + size_t numLoops = op.getNumLoops(); + size_t numTensors = op.getNumOperands(); + loopIdxToDim.assign(numTensors, std::vector(numLoops, INVALID_ID)); + + for (OpOperand &t : op->getOpOperands()) { + auto map = op.getMatchingIndexingMap(&t); + auto enc = getSparseTensorEncoding(t.get().getType()); + // Scan all dimensions of current tensor. + unsigned tid = t.getOperandNumber(); + for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { + auto a = map.getResult(toOrigDim(enc, d)).dyn_cast(); + if (a) { + unsigned loopId = a.getPosition(); + // Fills the mapping. + loopIdxToDim[tid][loopId] = d; + } + // Else a compound affine, do nothing. (at least we are good for + // now, as we only support compound affine expr on non-annoated dense + // tensors). + } + } +} + } // namespace //===----------------------------------------------------------------------===// @@ -128,13 +152,15 @@ /// same index is used more than once. Also rejects compound affine /// expressions in sparse dimensions. static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a, - DimLevelType dim) { + DimLevelType dim, bool setLvlFormat = true) { switch (a.getKind()) { case AffineExprKind::DimId: { unsigned idx = a.cast().getPosition(); if (!isUndefDLT(merger.getDimLevelType(tensor, idx))) return false; // used more than once - merger.setDimLevelType(tensor, idx, dim); + + if (setLvlFormat) + merger.setDimLevelType(tensor, idx, dim); return true; } case AffineExprKind::Add: @@ -142,8 +168,10 @@ if (!isDenseDLT(dim)) return false; // compound only in dense dim auto binOp = a.cast(); - return findAffine(merger, tensor, binOp.getLHS(), dim) && - findAffine(merger, tensor, binOp.getRHS(), dim); + // We do not set dim level format for affine expresssion like d0 + d1 on + // both loop index at d0 and d1, + return findAffine(merger, tensor, binOp.getLHS(), dim, false) && + findAffine(merger, tensor, binOp.getRHS(), dim, false); } case AffineExprKind::Constant: return isDenseDLT(dim); // const only in dense dim @@ -411,106 +439,45 @@ // Sparse compiler synthesis methods (statements and expressions). //===----------------------------------------------------------------------===// -/// Generates buffer for the output tensor. Note that all sparse kernels -/// assume that when all elements are written to (viz. x(i) = y(i) * z(i)), -/// the output buffer is already initialized to all zeroes and only nonzeroes -/// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)), -/// only nonzeroes values are used for the updates and no assumption on the -/// original contents of the output buffer is necessary. -static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder, - linalg::GenericOp op, MemRefType denseTp, - ArrayRef args) { - Location loc = op.getLoc(); - OpOperand *lhs = op.getOutputOperand(0); - Value tensor = lhs->get(); - bool isInit = op.isInitTensor(lhs); - // An output tensor can simply materialize from the buffer of the tensor that - // appears in the outs() clause. For updates, this has the advantage that only - // the nonzero value are involved in the computation, keeping the operation - // O(nnz). In all other cases, we are forced to zero out the buffer to enforce - // the assumption above, which may negatively impact running complexity - // (viz. O(n^2 + nnz) vs. O(nnz) for matrices). - // TODO: use better analysis to avoid zeroing out the buffer? - Value init = builder.create(loc, denseTp, tensor); - if (!isInit) { - Value zero = constantZero(builder, loc, denseTp.getElementType()); - builder.create(loc, ValueRange{zero}, ValueRange{init}); - } - return init; -} - /// Local bufferization of all dense and sparse data structures. static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op) { Location loc = op.getLoc(); - assert(op->getNumOperands() == op.getNumInputs() + 1); - // For every tensor, find lower and upper bound on dimensions, set the - // same bounds on loop indices, and obtain dense or sparse buffer(s). - auto dynShape = {ShapedType::kDynamicSize}; - SmallVector args; - for (OpOperand &t : op->getOpOperands()) { - unsigned tensor = t.getOperandNumber(); - auto shape = op.getShape(&t); - auto map = op.getMatchingIndexingMap(&t); - auto enc = getSparseTensorEncoding(t.get().getType()); - // Scan all dimensions of current tensor. - args.clear(); - for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { - AffineExpr a = map.getResult(toOrigDim(enc, d)); - if (a.getKind() != AffineExprKind::DimId) - continue; // compound - unsigned idx = a.cast().getPosition(); - // Handle the different storage schemes. - if (isCompressedDLT(merger.getDimLevelType(tensor, idx))) { - // Compressed dimension, fetch pointer and indices. - auto ptrTp = - MemRefType::get(dynShape, getPointerOverheadType(builder, enc)); - auto indTp = - MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); - auto dim = builder.getIndexAttr(d); - codegen.pointers[tensor][idx] = - builder.create(loc, ptrTp, t.get(), dim); - codegen.indices[tensor][idx] = - builder.create(loc, indTp, t.get(), dim); - } else if (isSingletonDLT(merger.getDimLevelType(tensor, idx))) { - // Singleton dimension, fetch indices. - auto indTp = - MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); - auto dim = builder.getIndexAttr(d); - codegen.indices[tensor][idx] = - builder.create(loc, indTp, t.get(), dim); - } else { - // Dense dimension, nothing to fetch. - assert(isDenseDLT(merger.getDimLevelType(tensor, idx))); - } - // Find upper bound in current dimension. - unsigned p = toOrigDim(enc, d); - Value up = linalg::createOrFoldDimOp(builder, loc, t.get(), p); - if (ShapedType::isDynamic(shape[p])) - args.push_back(up); - assert(codegen.highs[tensor][idx] == nullptr); - codegen.sizes[idx] = codegen.highs[tensor][idx] = up; - } - // Perform the required bufferization. Dense inputs materialize - // from the input tensors. Dense outputs need special handling. - // Sparse inputs use sparse primitives to obtain the values. - Type elementType = getElementTypeOrSelf(t.get().getType()); - if (!enc) { - // Non-annotated dense tensors. - auto denseTp = MemRefType::get(shape, elementType); - if (tensor < op.getNumInputs()) - codegen.buffers[tensor] = - builder.create(loc, denseTp, t.get()); - else - codegen.buffers[tensor] = - genOutputBuffer(codegen, builder, op, denseTp, args); - } else if (&t != codegen.sparseOut) { - // Annotated sparse tensors (not involved in output). - auto sparseTp = MemRefType::get(dynShape, elementType); - codegen.buffers[tensor] = - builder.create(loc, sparseTp, t.get()); - } - } + assert(op.getNumOperands() == op.getNumInputs() + 1); + + codegen.loopEmitter.initializeLoopEmit( + builder, loc, + /// Generates buffer for the output tensor. Note that all sparse kernels + /// assume that when all elements are written to (viz. x(i) = y(i) * + /// z(i)), the output buffer is already initialized to all zeroes and only + /// nonzeroes values are computed and written out. For updates (viz. x(i) + /// += y(i) * z(i)), only nonzeroes values are used for the updates and no + /// assumption on the original contents of the output buffer is necessary. + [&op](OpBuilder &builder, Location loc, Value memref, + Value tensor) -> Value { + // Must not be a sparse tensor. + assert(!getSparseTensorEncoding(tensor.getType())); + OpOperand *lhs = op.getOutputOperand(0); + // Two output tensors should match. + assert(lhs->get() == tensor); + bool isInit = op.isInitTensor(lhs); + // An output tensor can simply materialize from the buffer of the tensor + // that appears in the outs() clause. For updates, this has the + // advantage that only the nonzero value are involved in the + // computation, keeping the operation O(nnz). In all other cases, we are + // forced to zero out the buffer to enforce the assumption above, which + // may negatively impact running complexity (viz. O(n^2 + nnz) vs. + // O(nnz) for matrices). + // TODO: use better analysis to avoid zeroing out the buffer? + Value init = memref; + if (!isInit) { + Value zero = constantZero(builder, loc, + getElementTypeOrSelf(tensor.getType())); + builder.create(loc, ValueRange{zero}, + ValueRange{init}); + } + return init; + }); } /// Generates an affine expression. @@ -522,7 +489,7 @@ switch (a.getKind()) { case AffineExprKind::DimId: { unsigned idx = a.cast().getPosition(); - return codegen.loops[idx]; // universal dense index + return codegen.getLoopIdxValue(idx); // universal dense index } case AffineExprKind::Add: { auto binOp = a.cast(); @@ -552,7 +519,7 @@ AffineExpr a = map.getResult(toOrigDim(enc, map.getNumResults() - 1)); assert(a.getKind() == AffineExprKind::DimId); unsigned idx = a.cast().getPosition(); - return codegen.loops[idx]; + return codegen.getLoopIdxValue(idx); } /// Generates subscript for load/store on a dense or sparse tensor. @@ -566,18 +533,18 @@ if (enc) { // Note that currently, all sparse subscripts are simple. // TODO: accept affine too? - AffineExpr a = map.getResult(toOrigDim(enc, rank - 1)); - assert(a.getKind() == AffineExprKind::DimId); - unsigned idx = a.cast().getPosition(); - assert(codegen.pidxs[tensor][idx] != nullptr); - args.push_back(codegen.pidxs[tensor][idx]); // position index + assert(map.getResult(toOrigDim(enc, rank - 1)).getKind() == + AffineExprKind::DimId); + Value pidx = codegen.loopEmitter.getPidxs()[tensor].back(); + assert(pidx); + args.push_back(pidx); // position index } else { for (unsigned d = 0; d < rank; d++) { AffineExpr a = map.getResult(d); args.push_back(genAffine(codegen, builder, a, op.getLoc())); } } - return codegen.buffers[tensor]; + return codegen.loopEmitter.getValBuffer()[tensor]; } /// Generates insertion code to implement dynamic tensor load. @@ -622,8 +589,8 @@ unsigned rank = op.getRank(t); SmallVector indices; for (unsigned i = 0; i < rank; i++) { - assert(codegen.loops[codegen.topSort[i]]); - indices.push_back(codegen.loops[codegen.topSort[i]]); + assert(codegen.loopEmitter.getLoopIV(i)); + indices.push_back(codegen.loopEmitter.getLoopIV(i)); } builder.create(loc, rhs, t->get(), indices); return; @@ -717,42 +684,16 @@ builder.create(loc, rhs, ptr, args); } -/// Generates a pointer/index load from the sparse storage scheme. Narrower -/// data types need to be zero extended before casting the value into the -/// index type used for looping and indexing. -static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc, - Value ptr, Value s) { - // Simply zero extends narrower indices into 64-bit values before casting to - // index without a performance penalty. - Value load = builder.create(loc, ptr, s); - if (!load.getType().isa()) { - if (load.getType().getIntOrFloatBitWidth() < 64) - load = builder.create(loc, builder.getI64Type(), load); - load = - builder.create(loc, builder.getIndexType(), load); - } - return load; -} - /// Generates an invariant value. -static Value genInvariantValue(Merger &merger, CodeGen &codegen, - OpBuilder &builder, unsigned exp) { - Value val = merger.exp(exp).val; - return val; -} - -/// Generates an address computation "sz * p + i". -static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc, - Value size, Value p, Value i) { - Value mul = builder.create(loc, size, p); - return builder.create(loc, mul, i); +inline static Value genInvariantValue(Merger &merger, CodeGen &codegen, + OpBuilder &builder, unsigned exp) { + return merger.exp(exp).val; } /// Generates an index value. -static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx, - unsigned ldx) { - Value ival = codegen.loops[idx]; - return ival; +inline static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, + unsigned idx) { + return codegen.getLoopIdxValue(idx); } /// Semi-ring branches are simply inlined by the sparse compiler. Prior @@ -764,7 +705,7 @@ Block *block, Value e, unsigned ldx) { if (Operation *def = e.getDefiningOp()) { if (auto indexOp = dyn_cast(def)) - return genIndexValue(codegen, rewriter, indexOp.getDim(), ldx); + return genIndexValue(codegen, rewriter, indexOp.getDim()); if (def->getBlock() == block) { for (unsigned i = 0, n = def->getNumOperands(); i < n; i++) def->setOperand( @@ -785,7 +726,7 @@ if (merger.exp(exp).kind == Kind::kInvariant) return genInvariantValue(merger, codegen, rewriter, exp); if (merger.exp(exp).kind == Kind::kIndex) - return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx); + return genIndexValue(codegen, rewriter, merger.exp(exp).index); if (merger.exp(exp).kind == Kind::kReduce) { // Make custom reduction identity accessible for expanded access pattern. @@ -826,7 +767,7 @@ unsigned idx = a.cast().getPosition(); if (idx == ldx) atLevel = true; - return codegen.loops[idx] != nullptr; // no longer in play? + return codegen.getLoopIdxValue(idx) != nullptr; // no longer in play? } case AffineExprKind::Add: case AffineExprKind::Mul: { @@ -901,6 +842,7 @@ if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 || at != codegen.outerParNest) return; // not needed at this level + assert(codegen.redVal == nullptr); // Generate start or end of an expanded access pattern. Value tensor = lhs->get(); Location loc = op.getLoc(); @@ -923,8 +865,8 @@ assert(codegen.expValues); SmallVector indices; for (unsigned i = 0; i < at; i++) { - assert(codegen.loops[codegen.topSort[i]]); - indices.push_back(codegen.loops[codegen.topSort[i]]); + assert(codegen.loopEmitter.getLoopIV(i)); + indices.push_back(codegen.loopEmitter.getLoopIV(i)); } builder.create(loc, codegen.expValues, codegen.expFilled, codegen.expAdded, codegen.expCount, tensor, @@ -934,68 +876,9 @@ } } -/// Generates initialization code for the subsequent loop sequence at -/// current index level. Returns true if the loop sequence needs to -/// maintain the universal index. -static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder, - linalg::GenericOp op, unsigned at, BitVector &inits) { - std::vector &topSort(codegen.topSort); - bool needsUniv = false; - Location loc = op.getLoc(); - unsigned idx = topSort[at]; - - // Initialize sparse positions. - for (unsigned b = 0, be = inits.size(); b < be; b++) { - if (!inits[b]) - continue; - unsigned tensor = merger.tensor(b); - assert(idx == merger.index(b)); - if (isCompressedDLT(merger.getDimLevelType(b))) { - // Initialize sparse index that will implement the iteration: - // for pidx_idx = pointers(pidx_idx-1), pointers(1+pidx_idx-1) - unsigned pat = at; - for (; pat != 0; pat--) { - if (codegen.pidxs[tensor][topSort[pat - 1]]) - break; - } - Value ptr = codegen.pointers[tensor][idx]; - Value one = constantIndex(builder, loc, 1); - Value p0 = (pat == 0) ? constantIndex(builder, loc, 0) - : codegen.pidxs[tensor][topSort[pat - 1]]; - codegen.pidxs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p0); - Value p1 = builder.create(loc, p0, one); - codegen.highs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p1); - } else if (isSingletonDLT(merger.getDimLevelType(b))) { - // Initialize sparse index that will implement the "iteration": - // for pidx_idx = pidx_idx-1, 1+pidx_idx-1 - // We rely on subsequent loop unrolling to get rid of the loop - // if it is not involved in co-iteration with anything else. - unsigned pat = at; - for (; pat != 0; pat--) { - if (codegen.pidxs[tensor][topSort[pat - 1]]) - break; - } - Value one = constantIndex(builder, loc, 1); - Value p0 = (pat == 0) ? constantIndex(builder, loc, 0) - : codegen.pidxs[tensor][topSort[pat - 1]]; - codegen.pidxs[tensor][idx] = p0; - codegen.highs[tensor][idx] = builder.create(loc, p0, one); - } else { - assert(isDenseDLT(merger.getDimLevelType(b)) || - isUndefDLT(merger.getDimLevelType(b))); - // Dense index still in play. - needsUniv = true; - } - } - - // Initialize the universal dense index. - codegen.loops[idx] = constantIndex(builder, loc, 0); - return needsUniv; -} - -/// Returns parallelization strategy. Any implicit loop in the Linalg operation -/// that is marked "parallel" is a candidate. Whether it is actually converted -/// to a parallel operation depends on the requested strategy. +/// Returns parallelization strategy. Any implicit loop in the Linalg +/// operation that is marked "parallel" is a candidate. Whether it is actually +/// converted to a parallel operation depends on the requested strategy. static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, bool isSparse) { // Reject parallelization of sparse output. @@ -1020,32 +903,16 @@ /// Generates a for-loop on a single index. static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, bool isOuter, bool isInner, - unsigned idx, BitVector &indices) { - unsigned fb = indices.find_first(); - unsigned tensor = merger.tensor(fb); - assert(idx == merger.index(fb)); + unsigned idx, size_t tid, size_t dim, + ArrayRef extraTids, + ArrayRef extraDims) { + Location loc = op.getLoc(); auto iteratorTypes = op.getIteratorTypesArray(); bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]); - bool isSparse = isCompressedDLT(merger.getDimLevelType(fb)) || - isSingletonDLT(merger.getDimLevelType(fb)); + bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) || + isSingletonDLT(merger.getDimLevelType(tid, idx)); bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse); - - // Loop bounds and increment. - Location loc = op.getLoc(); - Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx]; - Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx]; - Value step = constantIndex(builder, loc, 1); - - // Emit a parallel loop. - if (isParallel) { - scf::ParallelOp parOp = builder.create(loc, lo, hi, step); - if (isSparse) - codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0]; - else - codegen.loops[idx] = parOp.getInductionVars()[0]; - builder.setInsertionPointToStart(parOp.getBody()); - return parOp; - } + assert(!isParallel); // Emit a sequential or vector loop. SmallVector operands; @@ -1054,182 +921,70 @@ if (codegen.expValues) operands.push_back(codegen.expCount); - scf::ForOp forOp = builder.create(loc, lo, hi, step, operands); + Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim( + builder, loc, tid, dim, operands, isParallel, extraTids, extraDims); + // The operands should be updated by loop emitter already. if (codegen.redVal) - updateReduc(merger, codegen, forOp.getRegionIterArgs().front()); + updateReduc(merger, codegen, operands.front()); if (codegen.expValues) - codegen.expCount = forOp.getRegionIterArgs().back(); - // Assign induction variable to sparse or dense index. - Value iv = forOp.getInductionVar(); - if (isSparse) - codegen.pidxs[tensor][idx] = iv; - else - codegen.loops[idx] = iv; - - builder.setInsertionPointToStart(forOp.getBody()); - return forOp; + codegen.expCount = operands.back(); + + return loop; } /// Emit a while-loop for co-iteration over multiple indices. static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned idx, bool needsUniv, - BitVector &indices) { - SmallVector types; + ArrayRef condTids, ArrayRef condDims, + ArrayRef extraTids, + ArrayRef extraDims) { + SmallVector operands; + // Construct the while-loop with a parameter for each index. - Type indexType = builder.getIndexType(); - for (unsigned b = 0, be = indices.size(); b < be; b++) { - if (!indices[b]) - continue; - if (isCompressedDLT(merger.getDimLevelType(b)) || - isSingletonDLT(merger.getDimLevelType(b))) { - unsigned tensor = merger.tensor(b); - assert(idx == merger.index(b)); - types.push_back(indexType); - operands.push_back(codegen.pidxs[tensor][idx]); - } else { - assert(isDenseDLT(merger.getDimLevelType(b)) || - isUndefDLT(merger.getDimLevelType(b))); - } - } - if (codegen.redVal) { - types.push_back(codegen.redVal.getType()); + if (codegen.redVal) operands.push_back(codegen.redVal); - } - if (codegen.expValues) { - types.push_back(indexType); + if (codegen.expValues) operands.push_back(codegen.expCount); - } - if (needsUniv) { - types.push_back(indexType); - operands.push_back(codegen.loops[idx]); - } - assert(types.size() == operands.size()); - Location loc = op.getLoc(); - scf::WhileOp whileOp = builder.create(loc, types, operands); - SmallVector locs(types.size(), loc); - Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); - Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); + Operation *loop = codegen.loopEmitter.enterCoIterationOverTensorsAtDims( + builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids, + extraDims); - // Build the "before" region, which effectively consists - // of a conjunction of "i < upper" tests on all induction. - builder.setInsertionPointToStart(&whileOp.getBefore().front()); - Value cond; - unsigned o = 0; - for (unsigned b = 0, be = indices.size(); b < be; b++) { - if (!indices[b]) - continue; - if (isCompressedDLT(merger.getDimLevelType(b)) || - isSingletonDLT(merger.getDimLevelType(b))) { - unsigned tensor = merger.tensor(b); - assert(idx == merger.index(b)); - Value op1 = before->getArgument(o); - Value op2 = codegen.highs[tensor][idx]; - Value opc = builder.create(loc, arith::CmpIPredicate::ult, - op1, op2); - cond = cond ? builder.create(loc, cond, opc) : opc; - codegen.pidxs[tensor][idx] = after->getArgument(o++); - } else { - assert(isDenseDLT(merger.getDimLevelType(b)) || - isUndefDLT(merger.getDimLevelType(b))); - } - } if (codegen.redVal) - updateReduc(merger, codegen, after->getArgument(o++)); + updateReduc(merger, codegen, operands.front()); if (codegen.expValues) - codegen.expCount = after->getArgument(o++); - if (needsUniv) - codegen.loops[idx] = after->getArgument(o++); - assert(o == operands.size()); - builder.create(loc, cond, before->getArguments()); - builder.setInsertionPointToStart(&whileOp.getAfter().front()); - return whileOp; + codegen.expCount = operands.back(); + + return loop; } /// Generates a for-loop or a while-loop, depending on whether it implements /// singleton iteration or co-iteration over the given conjunction. static Operation *genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned at, bool needsUniv, - BitVector &indices) { + ArrayRef condTids, ArrayRef condDims, + ArrayRef extraTids, + ArrayRef extraDims) { + assert(condTids.size() == condDims.size()); + assert(extraTids.size() == extraDims.size()); unsigned idx = codegen.topSort[at]; - if (indices.count() == 1) { + if (condTids.size() == 1) { bool isOuter = at == 0; bool isInner = at == codegen.topSort.size() - 1; - return genFor(merger, codegen, builder, op, isOuter, isInner, idx, indices); - } - return genWhile(merger, codegen, builder, op, idx, needsUniv, indices); -} - -/// Generates the local variables for this loop, consisting of the sparse -/// indices, restored universal dense index, and dense positions. -static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder, - linalg::GenericOp op, unsigned at, bool needsUniv, - BitVector &locals) { - std::vector &topSort(codegen.topSort); - Location loc = op.getLoc(); - unsigned idx = topSort[at]; - - // Initialize sparse indices. - Value min; - for (unsigned b = 0, be = locals.size(); b < be; b++) { - if (!locals[b]) - continue; - if (isCompressedDLT(merger.getDimLevelType(b)) || - isSingletonDLT(merger.getDimLevelType(b))) { - unsigned tensor = merger.tensor(b); - assert(idx == merger.index(b)); - Value ptr = codegen.indices[tensor][idx]; - Value s = codegen.pidxs[tensor][idx]; - Value load = genLoad(codegen, builder, loc, ptr, s); - codegen.idxs[tensor][idx] = load; - if (!needsUniv) { - if (min) { - Value cmp = builder.create( - loc, arith::CmpIPredicate::ult, load, min); - min = builder.create(loc, cmp, load, min); - } else { - min = load; - } - } - } else { - assert(isDenseDLT(merger.getDimLevelType(b)) || - isUndefDLT(merger.getDimLevelType(b))); - } - } - - // Merge dense universal index over minimum. - if (min) { - assert(!needsUniv); - codegen.loops[idx] = min; - } - - // Initialize dense positions. Note that we generate dense indices of the - // output tensor unconditionally, since they may not appear in the lattice, - // but may be needed for linearized codegen. - for (unsigned b = 0, be = locals.size(); b < be; b++) { - if ((locals[b] || merger.isOutTensor(b, idx)) && - isDenseDLT(merger.getDimLevelType(b))) { - unsigned tensor = merger.tensor(b); - assert(idx == merger.index(b)); - unsigned pat = at; - for (; pat != 0; pat--) - if (codegen.pidxs[tensor][topSort[pat - 1]]) - break; - Value p = (pat == 0) ? constantIndex(builder, loc, 0) - : codegen.pidxs[tensor][topSort[pat - 1]]; - codegen.pidxs[tensor][idx] = genAddress( - codegen, builder, loc, codegen.sizes[idx], p, codegen.loops[idx]); - } + return genFor(merger, codegen, builder, op, isOuter, isInner, idx, + condTids.front(), condDims.front(), extraTids, extraDims); } + return genWhile(merger, codegen, builder, op, idx, needsUniv, condTids, + condDims, extraTids, extraDims); } /// Generates the induction structure for a while-loop. -static void genWhileInduction(Merger &merger, CodeGen &codegen, - OpBuilder &builder, linalg::GenericOp op, - unsigned idx, bool needsUniv, - BitVector &induction, scf::WhileOp whileOp) { +static void finalizeWhileOp(Merger &merger, CodeGen &codegen, + OpBuilder &builder, linalg::GenericOp op, + unsigned idx, bool needsUniv, BitVector &induction, + scf::WhileOp whileOp) { Location loc = op.getLoc(); // Finalize each else branch of all if statements. if (codegen.redVal || codegen.expValues) { @@ -1251,71 +1006,6 @@ } } builder.setInsertionPointToEnd(&whileOp.getAfter().front()); - // Finalize the induction. Note that the induction could be performed - // in the individual if-branches to avoid re-evaluating the conditions. - // However, that would result in a rather elaborate forest of yield - // instructions during code generation. Moreover, performing the induction - // after the if-statements more closely resembles code generated by TACO. - unsigned o = 0; - SmallVector operands; - Value one = constantIndex(builder, loc, 1); - for (unsigned b = 0, be = induction.size(); b < be; b++) { - if (!induction[b]) - continue; - if (isCompressedDLT(merger.getDimLevelType(b)) || - isSingletonDLT(merger.getDimLevelType(b))) { - unsigned tensor = merger.tensor(b); - assert(idx == merger.index(b)); - Value op1 = codegen.idxs[tensor][idx]; - Value op2 = codegen.loops[idx]; - Value op3 = codegen.pidxs[tensor][idx]; - Value cmp = builder.create(loc, arith::CmpIPredicate::eq, - op1, op2); - Value add = builder.create(loc, op3, one); - operands.push_back(builder.create(loc, cmp, add, op3)); - codegen.pidxs[tensor][idx] = whileOp->getResult(o++); - } else { - assert(isDenseDLT(merger.getDimLevelType(b)) || - isUndefDLT(merger.getDimLevelType(b))); - } - } - if (codegen.redVal) { - operands.push_back(codegen.redVal); - updateReduc(merger, codegen, whileOp->getResult(o++)); - } - if (codegen.expValues) { - operands.push_back(codegen.expCount); - codegen.expCount = whileOp->getResult(o++); - } - if (needsUniv) { - operands.push_back( - builder.create(loc, codegen.loops[idx], one)); - codegen.loops[idx] = whileOp->getResult(o++); - } - assert(o == operands.size()); - builder.create(loc, operands); - builder.setInsertionPointAfter(whileOp); -} - -/// Generates the induction structure for a for-loop. -static void genForInduction(Merger &merger, CodeGen &codegen, - OpBuilder &builder, linalg::GenericOp op, - Operation *loop) { - Location loc = op.getLoc(); - unsigned o = 0; - SmallVector operands; - if (codegen.redVal) { - operands.push_back(codegen.redVal); - updateReduc(merger, codegen, loop->getResult(o++)); - } - if (codegen.expValues) { - operands.push_back(codegen.expCount); - codegen.expCount = loop->getResult(o++); - } - assert(o == operands.size()); - if (o > 0) - builder.create(loc, operands); - builder.setInsertionPointAfter(loop); } /// Generates a single if-statement within a while-loop. @@ -1333,8 +1023,10 @@ Value clause; if (isCompressedDLT(merger.getDimLevelType(b)) || isSingletonDLT(merger.getDimLevelType(b))) { - Value op1 = codegen.idxs[tensor][idx]; - Value op2 = codegen.loops[idx]; + auto dim = codegen.loopIdxToDim[tensor][idx]; + assert(dim != INVALID_ID); + Value op1 = codegen.loopEmitter.getCoord()[tensor][dim]; + Value op2 = codegen.getLoopIdxValue(idx); clause = builder.create(loc, arith::CmpIPredicate::eq, op1, op2); } else { @@ -1380,15 +1072,33 @@ static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp, unsigned at, unsigned idx, unsigned ldx, unsigned lts) { - assert(!codegen.loops[idx]); + assert(!codegen.getLoopIdxValue(idx)); // Emit invariants at this loop sequence level. genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true); // Emit access pattern expansion for sparse tensor output. genExpansion(merger, codegen, builder, op, at, /*atStart=*/true); // Emit further intitialization at this loop sequence level. unsigned l0 = merger.set(lts)[0]; - bool needsUniv = - genInit(merger, codegen, builder, op, at, merger.lat(l0).bits); + bool needsUniv = false; + + SmallVector ts; + SmallVector ds; + for (auto b : merger.lat(l0).bits.set_bits()) { + if (isDenseDLT(merger.getDimLevelType(b)) || + isUndefDLT(merger.getDimLevelType(b))) { + needsUniv = true; + } else { + unsigned tensor = merger.tensor(b); + assert(idx == merger.index(b)); + size_t dim = codegen.loopIdxToDim[tensor][idx]; + assert(dim != INVALID_ID); + ts.push_back(tensor); + ds.push_back(dim); + } + } + + codegen.loopEmitter.enterNewLoopSeq(builder, op.getLoc(), ts, ds); + // Maintain the universal index only if it is actually // consumed by a subsequent lattice point. if (needsUniv) { @@ -1406,11 +1116,57 @@ static Operation *startLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned at, unsigned li, bool needsUniv) { + const BitVector &simple = merger.lat(li).simple; + const BitVector &all = merger.lat(li).bits; + assert(simple.size() == all.size()); + // The set of tensors + dims to generate loops on + SmallVector condTids, condDims; + // The set of (dense) tensors that is optimized from condition, yet still + // need extra locals to iterate on them. + SmallVector extraTids, extraDims; + // First converts bits to array + dim pair + for (unsigned b = 0, e = simple.size(); b < e; b++) { + size_t tid = merger.tensor(b); + size_t idx = codegen.topSort[at]; + if (simple.test(b)) { + // the simplified condition must be a subset of the original condition. + assert(all.test(b)); + assert(merger.index(b) == idx); + if (isUndefDLT(merger.getDimLevelType(b))) { + // This could be a synthetic tensor (for invariants and sparse output + // tensor). + // In both cases, we mean to generate loops over output tensor. + // e.g., + // out[i][j] = invariant; + if (merger.getSynTensorID() == tid) + tid = merger.getOutTensorID(); + } + auto dim = codegen.loopIdxToDim[tid][idx]; + if (dim != INVALID_ID) { + // dim could be invalid if this is a zero ranked tensor + condTids.push_back(tid); + condDims.push_back(dim); + } + } else if ((all.test(b) || merger.isOutTensor(b, idx)) && + isDenseDLT(merger.getDimLevelType(b))) { + // Note that we generate dense indices of the output tensor + // unconditionally, since they may not appear in the lattice, but may be + // needed for linearized codegen. + assert(merger.index(b) == idx); + // Only dense dimensions should be optimized from conditions. + assert(isDenseDLT(merger.getDimLevelType(b))); + auto dim = codegen.loopIdxToDim[tid][idx]; + assert(dim != INVALID_ID); + extraTids.push_back(tid); + extraDims.push_back(dim); + } + } // Emit the for/while-loop control. Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv, - merger.lat(li).simple); + condTids, condDims, extraTids, extraDims); // Emit the locals for this loop. - genLocals(merger, codegen, builder, op, at, needsUniv, merger.lat(li).bits); + // genLocals(merger, codegen, builder, op, at, needsUniv, + // merger.lat(li).bits); return loop; } @@ -1420,21 +1176,36 @@ unsigned li, bool needsUniv) { // End a while-loop. if (auto whileOp = dyn_cast(loop)) { - genWhileInduction(merger, codegen, builder, op, idx, needsUniv, - merger.lat(li).bits, whileOp); - return needsUniv; + finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv, + merger.lat(li).bits, whileOp); + } else { + needsUniv = false; } - // End a for-loop. - genForInduction(merger, codegen, builder, op, loop); - return false; + + SmallVector reduc; + if (codegen.redVal) + reduc.push_back(codegen.redVal); + if (codegen.expValues) + reduc.push_back(codegen.expCount); + + auto loopRet = + codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc); + assert(reduc.size() == loopRet.size()); + + if (codegen.redVal) + updateReduc(merger, codegen, loopRet.front()); + if (codegen.expValues) + codegen.expCount = loopRet.back(); + + return needsUniv; } /// Ends a loop sequence at given level. static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp, unsigned at, unsigned idx, unsigned ldx) { - assert(codegen.loops[idx]); - codegen.loops[idx] = Value(); + assert(codegen.getLoopIdxValue(idx) == nullptr); + codegen.loopEmitter.exitCurrentLoopSeq(); // Unmark bookkeeping of invariants and loop index. genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false); // Finalize access pattern expansion for sparse tensor output. @@ -1514,7 +1285,7 @@ } else { // To rematerialize an non-annotated tensor, simply load it // from the bufferized value. - Value val = codegen.buffers.back(); // value array + Value val = codegen.loopEmitter.getValBuffer().back(); // value array rewriter.replaceOpWithNewOp(op, resType, val); } } @@ -1582,10 +1353,17 @@ // Inadmissible expression, reject. return failure(); - // Recursively generates code if admissible. merger.setHasSparseOut(sparseOut != nullptr); - CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest, - topSort); + + SmallVector tensors; + for (OpOperand &t : op->getOpOperands()) + tensors.push_back(t.get()); + + // Recursively generates code if admissible. + CodeGen codegen(options, tensors, numTensors, numLoops, sparseOut, + outerParNest, topSort); + // TODO: maybe merger should be responsible of maintaining the map. + codegen.buildLoopIdxToDimMap(op); genBuffers(merger, codegen, rewriter, op); genStmt(merger, codegen, rewriter, op, exp, 0); genResult(merger, codegen, rewriter, op); diff --git a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir --- a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir +++ b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir @@ -74,10 +74,10 @@ // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref -// CHECK: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64> -// CHECK: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64> -// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64> +// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] { // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_14]]] : memref<32xf64> @@ -120,12 +120,12 @@ // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref -// CHECK: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64> -// CHECK: linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>) -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64> +// CHECK-DAG: linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>) +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_15]], %[[VAL_21:.*]] = %[[VAL_17]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_16]] : index // CHECK: %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_21]], %[[VAL_18]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir @@ -113,10 +113,10 @@ // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>) -// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>) // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index // CHECK: scf.condition(%[[VAL_17]]) %[[VAL_15]], %[[VAL_16]] : index, index @@ -166,9 +166,9 @@ // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>) -// CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref -// CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref +// CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>) // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] { // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref @@ -206,9 +206,9 @@ // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>) -// CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] { // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_12]]] : memref @@ -314,9 +314,9 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index // CHECK: scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index @@ -371,9 +371,9 @@ // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>) -// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] { // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_14]]] : memref<32xf32> @@ -408,9 +408,9 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32> // CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index // CHECK: scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index @@ -465,9 +465,9 @@ // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32> // CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>) -// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] { // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref @@ -502,11 +502,11 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index // CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index @@ -585,11 +585,11 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) +// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index // CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index @@ -647,11 +647,11 @@ // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>) -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>) +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index // CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index @@ -740,11 +740,11 @@ // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] -// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>) -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>) +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index // CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index @@ -881,11 +881,11 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref +// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_18:.*]]:3 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]], %[[VAL_21:.*]] = %[[VAL_13]]) : (index, index, f32) -> (index, index, f32) { // CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index // CHECK: %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index @@ -989,12 +989,12 @@ // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref -// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_21:.*]]:3 = scf.while (%[[VAL_22:.*]] = %[[VAL_17]], %[[VAL_23:.*]] = %[[VAL_19]], %[[VAL_24:.*]] = %[[VAL_15]]) : (index, index, f32) -> (index, index, f32) { // CHECK: %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_22]], %[[VAL_18]] : index // CHECK: %[[VAL_26:.*]] = arith.cmpi ult, %[[VAL_23]], %[[VAL_20]] : index @@ -1106,13 +1106,13 @@ // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.pointers %[[VAL_3]] {dimension = 0 : index} : tensor> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.indices %[[VAL_3]] {dimension = 0 : index} : tensor> to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor> to memref -// CHECK-DAG: %[[VAL_16:.*]] = tensor.dim %[[VAL_4]], %[[VAL_5]] : tensor +// CHECK-DAG: %[[VAL_16:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor // CHECK-DAG: %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_4]] -// CHECK: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref) -// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref -// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref +// CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref) +// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref +// CHECK-DAG: %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref // CHECK: %[[VAL_23:.*]]:3 = scf.while (%[[VAL_24:.*]] = %[[VAL_19]], %[[VAL_25:.*]] = %[[VAL_21]], %[[VAL_26:.*]] = %[[VAL_5]]) : (index, index, index) -> (index, index, index) { // CHECK: %[[VAL_27:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_20]] : index // CHECK: %[[VAL_28:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_22]] : index @@ -1284,13 +1284,13 @@ // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor> to memref // CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref -// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref -// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref -// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref +// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref +// CHECK-DAG: %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref +// CHECK-DAG: %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_24:.*]]:4 = scf.while (%[[VAL_25:.*]] = %[[VAL_18]], %[[VAL_26:.*]] = %[[VAL_20]], %[[VAL_27:.*]] = %[[VAL_22]], %[[VAL_28:.*]] = %[[VAL_17]]) : (index, index, index, f64) -> (index, index, index, f64) { // CHECK: %[[VAL_29:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_19]] : index // CHECK: %[[VAL_30:.*]] = arith.cmpi ult, %[[VAL_26]], %[[VAL_21]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir @@ -962,7 +962,7 @@ // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor> to memref -// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor +// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor // CHECK: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_11]] : memref) // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] { @@ -1015,7 +1015,7 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor> to memref // CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref -// CHECK-DAG: %[[VAL_12:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor +// CHECK-DAG: %[[VAL_12:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor // CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref // CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_3]] : memref // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref @@ -1095,7 +1095,7 @@ // CHECK-DAG: %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor> to memref // CHECK-DAG: %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref // CHECK-DAG: %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref -// CHECK-DAG: %[[VAL_22:.*]] = tensor.dim %[[VAL_5]], %[[VAL_6]] : tensor +// CHECK-DAG: %[[VAL_22:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor // CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref // CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir @@ -1126,11 +1126,11 @@ // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 2 : index} : tensor> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 2 : index} : tensor> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> to memref -// CHECK-DAG: %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_5]] : tensor +// CHECK-DAG: %[[VAL_10:.*]] = tensor.dim %[[VAL_1]], %[[VAL_6]] : tensor // CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref -// CHECK-DAG: %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor -// CHECK-DAG: %[[VAL_14:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor +// CHECK-DAG: %[[VAL_13:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor // CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : memref // CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] { // CHECK: scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] { @@ -1247,7 +1247,7 @@ // CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor // CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor // CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref -// CHECK-DAG: %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor> +// CHECK-DAG: %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor> // CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir @@ -20,8 +20,8 @@ // CHECK: %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref // CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] { // CHECK: %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref -// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index // CHECK: %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref +// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index // CHECK: %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref // CHECK: scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] { // CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref @@ -38,8 +38,8 @@ // CHECK: %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref // CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] { // CHECK: %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref -// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index // CHECK: %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref +// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index // CHECK: %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref // CHECK: scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] { // CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref @@ -57,8 +57,8 @@ // CHECK: %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref // CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] { // CHECK: %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref -// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index // CHECK: %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref +// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index // CHECK: %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref // CHECK: scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] { // CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_index.mlir b/mlir/test/Dialect/SparseTensor/sparse_index.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_index.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_index.mlir @@ -25,14 +25,15 @@ // CHECK-DAG: %[[VAL_4:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor // CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64> // CHECK-HIR: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { -// CHECK-HIR: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64> -// CHECK-HIR: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref -// CHECK-HIR: %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index -// CHECK-HIR: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref +// CHECK-HIR-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64> +// CHECK-HIR-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref +// CHECK-HIR-DAG: %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index +// CHECK-HIR-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref // CHECK-HIR: scf.for %[[VAL_17:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_5]] { -// CHECK-HIR: %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref -// CHECK-HIR: %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64> -// CHECK-HIR: %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref +// CHECK-HIR-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref +// CHECK-HIR-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64> +// CHECK-HIR-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref // CHECK-HIR: %[[VAL_21:.*]] = arith.mulf %[[VAL_20]], %[[VAL_13]] : f64 // CHECK-HIR: %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] : f64 // CHECK-HIR: memref.store %[[VAL_22]], %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64> diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir @@ -60,17 +60,17 @@ // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor> -// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor -// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor -// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor +// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor // CHECK: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_10]] : memref) -// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] { -// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] { -// CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_8]], %[[VAL_11]] : index +// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] { +// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] { +// CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_7]], %[[VAL_11]] : index // CHECK: %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index -// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] { -// CHECK: %[[VAL_16:.*]] = arith.muli %[[VAL_6]], %[[VAL_14]] : index +// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] { +// CHECK: %[[VAL_16:.*]] = arith.muli %[[VAL_8]], %[[VAL_14]] : index // CHECK: %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_15]] : index // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_17]]] : memref // CHECK: memref.store %[[VAL_18]], %[[VAL_10]]{{\[}}%[[VAL_15]], %[[VAL_11]], %[[VAL_12]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir @@ -117,8 +117,8 @@ // CHECK-RWT: %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref // CHECK-RWT: scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] { // CHECK-RWT: %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref -// CHECK-RWT: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index // CHECK-RWT: %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref +// CHECK-RWT: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index // CHECK-RWT: %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref // CHECK-RWT: scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] { // CHECK-RWT: %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref @@ -266,8 +266,8 @@ // CHECK-RWT: %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref // CHECK-RWT: scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] { // CHECK-RWT: %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref -// CHECK-RWT: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index // CHECK-RWT: %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref +// CHECK-RWT: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index // CHECK-RWT: %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref // CHECK-RWT: scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] { // CHECK-RWT: %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir @@ -27,17 +27,17 @@ // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 2.200000e+00 : f32 // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32 -// CHECK: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref -// CHECK: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref -// CHECK: %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref -// CHECK: %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref -// CHECK: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref -// CHECK: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref -// CHECK: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32> -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref -// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref +// CHECK-DAG: %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32 +// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref +// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref +// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref +// CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref +// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32> +// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref +// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref +// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref // CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_7]] { // CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_19]]] : memref // CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref