Please use GitHub pull requests for new patches. Avoid migrating existing patches. Phabricator shutdown timeline
Changeset View
Standalone View
mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
Show All 19 Lines | |||||
using namespace mlir; | using namespace mlir; | ||||
using namespace mlir::sparse_tensor; | using namespace mlir::sparse_tensor; | ||||
//===----------------------------------------------------------------------===// | //===----------------------------------------------------------------------===// | ||||
// File local helper functions. | // File local helper functions. | ||||
//===----------------------------------------------------------------------===// | //===----------------------------------------------------------------------===// | ||||
/// Generates a position/coordinate load from the sparse storage scheme. | #define CMPI(p, l, r) \ | ||||
/// Narrower data types need to be zero extended before casting the | (builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, l, r) \ | ||||
/// value into the `Index` type used for looping and indexing. | .getResult()) | ||||
#define C_IDX(v) (constantIndex(builder, loc, v)) | |||||
/// Generates a pointer/index load from the sparse storage scheme. Narrower | |||||
/// data types need to be zero extended before casting the value into the | |||||
/// index type used for looping and indexing. | |||||
static Value genIndexLoad(OpBuilder &builder, Location loc, Value mem, | static Value genIndexLoad(OpBuilder &builder, Location loc, Value mem, | ||||
Value s) { | Value s) { | ||||
wrengr: I'd prefer these be defined as `static inline` functions rather than as macros, since that… | |||||
I will stick with this. The reason I use macro is that I want to avoid typing builder and loc. Peiming: I will stick with this. The reason I use macro is that I want to avoid typing `builder` and… | |||||
// For the scalar case, we simply zero extend narrower indices into 64-bit | // For the scalar case, we simply zero extend narrower indices into 64-bit | ||||
// values before casting to index without a performance penalty. Here too, | // values before casting to index without a performance penalty. Here too, | ||||
// however, indices that already are 64-bit, in theory, cannot express the | // however, indices that already are 64-bit, in theory, cannot express the | ||||
// full range as explained above. | // full range as explained above. | ||||
Value load = builder.create<memref::LoadOp>(loc, mem, s); | Value load = builder.create<memref::LoadOp>(loc, mem, s); | ||||
Please don't undo this variable naming. The "mem" matches other places, and avoids confusion about whether "ptr" means MemRef vs the old "pointer" (now "position") vs llvm-pointers vs... wrengr: Please don't undo this variable naming. The "mem" matches other places, and avoids confusion… | |||||
Sry, probably it get overlooked during rebasing Peiming: Sry, probably it get overlooked during rebasing | |||||
if (!load.getType().isa<IndexType>()) { | if (!load.getType().isa<IndexType>()) { | ||||
if (load.getType().getIntOrFloatBitWidth() < 64) | if (load.getType().getIntOrFloatBitWidth() < 64) | ||||
load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load); | load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load); | ||||
load = | load = | ||||
You should just use ValueRange::getTypes() wrengr: You should just use `ValueRange::getTypes()` | |||||
builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load); | builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load); | ||||
} | } | ||||
return load; | return load; | ||||
} | } | ||||
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor, | static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor, | ||||
Level lvl) { | Level lvl) { | ||||
auto enc = getSparseTensorEncoding(tensor.getType()); | auto enc = getSparseTensorEncoding(tensor.getType()); | ||||
Show All 15 Lines | |||||
static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd, | static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd, | ||||
Value offset, Value stride, Value tensor, Level lvl) { | Value offset, Value stride, Value tensor, Level lvl) { | ||||
// tensorCrd = sliceCrd * stride + offset | // tensorCrd = sliceCrd * stride + offset | ||||
crd = builder.create<arith::MulIOp>(loc, crd, stride); | crd = builder.create<arith::MulIOp>(loc, crd, stride); | ||||
crd = builder.create<arith::AddIOp>(loc, crd, offset); | crd = builder.create<arith::AddIOp>(loc, crd, offset); | ||||
return crd; | return crd; | ||||
} | } | ||||
/// Generates code to compute the *absolute* offset of the slice based on the | |||||
/// provide minimum coordinates in the slice. | |||||
/// E.g., when reducing d0 + d1 + d2, we need two slices to fully reduced the | |||||
/// expression, i,e, s1 = slice(T, d0), s2 = slice(s1, d1). The *absolute* | |||||
/// offset is the offset computed relative to the initial tensors T. | |||||
/// | |||||
This computation does not match my mental interpretation of the text above (L79) aartbik: This computation does not match my mental interpretation of the text above (L79) | |||||
/// When isNonEmpty == true, the computed offset is meaningless and should not | |||||
/// be used during runtime, the method generates code to return 0 currently in | |||||
/// that case. | |||||
/// | |||||
// offset adds very little, Either use a sentence or remove aartbik: // offset
adds very little, Either use a sentence or remove | |||||
/// offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0; | |||||
static Value offsetFromMinCoord(OpBuilder &builder, Location loc, Value minCrd, | |||||
Value size, Value isNonEmpty) { | |||||
Value geSize = CMPI(uge, minCrd, size); | |||||
Value pred = builder.create<arith::AndIOp>(loc, isNonEmpty, geSize); | |||||
Value mp1 = builder.create<arith::AddIOp>(loc, minCrd, C_IDX(1)); | |||||
Value mms = builder.create<arith::SubIOp>(loc, mp1, size); | |||||
// This is the absolute offset related to the underly tensor. | |||||
return builder.create<arith::SelectOp>(loc, pred, mms, C_IDX(0)); | |||||
} | |||||
/// Converts a coordinate relative to the underlying tensor to the coordinate | /// Converts a coordinate relative to the underlying tensor to the coordinate | ||||
/// relative to the slice, returns a extra reminder value | /// relative to the slice, returns a extra reminder value | ||||
// FIXME: that description says "tensorCrd -> sliceCrd"; but the function | // FIXME: that description says "tensorCrd -> sliceCrd"; but the function | ||||
// name suggests it should be "sliceCrd -> tensorCrd". | // name suggests it should be "sliceCrd -> tensorCrd". | ||||
static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc, | static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc, | ||||
Value crd, Value offset, | Value crd, Value offset, | ||||
Value stride, Value tensor, | Value stride, Value tensor, | ||||
Level lvl) { | Level lvl) { | ||||
Show All 16 Lines | LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd, | ||||
const auto [newCrd, crdRem] = | const auto [newCrd, crdRem] = | ||||
fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl); | fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl); | ||||
SmallVector<Value, 3> conds; // at most 3 conditions | SmallVector<Value, 3> conds; // at most 3 conditions | ||||
// First, coord >= offset (skip the check if offset is known to be 0). | // First, coord >= offset (skip the check if offset is known to be 0). | ||||
if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl); | if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl); | ||||
!(staticOffset.has_value() && *staticOffset == 0)) { | !(staticOffset.has_value() && *staticOffset == 0)) { | ||||
auto geOffset = builder.create<arith::CmpIOp>( | auto geOffset = CMPI(uge, crd, offset); | ||||
loc, arith::CmpIPredicate::uge, crd, offset); | |||||
conds.push_back(geOffset); | conds.push_back(geOffset); | ||||
} | } | ||||
// Second, coord_in_slice < length | // Second, coord_in_slice < length | ||||
auto ltLength = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, | auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]); | ||||
newCrd, lvlSizes[tid][lvl]); | |||||
conds.push_back(ltLength); | conds.push_back(ltLength); | ||||
// Third, rem == 0 (skip the check if stride is known to be 1). | // Third, rem == 0 (skip the check if stride is known to be 1). | ||||
if (auto staticStride = enc.getStaticLvlSliceStride(lvl); | if (auto staticStride = enc.getStaticLvlSliceStride(lvl); | ||||
!(staticStride.has_value() && *staticStride == 1)) { | !(staticStride.has_value() && *staticStride == 1)) { | ||||
auto fitStride = builder.create<arith::CmpIOp>( | auto fitStride = CMPI(eq, crdRem, C_IDX(0)); | ||||
loc, arith::CmpIPredicate::eq, crdRem, constantIndex(builder, loc, 0)); | |||||
conds.push_back(fitStride); | conds.push_back(fitStride); | ||||
} | } | ||||
// Must meet all condition to be a valid coordinate in slice. | // Must meet all condition to be a valid coordinate in slice. | ||||
auto pred = conds.front(); | auto pred = conds.front(); | ||||
for (auto cond : ValueRange(conds).drop_front()) | for (auto cond : ValueRange(conds).drop_front()) | ||||
pred = builder.create<arith::AndIOp>(loc, pred, cond); | pred = builder.create<arith::AndIOp>(loc, pred, cond); | ||||
return {newCrd, pred}; | return {newCrd, pred}; | ||||
} | } | ||||
//===----------------------------------------------------------------------===// | //===----------------------------------------------------------------------===// | ||||
// Sparse tensor loop emitter class implementations | // Sparse tensor loop emitter class implementations | ||||
//===----------------------------------------------------------------------===// | //===----------------------------------------------------------------------===// | ||||
Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid, | Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid, | ||||
Level lvl, Value crd) { | Level lvl, Value crd) { | ||||
Value pos = lvl == 0 ? constantIndex(builder, loc, 0) : posits[tid][lvl - 1]; | Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1]; | ||||
Value mul = builder.create<arith::MulIOp>(loc, highs[tid][lvl], pos); | Value mul = builder.create<arith::MulIOp>(loc, highs[tid][lvl], pos); | ||||
if (isSparseSlices[tid]) | if (isSparseSlices[tid]) | ||||
crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl], | crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl], | ||||
sliceStrides[tid][lvl], tensors[tid], lvl); | sliceStrides[tid][lvl], tensors[tid], lvl); | ||||
Value add = builder.create<arith::AddIOp>(loc, mul, crd); | Value add = builder.create<arith::AddIOp>(loc, mul, crd); | ||||
return add; | return add; | ||||
} | } | ||||
Show All 26 Lines | auto whileOp = builder.create<scf::WhileOp>( | ||||
builder.setInsertionPointToStart(ifInBound.elseBlock()); | builder.setInsertionPointToStart(ifInBound.elseBlock()); | ||||
builder.create<scf::YieldOp>(loc, constantI1(builder, loc, false)); | builder.create<scf::YieldOp>(loc, constantI1(builder, loc, false)); | ||||
} | } | ||||
builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs); | builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs); | ||||
}, | }, | ||||
/*afterBuilder=*/ | /*afterBuilder=*/ | ||||
[](OpBuilder &builder, Location loc, ValueRange ivs) { | [](OpBuilder &builder, Location loc, ValueRange ivs) { | ||||
// pos ++ | // pos ++ | ||||
Value nextPos = builder.create<arith::AddIOp>( | Value nextPos = builder.create<arith::AddIOp>(loc, ivs[0], C_IDX(1)); | ||||
loc, ivs[0], constantIndex(builder, loc, 1)); | |||||
builder.create<scf::YieldOp>(loc, nextPos); | builder.create<scf::YieldOp>(loc, nextPos); | ||||
}); | }); | ||||
// Return the segment high. | // Return the segment high. | ||||
return whileOp.getResult(0); | return whileOp.getResult(0); | ||||
} | } | ||||
Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, | Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, | ||||
Level dstLvl) { | Level dstLvl) { | ||||
Value crd = constantIndex(builder, loc, 0); | Value crd = C_IDX(0); | ||||
const auto reassoc = getCollapseReassociation(tid, dstLvl); | const auto reassoc = getCollapseReassociation(tid, dstLvl); | ||||
const unsigned reassocSize = reassoc.size(); | const unsigned reassocSize = reassoc.size(); | ||||
for (unsigned i = 0; i < reassocSize; i++) { | for (unsigned i = 0; i < reassocSize; i++) { | ||||
const Level srcLvl = reassoc[i]; | const Level srcLvl = reassoc[i]; | ||||
// A load on the coordinates array yields the coordinate. | // A load on the coordinates array yields the coordinate. | ||||
const Value mem = coordinatesBuffers[tid][srcLvl]; | const Value mem = coordinatesBuffers[tid][srcLvl]; | ||||
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | ||||
const Value pos = posits[tid][dstLvl]; | const Value pos = posits[tid][dstLvl]; | ||||
Show All 40 Lines | void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, | ||||
const LoopOrd numLoops = topSort.size(); | const LoopOrd numLoops = topSort.size(); | ||||
// These zeros will be overwritten below, but we need to initialize | // These zeros will be overwritten below, but we need to initialize | ||||
// them to something since we'll need random-access assignment. | // them to something since we'll need random-access assignment. | ||||
this->loopIdToOrd.assign(numLoops, 0); | this->loopIdToOrd.assign(numLoops, 0); | ||||
this->loopStack.reserve(numLoops); | this->loopStack.reserve(numLoops); | ||||
this->loopSeqStack.reserve(numLoops); | this->loopSeqStack.reserve(numLoops); | ||||
// Index-reduction related fields. | |||||
this->dependentLvlMap.assign( | this->dependentLvlMap.assign( | ||||
numTensors, std::vector<std::vector<std::pair<TensorId, Level>>>()); | numTensors, std::vector<std::vector<std::pair<TensorId, Level>>>()); | ||||
this->slicePosBuffer.assign(numTensors, std::vector<std::vector<Value>>()); | |||||
this->sliceSizes.assign(numTensors, std::vector<std::vector<Value>>()); | |||||
this->sliceStack.assign(numTensors, std::vector<SliceInfo>()); | |||||
this->levelReducedDep.assign(numTensors, std::vector<unsigned>()); | |||||
why the empty lines here? aartbik: why the empty lines here? | |||||
// Initialize nested types of `TensorId`-indexed fields. | // Initialize nested types of `TensorId`-indexed fields. | ||||
You should use the numTensors variable instead of calling tensors.size() repeatedly. (This is for code clarity rather than performance reasons) wrengr: You should use the `numTensors` variable instead of calling `tensors.size()` repeatedly. (This… | |||||
for (TensorId tid = 0; tid < numTensors; tid++) { | for (TensorId tid = 0; tid < numTensors; tid++) { | ||||
const Value t = tensors[tid]; | const Value t = tensors[tid]; | ||||
// a scalar or 0-dimension tensors | // a scalar or 0-dimension tensors | ||||
if (isZeroRankedTensorOrScalar(t.getType())) | if (isZeroRankedTensorOrScalar(t.getType())) | ||||
continue; | continue; | ||||
auto rtp = getRankedTensorType(t); | auto rtp = getRankedTensorType(t); | ||||
if (auto reshape = t.getDefiningOp<tensor::CollapseShapeOp>(); | if (auto reshape = t.getDefiningOp<tensor::CollapseShapeOp>(); | ||||
Show All 24 Lines | for (TensorId tid = 0; tid < numTensors; tid++) { | ||||
highs[tid].assign(lvlRank, Value()); | highs[tid].assign(lvlRank, Value()); | ||||
segHi[tid].assign(lvlRank, Value()); | segHi[tid].assign(lvlRank, Value()); | ||||
posits[tid].assign(lvlRank, Value()); | posits[tid].assign(lvlRank, Value()); | ||||
coords[tid].assign(lvlRank, Value()); | coords[tid].assign(lvlRank, Value()); | ||||
positionsBuffers[tid].assign(lvlRank, Value()); | positionsBuffers[tid].assign(lvlRank, Value()); | ||||
coordinatesBuffers[tid].assign(lvlRank, Value()); | coordinatesBuffers[tid].assign(lvlRank, Value()); | ||||
sliceOffsets[tid].assign(lvlRank, Value()); | sliceOffsets[tid].assign(lvlRank, Value()); | ||||
sliceStrides[tid].assign(lvlRank, Value()); | sliceStrides[tid].assign(lvlRank, Value()); | ||||
// Slice-driven loops related initialization. | |||||
levelReducedDep[tid].assign(lvlRank, 0); | |||||
dependentLvlMap[tid].assign(lvlRank, | dependentLvlMap[tid].assign(lvlRank, | ||||
std::vector<std::pair<TensorId, Level>>()); | std::vector<std::pair<TensorId, Level>>()); | ||||
slicePosBuffer[tid].assign(lvlRank, std::vector<Value>()); | |||||
sliceSizes[tid].assign(lvlRank, std::vector<Value>()); | |||||
sliceStack[tid].emplace_back(/*minCrd=*/Value(), | |||||
/*offset=*/Value(), /*isNonEmpty*/ Value(), | |||||
std::nullopt, 0); | |||||
if (dimGetter) { | if (dimGetter) { | ||||
auto reassoc = collapseReassoc[tid]; | auto reassoc = collapseReassoc[tid]; | ||||
Level dstRank = reassoc ? reassoc.size() : lvlRank; | Level dstRank = reassoc ? reassoc.size() : lvlRank; | ||||
for (Level l = 0; l < dstRank; l++) { | for (Level l = 0; l < dstRank; l++) { | ||||
dependentLvlMap[tid][l] = dimGetter(tid, l); | dependentLvlMap[tid][l] = dimGetter(tid, l); | ||||
unsigned depends = dependentLvlMap[tid][l].size(); | |||||
if (depends == 0) | |||||
continue; | |||||
Please keep this as Level l wrengr: Please keep this as `Level l` | |||||
// TODO: View-base collapse and dependent index reduction are not | // TODO: View-base collapse and dependent index reduction are not | ||||
// compatible right now. | // compatible right now. | ||||
assert(!reassoc || dependentLvlMap[tid][l].empty()); | assert(!reassoc); | ||||
// We need `depends - 1` slices to fully the affine expression. | |||||
We need depends - 1 slices to make sure you don't read depends as part of the sentence aartbik: We need `depends - 1` slices
to make sure you don't read depends as part of the sentence | |||||
sliceSizes[tid][l].assign(depends - 1, nullptr); | |||||
slicePosBuffer[tid][l].assign(depends - 1, nullptr); | |||||
} | } | ||||
It would be clearer to combine these together. Also, it would be clearer to use continue rather than extra indentation for the conditional. Putting those together, maybe use something like if (depends == 0) continue; assert(!reassoc); sliceSizes[... wrengr: It would be clearer to combine these together. Also, it would be clearer to use `continue`… | |||||
} | } | ||||
} | } | ||||
// Construct the inverse of the `topSort` from the sparsifier. | // Construct the inverse of the `topSort` from the sparsifier. | ||||
// This is needed to map `AffineDimExpr`s back to the `LoopOrd` | // This is needed to map `AffineDimExpr`s back to the `LoopOrd` | ||||
// used in loop emitter. | // used in loop emitter. | ||||
// FIXME: This map should be maintained outside loop emitter. | // FIXME: This map should be maintained outside loop emitter. | ||||
for (LoopOrd n = 0; n < numLoops; n++) | for (LoopOrd n = 0; n < numLoops; n++) | ||||
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines | if (!enc) { | ||||
// Annotated sparse tensors. | // Annotated sparse tensors. | ||||
// We also need the value buffer for all-dense annotated "sparse" tensors. | // We also need the value buffer for all-dense annotated "sparse" tensors. | ||||
valBuffer[t] = genToValues(builder, loc, tensor); | valBuffer[t] = genToValues(builder, loc, tensor); | ||||
} | } | ||||
// NOTE: we can also prepare for 0 lvl here in advance, this will hoist | // NOTE: we can also prepare for 0 lvl here in advance, this will hoist | ||||
// some loop preparation from tensor iteration, but will also (undesirably) | // some loop preparation from tensor iteration, but will also (undesirably) | ||||
// hoist the code ouside if-conditions. | // hoist the code ouside if-conditions. | ||||
} | } | ||||
Type indexType = builder.getIndexType(); | |||||
Value c0 = constantZero(builder, loc, indexType); | |||||
for (TensorId t = 0, e = tensors.size(); t < e; t++) { | |||||
Please use TensorId here. I am just about to upload the CLs that make that into a newtype, so it's important to use the correct type instead of just using size_t/unsigned everywhere wrengr: Please use `TensorId` here. I am just about to upload the CLs that make that into a newtype, so… | |||||
auto rtp = tensors[t].getType().dyn_cast<RankedTensorType>(); | |||||
Please define a dyn_cast variant of the getSparseTensorType function, and use that here and everywhere else. The SparseTensorType was created specifically to help avoid several code legibility and correctness concerns, so you should be using it everywhere possible. wrengr: Please define a `dyn_cast` variant of the `getSparseTensorType` function, and use that here and… | |||||
if (!rtp) | |||||
continue; | |||||
Level lvlRank = SparseTensorType(rtp).getLvlRank(); | |||||
You should be using SparseTensorType::getLevelRank here, since it is specifically the level-rank you want not the dim-rank wrengr: You should be using `SparseTensorType::getLevelRank` here, since it is specifically the level… | |||||
for (Level lvl = 0; lvl < lvlRank; lvl++) { | |||||
Please use Level for all levels. Even though it's just a typedef for now, I will be converting it to a proper type in the near future, so you should use the correct type rather than just using unsigned everywhere wrengr: Please use `Level` for all levels. Even though it's just a typedef for now, I will be… | |||||
if (!dependentLvlMap[t][lvl].empty()) { | |||||
ArrayRef<std::pair<TensorId, Level>> depLvls = dependentLvlMap[t][lvl]; | |||||
The comment applies to the assert, but the declaration is in between aartbik: The comment applies to the assert, but the declaration is in between | |||||
// Needs at least two operands to form a non-trivial affine expression. | |||||
assert(depLvls.size() > 1); | |||||
Value size = c0; | |||||
for (unsigned e = depLvls.size() - 1; e >= 1; e--) { | |||||
auto [dt, dd] = depLvls[e]; | |||||
size = builder.create<arith::AddIOp>(loc, size, lvlSizes[dt][dd]); | |||||
sliceSizes[t][lvl][e - 1] = size; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
localInsertPos = builder.getInsertionPoint()->getPrevNode(); | |||||
} | } | ||||
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, | void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, | ||||
ArrayRef<TensorId> tids, | ArrayRef<TensorId> tids, | ||||
ArrayRef<Level> lvls) { | ArrayRef<Level> lvls) { | ||||
// TODO: sort | // TODO: sort | ||||
assert(loopSeqStack.size() == loopStack.size()); | assert(loopSeqStack.size() == loopStack.size()); | ||||
// Universal Index starts from 0. | |||||
loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); | |||||
// Prepares for all the tensors used in the current loop sequence. | // Prepares for all the tensors used in the current loop sequence. | ||||
assert(tids.size() == lvls.size()); | std::vector<std::tuple<TensorId, Level, bool>> slicedTids; | ||||
for (auto [tid, lvl] : llvm::zip(tids, lvls)) | for (auto [tid, lvl] : llvm::zip(tids, lvls)) { | ||||
if (!dependentLvlMap[tid][lvl].empty()) { | |||||
bool fullyRed = genSliceBegin(builder, loc, tid, lvl); | |||||
slicedTids.emplace_back(tid, lvl, fullyRed); | |||||
} else { | |||||
prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); | prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); | ||||
} | } | ||||
} | |||||
// Universal Index starts from 0. | |||||
loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids)); | |||||
} | |||||
void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) { | |||||
assert(loopSeqStack.size() == loopStack.size() + 1); | |||||
const auto &slicedTids = loopSeqStack.back().second; | |||||
I think it'd be clearer to just use const auto & here wrengr: I think it'd be clearer to just use `const auto &` here | |||||
// Depending on whether the slice is resolved or not at current loop sequence, | |||||
Please elaborate. "Pop out" is not at all representative for what follows aartbik: Please elaborate. "Pop out" is not at all representative for what follows | |||||
// end them in different ways. | |||||
for (auto [tid, lvl, res] : slicedTids) { | |||||
if (!res) { | |||||
// If this is a unresolved-slice-driven loop, pops out the slice. | |||||
assert(sliceStack[tid].back().slicedOnLvl == lvl); | |||||
sliceStack[tid].pop_back(); | |||||
} else { | |||||
// Else this is a resolved-slice, and advance posit similar to TACO. | |||||
Value c1 = C_IDX(1), c2 = C_IDX(2); | |||||
// pIdx += 2, we finished the current lvl, advance the pointer index of | |||||
// the previous level by two to skip the [pLo, pHi] for current level. | |||||
Value sPtrBuf = slicePosBuffer[tid][lvl].back(); | |||||
Value curP = genIndexLoad(builder, loc, sPtrBuf, c1); | |||||
Value nexP = builder.create<arith::AddIOp>(loc, curP, c2); | |||||
// TODO: we could probably use an SSA value for it. | |||||
builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1); | |||||
} | |||||
} | |||||
loopSeqStack.pop_back(); | |||||
} | |||||
Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { | Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { | ||||
switch (a.getKind()) { | switch (a.getKind()) { | ||||
case AffineExprKind::DimId: { | case AffineExprKind::DimId: { | ||||
// FIXME: since the one callsite in Sparsification passes in a | // FIXME: since the one callsite in Sparsification passes in a | ||||
// level-expression, the `getPosition` must in fact be a `Dimension`. | // level-expression, the `getPosition` must in fact be a `Dimension`. | ||||
// However, elsewhere we have been lead to expect that `loopIdToOrd` | // However, elsewhere we have been lead to expect that `loopIdToOrd` | ||||
// should be indexed by `LoopId`... | // should be indexed by `LoopId`... | ||||
Show All 10 Lines | Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { | ||||
case AffineExprKind::Mul: { | case AffineExprKind::Mul: { | ||||
auto binOp = a.cast<AffineBinaryOpExpr>(); | auto binOp = a.cast<AffineBinaryOpExpr>(); | ||||
return builder.create<arith::MulIOp>( | return builder.create<arith::MulIOp>( | ||||
loc, genAffine(builder, loc, binOp.getLHS()), | loc, genAffine(builder, loc, binOp.getLHS()), | ||||
genAffine(builder, loc, binOp.getRHS())); | genAffine(builder, loc, binOp.getRHS())); | ||||
} | } | ||||
case AffineExprKind::Constant: { | case AffineExprKind::Constant: { | ||||
int64_t c = a.cast<AffineConstantExpr>().getValue(); | int64_t c = a.cast<AffineConstantExpr>().getValue(); | ||||
return constantIndex(builder, loc, c); | return C_IDX(c); | ||||
} | } | ||||
default: | default: | ||||
llvm_unreachable("unexpected affine subscript"); | llvm_unreachable("unexpected affine subscript"); | ||||
} | } | ||||
} | } | ||||
Operation *LoopEmitter::enterLoopOverTensorAtLvl( | Operation *LoopEmitter::emitForLoopOverTensorAtLvl(OpBuilder &builder, | ||||
OpBuilder &builder, Location loc, ArrayRef<TensorId> tids, | Location loc, TensorId tid, | ||||
ArrayRef<Level> lvls, MutableArrayRef<Value> reduc, bool isParallel) { | Level dstLvl, | ||||
// TODO: support multiple return on parallel for? | MutableArrayRef<Value> reduc, | ||||
assert(!isParallel || reduc.size() <= 1); | bool isParallel) { | ||||
bool isSparseInput = false; | bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) || | ||||
TensorId tid = tids.front(); | isSingletonDLT(lvlTypes[tid][dstLvl]); | ||||
Level dstLvl = lvls.front(); | |||||
assert(tids.size() == lvls.size()); | |||||
for (auto [t, l] : llvm::zip(tids, lvls)) { | |||||
// TODO: this check for validity of the (t,l) pairs should be | |||||
// checked/enforced at the callsites, if possible. | |||||
assert(isValidLevel(t, l)); | |||||
assert(!coords[t][l]); // We cannot re-enter the same level | |||||
const auto lvlTp = lvlTypes[t][l]; | |||||
const bool isSparse = isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp); | |||||
// Must be a recognizable level-type. | |||||
assert(isSparse || isDenseDLT(lvlTp)); | |||||
// We can at most have one sparse input, otherwise, a while loop is required | |||||
// to co-iterate multiple sparse tensors. | |||||
assert(!isSparseInput || !isSparse); | |||||
if (isSparse) { | |||||
tid = t; | |||||
dstLvl = l; | |||||
} | |||||
isSparseInput = isSparseInput || isSparse; | |||||
} | |||||
const auto reassoc = getCollapseReassociation(tid, dstLvl); | const auto reassoc = getCollapseReassociation(tid, dstLvl); | ||||
// TODO: support dynamic slices. | // TODO: support dynamic slices. | ||||
// Use the first source-level here to build the loop bound (which is | // Uses the first dimension here to build the loop bound (which is also the | ||||
// also the biggest range). | // biggest range). | ||||
const Level srcLvl = reassoc.front(); | const Level srcLvl = reassoc.front(); | ||||
const Value step = constantIndex(builder, loc, 1); | Value step = C_IDX(1); | ||||
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | Value lo = isSparseCond ? posits[tid][srcLvl] // current offset | ||||
const Value lo = isSparseInput ? posits[tid][srcLvl] // current position | : loopSeqStack.back().first; // universal index | ||||
: loopSeqStack.back(); // universal index | Value hi = highs[tid][srcLvl]; | ||||
const Value hi = highs[tid][srcLvl]; | |||||
Operation *loop = nullptr; | Operation *loop = nullptr; | ||||
Value iv; | Value iv; | ||||
if (isParallel) { | if (isParallel) { | ||||
assert(collapseReassoc[tid] == nullptr); | assert(collapseReassoc[tid] == nullptr); | ||||
scf::ParallelOp parOp = | scf::ParallelOp parOp = | ||||
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc); | builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc); | ||||
builder.setInsertionPointToStart(parOp.getBody()); | builder.setInsertionPointToStart(parOp.getBody()); | ||||
Show All 19 Lines | if (isParallel) { | ||||
assert(forOp.getNumRegionIterArgs() == reduc.size()); | assert(forOp.getNumRegionIterArgs() == reduc.size()); | ||||
for (int i = 0, e = reduc.size(); i < e; i++) | for (int i = 0, e = reduc.size(); i < e; i++) | ||||
reduc[i] = forOp.getRegionIterArg(i); | reduc[i] = forOp.getRegionIterArg(i); | ||||
loop = forOp; | loop = forOp; | ||||
} | } | ||||
assert(loop && iv); | assert(loop && iv); | ||||
Value crd; | Value crd; | ||||
if (isSparseInput) { | if (isSparseCond) { | ||||
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); | assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); | ||||
// For COO, the position is the same across consecutive levels. | // For COO, the position is the same across consecutive levels. | ||||
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | ||||
llvm::for_each(reassoc, | llvm::for_each(reassoc, | ||||
[this, tid, iv](Level srcLvl) { posits[tid][srcLvl] = iv; }); | [this, tid, iv](Level srcLvl) { posits[tid][srcLvl] = iv; }); | ||||
crd = genSparseCrd(builder, loc, tid, dstLvl); | crd = genSparseCrd(builder, loc, tid, dstLvl); | ||||
} else { | } else { | ||||
// Dense tensor, the coordinate is the inducation variable. | // Dense tensor, the coordinate is the inducation variable. | ||||
crd = iv; | crd = iv; | ||||
} | } | ||||
if (isSparseSlices[tid] && isSparseInput) { | if (isSparseSlices[tid] && isSparseCond) { | ||||
// For sparse level slices, we need to filter out invalid coordinates that | // For sparse level slices, we need to filter out invalid coordinates that | ||||
// are not included in the slice. | // are not included in the slice. | ||||
SmallVector<Type> types; | SmallVector<Type> types; | ||||
for (Value red : reduc) | for (Value red : reduc) | ||||
types.push_back(red.getType()); | types.push_back(red.getType()); | ||||
auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, srcLvl); | auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, srcLvl); | ||||
bool hasReduc = !types.empty(); | bool hasReduc = !types.empty(); | ||||
Show All 12 Lines | if (hasReduc) { | ||||
builder.create<scf::YieldOp>(loc, reduc); | builder.create<scf::YieldOp>(loc, reduc); | ||||
} | } | ||||
// Set the insertion point to matched branch. | // Set the insertion point to matched branch. | ||||
builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); | builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); | ||||
crd = trans; | crd = trans; | ||||
} | } | ||||
assert(crd); | assert(crd); | ||||
coords[tid][srcLvl] = crd; | coords[tid][dstLvl] = crd; | ||||
// NOTE: we can also prepare for next level here in advance | return loop; | ||||
// Push the loop into stack | } | ||||
loopStack.emplace_back(ArrayRef<TensorId>(tid), ArrayRef<Level>(srcLvl), loop, | |||||
builder.getInsertionBlock(), crd, loopTag); | |||||
// Emit extra locals. | |||||
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); | |||||
Operation *LoopEmitter::emitWhileLoopOverSliceAtSparseLvl( | |||||
OpBuilder &builder, Location loc, Value pLo, Value pHi, Value offset, | |||||
Value sliceSize, TensorId tid, Level lvl, MutableArrayRef<Value> reduc) { | |||||
// TODO: we should generalize the method to support iteration over for | |||||
// normal slices as well to allow early break. | |||||
Operation *insertPoint = nullptr; | |||||
Operation *loop = | |||||
genSliceLvlTraverseLoop( | |||||
builder, loc, pLo, pHi, offset, sliceSize, tid, lvl, reduc, | |||||
Use "l" or "lvl" here. The name "d" is reserved for things of Dimension type, whereas this has Level type. wrengr: Use "l" or "lvl" here. The name "d" is reserved for things of `Dimension` type, whereas this… | |||||
/*genYield=*/false, // unaware of the yield values from user yet | |||||
[this, tid, lvl, reduc, offset, | |||||
&insertPoint](OpBuilder &builder, Location loc, Value iv, | |||||
MutableArrayRef<Value> innerReduc) { | |||||
assert(innerReduc.size() == reduc.size()); | |||||
// Updates users' reduction variable inplace | |||||
for (unsigned i = 0, e = reduc.size(); i < e; i++) | |||||
reduc[i] = innerReduc[i]; | |||||
// Loads the coordinates. | |||||
Value absC = | |||||
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv); | |||||
// We need to substract the offset to get relative coordinates. | |||||
// TODO: how to assert relC >=0 during runtime? | |||||
insertPoint = builder.create<arith::SubIOp>(loc, absC, offset); | |||||
posits[tid][lvl] = iv; | |||||
coords[tid][lvl] = insertPoint->getResult(0); | |||||
}) | |||||
.first; | |||||
// Sets the insertionn pointer inside loop body. | |||||
isn't that always the case here? Should that not be part of the method description then? aartbik: isn't that always the case here? Should that not be part of the method description then? | |||||
builder.setInsertionPointAfter(insertPoint); | |||||
return loop; | return loop; | ||||
} | } | ||||
Operation *LoopEmitter::enterLoopOverTensorAtLvl( | |||||
OpBuilder &builder, Location loc, ArrayRef<TensorId> tids, | |||||
ArrayRef<Level> lvls, MutableArrayRef<Value> reduc, bool isParallel) { | |||||
// TODO: support multiple return on parallel for? | |||||
assert(!isParallel || reduc.size() <= 1); | |||||
If the slice aartbik: If the slice | |||||
bool isSparseCond = false, isSliceCond = false; | |||||
size_t tid = tids.front(), lvl = lvls.front(); | |||||
I find this block of code extremely hard to read. aartbik: I find this block of code extremely hard to read.
Any way to factor this into slightly smaller… | |||||
better now? Peiming: better now? | |||||
Yes, although it could still use a bit more doc on what each block does (on entry of each block). aartbik: Yes, although it could still use a bit more doc on what each block does (on entry of each… | |||||
// Finds out the tensor level that we should use to generate loops. Amongs all | |||||
// the tensor levels, there is at most one sparse tensor level. | |||||
for (auto [t, l] : llvm::zip(tids, lvls)) { | |||||
assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair | |||||
assert(!coords[t][l] || // We cannot re-enter the same level | |||||
!dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop | |||||
auto dimType = lvlTypes[t][l]; | |||||
// Must be a recognizable DLT. | |||||
assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || | |||||
isSingletonDLT(dimType)); | |||||
// This is a slice-driven loop. | |||||
if (!dependentLvlMap[t][l].empty()) { | |||||
assert(!isSliceCond && !isSparseCond); | |||||
isSliceCond = true; | |||||
tid = t; | |||||
lvl = l; | |||||
continue; | |||||
} | |||||
bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); | |||||
// We can at most have one sparse input, otherwise, a while loop is | |||||
// required to co-iterate multiple sparse tensors. | |||||
assert(!isSparseCond || !isSparse); | |||||
assert(!isSliceCond || !isSparseCond); | |||||
if (isSparse) { | |||||
tid = t; | |||||
lvl = l; | |||||
} | |||||
isSparseCond = isSparseCond || isSparse; | |||||
} | |||||
// Generates loops differently depending on whether we need a slice-driven | |||||
// loop or a simple level traversal loop. | |||||
Operation *l = nullptr; | |||||
if (isSliceCond) { | |||||
bool fullyReduced = depFullyReduced(tid, lvl); | |||||
if (!fullyReduced) { | |||||
l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc); | |||||
} else { | |||||
// If the slice is fully reduced, we can now use TACO-based algorithm to | |||||
// iterate it. | |||||
l = emitWhileLoopOverSliceAtSparseLvl( | |||||
builder, loc, posits[tid][lvl], highs[tid][lvl], | |||||
getFinalSliceOnLvl(tid, lvl).offset, sliceSizes[tid][lvl].back(), tid, | |||||
lvl, reduc); | |||||
} | |||||
levelReducedDep[tid][lvl]++; | |||||
// We can also prepare for next dim here in advance | |||||
// Pushes the loop into stack. | |||||
loopStack.emplace_back( | |||||
ArrayRef<TensorId>(), ArrayRef<Level>(), ArrayRef<TensorId>(tid), | |||||
ArrayRef<Level>(lvl), ArrayRef<bool>(fullyReduced), l, | |||||
builder.getInsertionBlock(), coords[tid][lvl], loopTag); | |||||
} else { | |||||
l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc, isParallel); | |||||
// We can also prepare for next dim here in advance | |||||
// Pushes the loop into stack. | |||||
loopStack.emplace_back(ArrayRef<TensorId>(tid), ArrayRef<Level>(lvl), | |||||
ArrayRef<TensorId>(), ArrayRef<Level>(), | |||||
ArrayRef<bool>(), l, builder.getInsertionBlock(), | |||||
coords[tid][lvl], loopTag); | |||||
} | |||||
// Emit extra locals. | |||||
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); | |||||
return l; | |||||
} | |||||
Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl( | Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl( | ||||
OpBuilder &builder, Location loc, TensorId tid, Level lvl, | OpBuilder &builder, Location loc, TensorId tid, Level lvl, | ||||
AffineExpr affine, MutableArrayRef<Value> reduc) { | AffineExpr affine, MutableArrayRef<Value> reduc) { | ||||
assert(isValidLevel(tid, lvl)); | assert(isValidLevel(tid, lvl)); | ||||
assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(lvlTypes[tid][lvl])); | assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(lvlTypes[tid][lvl])); | ||||
// We can not re-enter the same level. | // We can not re-enter the same level. | ||||
assert(!coords[tid][lvl]); | assert(!coords[tid][lvl]); | ||||
// TODO: We should instead use a whileOp for filter loop to allow early | // TODO: We should instead use a whileOp for filter loop to allow early | ||||
// break when exceeding (for ordered levels). | // break when exceeding (for ordered levels). | ||||
// TODO: There are many other potiential opportunities that we might apply in | // TODO: There are many other potiential opportunities that we might apply in | ||||
// the future. E.g., we could use binary search to locate positions. | // the future. E.g., we could use binary search to locate positions. | ||||
const Value step = constantIndex(builder, loc, 1); | const Value step = C_IDX(1); | ||||
const Value pLo = posits[tid][lvl]; | const Value pLo = posits[tid][lvl]; | ||||
const Value pHi = highs[tid][lvl]; | const Value pHi = highs[tid][lvl]; | ||||
scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc); | scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc); | ||||
// In-place update on the reduction variable vector. | // In-place update on the reduction variable vector. | ||||
assert(forOp.getNumRegionIterArgs() == reduc.size()); | assert(forOp.getNumRegionIterArgs() == reduc.size()); | ||||
for (int i = 0, e = reduc.size(); i < e; i++) | for (int i = 0, e = reduc.size(); i < e; i++) | ||||
reduc[i] = forOp.getRegionIterArg(i); | reduc[i] = forOp.getRegionIterArg(i); | ||||
builder.setInsertionPointToStart(forOp.getBody()); | builder.setInsertionPointToStart(forOp.getBody()); | ||||
// The induction variable gives the position. | // The induction variable gives the position. | ||||
const Value pos = forOp.getInductionVar(); | const Value pos = forOp.getInductionVar(); | ||||
posits[tid][lvl] = pos; | posits[tid][lvl] = pos; | ||||
// Generating a load on the coordinates array yields the crd. | // Generating a load on the coordinates array yields the crd. | ||||
const Value mem = coordinatesBuffers[tid][lvl]; | const Value mem = coordinatesBuffers[tid][lvl]; | ||||
const Value crd = genIndexLoad(builder, loc, mem, pos); | const Value crd = genIndexLoad(builder, loc, mem, pos); | ||||
coords[tid][lvl] = crd; | coords[tid][lvl] = crd; | ||||
// Generate an if-condition to filter out coordinates that are not | // Generate an if-condition to filter out coordinates that are not | ||||
// equal to the result of the affine expression. | // equal to the result of the affine expression. | ||||
Value expected = genAffine(builder, loc, affine); | Value expected = genAffine(builder, loc, affine); | ||||
auto pred = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, crd, | auto pred = CMPI(eq, crd, expected); | ||||
expected); | |||||
SmallVector<Type> types; | SmallVector<Type> types; | ||||
for (Value red : reduc) { | for (Value red : reduc) { | ||||
types.push_back(red.getType()); | types.push_back(red.getType()); | ||||
} | } | ||||
bool hasReduc = !types.empty(); | bool hasReduc = !types.empty(); | ||||
scf::IfOp ifOp = | scf::IfOp ifOp = | ||||
builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc); | builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc); | ||||
Show All 9 Lines | if (hasReduc) { | ||||
// On mismatch. | // On mismatch. | ||||
builder.create<scf::YieldOp>(loc, reduc); | builder.create<scf::YieldOp>(loc, reduc); | ||||
} | } | ||||
// Set the insert point to matched branch. | // Set the insert point to matched branch. | ||||
builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); | builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); | ||||
// NOTE: we can also prepare for next lvl here in advance | // NOTE: we can also prepare for next lvl here in advance | ||||
// Push the loop into stack | // Push the loop into stack | ||||
loopStack.emplace_back(ArrayRef<TensorId>(tid), ArrayRef<Level>(lvl), forOp, | loopStack.emplace_back(ArrayRef<TensorId>(tid), ArrayRef<Level>(lvl), | ||||
builder.getInsertionBlock(), crd, nullptr); | ArrayRef<TensorId>(), ArrayRef<Level>(), | ||||
ArrayRef<bool>(), forOp, builder.getInsertionBlock(), | |||||
coords[tid][lvl], nullptr); | |||||
return forOp; | return forOp; | ||||
} | } | ||||
void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc, | void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc, | ||||
TensorId tid, Level lvl, | TensorId tid, Level lvl, | ||||
AffineExpr lvlExpr) { | AffineExpr lvlExpr) { | ||||
assert(isDenseDLT(lvlTypes[tid][lvl])); | assert(isDenseDLT(lvlTypes[tid][lvl])); | ||||
// For dense levels, the level-coordinate also serves as the position. | // For dense levels, the level-coordinate also serves as the position. | ||||
Value lvlCrd = genAffine(builder, loc, lvlExpr); | Value lvlCrd = genAffine(builder, loc, lvlExpr); | ||||
posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd); | posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd); | ||||
} | } | ||||
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( | Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( | ||||
OpBuilder &builder, Location loc, ArrayRef<TensorId> tids, | OpBuilder &builder, Location loc, ArrayRef<TensorId> tids, | ||||
ArrayRef<Level> lvls, bool needsUniv, MutableArrayRef<Value> reduc) { | ArrayRef<Level> lvls, bool needsUniv, MutableArrayRef<Value> reduc) { | ||||
// NOTE: the slice driven tensor-related reduction variable must | |||||
A note "make sure" is very ambiguous. Is that a note to self, or something that the code actively does. aartbik: A note "make sure" is very ambiguous. Is that a note to self, or something that the code… | |||||
// appear before normal tensors. | |||||
appears first than normal tensors appears before normal tensors? aartbik: appears first than normal tensors
appears before normal tensors? | |||||
assert(tids.size() == lvls.size()); | assert(tids.size() == lvls.size()); | ||||
SmallVector<Type> types; | SmallVector<Type> types; | ||||
SmallVector<Value> operands; | SmallVector<Value> operands; | ||||
// Construct the while-loop with a parameter for each coordinate. | // Construct the while-loop with a parameter for each coordinate. | ||||
const Type indexType = builder.getIndexType(); | const Type indexType = builder.getIndexType(); | ||||
Why remove the const? It's clearer to know when local variables will never change wrengr: Why remove the const? It's clearer to know when local variables will never change | |||||
rebase mistake. Peiming: rebase mistake. | |||||
for (auto [tid, lvl] : llvm::zip(tids, lvls)) { | for (auto [tid, lvl] : llvm::zip(tids, lvls)) { | ||||
// TODO: support coiteration with slice driven tensors. | |||||
const auto lvlTp = lvlTypes[tid][lvl]; | const auto lvlTp = lvlTypes[tid][lvl]; | ||||
assert(dependentLvlMap[tid][lvl].empty() && "TODO: not yet implemented"); | |||||
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | ||||
const auto reassoc = getCollapseReassociation(tid, lvl); | const auto reassoc = getCollapseReassociation(tid, lvl); | ||||
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { | for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { | ||||
Please don't undo my factoring this out into a local variable. The condition is much easier to read when (1) it's all on one line, and (2) avoids repeating common expressions which forces the reader to double check if they are indeed the same or not. wrengr: Please don't undo my factoring this out into a local variable. The condition is much easier to… | |||||
Okay, it is a mistake I made during rebasing. Peiming: Okay, it is a mistake I made during rebasing. | |||||
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { | if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { | ||||
// This is the segment high for each non-unique levels. | // This is the segment high for each non-unique levels. | ||||
types.push_back(indexType); | types.push_back(indexType); | ||||
operands.push_back(constantIndex(builder, loc, 0)); | operands.push_back(C_IDX(0)); | ||||
} | } | ||||
} | } | ||||
const auto pos = posits[tid][reassoc.front()]; | const auto pos = posits[tid][reassoc.front()]; | ||||
assert(pos); | assert(pos); | ||||
types.push_back(indexType); | types.push_back(indexType); | ||||
operands.push_back(pos); | operands.push_back(pos); | ||||
} | } | ||||
} | } | ||||
// The position where user-supplied reduction variable starts. | // The position where user-supplied reduction variable starts. | ||||
for (Value rec : reduc) { | for (Value rec : reduc) { | ||||
types.push_back(rec.getType()); | types.push_back(rec.getType()); | ||||
operands.push_back(rec); | operands.push_back(rec); | ||||
} | } | ||||
if (needsUniv) { | if (needsUniv) { | ||||
types.push_back(indexType); | types.push_back(indexType); | ||||
// Update universal index. | // Update universal index. | ||||
operands.push_back(loopSeqStack.back()); | operands.push_back(loopSeqStack.back().first); | ||||
} | } | ||||
assert(types.size() == operands.size()); | assert(types.size() == operands.size()); | ||||
scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands); | scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands); | ||||
SmallVector<Location> locs(types.size(), loc); | SmallVector<Location> locs(types.size(), loc); | ||||
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); | Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); | ||||
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); | Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); | ||||
Show All 12 Lines | if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | ||||
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { | if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { | ||||
// Links the SSA chain for segHi. | // Links the SSA chain for segHi. | ||||
segHi[tid][reassoc[i]] = after->getArgument(o++); | segHi[tid][reassoc[i]] = after->getArgument(o++); | ||||
} | } | ||||
} | } | ||||
Value op1 = before->getArgument(o); | Value op1 = before->getArgument(o); | ||||
// We used the first level bound as the bound the collapsed set of levels. | // We used the first level bound as the bound the collapsed set of levels. | ||||
Value op2 = highs[tid][reassoc.front()]; | Value op2 = highs[tid][reassoc.front()]; | ||||
Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, | Value opc = CMPI(ult, op1, op2); | ||||
op1, op2); | |||||
cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc; | cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc; | ||||
// Update positions | // Update positions | ||||
Value pos = after->getArgument(o++); | Value pos = after->getArgument(o++); | ||||
// For COO, the position is the same across consecutive levels. | // For COO, the position is the same across consecutive levels. | ||||
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | ||||
llvm::for_each(reassoc, [this, tid, pos](Level srcLvl) { | llvm::for_each(reassoc, [this, tid, pos](Level srcLvl) { | ||||
posits[tid][srcLvl] = pos; | posits[tid][srcLvl] = pos; | ||||
}); | }); | ||||
Show All 27 Lines | if (!slicesPreds.empty()) { | ||||
SmallVector<Value> yields(after->getArguments()); | SmallVector<Value> yields(after->getArguments()); | ||||
// Generates a list of if statments | // Generates a list of if statments | ||||
// pos = in_slice ? pos : pos + 1 | // pos = in_slice ? pos : pos + 1 | ||||
// TODO: instead of always picking pos + 1, we should set pos = high to | // TODO: instead of always picking pos + 1, we should set pos = high to | ||||
// break to loop if the coordinates are larger than the slice size. | // break to loop if the coordinates are larger than the slice size. | ||||
// | // | ||||
// This "idx" is the index into `llvm::zip(tids, lvls)` | // This "idx" is the index into `llvm::zip(tids, lvls)` | ||||
for (auto [pred, idx] : slicesPreds) { | for (auto [pred, idx] : slicesPreds) { | ||||
Value nextPos = builder.create<arith::AddIOp>( | Value nextPos = builder.create<arith::AddIOp>(loc, yields[idx], C_IDX(1)); | ||||
loc, yields[idx], constantIndex(builder, loc, 1)); | |||||
yields[idx] = | yields[idx] = | ||||
builder.create<arith::SelectOp>(loc, pred, yields[idx], nextPos); | builder.create<arith::SelectOp>(loc, pred, yields[idx], nextPos); | ||||
} | } | ||||
Value pred = slicesPreds.front().first; | Value pred = slicesPreds.front().first; | ||||
for (int i = 1, e = slicesPreds.size(); i < e; i++) { | for (int i = 1, e = slicesPreds.size(); i < e; i++) { | ||||
pred = builder.create<arith::AndIOp>(loc, pred, slicesPreds[i].first); | pred = builder.create<arith::AndIOp>(loc, pred, slicesPreds[i].first); | ||||
} | } | ||||
Show All 13 Lines | Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( | ||||
Value min; | Value min; | ||||
// Finds the minimum coordinate | // Finds the minimum coordinate | ||||
if (!needsUniv) { | if (!needsUniv) { | ||||
for (auto [tid, lvl] : llvm::zip(tids, lvls)) { | for (auto [tid, lvl] : llvm::zip(tids, lvls)) { | ||||
const auto lvlTp = lvlTypes[tid][lvl]; | const auto lvlTp = lvlTypes[tid][lvl]; | ||||
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | ||||
const auto crd = coords[tid][lvl]; | const auto crd = coords[tid][lvl]; | ||||
if (min) { | if (min) { | ||||
Value cmp = builder.create<arith::CmpIOp>( | Value cmp = CMPI(ult, crd, min); | ||||
loc, arith::CmpIPredicate::ult, crd, min); | |||||
min = builder.create<arith::SelectOp>(loc, cmp, crd, min); | min = builder.create<arith::SelectOp>(loc, cmp, crd, min); | ||||
} else { | } else { | ||||
min = crd; | min = crd; | ||||
} | } | ||||
} | } | ||||
} | } | ||||
} else { | } else { | ||||
assert(!min); | assert(!min); | ||||
// Otherwise, universal index is the minimal pos. | // Otherwise, universal index is the minimal pos. | ||||
min = after->getArguments().back(); | min = after->getArguments().back(); | ||||
} | } | ||||
// Sets up the loop stack. | // Sets up the loop stack. | ||||
loopStack.emplace_back(tids, lvls, whileOp, builder.getInsertionBlock(), min, | loopStack.emplace_back(tids, lvls, ArrayRef<TensorId>(), ArrayRef<Level>(), | ||||
loopTag); | ArrayRef<bool>(), whileOp, builder.getInsertionBlock(), | ||||
min, loopTag); | |||||
assert(loopStack.size() == loopSeqStack.size()); | assert(loopStack.size() == loopSeqStack.size()); | ||||
for (auto [tid, dstLvl] : llvm::zip(tids, lvls)) { | for (auto [tid, dstLvl] : llvm::zip(tids, lvls)) { | ||||
const auto reassoc = getCollapseReassociation(tid, dstLvl); | const auto reassoc = getCollapseReassociation(tid, dstLvl); | ||||
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); | assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); | ||||
// TODO: Refactors this into smaller functions. | // TODO: Refactors this into smaller functions. | ||||
// NOTE: For all the collapsed level (except for the last one, that is why | // NOTE: For all the collapsed level (except for the last one, that is why | ||||
// the loop ends with `reassoc.size() - 1`), as each iteration is advanced | // the loop ends with `reassoc.size() - 1`), as each iteration is advanced | ||||
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines | |||||
void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, | void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, | ||||
TensorId tid, Level dstLvl) { | TensorId tid, Level dstLvl) { | ||||
assert(isValidLevel(tid, dstLvl)); | assert(isValidLevel(tid, dstLvl)); | ||||
const auto lvlTp = lvlTypes[tid][dstLvl]; | const auto lvlTp = lvlTypes[tid][dstLvl]; | ||||
if (isDenseDLT(lvlTp)) | if (isDenseDLT(lvlTp)) | ||||
return; | return; | ||||
const Value c0 = constantIndex(builder, loc, 0); | const Value c0 = C_IDX(0); | ||||
const Value c1 = constantIndex(builder, loc, 1); | const Value c1 = C_IDX(1); | ||||
for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) { | for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) { | ||||
// Either the first level, or the previous level has been set. | // Either the first level, or the previous level has been set. | ||||
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. | ||||
assert(srcLvl == 0 || posits[tid][srcLvl - 1]); | assert(srcLvl == 0 || posits[tid][srcLvl - 1]); | ||||
if (!isCompressedDLT(lvlTp) && !isSingletonDLT(lvlTp)) | if (!isCompressedDLT(lvlTp) && !isSingletonDLT(lvlTp)) | ||||
continue; | continue; | ||||
if (isCompressedDLT(lvlTp)) { | if (isCompressedDLT(lvlTp)) { | ||||
const Value mem = positionsBuffers[tid][srcLvl]; | const Value mem = positionsBuffers[tid][srcLvl]; | ||||
▲ Show 20 Lines • Show All 135 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, | void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, | ||||
MutableArrayRef<Value> reduc) { | MutableArrayRef<Value> reduc) { | ||||
const LoopInfo &loopInfo = loopStack.back(); | const LoopInfo &loopInfo = loopStack.back(); | ||||
auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop); | auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop); | ||||
builder.setInsertionPointToEnd(loopInfo.userCodeBlock); | builder.setInsertionPointToEnd(loopInfo.userCodeBlock); | ||||
Value iv = loopInfo.iv; | Value iv = loopInfo.iv; | ||||
// Finalize the induction. Note that the induction could be performed | // Finalize the induction. Note that the induction could be performed | ||||
// in the individual if-branches to avoid re-evaluating the conditions. | // in the individual if-branches to avoid re-evaluating the conditions. | ||||
// However, that would result in a rather elaborate forest of yield | // However, that would result in a rather elaborate forest of yield | ||||
// instructions during code generation. Moreover, performing the induction | // instructions during code generation. Moreover, performing the induction | ||||
// after the if-statements more closely resembles code generated by TACO. | // after the if-statements more closely resembles code generated by TACO. | ||||
unsigned o = 0; | unsigned o = 0; | ||||
SmallVector<Value> operands; | SmallVector<Value> operands; | ||||
Value one = constantIndex(builder, loc, 1); | unsigned delta = 0; | ||||
for (auto [tid, lvl, resolved] : llvm::zip( | |||||
loopInfo.slicedTids, loopInfo.slicedLvls, loopInfo.sliceReduced)) { | |||||
levelReducedDep[tid][lvl]--; | |||||
if (!resolved) { | |||||
genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o); | |||||
continue; | |||||
It would be clearer to use break in the then-branch. That keeps you from needing to indent the else-branch (which is very long), and helps the reader avoid needing to check to see if there's something else after the else-branch. wrengr: It would be clearer to use `break` in the then-branch. That keeps you from needing to indent… | |||||
I found else is easier to follow, because the control flow is more straight forward. Peiming: I found `else` is easier to follow, because the control flow is more straight forward. | |||||
I agree the else is very long and deep Why not if (!resolved) { genSlice continue; } aartbik: I agree the else is very long and deep
Why not
if (!resolved) {
genSlice
continue;
}
.... | |||||
} | |||||
// TODO: We need to distinguish coiterate loop with slice-driven loop and | |||||
// fully reduced while op for iterating one slices. | |||||
// FIXME: since we didn't implement coiteration, this must be iteration | |||||
// just on fully resolved slice. | |||||
assert(loopInfo.slicedTids.size() == 1 && loopInfo.tids.empty()); | |||||
// The if guard to filter out out-range coordinates. | |||||
assert(llvm::isa<scf::IfOp>(builder.getInsertionBlock()->getParentOp())); | |||||
posits[tid][lvl] = whileOp->getResult(o++); | |||||
// FIXME: we are not using continue here since we do not support | |||||
// coiteration on slices. But it need to be treated similarly as the | |||||
// universal index. | |||||
o++; // skip continue flag. | |||||
// Since we did not push two results from whileOp. The size of the | |||||
// operands vector is smaller than the actual number of return values from | |||||
// the whileOp. | |||||
// It is because we are actually generating yield in the IfOp inside the | |||||
// whileOp to only iterates over inbound coordinates within the slices. | |||||
delta += 2; | |||||
}; | |||||
Value one = C_IDX(1); | |||||
for (auto [tid, dstLvl] : llvm::zip(loopInfo.tids, loopInfo.lvls)) { | for (auto [tid, dstLvl] : llvm::zip(loopInfo.tids, loopInfo.lvls)) { | ||||
const auto lvlTp = lvlTypes[tid][dstLvl]; | const auto lvlTp = lvlTypes[tid][dstLvl]; | ||||
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { | ||||
const auto reassoc = getCollapseReassociation(tid, dstLvl); | const auto reassoc = getCollapseReassociation(tid, dstLvl); | ||||
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); | assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); | ||||
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { | for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { | ||||
const Level srcLvl = reassoc[i]; | const Level srcLvl = reassoc[i]; | ||||
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) { | if (!isUniqueDLT(lvlTypes[tid][srcLvl])) { | ||||
operands.push_back(segHi[tid][srcLvl]); | operands.push_back(segHi[tid][srcLvl]); | ||||
o++; | o++; | ||||
} | } | ||||
} | } | ||||
const Value crd = coords[tid][dstLvl]; | const Value crd = coords[tid][dstLvl]; | ||||
const Value pos = posits[tid][dstLvl]; | const Value pos = posits[tid][dstLvl]; | ||||
Value cmp = | Value cmp = CMPI(eq, crd, iv); | ||||
builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, crd, iv); | |||||
// If the loop contains a coiteration with non-unique level, we fast | // If the loop contains a coiteration with non-unique level, we fast | ||||
// forward all the duplicated coords by setting the position to the | // forward all the duplicated coords by setting the position to the | ||||
// segment high. | // segment high. | ||||
Value add = !isUniqueDLT(lvlTypes[tid][reassoc.back()]) | Value add = !isUniqueDLT(lvlTypes[tid][reassoc.back()]) | ||||
? segHi[tid][reassoc.back()] | ? segHi[tid][reassoc.back()] | ||||
: builder.create<arith::AddIOp>(loc, pos, one); | : builder.create<arith::AddIOp>(loc, pos, one); | ||||
operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, pos)); | operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, pos)); | ||||
Show All 18 Lines | void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, | ||||
// Reduction value from users. | // Reduction value from users. | ||||
for (auto &i : reduc) { | for (auto &i : reduc) { | ||||
operands.push_back(i); | operands.push_back(i); | ||||
// In place update reduction variable. | // In place update reduction variable. | ||||
i = whileOp->getResult(o++); | i = whileOp->getResult(o++); | ||||
} | } | ||||
// An (optional) universal index. | // An (optional) universal index. | ||||
if (operands.size() < whileOp.getNumResults()) { | if (operands.size() + delta < whileOp.getNumResults()) { | ||||
assert(operands.size() + 1 == whileOp.getNumResults()); | assert(operands.size() + delta + 1 == whileOp.getNumResults()); | ||||
// The last one is the universial index. | // The last one is the universial index. | ||||
operands.push_back(builder.create<arith::AddIOp>(loc, iv, one)); | operands.push_back(builder.create<arith::AddIOp>(loc, iv, one)); | ||||
// update the loop starting point of current loop sequence | // update the loop starting point of current loop sequence | ||||
loopSeqStack.back() = whileOp->getResult(o++); | loopSeqStack.back().first = whileOp->getResult(o++); | ||||
} | } | ||||
assert(o == operands.size()); | assert(o == operands.size() + delta); | ||||
builder.create<scf::YieldOp>(loc, operands); | builder.create<scf::YieldOp>(loc, operands); | ||||
builder.setInsertionPointAfter(whileOp); | builder.setInsertionPointAfter(whileOp); | ||||
} | } | ||||
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, | void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, | ||||
MutableArrayRef<Value> reduc) { | MutableArrayRef<Value> reduc) { | ||||
// Clean up the values, it would help use to discover potential bug at a | // Clean up the values, it would help use to discover potential bug at a | ||||
// earlier stage (instead of silently using a wrong value). | // earlier stage (instead of silently using a wrong value). | ||||
const LoopInfo &loopInfo = loopStack.back(); | const LoopInfo &loopInfo = loopStack.back(); | ||||
assert(loopInfo.tids.size() == loopInfo.lvls.size()); | assert(loopInfo.tids.size() == loopInfo.lvls.size()); | ||||
SmallVector<Value> red; | SmallVector<Value> red; | ||||
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) { | if (llvm::isa<scf::WhileOp>(loopInfo.loop)) { | ||||
exitWhileLoop(rewriter, loc, reduc); | exitWhileLoop(rewriter, loc, reduc); | ||||
} else { | } else { | ||||
exitForLoop(rewriter, loc, reduc); | exitForLoop(rewriter, loc, reduc); | ||||
} | } | ||||
assert(loopStack.size() == loopSeqStack.size()); | assert(loopStack.size() == loopSeqStack.size()); | ||||
loopStack.pop_back(); | loopStack.pop_back(); | ||||
} | } | ||||
//===----------------------------------------------------------------------===// | |||||
// Slice-driven loop related methods. | |||||
Ok, this block is where all the magic happens ;-) aartbik: Ok, this block is where all the magic happens ;-)
I need to do one more careful pass over this.. | |||||
//===----------------------------------------------------------------------===// | |||||
unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const { | |||||
I think this still needs some work to make reading the block easier. The problem is that you have very concise comments in the header So I would still give every implementation function here an entry comment, but one that shows what is generated, using some pseudo-code of the output WDYT? aartbik: I think this still needs some work to make reading the block easier. The problem is that you… | |||||
unsigned totalDependencies = dependentLvlMap[tid][lvl].size(); | |||||
if (totalDependencies != 0) { | |||||
assert(totalDependencies >= 2); | |||||
return totalDependencies - levelReducedDep[tid][lvl]; | |||||
} | |||||
return totalDependencies; | |||||
} | |||||
const LoopEmitter::SliceInfo &LoopEmitter::getFinalSliceOnLvl(TensorId tid, | |||||
Level lvl) { | |||||
// Finds the most-recent slice using a reverse iteration. | |||||
for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie; | |||||
it++) { | |||||
this one seems out of place (all others generate stuff) aartbik: this one seems out of place (all others generate stuff)
perhaps move it up or down in the… | |||||
if (it->slicedOnLvl == lvl) { // the level matched | |||||
// Must be the final slice we need to fully reduced the expression too. | |||||
assert(it->depth == dependentLvlMap[tid][lvl].size() - 1); | |||||
return *it; | |||||
} | |||||
} | |||||
llvm_unreachable("Failed to find sliceInfo"); | |||||
} | |||||
// Generates a while loop to iterate over a slice sparse level as follows. | |||||
// | |||||
// while(loopLo < loopHi) { | |||||
// if (coords[loopLo] < offset + size) { | |||||
// body_builder | |||||
// } else { | |||||
// break; | |||||
// } | |||||
// loopLo ++; | |||||
// } | |||||
std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop( | |||||
OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset, | |||||
Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield, | |||||
llvm::function_ref<void(OpBuilder &, Location, Value, | |||||
MutableArrayRef<Value>)> | |||||
bodyBuilder) { | |||||
Value c1 = C_IDX(1); | |||||
Value sliceHi = builder.create<arith::AddIOp>(loc, offset, size); | |||||
SmallVector<Value> reduc = { | |||||
loopLo, // loop lower bounds | |||||
constantI1(builder, loc, true), // continue | |||||
}; | |||||
// Append user required reduction value. | |||||
reduc.append(userReduc.begin(), userReduc.end()); | |||||
scf::WhileOp whileOp = builder.create<scf::WhileOp>( | |||||
loc, ValueRange(reduc).getTypes(), reduc, | |||||
/*beforeBuilder=*/ | |||||
[loopHi](OpBuilder &builder, Location loc, ValueRange args) { | |||||
Value lo = args[0]; | |||||
Value cont = args[1]; | |||||
Value inBound = CMPI(ult, lo, loopHi); | |||||
Value cond = builder.create<arith::AndIOp>(loc, cont, inBound); | |||||
// continue if not yet break nor out of bound. | |||||
builder.create<scf::ConditionOp>(loc, cond, args); | |||||
}, | |||||
/*afterBuilder=*/ | |||||
[this, c1, tid, lvl, sliceHi, genYield, | |||||
bodyBuilder](OpBuilder &builder, Location loc, ValueRange args) { | |||||
Value iv = args[0]; | |||||
Value coord = | |||||
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv); | |||||
Value cont = CMPI(ult, coord, sliceHi); | |||||
TypeRange types = args.drop_front(2).getTypes(); | |||||
auto ifOp = builder.create<scf::IfOp>(loc, types, cont, true); | |||||
{ | |||||
here and a few other place, no period at end, please make once last pass over all new comments here aartbik: here and a few other place, no period at end, please make once last pass over all new comments… | |||||
// 2 reduction variable maintained by us. | |||||
SmallVector<Value> ifRet = args.drop_front(2); | |||||
assert(ifRet.size() == args.size() - 2); | |||||
OpBuilder::InsertionGuard guard(builder); | |||||
// If coord >= sliceHi. | |||||
builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); | |||||
// Coordinates is OOB, just yield. | |||||
builder.create<scf::YieldOp>(loc, ifRet); | |||||
// If coord < sliceHi. | |||||
builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); | |||||
// Delegates to users' callback. | |||||
bodyBuilder(builder, loc, iv, ifRet); | |||||
if (genYield) { | |||||
builder.setInsertionPointToEnd(&ifOp.getThenRegion().front()); | |||||
builder.create<scf::YieldOp>(loc, ifRet); | |||||
} | |||||
} | |||||
// Marks this speical ifOp to avoid sparisification finalizing it. | |||||
ifOp->setAttr(getLoopEmitterLoopAttrName(), | |||||
StringAttr::get(builder.getContext(), "slice")); | |||||
// Insertion point restored to after ifOp. | |||||
SmallVector<Value> yields; | |||||
// Increase induction variable. | |||||
yields.push_back(builder.create<arith::AddIOp>(loc, iv, c1)); | |||||
// Terminates the while loop according to the continue flag. | |||||
yields.push_back(cont); | |||||
yields.append(ifOp.getResults().begin(), ifOp.getResults().end()); | |||||
builder.create<scf::YieldOp>(loc, yields); | |||||
}); | |||||
builder.setInsertionPointAfter(whileOp); | |||||
return std::make_pair(whileOp, whileOp.getResults().drop_front(2)); | |||||
} | |||||
// Generates a loop nest that traverse all the unresolved levels in between. | |||||
// TODO: it can only handle all compressed tensors. | |||||
// | |||||
// for(int i = 0; i < slicePos.size(); i+=2) { | |||||
// loopLo = slicePos[i]; | |||||
// loopHi = slicePos[i + 1]; | |||||
// | |||||
// // Then the same loop generated by genSliceLvlTraverse above. | |||||
// while (loopLo < loopHI) { | |||||
// if (pos[loopLo] < sliceHi) { | |||||
// bodyBuilder(); | |||||
// } else { | |||||
// break; | |||||
// } | |||||
// loopLo ++; | |||||
// } | |||||
// } | |||||
ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse( | |||||
OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl, | |||||
size_t depth, ValueRange userReduc, | |||||
llvm::function_ref<void(OpBuilder &, Location, Value, | |||||
MutableArrayRef<Value>)> | |||||
bodyBuilder) { | |||||
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); | |||||
// TODO: it only works on all compressed tensor. | |||||
Value sPtrBuf = slicePosBuffer[tid][lvl][depth]; | |||||
Value pSt = c2; // pointer starting index | |||||
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize | |||||
auto forOp = | |||||
scf::buildLoopNest( | |||||
builder, loc, pSt, mSz, c2, userReduc, | |||||
[this, c1, tid, lvl, offset, sPtrBuf, | |||||
bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs, | |||||
ValueRange iterArgs) -> scf::ValueVector { | |||||
// generate traversal for each level. | |||||
Value loopLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); | |||||
Value loopHi = genIndexLoad( | |||||
builder, loc, sPtrBuf, | |||||
builder.create<arith::AddIOp>(loc, ivs.front(), c1)); | |||||
return genSliceLvlTraverseLoop(builder, loc, loopLo, loopHi, offset, | |||||
sliceSizes[tid][lvl].back(), tid, | |||||
lvl, iterArgs, true, bodyBuilder) | |||||
.second; | |||||
}) | |||||
.loops.front(); | |||||
// Insert after current while operation. | |||||
builder.setInsertionPointAfter(forOp); | |||||
return forOp.getResults(); | |||||
} | |||||
void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc, | |||||
TensorId tid, Level lvl) { | |||||
assert(lvl == 0 && "TODO: handle non-first level"); | |||||
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3), | |||||
c4 = C_IDX(4); | |||||
Value size = sliceSizes[tid][0][0]; | |||||
Value sPtrBuf = slicePosBuffer[tid][0][0]; | |||||
Value pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); | |||||
// Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi] | |||||
builder.create<memref::StoreOp>(loc, c4, sPtrBuf, c0); // memSize = 4 | |||||
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1); // index = 0 | |||||
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c2); // pLo = 0; | |||||
builder.create<memref::StoreOp>(loc, pHi, sPtrBuf, c3); // loaded pHi. | |||||
// This is an non empty tensor if 0 < pHi. | |||||
Value isNonEmpty = CMPI(ult, c0, pHi); | |||||
// The minimal coord must be at the first on ordered level. | |||||
// FIXME: Technically we should load the coord only when the slice is | |||||
// nonempty. though we assume that even on empty sparse tensors, a non-empty | |||||
// ptr/idx buffer is allocated for each level so it would not cause OOB to | |||||
// avoid generating a ifOp here. | |||||
Value minCrd = genIndexLoad(builder, loc, coordinatesBuffers[tid][0], c0); | |||||
// FIXME: We need the relative offset related to the base slice. | |||||
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty); | |||||
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, /*depth=*/1); | |||||
} | |||||
// Fills in the slicePosBuffer before slice-driven loop begin. | |||||
// TODO: it can only handle all compressed tensors. | |||||
// | |||||
// // Loop generated by `genUnResolvedSliceTreeTraverse` | |||||
// for(int i = 0; i < slicePos.size(); i+=2) { | |||||
// loopLo = slicePos[i]; | |||||
// loopHi = slicePos[i + 1]; | |||||
// minCrd = max; | |||||
// while (loopLo < loopHi) { | |||||
// if (pos[loopLo] < sliceHi) { | |||||
// // bodyBuilder | |||||
// slicePos[tid].push_back(pos[loopLo]); | |||||
// slicePos[tid].push_back(pos[loopLo + 1]); | |||||
// minCrd = min(minCrd, crd[pos[loopLo]]); | |||||
// } else { | |||||
// break; | |||||
// } | |||||
// loopLo ++; | |||||
// } | |||||
// } | |||||
void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc, | |||||
TensorId tid, Level lvl) { | |||||
assert(isCompressedDLT(lvlTypes[tid][lvl])); | |||||
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); | |||||
const SliceInfo &sliceInfo = sliceStack[tid].back(); | |||||
unsigned prevLvl = *sliceInfo.slicedOnLvl; | |||||
assert(lvl >= prevLvl); | |||||
// Either lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one | |||||
// variable need to be reduced on the same level). | |||||
// Or lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a | |||||
// simple dim expression in between). | |||||
assert(lvl == prevLvl + 1 && "TODO: not yet implemented"); | |||||
// Check slice stack integrity. | |||||
assert(slicePosBuffer[tid][prevLvl].size() == sliceInfo.depth); | |||||
Value sPtrBuf = slicePosBuffer[tid][lvl].back(); | |||||
SmallVector<Value, 3> reduc = { | |||||
constantI1(builder, loc, false), // isNonEmpty | |||||
lvlSizes[tid][lvl], // minCoord | |||||
c2, // memSize | |||||
}; | |||||
ValueRange result = genUnResolvedSliceTreeTraverse( | |||||
builder, loc, sliceInfo.offset, tid, prevLvl, sliceInfo.depth - 1, reduc, | |||||
[this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, | |||||
Value iv, | |||||
MutableArrayRef<Value> reduc) { | |||||
Value &nonEmpty = reduc[0]; | |||||
Value &minCrd = reduc[1]; | |||||
Value &curMemSz = reduc[2]; | |||||
Value pHi = builder.create<arith::AddIOp>(loc, iv, c1); | |||||
Value sPLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv); | |||||
Value sPHi = | |||||
genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi); | |||||
// isNonEmpty = isNonEmpty || lvlNonEmpty, i.e., as long as there is one | |||||
// non-empty lvl, the slice is non-empty. | |||||
Value lvlNonEmpty = CMPI(ult, sPLo, sPHi); | |||||
nonEmpty = builder.create<arith::OrIOp>(loc, lvlNonEmpty, nonEmpty); | |||||
// Update the minimum coordinate. | |||||
auto ifNonEmpty = builder.create<scf::IfOp>(loc, builder.getIndexType(), | |||||
lvlNonEmpty, true); | |||||
{ | |||||
// Generate Code as follows. | |||||
Not Done ReplyInline ActionsI first though this was commented out code ;-) So make it Generate: code aartbik: I first though this was commented out code ;-)
So make it
Generate:
code
| |||||
// | |||||
// if (nonEmpty) { | |||||
// minCrd = min(minCrd, crd[pos[pLo]]); | |||||
// } | |||||
OpBuilder::InsertionGuard guard(builder); | |||||
builder.setInsertionPointToStart(ifNonEmpty.thenBlock()); | |||||
Value curC = | |||||
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], sPLo); | |||||
Value isSmaller = CMPI(ult, curC, minCrd); | |||||
Value newMin = | |||||
builder.create<arith::SelectOp>(loc, isSmaller, curC, minCrd); | |||||
builder.create<scf::YieldOp>(loc, newMin); | |||||
builder.setInsertionPointToStart(ifNonEmpty.elseBlock()); | |||||
builder.create<scf::YieldOp>(loc, minCrd); | |||||
} | |||||
minCrd = ifNonEmpty.getResult(0); | |||||
builder.create<memref::StoreOp>(loc, sPLo, sPtrBuf, curMemSz); | |||||
Value nxtMemSize = builder.create<arith::AddIOp>(loc, curMemSz, c1); | |||||
builder.create<memref::StoreOp>(loc, sPHi, sPtrBuf, nxtMemSize); | |||||
// updates the size of the memory curMemSize += 2 | |||||
curMemSz = builder.create<arith::AddIOp>(loc, curMemSz, c2); | |||||
}); | |||||
unsigned depth = levelReducedDep[tid][lvl]; | |||||
Value size = sliceSizes[tid][lvl][depth]; | |||||
Value isNonEmpty = result[0]; | |||||
Value minCrd = result[1]; | |||||
// Two metadata [memSize, idx]. | |||||
// TODO: Can use an SSA value for these two metadata | |||||
builder.create<memref::StoreOp>(loc, result[2], sPtrBuf, c0); | |||||
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1); | |||||
// FIXME: we need the relative offset related to the base slice. | |||||
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty); | |||||
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, depth + 1); | |||||
} | |||||
bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, | |||||
Level lvl) { | |||||
Value c1 = C_IDX(1), c2 = C_IDX(2); | |||||
if (depFullyReduced(tid, lvl)) { | |||||
// If constraints on the tensor is fully resolved. We do not need to | |||||
// generates slice begin any more, instead we fall back to TACO-based | |||||
// algorithm to (co)iterates over the slice. | |||||
Value pLoPtr = | |||||
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), c1); | |||||
pLoPtr = builder.create<arith::AddIOp>(loc, pLoPtr, c2); | |||||
Value pHiPtr = builder.create<arith::AddIOp>(loc, pLoPtr, c1); | |||||
posits[tid][lvl] = | |||||
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pLoPtr); | |||||
highs[tid][lvl] = | |||||
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pHiPtr); | |||||
return true; | |||||
} | |||||
// Only when the level is sorted, the next-non-empty slice can be computed | |||||
// efficiently. | |||||
const DimLevelType lvlType = lvlTypes[tid][lvl]; | |||||
assert(isOrderedDLT(lvlType)); | |||||
if (isSingletonDLT(lvlType)) { | |||||
llvm_unreachable("TODO: dense level should be easy to support, while " | |||||
"singleton level requres more efforts"); | |||||
} | |||||
assert(!dependentLvlMap[tid][lvl].empty()); | |||||
assert(!sliceStack[tid].empty()); | |||||
const SliceInfo &sliceInfo = sliceStack[tid].back(); | |||||
auto baseEnc = getSparseTensorEncoding(tensors[tid].getType()); | |||||
if (baseEnc.isSlice()) | |||||
llvm_unreachable("TODO: not yet implemented"); | |||||
// Generate caches required to fast compute next-non-empty slices with | |||||
// increasing offset for slice-base loop. | |||||
// We do not need cache for dense levels. | |||||
if (slicePosBuffer[tid][lvl][0] == nullptr && !isDenseDLT(lvlType)) { | |||||
OpBuilder::InsertionGuard guard(builder); | |||||
// The buffer can be reused, and the size is loop invariant: it only depends | |||||
// on the iteration graph's toposort. | |||||
builder.setInsertionPointAfter(localInsertPos); | |||||
Value bufSize = C_IDX(1); | |||||
Value c2 = C_IDX(2); | |||||
// Accumlates the size required to cache the pLo for the slice. | |||||
// E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second | |||||
// level. We at most need to a memref<d0xindex>. | |||||
// NOTE: this is apperantly an over-approximation when the previous | |||||
// level is compressed, and we can compute a precise memory size | |||||
// inside the loops. But that would also requires us to allocate/free | |||||
// memorys in loops. | |||||
// TODO: Maybe using allocaScopeOp inside the loop to resolve the issue? | |||||
for (Level curLevel = lvl; | |||||
curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) { | |||||
auto depth = remDepOnLevel(tid, curLevel - 1); | |||||
assert(sliceSizes[tid][lvl].size() >= depth); | |||||
Value sz = *(sliceSizes[tid][lvl].rbegin() + depth - 1); | |||||
bufSize = builder.create<arith::MulIOp>(loc, bufSize, sz); | |||||
} | |||||
// For a pair of [pLo, pHi]. Note that we can not compress pHi because slice | |||||
// creates segments in the index buffer so that the pHi for the current | |||||
// level is no longer the pLo for the next level. | |||||
bufSize = builder.create<arith::MulIOp>(loc, bufSize, c2); | |||||
// Additional two metadata {memSize, idx} at head. | |||||
bufSize = builder.create<arith::AddIOp>(loc, bufSize, c2); | |||||
llvm::for_each( | |||||
slicePosBuffer[tid][lvl], [bufSize, loc, &builder](Value &cache) { | |||||
cache = genAlloca(builder, loc, bufSize, builder.getIndexType()); | |||||
}); | |||||
} | |||||
if (sliceInfo.isInitialTensor() || | |||||
(lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) { | |||||
// First level or previous level has been full resolved. | |||||
genResolvedSliceBegin(builder, loc, tid, lvl); | |||||
} else { | |||||
// The previous level has not been full resolved. | |||||
genUnResolvedSliceBegin(builder, loc, tid, lvl); | |||||
} | |||||
return false; | |||||
} | |||||
void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, | |||||
const Operation *op, TensorId tid, | |||||
Level lvl, | |||||
SmallVectorImpl<Value> &operands, | |||||
unsigned &retIdx) { | |||||
if (!isCompressedDLT(lvlTypes[tid][lvl])) | |||||
llvm_unreachable("TODO"); | |||||
// else generate code to compute next non empty slice. | |||||
Value c0 = C_IDX(0); | |||||
Value c1 = C_IDX(1); | |||||
Value c2 = C_IDX(2); | |||||
auto whileOp = llvm::cast<scf::WhileOp>(op); | |||||
SliceInfo &info = sliceStack[tid].back(); | |||||
assert(info.slicedOnLvl == lvl); | |||||
// | |||||
// We forward to the next non empty slice by | |||||
// if (minCrd > offset) { | |||||
// offset += 1 | |||||
// } else { | |||||
// minCrd = nextMinInSlice(); | |||||
// offset = minCrd - size + 1; | |||||
// } | |||||
// | |||||
// if (offset + size > parents.size) | |||||
// isNonEmpty = false; | |||||
// | |||||
Value absOffset = info.offset; | |||||
// Resets slices pointers as the resolved slices are invalidated after we | |||||
// moves forward to the next slice. | |||||
for (unsigned i = 0; i <= lvl; i++) | |||||
builder.create<memref::StoreOp>(loc, c0, slicePosBuffer[tid][i].back(), c1); | |||||
SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset}; | |||||
Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1]; | |||||
Value fastPathP = CMPI(ugt, info.minCrd, absOffset); | |||||
auto ifOp = builder.create<scf::IfOp>(loc, ValueRange(reduc).getTypes(), | |||||
fastPathP, true); | |||||
{ | |||||
OpBuilder::InsertionGuard guard(builder); | |||||
// Take the fast path | |||||
// if (minCrd > offset) { | |||||
// return offset += 1 | |||||
// } | |||||
builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); | |||||
reduc[2] = builder.create<arith::AddIOp>(loc, absOffset, c1); | |||||
// Yield offset + 1. | |||||
builder.create<scf::YieldOp>(loc, reduc); | |||||
// else /*minCrd == offset*/ { | |||||
// for (i = 0; i < slicePos.size(); i+=2) { | |||||
// if (crd[pos[slicePos[i]]] == minCrd) { | |||||
// slicePos[i]++; | |||||
// } | |||||
// minCrd=min(minCrd, crd[pos[slicePos[i]]]); | |||||
// } | |||||
// offset = minCrd - size + 1; | |||||
// } | |||||
builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); | |||||
reduc[2] = absOffset; // restore value. | |||||
Value pSt = c2; // pointer starting index | |||||
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize | |||||
reduc[0] = lvlSizes[tid][lvl]; // next min coord | |||||
reduc[1] = constantI1(builder, loc, false); // isNonEmpty | |||||
auto loopArgs = static_cast<ValueRange>(reduc).drop_back(); | |||||
auto forOp = scf::buildLoopNest( | |||||
builder, loc, pSt, mSz, c2, loopArgs, | |||||
[this, tid, lvl, c1, sPtrBuf, | |||||
&info](OpBuilder &builder, Location loc, ValueRange ivs, | |||||
ValueRange iterArgs) -> scf::ValueVector { | |||||
Value curMinCrd = iterArgs[0]; | |||||
Value isNonEmpty = iterArgs[1]; | |||||
Type idxTp = builder.getIndexType(); | |||||
Value pLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front()); | |||||
Value pHi = | |||||
genIndexLoad(builder, loc, sPtrBuf, | |||||
builder.create<arith::AddIOp>(loc, ivs.front(), c1)); | |||||
// | |||||
// if (pLo < pHi) // Only loads when inbound. | |||||
// coord = load[pLo] | |||||
// if coord == minCrd | |||||
// pLo += 1 | |||||
// | |||||
// if (pLo < pHi) | |||||
// curMinCrd = min(curMinCrd, load[pLo]) | |||||
// | |||||
Value pred = CMPI(ult, pLo, pHi); | |||||
auto advPLo = builder.create<scf::IfOp>(loc, idxTp, pred, true); | |||||
/* if pLo < pHi */ { | |||||
builder.setInsertionPointToStart(&advPLo.getThenRegion().front()); | |||||
// coord = load[pLo] | |||||
Value coord = | |||||
The next aartbik: The next | |||||
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo); | |||||
Value pred = CMPI(eq, coord, info.minCrd); | |||||
auto ifEqual = builder.create<scf::IfOp>(loc, idxTp, pred, true); | |||||
/* if coord == minCrd */ { | |||||
builder.setInsertionPointToStart( | |||||
&ifEqual.getThenRegion().front()); | |||||
// pLo += 1. | |||||
Value newPlo = builder.create<arith::AddIOp>(loc, pLo, c1); | |||||
builder.create<memref::StoreOp>(loc, newPlo, sPtrBuf, | |||||
ivs.front()); | |||||
builder.create<scf::YieldOp>(loc, newPlo); | |||||
} | |||||
/* else coord != minCrd */ { | |||||
builder.setInsertionPointToStart( | |||||
&ifEqual.getElseRegion().front()); | |||||
builder.create<scf::YieldOp>(loc, pLo); | |||||
} | |||||
builder.setInsertionPointAfter(ifEqual); | |||||
builder.create<scf::YieldOp>(loc, ifEqual.getResults()); | |||||
} | |||||
/* else pLo >= pHi */ { | |||||
builder.setInsertionPointToStart(&advPLo.getElseRegion().front()); | |||||
builder.create<scf::YieldOp>(loc, pLo); | |||||
} | |||||
builder.setInsertionPointAfter(advPLo); | |||||
pLo = advPLo.getResult(0); | |||||
Value lvlNonEmpty = CMPI(ult, pLo, pHi); | |||||
// Update minCrds | |||||
auto newMin = | |||||
builder.create<scf::IfOp>(loc, idxTp, lvlNonEmpty, true); | |||||
builder.setInsertionPointToStart(&newMin.getThenRegion().front()); | |||||
builder.create<scf::YieldOp>( | |||||
loc, | |||||
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo)); | |||||
builder.setInsertionPointToStart(&newMin.getElseRegion().front()); | |||||
builder.create<scf::YieldOp>(loc, curMinCrd); | |||||
builder.setInsertionPointAfter(newMin); | |||||
// isNonEmpty = isNonEmpty || lvlNonEmpty | |||||
isNonEmpty = | |||||
builder.create<arith::OrIOp>(loc, lvlNonEmpty, isNonEmpty); | |||||
curMinCrd = builder.create<arith::SelectOp>( | |||||
loc, CMPI(ult, newMin.getResult(0), curMinCrd), | |||||
newMin.getResult(0), curMinCrd); | |||||
return {curMinCrd, isNonEmpty}; | |||||
}); | |||||
builder.setInsertionPointAfter(forOp.loops.front()); | |||||
// minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0 | |||||
Value tmp = builder.create<arith::AddIOp>(loc, forOp.results.front(), c1); | |||||
Value minOffset = builder.create<arith::SubIOp>( | |||||
loc, tmp, sliceSizes[tid][lvl][info.depth - 1]); | |||||
Value p = CMPI(uge, tmp, sliceSizes[tid][lvl][info.depth - 1]); | |||||
minOffset = builder.create<arith::SelectOp>(loc, p, minOffset, c0); | |||||
SmallVector<Value, 3> yields; | |||||
yields.assign(forOp.results.begin(), forOp.results.end()); | |||||
yields.push_back(minOffset); | |||||
builder.create<scf::YieldOp>(loc, yields); | |||||
} | |||||
Value nextMinCrd = ifOp.getResults()[0]; | |||||
Value nextNonEmpty = ifOp.getResults()[1]; | |||||
// The next offset should at least be offset + 1; | |||||
Value minOffset = ifOp.getResults()[2]; | |||||
Value nxOffset = builder.create<arith::AddIOp>(loc, info.offset, c1); | |||||
Value maxPred = CMPI(ugt, minOffset, nxOffset); | |||||
Value nextAbsOffset = | |||||
builder.create<arith::SelectOp>(loc, maxPred, minOffset, nxOffset); | |||||
Value sliceUB = builder.create<arith::AddIOp>( | |||||
loc, nextAbsOffset, sliceSizes[tid][lvl][info.depth - 1]); | |||||
// FIXME: this only works if there is only one parent. | |||||
assert(info.depth - 1 == 0); | |||||
// nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound. | |||||
Sets (or Increment), but use one style aartbik: Sets (or Increment), but use one style | |||||
nextNonEmpty = builder.create<arith::AndIOp>( | |||||
loc, nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl])); | |||||
// FIXME: compute relative offset. | |||||
assert(info.depth - 1 == 0); | |||||
Value nextRelOffset = nextAbsOffset; | |||||
nextRelOffset = | |||||
builder.create<arith::SelectOp>(loc, nextNonEmpty, nextRelOffset, c0); | |||||
operands.push_back(nextNonEmpty); | |||||
operands.push_back(nextMinCrd); | |||||
operands.push_back(nextAbsOffset); // we push the absolute offset. | |||||
// Update the slice stack. | |||||
info.isNonEmpty = whileOp.getResult(retIdx++); | |||||
info.minCrd = whileOp.getResult(retIdx++); | |||||
info.offset = whileOp.getResult(retIdx++); | |||||
} | |||||
Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtLvl( | |||||
OpBuilder &builder, Location loc, TensorId tid, Level lvl, | |||||
MutableArrayRef<Value> reduc) { | |||||
assert(!depFullyReduced(tid, lvl)); | |||||
SliceInfo &sliceInfo = sliceStack[tid].back(); | |||||
assert(sliceInfo.slicedOnLvl == lvl); | |||||
// The order matters! | |||||
SmallVector<Value, 3> operands{sliceInfo.isNonEmpty, sliceInfo.minCrd, | |||||
sliceInfo.offset}; | |||||
// number of reduction maintained by us. | |||||
size_t numMetaReduc = operands.size(); | |||||
// Append user-required reduction values. | |||||
operands.append(reduc.begin(), reduc.end()); | |||||
assert(operands.size() == numMetaReduc + reduc.size()); | |||||
// while (slice.nonEmpty()) { | |||||
// bodyBuilder(); | |||||
// SliceNext(); | |||||
// } | |||||
auto whileOp = builder.create<scf::WhileOp>( | |||||
loc, ValueRange(operands).getTypes(), operands, | |||||
/*beforeBuilder=*/ | |||||
[](OpBuilder &builder, Location loc, ValueRange args) { | |||||
builder.create<scf::ConditionOp>(loc, /*isNonEmpty*/ args[0], args); | |||||
}, | |||||
/*afterBuilder=*/ | |||||
[this, tid, lvl, reduc, numMetaReduc, | |||||
&sliceInfo](OpBuilder &builder, Location loc, ValueRange args) { | |||||
assert(args.size() == reduc.size() + numMetaReduc); | |||||
sliceInfo.isNonEmpty = args[0]; | |||||
sliceInfo.minCrd = args[1]; | |||||
sliceInfo.offset = args[2]; | |||||
// The slice offset is used to coiterate with other tensors' | |||||
// coordinates. | |||||
Value c = sliceInfo.offset; | |||||
if (sliceInfo.depth > 1) { | |||||
// Coord is the relative offset related to its parents. | |||||
// Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1] | |||||
llvm_unreachable("TODO: not yet implement"); | |||||
} | |||||
coords[tid][lvl] = c; | |||||
for (unsigned i = 0, e = reduc.size(); i < e; i++) | |||||
reduc[i] = args[i + numMetaReduc]; | |||||
}); | |||||
// Set the insertion point to while loop body. | |||||
builder.setInsertionPointToEnd(&whileOp.getAfter().front()); | |||||
return whileOp; | |||||
} | |||||
#undef CMPI | |||||
#undef C_IDX |
I'd prefer these be defined as static inline functions rather than as macros, since that gives better type-safety and compiler error messages. If you need to use CMPI at several different types, then just use a template.