Diff 384779

mlir/include/mlir/Dialect/Linalg/Utils/Utils.h

Show First 20 Lines • Show All 206 Lines • ▼ Show 20 Lines	public:
/// Returns the tiled root operation.		/// Returns the tiled root operation.
LinalgOp getRootOp() { return rootOp; }		LinalgOp getRootOp() { return rootOp; }

private:		private:
/// Returns true if the tile loop nest has no tile loops.		/// Returns true if the tile loop nest has no tile loops.
bool isEmpty();		bool isEmpty();

/// Returns true if the tile loop nest invariants are satisfied:		/// Returns true if the tile loop nest invariants are satisfied:
		/// - The `rootOp` has been tiled at least once.
/// - The number of tile loop operations and dimensions match.		/// - The number of tile loop operations and dimensions match.
/// - The innermost tile loop is the parent of `tiledOp`.		/// - The innermost tile loop is the parent of `tiledOp`.
/// - The tile loops are directly nested.		/// - The tile loops are directly nested.
// TODO: relax to support additional control flow, e.g., IfOp.		// TODO: relax to support additional control flow, e.g., IfOp.
bool isValid();		bool isValid();

/// Searches the block arguments tied to a block argument `bbArg` of the		/// Searches the block arguments tied to a block argument `bbArg` of the
/// innermost tile loop. Returns the block argument from outermost to		/// innermost tile loop. Returns the block argument from outermost to
/// innermost or an empty vector if none are found.		/// innermost or an empty vector if none are found.
SmallVector<BlockArgument> getTiedBBArgs(BlockArgument bbArg);		SmallVector<BlockArgument> getTiedBBArgs(BlockArgument bbArg);

/// Returns the iteration argument of the outermost tile loop mapped to a		/// Returns the iteration argument of the outermost tile loop mapped to a
/// block argument `bbArg` of the innermost tile loop.		/// block argument `bbArg` of the innermost tile loop.
OpOperand *getTiedIterArg(BlockArgument bbArg);		OpOperand *getTiedIterArg(BlockArgument bbArg);

/// Returns true if `bbArg` has other used than `sliceOp` and its		/// Returns true if `bbArg` has other used than `sliceOp` and its
/// dependencies. Only if there are no other uses, the producer output		/// dependencies. Only if there are no other uses, the producer output
/// iteration argument may reused to pass the producer result after fusion.		/// iteration argument may reused to pass the producer result after fusion.
bool hasOtherUses(BlockArgument bbArg, tensor::ExtractSliceOp sliceOp);		bool hasOtherUses(BlockArgument bbArg, tensor::ExtractSliceOp sliceOp);

LinalgOp rootOp;		LinalgOp rootOp;
SmallVector<scf::ForOp> loopOps;		SmallVector<scf::ForOp> tileLoopOps;
SmallVector<int64_t> loopDims;		DenseMap<Operation *, SmallVector<int64_t>> tiledRootAndFusedOpsLoops;
};		};

/// Tiles `consumerOp` and fuses its dependencies if possible. Uses the		/// Tiles `consumerOp` and fuses its dependencies if possible. Uses the
/// `tileSizes` and `tileInterchange` parameters to control the tiling.		/// `tileSizes` and `tileInterchange` parameters to control the tiling.
FailureOr<TileLoopNest>		FailureOr<TileLoopNest>
tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,		tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,
ArrayRef<int64_t> tileSizes,		ArrayRef<int64_t> tileSizes,
ArrayRef<int64_t> tileInterchange);		ArrayRef<int64_t> tileInterchange);
▲ Show 20 Lines • Show All 132 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp

Show All 36 Lines
/// these cases, fusion introduces redundant computation.		/// these cases, fusion introduces redundant computation.
static SmallVector<int64_t> getTiledSliceDims(OpOperand *consumerOperand,		static SmallVector<int64_t> getTiledSliceDims(OpOperand *consumerOperand,
ArrayRef<int64_t> tiledLoopDims) {		ArrayRef<int64_t> tiledLoopDims) {
// Get the consumer operand indexing map.		// Get the consumer operand indexing map.
LinalgOp consumerOp = consumerOperand->getOwner();		LinalgOp consumerOp = consumerOperand->getOwner();
AffineMap indexingMap = consumerOp.getTiedIndexingMap(consumerOperand);		AffineMap indexingMap = consumerOp.getTiedIndexingMap(consumerOperand);

// Search the slice dimensions tiled by a tile loop dimension.		// Search the slice dimensions tiled by a tile loop dimension.
DenseSet<int64_t> tiledSliceDims;		DenseSet<int64_t> tiledSliceDimIndices;
for (auto en : enumerate(indexingMap.getResults())) {		for (auto en : enumerate(indexingMap.getResults())) {
for (auto tiledLoopDim : tiledLoopDims) {		for (auto tiledLoopDim : tiledLoopDims) {
if (en.value().isFunctionOfDim(tiledLoopDim))		if (en.value().isFunctionOfDim(tiledLoopDim))
tiledSliceDims.insert(en.index());		tiledSliceDimIndices.insert(en.index());
}		}
}		}
return {tiledSliceDims.begin(), tiledSliceDims.end()};		return {tiledSliceDimIndices.begin(), tiledSliceDimIndices.end()};
		}

		/// Given a vector of `tiledSliceDimIndices` that represent the tiled dimensions
		/// of the producer result slice returns the tiled producer loop dimensions.
		/// Example:
		/// ```
		/// %res = linalg.fill(%cst, %input)
		/// scf.for %i
		/// scf.for %j
		/// %slice = tensor.extract_slice %res[%i, %j]
		/// ```
		/// getTiledProducerLoops(%res, [0, 1]) returns the loop indices [0, 1].
		static SmallVector<int64_t>
		getTiledProducerLoops(OpResult producerResult,
		ArrayRef<int64_t> tiledSliceDimIndices) {
		LinalgOp producerOp = producerResult.getOwner();

		// Get the indexing map of the `producerOp` output operand that matches
		// ´producerResult´.
		AffineMap producerIndexingMap = producerOp.getTiedIndexingMap(
		producerOp.getOutputOperand(producerResult.getResultNumber()));

		// Keep only the tiled result slice dimensions of `producerIndexingMap`.
		AffineMap tiledProducerIndexingSubMap =
		producerIndexingMap.getSubMap(SmallVector<unsigned>(
		tiledSliceDimIndices.begin(), tiledSliceDimIndices.end()));

		// Compute the producer loop indices mapped to the tiled result slice
		// dimensions. As the output indexing map of structured operations are
		// projected permutations, `tiledProducerIndexingSubMap` has to be a
		// projected permutation as well. We can thus obtain the producer loop indices
		// by getting the positions of the result dimensions.
		// Example:
		// (d0, d1, d2) -> (d0, d2) has the result positions [0, 2].
		assert(tiledProducerIndexingSubMap.isProjectedPermutation() &&
		"expect slice and producer loop dimensions map one-to-one");
		SmallVector<int64_t> tiledProducerLoopIndices;
		transform(llvm::seq<unsigned>(0, tiledProducerIndexingSubMap.getNumResults()),
		std::back_inserter(tiledProducerLoopIndices), [&](unsigned idx) {
		return tiledProducerIndexingSubMap.getDimPosition(idx);
		});

		return tiledProducerLoopIndices;
}		}

/// Returns the producer fused in place of `sliceOp`. Tile the producer operands		/// Returns the producer fused in place of `sliceOp`. Tile the producer operands
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Better doc comments + minimal example would be useful (no need to fully spell out the IR, pseudo-IR with the relevant info is what we want). Given a list of `tiledSliceDims` that represent ... return the indices of ... Example: for i for j for k something getTiledProducerLoops( ... ) returns ... nicolasvasilache: Better doc comments + minimal example would be useful (no need to fully spell out the IR…
/// along the `tiledSliceDims` and clone the producer. Consider the case of		/// along the `tiledSliceDimIndices` and clone the producer. Consider the case
/// fusion of an output tensor:		/// of fusion of an output tensor:
		nicolasvasilacheUnsubmitted Done Reply Inline Actions static? nicolasvasilache: static?
/// ```		/// ```
/// %1 = producer ins(...) outs(%0)		/// %1 = producer ins(...) outs(%0)
/// %2 = consumer ins(...) outs(%1)		/// %2 = consumer ins(...) outs(%1)
		hanchungUnsubmitted Done Reply Inline Actions comment not needed, since the code describes what it does. hanchung: comment not needed, since the code describes what it does.
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Always bias towards more and better comments, none of this is trivial to verbalise -> 'get the indexing map of the output operand in the producer op, that matches the opResult ' nicolasvasilache: Always bias towards more and better comments, none of this is trivial to verbalise -> 'get the…
/// ```		/// ```
/// When consumer is tiled, %1 appears in the loop iter_args:		/// When consumer is tiled, %1 appears in the loop iter_args:
/// ```		/// ```
/// %1 = producer ins(...) outs(%0)		/// %1 = producer ins(...) outs(%0)
/// %2 = scf.for ... iter_args(%1) .. (%bbarg) {		/// %2 = scf.for ... iter_args(%1) .. (%bbarg) {
		nicolasvasilacheUnsubmitted Done Reply Inline Actions there should be an existing `AffineMap::xxxinversexxx` function that you can reuse or extend (or create if nothing fits) instead of rolling your own. nicolasvasilache: there should be an existing `AffineMap::xxxinversexxx` function that you can reuse or extend…
/// %t1 = tensor.extract_slice %bbarg[..]		/// %t1 = tensor.extract_slice %bbarg[..]
		hanchungUnsubmitted Done Reply Inline Actions Maybe add an assertion for the assumption? Also, don't need the comment when we have the assertion. assert(map.isProjectedPermutation() && "expected projected permutation"); hanchung: Maybe add an assertion for the assumption? Also, don't need the comment when we have the…
		gysitAuthorUnsubmitted Done Reply Inline Actions I updated the comments and made the assertion more concise. In fact, it is sufficient if there is a one-to-one mapping from slice dimension to loop dimension. We do not care about dimensions that are not sliced. gysit: I updated the comments and made the assertion more concise. In fact, it is sufficient if there…
/// %t2 = consumer ins(...) outs(%t1)		/// %t2 = consumer ins(...) outs(%t1)
/// %r = tensor.insert_slice %t2, %bbarg[...]		/// %r = tensor.insert_slice %t2, %bbarg[...]
/// }		/// }
/// ```		/// ```
/// Fusing %1 into the loop requires updating iter_args(%1) to iter_args(%0):		/// Fusing %1 into the loop requires updating iter_args(%1) to iter_args(%0):
/// ```		/// ```
/// %2 = scf.for ... iter_args(%0) .. (%bbarg) {		/// %2 = scf.for ... iter_args(%0) .. (%bbarg) {
/// %t0 = tensor.extract_slice %bbarg[..]		/// %t0 = tensor.extract_slice %bbarg[..]
/// %t1 = producer ins(...) outs(%t0)		/// %t1 = producer ins(...) outs(%t0)
/// %t2 = consumer ins(...) outs(%t1)		/// %t2 = consumer ins(...) outs(%t1)
/// %r = tensor.insert_slice %t2, %bbarg[...]		/// %r = tensor.insert_slice %t2, %bbarg[...]
/// }		/// }
/// ```		/// ```
/// This transformation is only valid if %bbarg is exclusively used by the		/// This transformation is only valid if %bbarg is exclusively used by the
/// output ExtractSliceOp / InsertSliceOp pair, which is checked by the		/// output ExtractSliceOp / InsertSliceOp pair, which is checked by the
/// `fuseProducer` method.		/// `fuseProducer` method.
/// TODO: instead of check and failure, insert new iter_args each time a		/// TODO: instead of check and failure, insert new iter_args each time a
/// producer is fused into a consumer and fold away unused iter_args.		/// producer is fused into a consumer and fold away unused iter_args.
static LinalgOp getTiledProducer(OpBuilder &b, OpResult producerResult,		static LinalgOp getTiledProducer(OpBuilder &b, OpResult producerResult,
tensor::ExtractSliceOp sliceOp,		tensor::ExtractSliceOp sliceOp,
ArrayRef<int64_t> tiledSliceDims,		ArrayRef<int64_t> tiledSliceDimIndices,
		ArrayRef<int64_t> tiledProducerLoopIndices,
OpOperand *iterArg) {		OpOperand *iterArg) {
// Clone the producer after `sliceOp` since the slice may be reused to pass in		// Clone the producer after `sliceOp` since the slice may be reused to pass in
// the producer result.		// the producer result.
OpBuilder::InsertionGuard guard(b);		OpBuilder::InsertionGuard guard(b);
b.setInsertionPointAfter(sliceOp);		b.setInsertionPointAfter(sliceOp);

// Get the producer.		// Get the producer.
LinalgOp producerOp = producerResult.getOwner();		LinalgOp producerOp = producerResult.getOwner();
Location loc = producerOp.getLoc();		Location loc = producerOp.getLoc();

// Obtain the `producerOp` loop bounds and the `sliceOp` ranges.		// Obtain the `producerOp` loop bounds and the `sliceOp` ranges.
SmallVector<Value> producerLoopBounds;		SmallVector<Value> producerLoopBounds;
transform(producerOp.createLoopRanges(b, loc),		transform(producerOp.createLoopRanges(b, loc),
std::back_inserter(producerLoopBounds),		std::back_inserter(producerLoopBounds),
[](Range range) { return range.size; });		[](Range range) { return range.size; });
SmallVector<Range> sliceOpRanges = sliceOp.getOrCreateRanges(b, loc);		SmallVector<Range> sliceOpRanges = sliceOp.getOrCreateRanges(b, loc);

// Get the producer result indexing map.
AffineMap producerIndexingMap = producerOp.getTiedIndexingMap(
producerOp.getOutputOperand(producerResult.getResultNumber()));

// Tile the producer operands given the `sliceOp` ranges. Iterate the		// Tile the producer operands given the `sliceOp` ranges. Iterate the
// `tiledSliceDims` and store the tile offset and size for the tiled slice		// `tiledSliceDimIndices` and store the tile offset and size for the tiled
// dimension. Assumes the mapping from slice dimensions to producer loops is a		// slice dimension.
// permutation.
auto zero = b.create<arith::ConstantIndexOp>(loc, 0);		auto zero = b.create<arith::ConstantIndexOp>(loc, 0);
SmallVector<Value> tileIvs(producerOp.getNumLoops(), nullptr);		SmallVector<Value> tileIvs(producerOp.getNumLoops(), nullptr);
SmallVector<Value> tileSizes(producerOp.getNumLoops(), zero);		SmallVector<Value> tileSizes(producerOp.getNumLoops(), zero);
SmallVector<Value> allIvs(producerOp.getNumLoops(), nullptr);		SmallVector<Value> allIvs(producerOp.getNumLoops(), nullptr);
for (int64_t tiledSliceDim : tiledSliceDims) {		for (auto it : zip(tiledSliceDimIndices, tiledProducerLoopIndices)) {
AffineExpr result = producerIndexingMap.getResults()[tiledSliceDim];		int64_t tiledSliceDim = std::get<0>(it);
assert(result.isa<AffineDimExpr>() &&		int64_t tiledProducerLoop = std::get<1>(it);
		hanchungUnsubmitted Done Reply Inline Actions [optional] I'd prefer int64_t tiledSliceDim = std::get<0>(it); int64_t tiledProducerLoop = std::get<1>(it); hanchung: [optional] I'd prefer ``` int64_t tiledSliceDim = std::get<0>(it); int64_t tiledProducerLoop =…
"expect producer indexing map is a projected permutation");
int64_t tiledProducerLoop = result.cast<AffineDimExpr>().getPosition();
tileIvs[tiledProducerLoop] = sliceOpRanges[tiledSliceDim].offset;		tileIvs[tiledProducerLoop] = sliceOpRanges[tiledSliceDim].offset;
tileSizes[tiledProducerLoop] = sliceOpRanges[tiledSliceDim].size;		tileSizes[tiledProducerLoop] = sliceOpRanges[tiledSliceDim].size;
allIvs[tiledProducerLoop] = tileIvs[tiledProducerLoop];		allIvs[tiledProducerLoop] = tileIvs[tiledProducerLoop];
}		}
erase_value(tileIvs, nullptr);		erase_value(tileIvs, nullptr);
SmallVector<Value> tiledOperands = producerOp.getInputAndOutputOperands();		SmallVector<Value> tiledOperands = producerOp.getInputAndOutputOperands();
tiledOperands = makeTiledShapes(b, loc, producerOp, tiledOperands, tileIvs,		tiledOperands = makeTiledShapes(b, loc, producerOp, tiledOperands, tileIvs,
tileSizes, producerLoopBounds);		tileSizes, producerLoopBounds);
Show All 21 Lines	static LinalgOp getTiledProducer(OpBuilder &b, OpResult producerResult,

return clonedOp;		return clonedOp;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// TileLoopNest specific helpers.		// TileLoopNest specific helpers.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

bool TileLoopNest::isEmpty() { return loopOps.empty(); }		bool TileLoopNest::isEmpty() { return tileLoopOps.empty(); }

bool TileLoopNest::isValid() {		bool TileLoopNest::isValid() {
// Check if the number of `tileLoopOps` and `tileLoopDims` match.		// Check if `rootOp` has been tiled at least once.
		nicolasvasilacheUnsubmitted Done Reply Inline Actions validity conditions change but I see no doc change. I think all the cases required for validity should be enumerated in the proper place in the doc and kept up to date. nicolasvasilache: validity conditions change but I see no doc change. I think all the cases required for validity…
if (loopOps.size() != loopDims.size())		if (isEmpty() \|\| tiledRootAndFusedOpsLoops.count(rootOp) == 0)
		return false;

		// Check if the number of loop operations and dimensions match.
		if (tileLoopOps.size() != tiledRootAndFusedOpsLoops[rootOp].size())
return false;		return false;

// Check if the innermost tile loop is the parent of `tiledOp`.		// Check if the innermost tile loop is the parent of `tiledOp`.
if (rootOp->getParentOp() != loopOps.back())		if (rootOp->getParentOp() != tileLoopOps.back())
return false;		return false;

// Check if the tile loops are directly nested.		// Check if the tile loops are directly nested.
return std::adjacent_find(loopOps.begin(), loopOps.end(),		return std::adjacent_find(tileLoopOps.begin(), tileLoopOps.end(),
[](Operation op1, Operation op2) {		[](Operation op1, Operation op2) {
return op1 != op2->getParentOp();		return op1 != op2->getParentOp();
}) == loopOps.end();		}) == tileLoopOps.end();
}		}

SmallVector<BlockArgument> TileLoopNest::getTiedBBArgs(BlockArgument bbArg) {		SmallVector<BlockArgument> TileLoopNest::getTiedBBArgs(BlockArgument bbArg) {
assert(bbArg && "expect the block argument to be non-zero");		assert(bbArg && "expect the block argument to be non-zero");
SmallVector<BlockArgument> bbArgs;		SmallVector<BlockArgument> bbArgs;

// Search all tile loop block arguments from inner to outer.		// Search all tile loop block arguments from inner to outer.
for (auto tileLoop : reverse(loopOps)) {		for (auto tileLoop : reverse(tileLoopOps)) {
if (bbArg.getOwner()->getParentOp() != tileLoop)		if (bbArg.getOwner()->getParentOp() != tileLoop)
return {};		return {};
bbArgs.push_back(bbArg);		bbArgs.push_back(bbArg);
OpOperand *iterArg = &tileLoop.getOpOperandForRegionIterArg(bbArg);		OpOperand *iterArg = &tileLoop.getOpOperandForRegionIterArg(bbArg);
bbArg = iterArg->get().dyn_cast<BlockArgument>();		bbArg = iterArg->get().dyn_cast<BlockArgument>();
}		}

// Reverse the block arguments to order them from outer to inner.		// Reverse the block arguments to order them from outer to inner.
return {bbArgs.rbegin(), bbArgs.rend()};		return {bbArgs.rbegin(), bbArgs.rend()};
}		}

OpOperand *TileLoopNest::getTiedIterArg(BlockArgument bbArg) {		OpOperand *TileLoopNest::getTiedIterArg(BlockArgument bbArg) {
// Search all block arguments and return the matching iteration argument.		// Search all block arguments and return the matching iteration argument.
SmallVector<BlockArgument> bbArgs = getTiedBBArgs(bbArg);		SmallVector<BlockArgument> bbArgs = getTiedBBArgs(bbArg);
if (bbArgs.size() != loopOps.size())		if (bbArgs.size() != tileLoopOps.size())
return nullptr;		return nullptr;
return &loopOps.front().getOpOperandForRegionIterArg(bbArgs.front());		return &tileLoopOps.front().getOpOperandForRegionIterArg(bbArgs.front());
}		}

bool TileLoopNest::hasOtherUses(BlockArgument bbArg,		bool TileLoopNest::hasOtherUses(BlockArgument bbArg,
tensor::ExtractSliceOp sliceOp) {		tensor::ExtractSliceOp sliceOp) {
// Check the innermost block argument is either used by the ExtractSliceOp		// Check the innermost block argument is either used by the ExtractSliceOp
// `sliceOp`, the matching InsertSliceOp, or by a DimOp. Handle other uses		// `sliceOp`, the matching InsertSliceOp, or by a DimOp. Handle other uses
// conservatively.		// conservatively.
for (Operation *op : bbArg.getUsers()) {		for (Operation *op : bbArg.getUsers()) {
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	if (!tiledRootOp.hasValue())
return failure();		return failure();

// Replace all uses of the root operation if it has been tiled before. All		// Replace all uses of the root operation if it has been tiled before. All
// uses of the original untiled root operation are updated by the calling pass		// uses of the original untiled root operation are updated by the calling pass
// or pattern.		// or pattern.
if (!isEmpty())		if (!isEmpty())
rootOp->replaceAllUsesWith(tiledRootOp->tensorResults);		rootOp->replaceAllUsesWith(tiledRootOp->tensorResults);

		// Transfer the stored `rootOp` loop dimensions if it has been tiled before.
		if (tiledRootAndFusedOpsLoops.count(rootOp) != 0) {
		tiledRootAndFusedOpsLoops[tiledRootOp->op] =
		tiledRootAndFusedOpsLoops[rootOp];
		}

// Update the root operation and append the loops and tile loop dimensions.		// Update the root operation and append the loops and tile loop dimensions.
rootOp = tiledRootOp->op;		rootOp = tiledRootOp->op;
loopOps.append(tiledRootOp->loops.begin(), tiledRootOp->loops.end());		tileLoopOps.append(tiledRootOp->loops.begin(), tiledRootOp->loops.end());
for (auto en : enumerate(tileSizes)) {		for (auto en : enumerate(tileSizes)) {
// Copy only the tiled loop dimensions with non-zero tile size.		// Copy only the tiled loop dimensions with non-zero tile size.
if (en.value() == 0)		if (en.value() == 0)
continue;		continue;
loopDims.push_back(tileInterchange[en.index()]);		tiledRootAndFusedOpsLoops[rootOp].push_back(tileInterchange[en.index()]);
}		}
assert(isValid() && "expect tile loop nest to be valid after tiling");		assert(isValid() && "expect tile loop nest to be valid after tiling");

return success();		return success();
}		}

FailureOr<LinalgOp> TileLoopNest::fuseProducer(OpBuilder &b,		FailureOr<LinalgOp> TileLoopNest::fuseProducer(OpBuilder &b,
OpOperand *rootOpOperand) {		OpOperand *consumerOpOperand) {
assert(rootOpOperand->getOwner() == rootOp &&		assert(tiledRootAndFusedOpsLoops.count(consumerOpOperand->getOwner()) != 0 &&
"expect the root op to be the owner of the operand to fuse");		"expect the operand owner is the root operation or a fused producer");
assert(this->isValid() &&		assert(this->isValid() &&
"expect the tile loop nest to satisfy all invariants");		"expect the tile loop nest to satisfy all invariants");

// Check the tile loop nest is non-empty.		// Check the tile loop nest is non-empty.
if (isEmpty())		if (isEmpty())
return failure();		return failure();

// Check `rootOpOperand` is defined by an ExtractSliceOp.		// Check `consumerOpOperand` is defined by an ExtractSliceOp.
auto sliceOp = rootOpOperand->get().getDefiningOp<tensor::ExtractSliceOp>();		auto sliceOp =
		consumerOpOperand->get().getDefiningOp<tensor::ExtractSliceOp>();
if (!sliceOp)		if (!sliceOp)
return failure();		return failure();

// Check `sliceOp` is tiled by the tile loop nest.		// Check `sliceOp` and `consumerOp` are in the same block.
if (sliceOp->getParentOp() != rootOp->getParentOp())		LinalgOp consumerOp = consumerOpOperand->getOwner();
		nicolasvasilacheUnsubmitted Done Reply Inline Actions I'd even check that they are all in the same block, you never know what will happen in the future. nicolasvasilache: I'd even check that they are all in the same block, you never know what will happen in the…
		if (sliceOp->getBlock() != rootOp->getBlock() \|\|
		consumerOp->getBlock() != rootOp->getBlock())
return failure();		return failure();

// Check if the producer is a LinalgOp possibly passed by iteration argument.		// Check if the producer is a LinalgOp possibly passed by iteration argument.
OpOperand *iterArg = nullptr;		OpOperand *iterArg = nullptr;
auto producerResult = sliceOp.source().dyn_cast<OpResult>();		auto producerResult = sliceOp.source().dyn_cast<OpResult>();
if (auto bbArg = sliceOp.source().dyn_cast<BlockArgument>()) {		if (auto bbArg = sliceOp.source().dyn_cast<BlockArgument>()) {
iterArg = getTiedIterArg(bbArg);		iterArg = getTiedIterArg(bbArg);
// Check the iteration argument may be used to pass in the producer output.		// Check the iteration argument may be used to pass in the producer output.
if (!iterArg \|\| hasOtherUses(bbArg, sliceOp))		if (!iterArg \|\| hasOtherUses(bbArg, sliceOp))
return failure();		return failure();
producerResult = iterArg->get().dyn_cast<OpResult>();		producerResult = iterArg->get().dyn_cast<OpResult>();
}		}
if (!producerResult \|\| !isa<LinalgOp>(producerResult.getOwner()))		if (!producerResult \|\| !isa<LinalgOp>(producerResult.getOwner()))
return failure();		return failure();

// Compute the tiled producer slice dimensions given the tiled root operation		// Compute the tiled producer slice dimensions given the tiled consumer loops.
		nicolasvasilacheUnsubmitted Done Reply Inline Actions I'd append Index/Indices to all these vairable names to be super explicit. I keep on expecting to see a Value coming from a DimOp here .. nicolasvasilache: I'd append Index/Indices to all these vairable names to be super explicit. I keep on expecting…
// loop dimensions `loopDims`.		SmallVector<int64_t> tiledSliceDimIndices = getTiledSliceDims(
SmallVector<int64_t> tiledSliceDims =		consumerOpOperand, tiledRootAndFusedOpsLoops[consumerOp]);
getTiledSliceDims(rootOpOperand, loopDims);		if (tiledSliceDimIndices.empty())
if (tiledSliceDims.empty())
return failure();		return failure();

		// Compute the tiled producer loop indices.
		nicolasvasilacheUnsubmitted Done Reply Inline Actions loop indices. nicolasvasilache: loop indices.
		SmallVector<int64_t> tiledProducerLoopIndices =
		getTiledProducerLoops(producerResult, tiledSliceDimIndices);

// Tile the producer operands and clone the producer in place of `sliceOp`.		// Tile the producer operands and clone the producer in place of `sliceOp`.
LinalgOp clonedOp =		LinalgOp clonedOp =
getTiledProducer(b, producerResult, sliceOp, tiledSliceDims, iterArg);		getTiledProducer(b, producerResult, sliceOp, tiledSliceDimIndices,
		tiledProducerLoopIndices, iterArg);
		tiledRootAndFusedOpsLoops[clonedOp] = tiledProducerLoopIndices;

// Cast the `clonedOp` result to gap type mismatches before canonicalization.		// Cast the `clonedOp` result to gap type mismatches before canonicalization.
Type consumerOperandType = rootOpOperand->get().getType();		Type consumerOperandType = consumerOpOperand->get().getType();
Value newResult = clonedOp->getResult(producerResult.getResultNumber());		Value newResult = clonedOp->getResult(producerResult.getResultNumber());
if (newResult.getType() != consumerOperandType) {		if (newResult.getType() != consumerOperandType) {
OpBuilder::InsertionGuard guard(b);		OpBuilder::InsertionGuard guard(b);
b.setInsertionPointAfter(clonedOp);		b.setInsertionPointAfter(clonedOp);
newResult = b.create<tensor::CastOp>(producerResult.getLoc(),		newResult = b.create<tensor::CastOp>(producerResult.getLoc(),
consumerOperandType, newResult);		consumerOperandType, newResult);
}		}

// Replace the `sliceOp` uses except for the `clonedOp` output uses.		// Replace the `sliceOp` uses except for the `clonedOp` output uses.
sliceOp.getResult().replaceAllUsesExcept(newResult, clonedOp);		sliceOp.getResult().replaceAllUsesExcept(newResult, clonedOp);
return clonedOp;		return clonedOp;
}		}

ValueRange TileLoopNest::getRootOpReplacementResults() {		ValueRange TileLoopNest::getRootOpReplacementResults() {
assert(!isEmpty() && "expect tile loop nest to be non-empty");		assert(!isEmpty() && "expect tile loop nest to be non-empty");
return loopOps.front()->getOpResults();		return tileLoopOps.front()->getOpResults();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Tile and fuse entry-points.		// Tile and fuse entry-points.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

FailureOr<TileLoopNest>		FailureOr<TileLoopNest>
mlir::linalg::tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,		mlir::linalg::tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,
Show All 12 Lines	mlir::linalg::tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,
SmallVector<StringAttr> iterTypes =		SmallVector<StringAttr> iterTypes =
llvm::to_vector<6>(consumerOp.iterator_types().getAsRange<StringAttr>());		llvm::to_vector<6>(consumerOp.iterator_types().getAsRange<StringAttr>());
applyPermutationToVector(iterTypes, tileInterchange);		applyPermutationToVector(iterTypes, tileInterchange);
auto *it = find_if(iterTypes, [&](StringAttr iterType) {		auto *it = find_if(iterTypes, [&](StringAttr iterType) {
return !isParallelIterator(iterType);		return !isParallelIterator(iterType);
});		});
int64_t split = std::distance(iterTypes.begin(), it);		int64_t split = std::distance(iterTypes.begin(), it);

		// Helper to fuse the producers greedily using a queue of fusion candidates.
		auto fuseProducersGreedily = [&](ArrayRef<OpOperand *> operands) {
		SmallVector<OpOperand *> candidates(operands.begin(), operands.end());
		while (!candidates.empty()) {
		FailureOr<LinalgOp> fusedProducer =
		tileLoopNest.fuseProducer(b, candidates.pop_back_val());
		nicolasvasilacheUnsubmitted Done Reply Inline Actions `candidates.pop_back_val()` and drop the separate `pop_back()` call nicolasvasilache: `candidates.pop_back_val()` and drop the separate `pop_back()` call
		if (failed(fusedProducer))
		continue;
		candidates.append(fusedProducer->getInputAndOutputOperands());
		}
		};

// Tile the outer parallel loops and fuse the output operands.		// Tile the outer parallel loops and fuse the output operands.
SmallVector<int64_t> outerTileSizes;		SmallVector<int64_t> outerTileSizes;
outerTileSizes.append(tileSizes.begin(), tileSizes.begin() + split);		outerTileSizes.append(tileSizes.begin(), tileSizes.begin() + split);
outerTileSizes.append(tileSizes.size() - split, 0);		outerTileSizes.append(tileSizes.size() - split, 0);
if (failed(tileLoopNest.tileRootOp(b, outerTileSizes, tileInterchange)))		if (failed(tileLoopNest.tileRootOp(b, outerTileSizes, tileInterchange)))
return failure();		return failure();
for (OpOperand *opOperand : tileLoopNest.getRootOp().getOutputOperands())		fuseProducersGreedily(tileLoopNest.getRootOp().getOutputOperands());
(void)tileLoopNest.fuseProducer(b, opOperand);

// Tile the remaining loops and fuse the input operands.		// Tile the remaining loops and fuse the input operands.
SmallVector<int64_t> innerTileSizes;		SmallVector<int64_t> innerTileSizes;
innerTileSizes.append(split, 0);		innerTileSizes.append(split, 0);
innerTileSizes.append(tileSizes.begin() + split, tileSizes.end());		innerTileSizes.append(tileSizes.begin() + split, tileSizes.end());
if (failed(tileLoopNest.tileRootOp(b, innerTileSizes, tileInterchange)))		if (failed(tileLoopNest.tileRootOp(b, innerTileSizes, tileInterchange)))
return failure();		return failure();
SmallVector<OpOperand *> inputOperands =		fuseProducersGreedily(tileLoopNest.getRootOp().getInputOperands());
tileLoopNest.getRootOp().getInputOperands();
for (OpOperand *opOperand : tileLoopNest.getRootOp().getInputOperands())
(void)tileLoopNest.fuseProducer(b, opOperand);

return tileLoopNest;		return tileLoopNest;
}		}

namespace {		namespace {
struct LinalgTileAndFuseTensorOps		struct LinalgTileAndFuseTensorOps
: public LinalgTileAndFuseTensorOpsBase<LinalgTileAndFuseTensorOps> {		: public LinalgTileAndFuseTensorOpsBase<LinalgTileAndFuseTensorOps> {

▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir

This file was added.

				// RUN: mlir-opt %s -linalg-tile-and-fuse-tensor-ops="tile-sizes=4,4,0,0 tile-interchange=0,1,2,3" -cse --canonicalize -split-input-file \| FileCheck %s

				// CHECK: fuse_conv_chain
				// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<2x2xf32>
				// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<11x11xf32>
				// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<10x10xf32>
				// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<9x9xf32>
				// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]*]]: tensor<8x8xf32>
				builtin.func @fuse_conv_chain(%arg0: tensor<2x2xf32>,
				%arg1: tensor<11x11xf32>,
				%arg2: tensor<10x10xf32>,
				%arg3: tensor<9x9xf32>,
				%arg4: tensor<8x8xf32>) -> tensor<8x8xf32> {
				%cst = arith.constant 1.0 : f32

				// Do not tile the filter fill since the filter dimensions are not tiled.
				// CHECK: %[[T0:.]] = linalg.fill(%{{.}}, %[[ARG0]])
				%0 = linalg.fill(%cst, %arg0) : f32, tensor<2x2xf32> -> tensor<2x2xf32>

				// Fuse all other operations.
				// CHECK: scf.for %[[IV0:.]] = {{.}} iter_args(%[[ARG5:.*]] = %[[ARG4]]
				// CHECK: scf.for %[[IV1:.]] = {{.}} iter_args(%[[ARG6:.*]] = %[[ARG5]]

				// CHECK: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
				// CHECK-SAME: %[[IV0]], %[[IV1]]
				// CHECK: %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
				// CHECK-SAME: %[[IV0]], %[[IV1]]
				// CHECK: %[[T3:.]] = linalg.fill(%{{.}}, %[[T2]])
				// CHECK: %[[T4:.]] = linalg.conv_2d ins(%[[T1]], %[[T0]] : {{.}} outs(%[[T3]]
				%1 = linalg.fill(%cst, %arg2) : f32, tensor<10x10xf32> -> tensor<10x10xf32>
				%2 = linalg.conv_2d ins(%arg1, %0 : tensor<11x11xf32>, tensor<2x2xf32>) outs(%1 : tensor<10x10xf32>) -> tensor<10x10xf32>

				// CHECK: %[[T5:.*]] = tensor.extract_slice %[[ARG3]]
				// CHECK-SAME: %[[IV0]], %[[IV1]]
				// CHECK: %[[T6:.]] = linalg.fill(%{{.}}, %[[T5]])
				// CHECK: %[[T7:.]] = linalg.conv_2d ins(%[[T4]], %[[T0]] : {{.}} outs(%[[T6]]
				%3 = linalg.fill(%cst, %arg3) : f32, tensor<9x9xf32> -> tensor<9x9xf32>
				%4 = linalg.conv_2d ins(%2, %0 : tensor<10x10xf32>, tensor<2x2xf32>) outs(%3 : tensor<9x9xf32>) -> tensor<9x9xf32>

				// Use the argument passed in by iteration argument.
				// CHECK: %[[T8:.*]] = tensor.extract_slice %[[ARG6]]
				// CHECK-SAME: %[[IV0]], %[[IV1]]
				// CHECK: %[[T9:.]] = linalg.fill(%{{.}}, %[[T8]])
				// CHECK: %[[T5:.]] = linalg.conv_2d ins(%[[T7]], %[[T0]] {{.}} outs(%[[T9]]
				%5 = linalg.fill(%cst, %arg4) : f32, tensor<8x8xf32> -> tensor<8x8xf32>
				%6 = linalg.conv_2d ins(%4, %0 : tensor<9x9xf32>, tensor<2x2xf32>) outs(%5 : tensor<8x8xf32>) -> tensor<8x8xf32>
				return %6 : tensor<8x8xf32>
				}

				// -----

				// CHECK: fuse_matmul_chain
				// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<8x8xf32>
				builtin.func @fuse_matmul_chain(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
				%c0 = arith.constant 0 : index
				%c12 = arith.constant 12 : index
				%c25 = arith.constant 25 : index
				%c24 = arith.constant 24 : index
				%c4 = arith.constant 4 : index
				%cst = arith.constant 0.000000e+00 : f32

				// Do not tile rhs fill of the producer matmul since none of its loop dimension is tiled.
				// CHECK: %[[T0:.]] = linalg.fill(%{{.}}, %[[ARG0]])
				%0 = linalg.fill(%cst, %arg0) : f32, tensor<8x8xf32> -> tensor<8x8xf32>

				// CHECK: scf.for %[[IV0:.]] = {{.}} iter_args(%[[ARG1:.*]] = %[[ARG0]]
				// CHECK: scf.for %[[IV1:.]] = {{.}} iter_args(%[[ARG2:.*]] = %[[ARG1]]

				// Only the outermost loop of the producer matmul is tiled.
				// CHECK: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
				// CHECK-SAME: %[[IV0]], 0
				// CHECK: %[[T2:.]] = linalg.fill(%{{.}}, %[[T1]])
				// CHECK: %[[T3:.]] = linalg.matmul ins(%[[T2]], %[[T0]] {{.}}
				%1 = linalg.matmul ins(%0, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32>

				// Use the argument passed in by iteration argument.
				// CHECK: %[[T4:.*]] = tensor.extract_slice %[[ARG2]]
				hanchungUnsubmitted Done Reply Inline Actions Should we capture iter_args and extract slice from iter_args? I think we are doing destructive update, and everything should work on the iter_args. I.e., extract slice from iter_args, fill and matmul on the slice, insert the slice into iter_args and yield it. If the inputs are fill+matmul, the dest of fill op is a slice of iter_args. E.g., input: module { func @fuse_output(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32> %1 = tensor.dim %arg1, %c1 : tensor<?x?xf32> %2 = linalg.init_tensor [%0, %1] : tensor<?x?xf32> %3 = linalg.fill(%cst, %2) : f32, tensor<?x?xf32> -> tensor<?x?xf32> %4 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32> return %4 : tensor<?x?xf32> } } output: ... %5 = scf.for %arg2 = %c0 to %1 step %c4 iter_args(%arg3 = %2) -> (tensor<?x?xf32>) { %6 = scf.for %arg4 = %c0 to %0 step %c5 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) { %7 = affine.min #map0(%arg4)[%0] %8 = tensor.extract_slice %arg0[%arg4, 0] [%7, %4] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> %9 = tensor.dim %arg1, %c0 : tensor<?x?xf32> %10 = affine.min #map1(%arg2)[%1] %11 = tensor.extract_slice %arg1[0, %arg2] [%9, %10] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> %12 = tensor.extract_slice %arg5[%arg4, %arg2] [%7, %10] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> %13 = tensor.dim %2, %c0 : tensor<?x?xf32> %14 = tensor.dim %2, %c1 : tensor<?x?xf32> %15 = affine.min #map2(%7, %arg4)[%13] %16 = affine.min #map2(%10, %arg2)[%14] %17 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32> %18 = linalg.matmul ins(%8, %11 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%17 : tensor<?x?xf32>) -> tensor<?x?xf32> %19 = tensor.insert_slice %18 into %arg5[%arg4, %arg2] [%7, %10] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> scf.yield %19 : tensor<?x?xf32> } scf.yield %6 : tensor<?x?xf32> } ... where `%12` is `tensor.extract_slice %arg5[%arg4, %arg2] [%7, %10] [1, 1]` hanchung: Should we capture iter_args and extract slice from iter_args? I think we are doing destructive…
				gysitAuthorUnsubmitted Done Reply Inline Actions That is right and the code actually updates the iteration argument. I did just not add this to the test since I wanted to match the new stuff (fusing a chain of operations and considering the right index dimensions). I will try to improve the test to make things more clear. There is one limitation that is on purpose though. This revision always tiles the consumer and adds its outputs to the iteration arguments. However, it never adds additional iteration arguments while fusing producers. If a result of a producer is used outside of the fused tile loop nest the original producer will thus remain in the code for now. We may handle this use case in a follow up revision if there is a use case. gysit: That is right and the code actually updates the iteration argument. I did just not add this to…
				hanchungUnsubmitted Not Done Reply Inline Actions Thanks for the explanation. I think we want to add additional iteration arguments. Let's do it in a follow up revision. hanchung: Thanks for the explanation. I think we want to add additional iteration arguments. Let's do it…
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions You shouldn't need anything new in the compiler for this. You problem is that init_tensor should be created above and passed as an extra argument. This is closer to a PL / compiler co-design that I replied to internally. There is something interesting to be done related to iter_args but it is quite more complex and I think unnecessary atm. nicolasvasilache: You shouldn't need anything new in the compiler for this. You problem is that init_tensor…
				// CHECK-SAME: %[[IV0]], %[[IV1]]
				// CHECK: %[[T5:.]] = linalg.fill(%{{.}}, %[[T4]])
				// CHECK: %{{.}} = linalg.matmul ins(%[[T3]], {{.}} outs(%[[T5]]
				%2 = linalg.matmul ins(%1, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32>
				return %2 : tensor<8x8xf32>
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Add support for transitive fusion.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 384779

mlir/include/mlir/Dialect/Linalg/Utils/Utils.h

mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp

mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Add support for transitive fusion.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 384779

mlir/include/mlir/Dialect/Linalg/Utils/Utils.h

mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp

mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir

[mlir][linalg] Add support for transitive fusion.
ClosedPublic