Diff 344332

mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp

Show All 20 Lines
#include "mlir/IR/Matchers.h"		#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::linalg;		using namespace mlir::linalg;

/// Implementation of fusion of generic ops and indexed_generic ops.		/// Conditions for elementwise fusion of generic operations.
static bool areElementwiseOpsFusable(LinalgOp producer, LinalgOp consumer,		static bool areElementwiseOpsFusable(GenericOp producer, GenericOp consumer,
unsigned consumerIdx) {		unsigned consumerIdx) {
// Producer and consumer must have tensor semantics.		// Producer and consumer must have tensor semantics.
if (!producer.hasTensorSemantics() \|\| !consumer.hasTensorSemantics())		if (!producer.hasTensorSemantics() \|\| !consumer.hasTensorSemantics())
return false;		return false;

// Verify that		// Verify that
// - the producer has all "parallel" iterator type.		// - the producer has all "parallel" iterator type.
if (producer.getNumParallelLoops() != producer.getNumLoops())		if (producer.getNumParallelLoops() != producer.getNumLoops())
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	static AffineMap getIndexingMapOfProducerOperandsInCoordinatesOfFusedOp(
// Compose t1 with fusedConsumerArgIndexMap gives an indexing map from		// Compose t1 with fusedConsumerArgIndexMap gives an indexing map from
// consumer loop/ fused loop -> producer arg tensor index.		// consumer loop/ fused loop -> producer arg tensor index.
return t1.compose(fusedConsumerArgIndexMap);		return t1.compose(fusedConsumerArgIndexMap);
}		}

/// Generate the region of the fused tensor operation. The region of the fused		/// Generate the region of the fused tensor operation. The region of the fused
/// op must be empty.		/// op must be empty.
static void		static void
generateFusedElementwiseOpRegion(PatternRewriter &rewriter, Operation *fusedOp,		generateFusedElementwiseOpRegion(PatternRewriter &rewriter, GenericOp fusedOp,
LinalgOp producer, LinalgOp consumer,		GenericOp producer, GenericOp consumer,
AffineMap consumerToProducerLoopsMap,		AffineMap consumerToProducerLoopsMap,
unsigned consumerIdx, unsigned nloops) {		unsigned consumerIdx, unsigned nloops) {
// Build the region of the fused op.		// Build the region of the fused op.
Block &producerBlock = producer->getRegion(0).front();		Block &producerBlock = producer->getRegion(0).front();
Block &consumerBlock = consumer->getRegion(0).front();		Block &consumerBlock = consumer->getRegion(0).front();
Block *fusedBlock = new Block();		Block *fusedBlock = new Block();
fusedOp->getRegion(0).push_back(fusedBlock);		fusedOp.region().push_back(fusedBlock);
BlockAndValueMapping mapper;		BlockAndValueMapping mapper;
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(fusedBlock);		rewriter.setInsertionPointToStart(fusedBlock);

// The block arguments are		// 2. Add an index operation for every fused loop dimension and use the
// [index_0, index_1, ... ,
// consumer_operand_0, ... , consumer_operand_(`consumerIdx`-1),
// producer_operand_0, ... , producer_operand_(n-1)],
// consumer_operand_(`consumerIdx`), .. consumer_operand_(m-1)]
// , where n is the number of producer's operand and m is the number
// consumer's operand.
// If both `numProducerIndices` and `numConsumerIndices` are zero, this is a
// generic op. In this case, there are no indices in block arguments.
unsigned numProducerIndices = isa<IndexedGenericOp>(producer.getOperation())
? producer.getNumLoops()
: 0;
unsigned numConsumerIndices = isa<IndexedGenericOp>(consumer.getOperation())
? consumer.getNumLoops()
: 0;
unsigned numFusedOpIndices =
(isa<IndexedGenericOp>(producer.getOperation()) \|\|
isa<IndexedGenericOp>(consumer.getOperation()))
? std::max(producer.getNumLoops(), consumer.getNumLoops())
: 0;

// 0. Firstly, add all the indices to the block arguments.
for (unsigned i = 0, e = numFusedOpIndices; i < e; ++i)
fusedBlock->addArgument(rewriter.getIndexType());
// 1. Map consumer indices to fusedBlock indices 1-1.
mapper.map(consumerBlock.getArguments().take_front(numConsumerIndices),
fusedBlock->getArguments().take_front(numConsumerIndices));
// 2a. Embed producer indices into fusedBlock index space 1-1.
for (auto it :
llvm::zip(producerBlock.getArguments().take_front(numProducerIndices),
fusedBlock->getArguments().take_front(numProducerIndices))) {
auto newIndex = rewriter.create<mlir::AffineApplyOp>(
producer.getLoc(),
consumerToProducerLoopsMap.getSubMap(std::get<0>(it).getArgNumber()),
fusedBlock->getArguments().take_front(numFusedOpIndices));
mapper.map(std::get<0>(it), newIndex);
}
// 2b. Add an index operation for every fused loop dimension and use the
// `consumerToProducerLoopsMap` to map the producer indices.		// `consumerToProducerLoopsMap` to map the producer indices.
if (producer.hasIndexSemantics()) {		if (producer.hasIndexSemantics()) {
// Add an index operation for every fused loop dimension.		// Add an index operation for every fused loop dimension.
unsigned numFusedOpLoops =		unsigned numFusedOpLoops =
std::max(producer.getNumLoops(), consumer.getNumLoops());		std::max(producer.getNumLoops(), consumer.getNumLoops());
SmallVector<Value> fusedIndices;		SmallVector<Value> fusedIndices;
fusedIndices.reserve(numFusedOpLoops);		fusedIndices.reserve(numFusedOpLoops);
llvm::transform(llvm::seq<uint64_t>(0, numFusedOpLoops),		llvm::transform(llvm::seq<uint64_t>(0, numFusedOpLoops),
std::back_inserter(fusedIndices), [&](uint64_t dim) {		std::back_inserter(fusedIndices), [&](uint64_t dim) {
return rewriter.create<IndexOp>(producer.getLoc(), dim);		return rewriter.create<IndexOp>(producer.getLoc(), dim);
});		});
for (IndexOp indexOp :		for (IndexOp indexOp :
llvm::make_early_inc_range(producerBlock.getOps<IndexOp>())) {		llvm::make_early_inc_range(producerBlock.getOps<IndexOp>())) {
Value newIndex = rewriter.create<mlir::AffineApplyOp>(		Value newIndex = rewriter.create<mlir::AffineApplyOp>(
producer.getLoc(),		producer.getLoc(),
consumerToProducerLoopsMap.getSubMap(indexOp.dim()), fusedIndices);		consumerToProducerLoopsMap.getSubMap(indexOp.dim()), fusedIndices);
mapper.map(indexOp.getResult(), newIndex);		mapper.map(indexOp.getResult(), newIndex);
}		}
}		}
// TODO: allow fusing the producer of an output operand.		// TODO: allow fusing the producer of an output operand.
assert(consumerIdx < consumer.getNumInputs() &&		assert(consumerIdx < consumer.getNumInputs() &&
"expected producer of input operand");		"expected producer of input operand");
// 3. Consumer input operands up to consumerIdx (exclusive).		// 3. Consumer input operands up to consumerIdx (exclusive).
for (BlockArgument bbArg : consumerBlock.getArguments()		for (BlockArgument bbArg : consumerBlock.getArguments().take_front(
.drop_front(numConsumerIndices)		consumerIdx)) // input assumption.
.take_front(consumerIdx)) // input assumption.
mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));		mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));

// Replacing consumerIdx requires getting the cloned, yielded, value from		// Replacing consumerIdx requires getting the cloned, yielded, value from
// the (cloned) producer block. This happens in step 9.		// the (cloned) producer block. This happens in step 9.

// 4. Splice in producer's input operands.		// 4. Splice in producer's input operands.
for (BlockArgument bbArg : producerBlock.getArguments()		for (BlockArgument bbArg :
.drop_front(numProducerIndices)		producerBlock.getArguments().take_front(producer.getNumInputs()))
.take_front(producer.getNumInputs()))
mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));		mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));

// 4.b. Producer output operand/map that is fused needs to be mapped to the		// 4.b. Producer output operand/map that is fused needs to be mapped to the
// producer bbArg if it is an "initTensor" (i.e. its value is actually read).		// producer bbArg if it is an "initTensor" (i.e. its value is actually read).
assert(producer->getNumResults() == 1 && "expected single result producer");		assert(producer->getNumResults() == 1 && "expected single result producer");
if (producer.isInitTensor(&producer.getOutputOpOperands()[0])) {		if (producer.isInitTensor(&producer.getOutputOpOperands()[0])) {
BlockArgument bbArg =		BlockArgument bbArg = producerBlock.getArguments()
producerBlock.getArguments()		.drop_front(producer.getNumInputs())
.drop_front(numConsumerIndices + producer.getNumInputs())
// TODO: bbArg index of		// TODO: bbArg index of
.front();		.front();
mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));		mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));
}		}
// 5. Remaining consumer's input operands (drop past index `consumerIdx`).		// 5. Remaining consumer's input operands (drop past index `consumerIdx`).
for (BlockArgument bbArg : consumerBlock.getArguments()		for (BlockArgument bbArg : consumerBlock.getArguments()
.drop_front(numConsumerIndices)
.take_front(consumer.getNumInputs())		.take_front(consumer.getNumInputs())
.drop_front(consumerIdx + 1))		.drop_front(consumerIdx + 1))
mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));		mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));
// 6. All of consumer's output operands.		// 6. All of consumer's output operands.
for (BlockArgument bbArg :		for (BlockArgument bbArg :
consumerBlock.getArguments().take_back(consumer.getNumOutputs()))		consumerBlock.getArguments().take_back(consumer.getNumOutputs()))
mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));		mapper.map(bbArg, fusedBlock->addArgument(bbArg.getType()));
// 7. All of producer's output operands except the one fused.		// 7. All of producer's output operands except the one fused.
Show All 19 Lines	generateFusedElementwiseOpRegion(PatternRewriter &rewriter, GenericOp fusedOp,
if (replacement == yieldOp.getOperand(producerResultNumber)) {		if (replacement == yieldOp.getOperand(producerResultNumber)) {
if (auto bb = replacement.dyn_cast<BlockArgument>())		if (auto bb = replacement.dyn_cast<BlockArgument>())
assert(bb.getOwner() != &producerBlock &&		assert(bb.getOwner() != &producerBlock &&
"yielded block argument must have been mapped");		"yielded block argument must have been mapped");
else		else
assert(!producer->isAncestor(replacement.getDefiningOp()) &&		assert(!producer->isAncestor(replacement.getDefiningOp()) &&
"yielded value must have been mapped");		"yielded value must have been mapped");
}		}
mapper.map(consumerBlock.getArgument(consumerIdx + numConsumerIndices),		mapper.map(consumerBlock.getArgument(consumerIdx), replacement);
replacement);
// 10. Clone operations from the consumer to the fused op.		// 10. Clone operations from the consumer to the fused op.
for (auto &op : consumerBlock.getOperations())		for (auto &op : consumerBlock.getOperations())
rewriter.clone(op, mapper);		rewriter.clone(op, mapper);

// Sanity checks.		// Sanity checks.
assert(fusedBlock->getNumArguments() ==		assert(fusedBlock->getNumArguments() == fusedOp.getNumOperands() &&
fusedOp->getNumOperands() + numFusedOpIndices &&		"Ill-formed GenericOp region");
"Ill-formed LinalgOp region");
}		}

static Optional<SmallVector<Value, 1>>		static Optional<SmallVector<Value>>
fuseElementwiseOpsImpl(LinalgOp producer, OpOperand &consumerOpOperand,		fuseElementwiseOpsImpl(GenericOp producer, OpOperand &consumerOpOperand,
const ControlElementwiseOpsFusionFn &controlFn,		const ControlElementwiseOpsFusionFn &controlFn,
PatternRewriter &rewriter) {		PatternRewriter &rewriter) {
LinalgOp consumer = cast<LinalgOp>(consumerOpOperand.getOwner());		auto consumer = cast<GenericOp>(consumerOpOperand.getOwner());
unsigned consumerIdx = consumerOpOperand.getOperandNumber();		unsigned consumerIdx = consumerOpOperand.getOperandNumber();
if (!areElementwiseOpsFusable(producer, consumer, consumerIdx) \|\|		if (!areElementwiseOpsFusable(producer, consumer, consumerIdx) \|\|
!controlFn(producer->getResult(0), consumerOpOperand))		!controlFn(producer->getResult(0), consumerOpOperand))
return llvm::None;		return llvm::None;

// TODO: allow fusing the producer of an output operand.		// TODO: allow fusing the producer of an output operand.
assert(consumerIdx < consumer.getNumInputs() &&		assert(consumerIdx < consumer.getNumInputs() &&
"expected producer of input operand");		"expected producer of input operand");
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	fuseElementwiseOpsImpl(GenericOp producer, OpOperand &consumerOpOperand,
// 6. All of consumer's output operands (skip operands: added by the builder).		// 6. All of consumer's output operands (skip operands: added by the builder).
// llvm::append_range(fusedOperands, consumer.getOutputs());		// llvm::append_range(fusedOperands, consumer.getOutputs());
llvm::append_range(fusedIndexMaps, consumer.getOutputIndexingMaps());		llvm::append_range(fusedIndexMaps, consumer.getOutputIndexingMaps());
// 7. All of producer's output operands/maps except the one fused.		// 7. All of producer's output operands/maps except the one fused.
// TODO: allow fusion of multi-result producers.		// TODO: allow fusion of multi-result producers.
assert(producer->getNumResults() == 1 && "expected single result producer");		assert(producer->getNumResults() == 1 && "expected single result producer");

// Generate the fused op.		// Generate the fused op.
Operation *fusedOp;		auto fusedOp = rewriter.create<GenericOp>(
if (isa<GenericOp>(producer.getOperation()) &&
isa<GenericOp>(consumer.getOperation())) {
fusedOp = rewriter.create<GenericOp>(
consumer.getLoc(), consumer->getResultTypes(),
/inputs=/fusedOperands,
// TODO: handle outputs.
consumer.getOutputs(), rewriter.getAffineMapArrayAttr(fusedIndexMaps),
consumer.iterator_types(),
/doc=/nullptr,
/library_call=/nullptr);
} else {
fusedOp = rewriter.create<IndexedGenericOp>(
consumer.getLoc(), consumer->getResultTypes(),		consumer.getLoc(), consumer->getResultTypes(),
/inputs=/fusedOperands,		/inputs=/fusedOperands,
// TODO: handle outputs.		// TODO: handle outputs.
consumer.getOutputs(), rewriter.getAffineMapArrayAttr(fusedIndexMaps),		consumer.getOutputs(), rewriter.getAffineMapArrayAttr(fusedIndexMaps),
consumer.iterator_types(),		consumer.iterator_types(),
/doc=/nullptr,		/doc=/nullptr,
/library_call=/nullptr);		/library_call=/nullptr);
}

// Construct an AffineMap from consumer loops to producer loops.		// Construct an AffineMap from consumer loops to producer loops.
// consumer loop -> tensor index		// consumer loop -> tensor index
AffineMap consumerResultIndexMap = consumer.getInputIndexingMap(consumerIdx);		AffineMap consumerResultIndexMap = consumer.getInputIndexingMap(consumerIdx);
// tensor index -> producer loop		// tensor index -> producer loop
AffineMap invProducerResultIndexMap =		AffineMap invProducerResultIndexMap =
inversePermutation(producerResultIndexMap);		inversePermutation(producerResultIndexMap);
assert(invProducerResultIndexMap &&		assert(invProducerResultIndexMap &&
"expected producer result indexig map to be invertible");		"expected producer result indexig map to be invertible");
// consumer loop -> producer loop		// consumer loop -> producer loop
AffineMap consumerToProducerLoopsMap =		AffineMap consumerToProducerLoopsMap =
invProducerResultIndexMap.compose(consumerResultIndexMap);		invProducerResultIndexMap.compose(consumerResultIndexMap);

generateFusedElementwiseOpRegion(rewriter, fusedOp, producer, consumer,		generateFusedElementwiseOpRegion(rewriter, fusedOp, producer, consumer,
consumerToProducerLoopsMap, consumerIdx,		consumerToProducerLoopsMap, consumerIdx,
consumer.getNumLoops());		consumer.getNumLoops());
return SmallVector<Value, 1>(fusedOp->getResults());		return SmallVector<Value>(fusedOp->getResults());
}		}

/// Linearize the expressions in `sourceMap` based on the `reassociationMaps`		/// Linearize the expressions in `sourceMap` based on the `reassociationMaps`
/// provided, given the shape of the source tensor that corresponds to the		/// provided, given the shape of the source tensor that corresponds to the
/// `sourceMap`. Note that this implicitly assumes that the tensors dimensions		/// `sourceMap`. Note that this implicitly assumes that the tensors dimensions
/// are "row-major" ordered logically.		/// are "row-major" ordered logically.
///		///
/// For example:		/// For example:
///		///
/// %0 = op ... : tensor<?x?x4x5xf32>		/// %0 = op ... : tensor<?x?x4x5xf32>
/// with output index_map `affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>`		/// with output index_map `affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>`
///		///
/// and reshape:		/// and reshape:
/// %1 = linalg.tensor_reshape %0 [affine_map<(i, j, k, l) -> (i)>,		/// %1 = linalg.tensor_reshape %0 [affine_map<(i, j, k, l) -> (i)>,
/// affine_map<(i, j, k, l) -> (j, k, l)>] :		/// affine_map<(i, j, k, l) -> (j, k, l)>] :
/// tensor<?x?x4x5xf32> into tensor<?x?xf32>		/// tensor<?x?x4x5xf32> into tensor<?x?xf32>
///		///
/// would be rewritten into:		/// would be rewritten into:
/// %0 = op ... : tensor<?x?x4x5xf32>		/// %0 = op ... : tensor<?x?x4x5xf32>
/// with output index_map		/// with output index_map
/// `affine_map<(d0, d1, d2, d3) -> (d0, d1 * 20 + d2 * 5 + d3)>`		/// `affine_map<(d0, d1, d2, d3) -> (d0, d1 * 20 + d2 * 5 + d3)>`
static AffineMap linearizeCollapsedDims(AffineMap sourceMap,		static AffineMap linearizeCollapsedDims(AffineMap sourceMap,
ArrayRef<int64_t> sourceShape,		ArrayRef<int64_t> sourceShape,
ArrayRef<AffineMap> reassociationMaps) {		ArrayRef<AffineMap> reassociationMaps) {
SmallVector<AffineExpr, 4> resultExprs;		SmallVector<AffineExpr> resultExprs;
resultExprs.reserve(reassociationMaps.size());		resultExprs.reserve(reassociationMaps.size());
ArrayRef<AffineExpr> sourceExprs = sourceMap.getResults();		ArrayRef<AffineExpr> sourceExprs = sourceMap.getResults();
MLIRContext *context = sourceMap.getContext();		MLIRContext *context = sourceMap.getContext();

// Compute the result exprs based on the reassociation maps.		// Compute the result exprs based on the reassociation maps.
for (AffineMap map : reassociationMaps) {		for (AffineMap map : reassociationMaps) {
ArrayRef<AffineExpr> collapsedDims = map.getResults();		ArrayRef<AffineExpr> collapsedDims = map.getResults();
// Assume that they are in-order and contiguous (already checked in		// Assume that they are in-order and contiguous (already checked in
// verifier).		// verifier).
assert(!collapsedDims.empty());		assert(!collapsedDims.empty());
unsigned startDim =		unsigned startDim =
collapsedDims.front().cast<AffineDimExpr>().getPosition();		collapsedDims.front().cast<AffineDimExpr>().getPosition();
SmallVector<int64_t, 4> sizes;		SmallVector<int64_t> sizes;
SmallVector<AffineExpr, 4> dimExprs;		SmallVector<AffineExpr> dimExprs;
		mravishankarUnsubmitted Done Reply Inline Actions I am ok with dropping the `, 4` , but one place where it gets compilcated is when you try something like SmallVector<T> vec = llvm::to_vector<4>(...); AFAIK, this will create a vector with the `llvm::to_vector` and then instead of a `move` assignment will do a `copy` assignment since the type is different. So something to watch out for. mravishankar: I am ok with dropping the `, 4` , but one place where it gets compilcated is when you try…
for (auto en :		for (auto en :
llvm::zip(sourceShape.slice(startDim, collapsedDims.size()),		llvm::zip(sourceShape.slice(startDim, collapsedDims.size()),
sourceExprs.slice(startDim, collapsedDims.size()))) {		sourceExprs.slice(startDim, collapsedDims.size()))) {
if (std::get<0>(en) == 1)		if (std::get<0>(en) == 1)
continue;		continue;
sizes.push_back(std::get<0>(en));		sizes.push_back(std::get<0>(en));
dimExprs.push_back(std::get<1>(en));		dimExprs.push_back(std::get<1>(en));
}		}
Show All 22 Lines	static bool isTensorReshapeOpFoldableByLinearization(TensorReshapeOp reshapeOp,
if ((asProducer && reshapeOp.getSrcType().hasStaticShape() &&		if ((asProducer && reshapeOp.getSrcType().hasStaticShape() &&
returnType.getRank() < operandType.getRank()) \|\|		returnType.getRank() < operandType.getRank()) \|\|
(!asProducer && reshapeOp.getResultType().hasStaticShape() &&		(!asProducer && reshapeOp.getResultType().hasStaticShape() &&
operandType.getRank() < returnType.getRank()))		operandType.getRank() < returnType.getRank()))
return false;		return false;
return useIndexMap.isPermutation();		return useIndexMap.isPermutation();
}		}

/// Based on the type of `op` create a linalg op of the same type, i.e. if `op`
/// is a linalg.generic operation, the create a `linalg.generic` operation with
/// the given `args`. Expects `op` to be `linalg.generic` or
/// `linalg.indexed_generic`.
template <typename... Args>
static LinalgOp createLinalgOpOfSameType(LinalgOp op, PatternRewriter &rewriter,
Args... args) {
if (isa<GenericOp>(op.getOperation()))
return rewriter.create<GenericOp>(args...);
if (isa<IndexedGenericOp>(op.getOperation()))
return rewriter.create<IndexedGenericOp>(args...);
llvm_unreachable(
"expected only linalg.generic or linalg.indexed_generic ops");
return nullptr;
}

/// Check if the reshape operation is only expansion into/collapsing of		/// Check if the reshape operation is only expansion into/collapsing of
/// unit-dimension.		/// unit-dimension.
static bool isUnitDimExpansionOnly(ArrayRef<int64_t> expandedShape,		static bool isUnitDimExpansionOnly(ArrayRef<int64_t> expandedShape,
ArrayRef<AffineMap> reassociation) {		ArrayRef<AffineMap> reassociation) {
for (auto &map : reassociation) {		for (auto &map : reassociation) {
unsigned numUnitDims = 0;		unsigned numUnitDims = 0;
for (AffineExpr expr : map.getResults()) {		for (AffineExpr expr : map.getResults()) {
unsigned position = expr.cast<AffineDimExpr>().getPosition();		unsigned position = expr.cast<AffineDimExpr>().getPosition();
if (expandedShape[position] == 1)		if (expandedShape[position] == 1)
numUnitDims++;		numUnitDims++;
}		}
if (numUnitDims != map.getNumResults() - 1)		if (numUnitDims != map.getNumResults() - 1)
return false;		return false;
}		}
return true;		return true;
}		}

/// Conditions for folding a generic/indexed-generic operation with a reshape op		/// Conditions for folding a generic operation with a reshape op by expanding
/// by expanding the iteration space dimensionality for tensor operations. These		/// the iteration space dimensionality for tensor operations. These are
/// are preconditions assumed by `foldReshapeByDimExpansion` which implements		/// preconditions assumed by `foldReshapeByDimExpansion` which implements the
/// the following fusion pattern.		/// following fusion pattern.
///		///
/// Consider		/// Consider
///		///
/// %c = linalg.generic ins(%a, %b : memref<?x?x?xf32>, memref<?x?xf32>)		/// %c = linalg.generic ins(%a, %b : memref<?x?x?xf32>, memref<?x?xf32>)
/// indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,		/// indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,
/// affine_map<(d0, d1, d2) -> (d1, d2)>,		/// affine_map<(d0, d1, d2) -> (d1, d2)>,
/// affine_map<(d0, d1, d2) -> (d0, d2, d1)>]		/// affine_map<(d0, d1, d2) -> (d0, d2, d1)>]
/// %d = linalg.tensor_reshape %c		/// %d = linalg.tensor_reshape %c
/// [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1)>,		/// [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1)>,
/// affine_map<(d0, d1, d2, d3, d4, d5) -> (d2)>,		/// affine_map<(d0, d1, d2, d3, d4, d5) -> (d2)>,
/// affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>]		/// affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>]
/// : tensor<?x?x?xf32> into tensor<?x?x?x?x?x?xf32>		/// : tensor<?x?x?xf32> into tensor<?x?x?x?x?x?xf32>
///		///
/// The reshape can be folded into the `linalgOp` if the		/// The reshape can be folded into the `genericOp` if its loop dimensionality
/// generic/indexed-generic op loop dimensionality is increased to match the		/// is increased to match the result (operand) of the tensor_reshape when the
/// result (operand) of the tensor_reshape when the reshape is expanding		/// reshape is expanding (folding). The indexing_map of the fused tensor in the
/// (folding). The indexing_map of the fused tensor in the `linalgOp` and the		/// `genericOp` and the reassociation map helps compute the indexing maps of
/// reassociation map helps compute the indexing maps of the modified op. For		/// the modified op. For the above example, based on the reassociation map it
/// the above example, based on the reassociation map it can be concluded that		/// can be concluded that
///		///
/// - The loop used to access the first dimension of the fused tensor is split		/// - The loop used to access the first dimension of the fused tensor is split
/// into two.		/// into two.
/// - The loop used to access the second dimension of the fused tensor is kept		/// - The loop used to access the second dimension of the fused tensor is kept
/// as is.		/// as is.
/// - The loop used to access the third dimension of the fused tensor is split		/// - The loop used to access the third dimension of the fused tensor is split
/// into three.		/// into three.
///		///
Show All 22 Lines
/// : tensor<?x?x?xf32> into tensor<?x?x?x?x?x?xf32>		/// : tensor<?x?x?xf32> into tensor<?x?x?x?x?x?xf32>
/// %1 = linalg.tensor_reshape %b		/// %1 = linalg.tensor_reshape %b
/// [affine_map<(e0, e1, e2, e3) -> (e0, e1, e2),		/// [affine_map<(e0, e1, e2, e3) -> (e0, e1, e2),
/// affine_map<(e0, e1, e2, e3) -> (e3)]		/// affine_map<(e0, e1, e2, e3) -> (e3)]
/// : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>		/// : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
///		///
/// The added reshapes are again expanding patterns, so they will get fused		/// The added reshapes are again expanding patterns, so they will get fused
/// with its producers if possible.		/// with its producers if possible.
static bool isFusableWithReshapeByDimExpansion(LinalgOp linalgOp,		static bool isFusableWithReshapeByDimExpansion(GenericOp genericOp,
unsigned fusedTensorIndex) {		unsigned fusedTensorIndex) {
// Is fusable only if:		// Is fusable only if:
// - The linalgOp is a generic op, or an indexed_generic.		// - All the indexing maps for operands and results are projected
// - All the indexing maps for operands and results in linalgOp are projected
// permutations.		// permutations.
// - The fused tensor is not a scalar.		// - The fused tensor is not a scalar.
// - All the loops in linalgOp are parallel loops.		// - All the loops are parallel loops.
return isa<GenericOp, IndexedGenericOp>(linalgOp.getOperation()) &&		return genericOp.hasTensorSemantics() &&
linalgOp.hasTensorSemantics() &&		llvm::all_of(genericOp.indexing_maps().getValue(),
llvm::all_of(linalgOp.indexing_maps().getValue(),
[](Attribute attr) {		[](Attribute attr) {
return attr.cast<AffineMapAttr>()		return attr.cast<AffineMapAttr>()
.getValue()		.getValue()
.isProjectedPermutation();		.isProjectedPermutation();
}) &&		}) &&
linalgOp.getIndexingMap(fusedTensorIndex).getNumResults() > 0 &&		genericOp.getIndexingMap(fusedTensorIndex).getNumResults() > 0 &&
llvm::all_of(linalgOp.iterator_types(), [](Attribute attr) {		llvm::all_of(genericOp.iterator_types(), [](Attribute attr) {
return attr.cast<StringAttr>().getValue() ==		return attr.cast<StringAttr>().getValue() ==
getParallelIteratorTypeName();		getParallelIteratorTypeName();
});		});
}		}

namespace {		namespace {
/// Information needed to expand a generic/indexed_generic operation to fold the		/// Information needed to expand a generic operation to fold the reshape with
/// reshape with it.		/// it.
class ExpansionInfo {		class ExpansionInfo {
public:		public:
// Computes the mapping from original dimensions of the op to the dimensions		// Computes the mapping from original dimensions of the op to the dimensions
// of the expanded op given the `indexingMap` of the fused operand/result of		// of the expanded op given the `indexingMap` of the fused operand/result of
// the generic/indexed_generic op, the `reassocationMaps` of the reshape op		// the generic op, the `reassocationMaps` of the reshape op and the shape of
// and the shape of the expanded op.		// the expanded op.
LogicalResult compute(LinalgOp linalgOp, unsigned fusedTensorIndex,		LogicalResult compute(LinalgOp linalgOp, unsigned fusedTensorIndex,
ArrayRef<AffineMap> reassociationMaps,		ArrayRef<AffineMap> reassociationMaps,
ArrayRef<int64_t> expandedShape);		ArrayRef<int64_t> expandedShape,
		PatternRewriter &rewriter);
unsigned getOrigOpNumDims() const { return reassociation.size(); }		unsigned getOrigOpNumDims() const { return reassociation.size(); }
unsigned getExpandedOpNumDims() const { return expandedOpNumDims; }		unsigned getExpandedOpNumDims() const { return expandedOpNumDims; }
ReassociationIndicesRef getExpandedDims(unsigned i) const {		ReassociationIndicesRef getExpandedDims(unsigned i) const {
return reassociation[i];		return reassociation[i];
}		}
ArrayRef<int64_t> getExpandedShapeOfDim(unsigned i) const {		ArrayRef<int64_t> getExpandedShapeOfDim(unsigned i) const {
return expandedShapeMap[i];		return expandedShapeMap[i];
}		}

private:		private:
/// Reassociation from the dimensions in the original operation to the		/// Reassociation from the dimensions in the original operation to the
/// dimension of the expanded operation.		/// dimension of the expanded operation.
SmallVector<ReassociationIndices, 4> reassociation;		SmallVector<ReassociationIndices> reassociation;
/// Mapping from extent of loops in the original operation, to the extent of		/// Mapping from extent of loops in the original operation, to the extent of
/// loops in the expanded operation.		/// loops in the expanded operation.
SmallVector<SmallVector<int64_t, 4>, 4> expandedShapeMap;		SmallVector<SmallVector<int64_t>> expandedShapeMap;
unsigned expandedOpNumDims;		unsigned expandedOpNumDims;
};		};
} // namespace		} // namespace

LogicalResult ExpansionInfo::compute(LinalgOp linalgOp,		LogicalResult ExpansionInfo::compute(LinalgOp linalgOp,
unsigned fusedTensorIndex,		unsigned fusedTensorIndex,
ArrayRef<AffineMap> reassociationMaps,		ArrayRef<AffineMap> reassociationMaps,
ArrayRef<int64_t> expandedShape) {		ArrayRef<int64_t> expandedShape,
		PatternRewriter &rewriter) {
if (reassociationMaps.empty())		if (reassociationMaps.empty())
return failure();		return failure();
AffineMap fusedIndexMap = linalgOp.getIndexingMap(fusedTensorIndex);		AffineMap fusedIndexMap = linalgOp.getIndexingMap(fusedTensorIndex);

Optional<SmallVector<int64_t, 4>> originalLoopRange =		Optional<SmallVector<int64_t, 4>> originalLoopRange =
linalgOp.getStaticLoopRanges();		linalgOp.getStaticLoopRanges();
if (!originalLoopRange)		if (!originalLoopRange)
return linalgOp.emitError("unable to find loop range for operation");		return rewriter.notifyMatchFailure(linalgOp, "unable to find loop range");

reassociation.clear();		reassociation.clear();
expandedShapeMap.clear();		expandedShapeMap.clear();
// Compute the number of dimension in the expanded op that correspond to each		// Compute the number of dimension in the expanded op that correspond to each
// dimension of the original op.		// dimension of the original op.
SmallVector<unsigned, 4> numExpandedDims(fusedIndexMap.getNumDims(), 1);		SmallVector<unsigned> numExpandedDims(fusedIndexMap.getNumDims(), 1);
expandedShapeMap.resize(fusedIndexMap.getNumDims());		expandedShapeMap.resize(fusedIndexMap.getNumDims());
for (auto resultExpr : llvm::enumerate(fusedIndexMap.getResults())) {		for (auto resultExpr : llvm::enumerate(fusedIndexMap.getResults())) {
unsigned pos = resultExpr.value().cast<AffineDimExpr>().getPosition();		unsigned pos = resultExpr.value().cast<AffineDimExpr>().getPosition();
AffineMap foldedDims = reassociationMaps[resultExpr.index()];		AffineMap foldedDims = reassociationMaps[resultExpr.index()];
numExpandedDims[pos] = foldedDims.getNumResults();		numExpandedDims[pos] = foldedDims.getNumResults();
ArrayRef<int64_t> shape =		ArrayRef<int64_t> shape =
expandedShape.slice(foldedDims.getDimPosition(0), numExpandedDims[pos]);		expandedShape.slice(foldedDims.getDimPosition(0), numExpandedDims[pos]);
expandedShapeMap[pos].assign(shape.begin(), shape.end());		expandedShapeMap[pos].assign(shape.begin(), shape.end());
Show All 18 Lines
/// Epanding the body of a linalg operation requires adaptations of the accessed		/// Epanding the body of a linalg operation requires adaptations of the accessed
/// loop indices. Specifically, access of indices in the original operation need		/// loop indices. Specifically, access of indices in the original operation need
/// to be replaced with linearizations of indices in the expanded op. That		/// to be replaced with linearizations of indices in the expanded op. That
/// requires the shape of the expanded dimensions to be static (at least all but		/// requires the shape of the expanded dimensions to be static (at least all but
/// the most significant). For now check that these are all statically sized.		/// the most significant). For now check that these are all statically sized.
/// Note that this could be extended to handle dynamic case, but the		/// Note that this could be extended to handle dynamic case, but the
/// implementation below uses `affine.apply` which seems to have issues when the		/// implementation below uses `affine.apply` which seems to have issues when the
/// shapes are not static.		/// shapes are not static.
LogicalResult isIndexedOpExpandable(LinalgOp linalgOp,		LogicalResult isGenericOpExpandable(GenericOp genericOp,
const ExpansionInfo &expansionInfo) {		const ExpansionInfo &expansionInfo,
		PatternRewriter &rewriter) {
		if (!genericOp.hasIndexSemantics())
		return success();
for (unsigned i : llvm::seq<unsigned>(0, expansionInfo.getOrigOpNumDims())) {		for (unsigned i : llvm::seq<unsigned>(0, expansionInfo.getOrigOpNumDims())) {
ArrayRef<int64_t> expandedShape = expansionInfo.getExpandedShapeOfDim(i);		ArrayRef<int64_t> expandedShape = expansionInfo.getExpandedShapeOfDim(i);
if (expandedShape.size() == 1)		if (expandedShape.size() == 1)
continue;		continue;
for (int64_t shape : expandedShape.drop_front()) {		for (int64_t shape : expandedShape.drop_front()) {
if (ShapedType::isDynamic(shape)) {		if (ShapedType::isDynamic(shape)) {
return linalgOp.emitError(		return rewriter.notifyMatchFailure(
"unable to fuse indexed generic op where the expanded dim is "		genericOp, "cannot expand due to index semantics and dynamic dims");
		mravishankarUnsubmitted Done Reply Inline Actions Nit: could you change this to either `return failure()` (or better `return motifyMatchFailure()` . There is better error reporting now than when this was written (and hasnt been updated). This just adds noise to the logs. mravishankar: Nit: could you change this to either `return failure()` (or better `return motifyMatchFailure…
"dynamic");
}		}
}		}
}		}
return success();		return success();
}		}

/// Return the indexing map to use in the expanded op for a given the		/// Return the indexing map to use in the expanded op for a given the
/// `indexingMap` of the original operation.		/// `indexingMap` of the original operation.
static AffineMap		static AffineMap
getIndexingMapInExpandedOp(OpBuilder &builder, AffineMap indexingMap,		getIndexingMapInExpandedOp(OpBuilder &builder, AffineMap indexingMap,
const ExpansionInfo &expansionInfo) {		const ExpansionInfo &expansionInfo) {
SmallVector<AffineExpr, 4> newExprs;		SmallVector<AffineExpr> newExprs;
for (AffineExpr expr : indexingMap.getResults()) {		for (AffineExpr expr : indexingMap.getResults()) {
unsigned pos = expr.cast<AffineDimExpr>().getPosition();		unsigned pos = expr.cast<AffineDimExpr>().getPosition();
SmallVector<AffineExpr, 4> expandedExprs = llvm::to_vector<4>(		SmallVector<AffineExpr, 4> expandedExprs = llvm::to_vector<4>(
		mravishankarUnsubmitted Done Reply Inline Actions See comment above. If I am not mistaken this is going to be a `copy` assignment. If so would rather leave `4` in there. (Ideally `llvm::to_vector` can be modified to handle the default case) mravishankar: See comment above. If I am not mistaken this is going to be a `copy` assignment. If so would…
llvm::map_range(expansionInfo.getExpandedDims(pos), [&](int64_t v) {		llvm::map_range(expansionInfo.getExpandedDims(pos), [&](int64_t v) {
return builder.getAffineDimExpr(static_cast<unsigned>(v));		return builder.getAffineDimExpr(static_cast<unsigned>(v));
}));		}));
newExprs.append(expandedExprs.begin(), expandedExprs.end());		newExprs.append(expandedExprs.begin(), expandedExprs.end());
}		}
return AffineMap::get(expansionInfo.getExpandedOpNumDims(),		return AffineMap::get(expansionInfo.getExpandedOpNumDims(),
indexingMap.getNumSymbols(), newExprs,		indexingMap.getNumSymbols(), newExprs,
builder.getContext());		builder.getContext());
}		}

/// Return the type of the operand/result to use in the expanded op given the		/// Return the type of the operand/result to use in the expanded op given the
/// type in the original op.		/// type in the original op.
static RankedTensorType getExpandedType(RankedTensorType originalType,		static RankedTensorType getExpandedType(RankedTensorType originalType,
AffineMap indexingMap,		AffineMap indexingMap,
const ExpansionInfo &expansionInfo) {		const ExpansionInfo &expansionInfo) {
SmallVector<int64_t, 4> expandedShape;		SmallVector<int64_t> expandedShape;
for (AffineExpr expr : indexingMap.getResults()) {		for (AffineExpr expr : indexingMap.getResults()) {
unsigned dim = expr.cast<AffineDimExpr>().getPosition();		unsigned dim = expr.cast<AffineDimExpr>().getPosition();
auto dimExpansion = expansionInfo.getExpandedShapeOfDim(dim);		auto dimExpansion = expansionInfo.getExpandedShapeOfDim(dim);
expandedShape.append(dimExpansion.begin(), dimExpansion.end());		expandedShape.append(dimExpansion.begin(), dimExpansion.end());
}		}
return RankedTensorType::get(expandedShape, originalType.getElementType());		return RankedTensorType::get(expandedShape, originalType.getElementType());
}		}

/// Returns the reassociation maps to use in the `linalg.tensor_reshape`		/// Returns the reassociation maps to use in the `linalg.tensor_reshape`
/// operation to convert the operands of the origial operation to operands of		/// operation to convert the operands of the origial operation to operands of
/// the expanded operation. The same method is used to compute the		/// the expanded operation. The same method is used to compute the
/// `linalg.tensor_reshape` used to collapse the result of the expanded op to		/// `linalg.tensor_reshape` used to collapse the result of the expanded op to
/// get the value that can replace all uses of the results of the original op.		/// get the value that can replace all uses of the results of the original op.
static SmallVector<ReassociationIndices, 4>		static SmallVector<ReassociationIndices>
getReassociationForExpansion(AffineMap indexingMap,		getReassociationForExpansion(AffineMap indexingMap,
const ExpansionInfo &expansionInfo) {		const ExpansionInfo &expansionInfo) {
SmallVector<ReassociationIndices, 4> reassociation;		SmallVector<ReassociationIndices> reassociation;
unsigned numReshapeDims = 0;		unsigned numReshapeDims = 0;
for (AffineExpr expr : indexingMap.getResults()) {		for (AffineExpr expr : indexingMap.getResults()) {
unsigned dim = expr.cast<AffineDimExpr>().getPosition();		unsigned dim = expr.cast<AffineDimExpr>().getPosition();
auto numExpandedDims = expansionInfo.getExpandedDims(dim).size();		auto numExpandedDims = expansionInfo.getExpandedDims(dim).size();
auto indices = llvm::to_vector<2>(		SmallVector<int64_t, 2> indices = llvm::to_vector<2>(
llvm::seq<int64_t>(numReshapeDims, numReshapeDims + numExpandedDims));		llvm::seq<int64_t>(numReshapeDims, numReshapeDims + numExpandedDims));
reassociation.emplace_back(std::move(indices));		reassociation.emplace_back(std::move(indices));
numReshapeDims += numExpandedDims;		numReshapeDims += numExpandedDims;
}		}
return reassociation;		return reassociation;
}		}

/// Build the body of the expanded IndexedGenericOp. The arguments for the
/// induction variables of the original operation need to be recovered by
/// linearizing the arguments of the corresponding dimensions of the expanded
/// op. For now it is assumed that the shapes of the expanded op needed for
/// linearization are static.
static void buildExpandedIndexedGenericOpRegion(
PatternRewriter &rewriter, Location loc, Region &originalOpRegion,
Region &fusedOpRegion, const ExpansionInfo &expansionInfo) {
assert(fusedOpRegion.empty() && "expected fused op to have empty region");
// Create an entry block in the fused region with same number of arguments
// as the fused op
Block *fusedEntryBlock = new Block;
fusedOpRegion.push_back(fusedEntryBlock);
rewriter.cloneRegionBefore(originalOpRegion, fusedOpRegion,
fusedOpRegion.end());

// Merge the entry block of the fused op with the cloned blocks. For this
// compute the value for arguments of the region in the original operation
// in terms of the arguments of the fused op. Since the original operation
// is expanded, the expanded dimensions need to be folded back to get the
// replacement value for the arguments corresponding to interation index.
// For now this expects that all the loop ranges are constants, which is
// true if the shapes are all static. This has already been checked in the
// precondition.
using namespace edsc::op;
using namespace edsc::intrinsics;
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<Value, 4> argReplacements(originalOpRegion.getNumArguments());
rewriter.setInsertionPointToStart(fusedEntryBlock);
edsc::ScopedContext scopedContext(rewriter, loc);
IndexType indexType = rewriter.getIndexType();
for (auto i : llvm::seq<unsigned>(0, expansionInfo.getOrigOpNumDims())) {
Value linearizedIndex = fusedEntryBlock->addArgument(indexType);
ArrayRef<int64_t> expandedDimsShape =
expansionInfo.getExpandedShapeOfDim(i).drop_front();
for (unsigned shape : expandedDimsShape) {
assert(!ShapedType::isDynamic(shape));
linearizedIndex = linearizedIndex * std_constant_index(shape);
linearizedIndex =
linearizedIndex + fusedEntryBlock->addArgument(indexType);
}
argReplacements[i] = linearizedIndex;
}
for (auto i : llvm::seq<unsigned>(expansionInfo.getOrigOpNumDims(),
argReplacements.size())) {
argReplacements[i] =
fusedEntryBlock->addArgument(originalOpRegion.getArgument(i).getType());
}
rewriter.mergeBlocks(fusedEntryBlock->getNextNode(), fusedEntryBlock,
argReplacements);
}

/// Update the body of an expanded linalg operation having index semantics. The		/// Update the body of an expanded linalg operation having index semantics. The
/// indices of the original operation need to be recovered by linearizing the		/// indices of the original operation need to be recovered by linearizing the
/// indices of the correspoding dimensions of the expanded operation. For now it		/// indices of the correspoding dimensions of the expanded operation. For now it
/// is assumed that the shapes of the expanded operation needed for		/// is assumed that the shapes of the expanded operation needed for
/// linearization are static.		/// linearization are static.
static void updateExpandedIndexOpRegion(PatternRewriter &rewriter, Location loc,		static void updateExpandedGenericOpRegion(PatternRewriter &rewriter,
Region &fusedRegion,		Location loc, Region &fusedRegion,
const ExpansionInfo &expansionInfo) {		const ExpansionInfo &expansionInfo) {
// Replace the original indices by the linearization of the expanded indices.		// Replace the original indices by the linearization of the expanded indices.
for (IndexOp indexOp :		for (IndexOp indexOp :
llvm::make_early_inc_range(fusedRegion.front().getOps<IndexOp>())) {		llvm::make_early_inc_range(fusedRegion.front().getOps<IndexOp>())) {
ArrayRef<int64_t> expandedDims =		ArrayRef<int64_t> expandedDims =
expansionInfo.getExpandedDims(indexOp.dim());		expansionInfo.getExpandedDims(indexOp.dim());
assert(!expandedDims.empty() && "expected valid expansion info");		assert(!expandedDims.empty() && "expected valid expansion info");

// Skip index operations that are not affected by the expansion.		// Skip index operations that are not affected by the expansion.
Show All 19 Lines	for (auto it : llvm::zip(expandedDimsShape, expandedIndices)) {
newIndex = rewriter.create<AffineApplyOp>(		newIndex = rewriter.create<AffineApplyOp>(
indexOp.getLoc(), idx + acc * std::get<0>(it),		indexOp.getLoc(), idx + acc * std::get<0>(it),
ValueRange{std::get<1>(it), newIndex});		ValueRange{std::get<1>(it), newIndex});
}		}
rewriter.replaceOp(indexOp, newIndex);		rewriter.replaceOp(indexOp, newIndex);
}		}
}		}

/// Implements the fusion of a tensor_reshape op and a generic/indexed_generic		/// Implements the fusion of a tensor_reshape op and a generic op as explained
/// op as explained in `isFusableWithReshapeByExpansion`. Assumes that those		/// in `isFusableWithReshapeByExpansion`. Assumes that those conditions have
/// conditions have been satisfied.		/// been satisfied.
static Optional<SmallVector<Value, 1>>		static Optional<SmallVector<Value>>
fuseWithReshapeByExpansion(LinalgOp linalgOp, TensorReshapeOp reshapeOp,		fuseWithReshapeByExpansion(GenericOp genericOp, TensorReshapeOp reshapeOp,
unsigned fusedTensorIndex,		unsigned fusedTensorIndex,
PatternRewriter &rewriter) {		PatternRewriter &rewriter) {
assert(isFusableWithReshapeByDimExpansion(linalgOp, fusedTensorIndex) &&		assert(isFusableWithReshapeByDimExpansion(genericOp, fusedTensorIndex) &&
"preconditions for fuse operation failed");		"preconditions for fuse operation failed");
// Check if reshape is expanding or collapsing.		// Check if reshape is expanding or collapsing.
bool isExpanding =		bool isExpanding =
reshapeOp.getSrcType().getRank() < reshapeOp.getResultType().getRank();		reshapeOp.getSrcType().getRank() < reshapeOp.getResultType().getRank();
RankedTensorType expandedType =		RankedTensorType expandedType =
isExpanding ? reshapeOp.getResultType() : reshapeOp.getSrcType();		isExpanding ? reshapeOp.getResultType() : reshapeOp.getSrcType();
bool hasIndexSemantics = linalgOp.hasIndexSemantics() \|\|
isa<IndexedGenericOp>(linalgOp.getOperation());

ExpansionInfo expansionInfo;		ExpansionInfo expansionInfo;
if (failed(expansionInfo.compute(linalgOp, fusedTensorIndex,		if (failed(expansionInfo.compute(genericOp, fusedTensorIndex,
reshapeOp.getReassociationMaps(),		reshapeOp.getReassociationMaps(),
expandedType.getShape())))		expandedType.getShape(), rewriter)))
return llvm::None;		return llvm::None;

if (hasIndexSemantics &&		if (failed(isGenericOpExpandable(genericOp, expansionInfo, rewriter)))
failed(isIndexedOpExpandable(linalgOp, expansionInfo)))
return llvm::None;		return llvm::None;

SmallVector<AffineMap, 4> expandedOpIndexingMaps = llvm::to_vector<4>(		SmallVector<AffineMap, 4> expandedOpIndexingMaps = llvm::to_vector<4>(
llvm::map_range(linalgOp.getIndexingMaps(), [&](AffineMap m) {		llvm::map_range(genericOp.getIndexingMaps(), [&](AffineMap m) {
return getIndexingMapInExpandedOp(rewriter, m, expansionInfo);		return getIndexingMapInExpandedOp(rewriter, m, expansionInfo);
}));		}));

SmallVector<Value, 4> expandedOpOperands;		SmallVector<Value> expandedOpOperands;
for (auto operand : llvm::enumerate(linalgOp.getInputs())) {		for (auto operand : llvm::enumerate(genericOp.getInputs())) {
if (operand.index() == fusedTensorIndex) {		if (operand.index() == fusedTensorIndex) {
expandedOpOperands.push_back(reshapeOp.src());		expandedOpOperands.push_back(reshapeOp.src());
continue;		continue;
}		}
AffineMap indexingMap = linalgOp.getInputIndexingMap(operand.index());		AffineMap indexingMap = genericOp.getInputIndexingMap(operand.index());
RankedTensorType expandedOperandType =		RankedTensorType expandedOperandType =
getExpandedType(operand.value().getType().cast<RankedTensorType>(),		getExpandedType(operand.value().getType().cast<RankedTensorType>(),
indexingMap, expansionInfo);		indexingMap, expansionInfo);
if (expandedOperandType != operand.value().getType()) {		if (expandedOperandType != operand.value().getType()) {
// Reshape the operand to get the right type.		// Reshape the operand to get the right type.
SmallVector<ReassociationIndices, 4> reassociation =		SmallVector<ReassociationIndices> reassociation =
getReassociationForExpansion(indexingMap, expansionInfo);		getReassociationForExpansion(indexingMap, expansionInfo);
expandedOpOperands.push_back(rewriter.create<TensorReshapeOp>(		expandedOpOperands.push_back(rewriter.create<TensorReshapeOp>(
linalgOp.getLoc(), expandedOperandType, operand.value(),		genericOp.getLoc(), expandedOperandType, operand.value(),
reassociation));		reassociation));
continue;		continue;
}		}
expandedOpOperands.push_back(operand.value());		expandedOpOperands.push_back(operand.value());
}		}

Location loc = linalgOp.getLoc();		Location loc = genericOp.getLoc();
SmallVector<Value, 1> outputs;		SmallVector<Value> outputs;
for (auto result : llvm::enumerate(linalgOp.getOutputs())) {		for (auto result : llvm::enumerate(genericOp.getOutputs())) {
AffineMap indexingMap = linalgOp.getOutputIndexingMap(result.index());		AffineMap indexingMap = genericOp.getOutputIndexingMap(result.index());
RankedTensorType expandedOutputType =		RankedTensorType expandedOutputType =
getExpandedType(result.value().getType().cast<RankedTensorType>(),		getExpandedType(result.value().getType().cast<RankedTensorType>(),
indexingMap, expansionInfo);		indexingMap, expansionInfo);
if (expandedOutputType != result.value().getType()) {		if (expandedOutputType != result.value().getType()) {
SmallVector<ReassociationIndices, 4> reassociation =		SmallVector<ReassociationIndices> reassociation =
getReassociationForExpansion(indexingMap, expansionInfo);		getReassociationForExpansion(indexingMap, expansionInfo);
outputs.push_back(rewriter.create<TensorReshapeOp>(		outputs.push_back(rewriter.create<TensorReshapeOp>(
linalgOp.getLoc(), expandedOutputType, result.value(),		genericOp.getLoc(), expandedOutputType, result.value(),
reassociation));		reassociation));
}		}
}		}

// The iterator types of the expanded op are all parallel.		// The iterator types of the expanded op are all parallel.
SmallVector<StringRef, 4> iteratorTypes(expansionInfo.getExpandedOpNumDims(),		SmallVector<StringRef> iteratorTypes(expansionInfo.getExpandedOpNumDims(),
getParallelIteratorTypeName());		getParallelIteratorTypeName());

TypeRange resultTypes = ValueRange(outputs).getTypes();		TypeRange resultTypes = ValueRange(outputs).getTypes();
LinalgOp fusedOp = createLinalgOpOfSameType(		auto fusedOp =
linalgOp, rewriter, linalgOp.getLoc(), resultTypes,		rewriter.create<GenericOp>(genericOp.getLoc(), resultTypes,
/inputs=/expandedOpOperands, outputs, expandedOpIndexingMaps,		/inputs=/expandedOpOperands, outputs,
iteratorTypes);		expandedOpIndexingMaps, iteratorTypes);
Region &fusedRegion = fusedOp->getRegion(0);		Region &fusedRegion = fusedOp->getRegion(0);
Region &originalRegion = linalgOp->getRegion(0);		Region &originalRegion = genericOp->getRegion(0);
		rewriter.cloneRegionBefore(originalRegion, fusedRegion, fusedRegion.begin());
if (isa<GenericOp>(linalgOp.getOperation())) {
rewriter.cloneRegionBefore(originalRegion, fusedRegion,
fusedRegion.begin());
} else {
assert(isa<IndexedGenericOp>(linalgOp.getOperation()));
buildExpandedIndexedGenericOpRegion(rewriter, loc, originalRegion,
fusedRegion, expansionInfo);
}

// Update the index accesses after the expansion.		// Update the index accesses after the expansion.
if (linalgOp.hasIndexSemantics())		updateExpandedGenericOpRegion(rewriter, loc, fusedRegion, expansionInfo);
updateExpandedIndexOpRegion(rewriter, loc, fusedRegion, expansionInfo);

// Reshape the result values to their original shape if this is a collapsing		// Reshape the result values to their original shape if this is a collapsing
// reshape folded into its consumer.		// reshape folded into its consumer.
SmallVector<Value, 1> resultVals;		SmallVector<Value> resultVals;
for (auto result : llvm::enumerate(linalgOp->getResults())) {		for (auto result : llvm::enumerate(genericOp->getResults())) {
if (!isExpanding &&		if (!isExpanding &&
resultTypes[result.index()] != result.value().getType()) {		resultTypes[result.index()] != result.value().getType()) {
SmallVector<ReassociationIndices, 4> reassociation =		SmallVector<ReassociationIndices> reassociation =
getReassociationForExpansion(		getReassociationForExpansion(
linalgOp.getOutputIndexingMap(result.index()), expansionInfo);		genericOp.getOutputIndexingMap(result.index()), expansionInfo);
resultVals.push_back(rewriter.create<TensorReshapeOp>(		resultVals.push_back(rewriter.create<TensorReshapeOp>(
linalgOp.getLoc(), result.value().getType(),		genericOp.getLoc(), result.value().getType(),
fusedOp->getResult(result.index()), reassociation));		fusedOp->getResult(result.index()), reassociation));
} else {		} else {
resultVals.push_back(fusedOp->getResult(result.index()));		resultVals.push_back(fusedOp->getResult(result.index()));
}		}
}		}
// Assuming a single result.		// Assuming a single result.
return resultVals;		return resultVals;
}		}
Show All 19 Lines
///		///
/// can be folded into		/// can be folded into
///		///
/// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1 * 4 + d2, d3)>		/// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1 * 4 + d2, d3)>
/// #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>		/// #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
/// %0 = linalg.generic { indexing_maps = [#map0, #map1, #map1] ... }		/// %0 = linalg.generic { indexing_maps = [#map0, #map1, #map1] ... }
/// ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x4x?xf32>) ...		/// ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x4x?xf32>) ...
/// -> tensor<?x?x4x?xf32>		/// -> tensor<?x?x4x?xf32>
template <typename LinalgOpTy, bool foldUnitDimReshapesOnly>		template <bool foldUnitDimReshapesOnly>
struct FoldProducerReshapeOpByLinearization		struct FoldProducerReshapeOpByLinearization
: public OpRewritePattern<LinalgOpTy> {		: public OpRewritePattern<GenericOp> {
using OpRewritePattern<LinalgOpTy>::OpRewritePattern;		using OpRewritePattern<GenericOp>::OpRewritePattern;

LogicalResult matchAndRewrite(LinalgOpTy op,		LogicalResult matchAndRewrite(GenericOp genericOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
if (!op.hasTensorSemantics())		if (!genericOp.hasTensorSemantics())
return failure();		return failure();
LinalgOp linalgOp = cast<LinalgOp>(op.getOperation());		for (auto operand : llvm::enumerate(genericOp.getInputs())) {
for (auto operand : llvm::enumerate(linalgOp.getInputs())) {
TensorReshapeOp reshapeOp =		TensorReshapeOp reshapeOp =
operand.value().getDefiningOp<TensorReshapeOp>();		operand.value().getDefiningOp<TensorReshapeOp>();
if (!reshapeOp \|\|		if (!reshapeOp \|\|
!isTensorReshapeOpFoldableByLinearization(		!isTensorReshapeOpFoldableByLinearization(
reshapeOp, linalgOp.getInputIndexingMap(operand.index()),		reshapeOp, genericOp.getInputIndexingMap(operand.index()),
/asProducer =/true) \|\|		/asProducer =/true) \|\|
(foldUnitDimReshapesOnly &&		(foldUnitDimReshapesOnly &&
!isUnitDimExpansionOnly(reshapeOp.getResultType().getShape(),		!isUnitDimExpansionOnly(reshapeOp.getResultType().getShape(),
reshapeOp.getReassociationMaps())))		reshapeOp.getReassociationMaps())))
continue;		continue;

// Compute the fused operands list,		// Compute the fused operands list,
SmallVector<Value, 2> fusedOperands(linalgOp.getInputs());		SmallVector<Value> fusedOperands(genericOp.getInputs());
fusedOperands[operand.index()] = reshapeOp.src();		fusedOperands[operand.index()] = reshapeOp.src();
fusedOperands.append(linalgOp.getOutputs().begin(),		fusedOperands.append(genericOp.getOutputs().begin(),
linalgOp.getOutputs().end());		genericOp.getOutputs().end());

// Compute indexing_maps for the fused operation. The indexing_maps for		// Compute indexing_maps for the fused operation. The indexing_maps for
// the operands of the consumers that arent fused are the same.		// the operands of the consumers that arent fused are the same.
SmallVector<AffineMap, 4> fusedIndexMaps = llvm::to_vector<4>(		SmallVector<AffineMap, 4> fusedIndexMaps = llvm::to_vector<4>(
op.indexing_maps().template getAsValueRange<AffineMapAttr>());		genericOp.indexing_maps().template getAsValueRange<AffineMapAttr>());

// Accepted consumer maps are either identity or permutation.		// Accepted consumer maps are either identity or permutation.
auto invMap = inversePermutation(fusedIndexMaps[operand.index()]);		auto invMap = inversePermutation(fusedIndexMaps[operand.index()]);

// Compute the indexing map to use for the result of the producer.		// Compute the indexing map to use for the result of the producer.
AffineMap modifiedMap =		AffineMap modifiedMap =
linearizeCollapsedDims(invMap, reshapeOp.getResultType().getShape(),		linearizeCollapsedDims(invMap, reshapeOp.getResultType().getShape(),
reshapeOp.getReassociationMaps());		reshapeOp.getReassociationMaps());
for (AffineExpr expr : modifiedMap.getResults()) {		for (AffineExpr expr : modifiedMap.getResults()) {
if (!expr.isPureAffine())		if (!expr.isPureAffine())
return failure();		return failure();
}		}
fusedIndexMaps[operand.index()] = modifiedMap;		fusedIndexMaps[operand.index()] = modifiedMap;

// Further check that the resulting index maps can be fused and		// Further check that the resulting index maps can be fused and
// inverted. Without this the resultant op is not legal.		// inverted. Without this the resultant op is not legal.
if (!inversePermutation(concatAffineMaps(fusedIndexMaps))) {		if (!inversePermutation(concatAffineMaps(fusedIndexMaps))) {
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
op, "fused op loop bound computation failed");		genericOp, "fused op loop bound computation failed");
}		}

rewriter.startRootUpdate(op);		rewriter.startRootUpdate(genericOp);
op->setOperands(fusedOperands);		genericOp->setOperands(fusedOperands);
op.indexing_mapsAttr(rewriter.getAffineMapArrayAttr(fusedIndexMaps));		genericOp.indexing_mapsAttr(
rewriter.finalizeRootUpdate(op);		rewriter.getAffineMapArrayAttr(fusedIndexMaps));
		rewriter.finalizeRootUpdate(genericOp);
return success();		return success();
}		}
return failure();		return failure();
}		}
};		};

static SmallVector<ReassociationIndices>		static SmallVector<ReassociationIndices>
getReassociationIndices(ArrayRef<AffineMap> maps) {		getReassociationIndices(ArrayRef<AffineMap> maps) {
SmallVector<ReassociationIndices> reassociation;		SmallVector<ReassociationIndices> reassociation;
for (AffineMap map : maps) {		for (AffineMap map : maps) {
ReassociationIndices indices;		ReassociationIndices indices;
for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {		for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
unsigned pos = map.getResult(i).cast<AffineDimExpr>().getPosition();		unsigned pos = map.getResult(i).cast<AffineDimExpr>().getPosition();
indices.push_back(pos);		indices.push_back(pos);
}		}
reassociation.push_back(indices);		reassociation.push_back(indices);
}		}
return reassociation;		return reassociation;
}		}

/// Pattern to move rank reducing reshape after an elementwise linalg generic		/// Pattern to move rank reducing reshape after an elementwise linalg generic
/// op. This is useful to expose more fusion opportunities between named ops and		/// op. This is useful to expose more fusion opportunities between named ops and
/// generic op. This can only be done if there is no broadcast or permuation		/// generic ops. This can only be done if there is no broadcast or permuation
/// within the dimensions we need to merge.		/// within the dimensions we need to merge.
///		///
/// For example,		/// For example,
///		///
/// %0 = linalg.tensor_reshape %A [		/// %0 = linalg.tensor_reshape %A [
/// affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>]		/// affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>]
/// : tensor<12544x16xf32> into tensor<112x112x16xf32>		/// : tensor<12544x16xf32> into tensor<112x112x16xf32>
/// %2 = linalg.generic {indexing_maps = [		/// %2 = linalg.generic {indexing_maps = [
Show All 10 Lines
/// affine_map<(d0, d1) -> (d1)>,		/// affine_map<(d0, d1) -> (d1)>,
/// affine_map<(d0, d1) -> (d0, d1)>],		/// affine_map<(d0, d1) -> (d0, d1)>],
/// iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1		/// iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1
/// : tensor<12544x16xf32>, tensor<16xf32>) outs(%1 : tensor<12544x16xf32>) {		/// : tensor<12544x16xf32>, tensor<16xf32>) outs(%1 : tensor<12544x16xf32>) {
/// } -> tensor<12544x16xf32>		/// } -> tensor<12544x16xf32>
/// %3 = linalg.tensor_reshape %2 [		/// %3 = linalg.tensor_reshape %2 [
/// #affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>]		/// #affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>]
/// : tensor<12544x16xf32> into tensor<112x112x16xf32>		/// : tensor<12544x16xf32> into tensor<112x112x16xf32>
template <typename GenericOpTy>		struct PushExpandingReshape : public OpRewritePattern<GenericOp> {
struct PushExpandingReshape : public OpRewritePattern<GenericOpTy> {		using OpRewritePattern<GenericOp>::OpRewritePattern;
using OpRewritePattern<GenericOpTy>::OpRewritePattern;

LogicalResult matchAndRewrite(GenericOpTy op,		LogicalResult matchAndRewrite(GenericOp genericOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
// Only apply to elementwise linalg on tensor.		// Only apply to elementwise linalg on tensor.
if (!op.hasTensorSemantics() \|\|		if (!genericOp.hasTensorSemantics() \|\|
op.getNumParallelLoops() != op.getNumLoops())		genericOp.getNumParallelLoops() != genericOp.getNumLoops())
return failure();		return failure();
// Only support identity output maps. It could be extended to permuations if		// Only support identity output maps. It could be extended to permuations if
// needed.		// needed.
if (llvm::any_of(op.getOutputIndexingMaps(),		if (llvm::any_of(genericOp.getOutputIndexingMaps(),
[](AffineMap map) { return !map.isIdentity(); }))		[](AffineMap map) { return !map.isIdentity(); }))
return failure();		return failure();
int64_t destRank = op.getNumParallelLoops();		int64_t destRank = genericOp.getNumParallelLoops();
SmallVector<Value, 4> newOperands = llvm::to_vector<4>(op.getInputs());		SmallVector<Value, 4> newOperands =
		llvm::to_vector<4>(genericOp.getInputs());
TensorReshapeOp reshapeFound;		TensorReshapeOp reshapeFound;
// 1. Look for tensor_reshape operands and figure out save the dimensions		// 1. Look for tensor_reshape operands and figure out save the dimensions
// merged.		// merged.
for (auto operand : llvm::enumerate(op.getInputs())) {		for (auto operand : llvm::enumerate(genericOp.getInputs())) {
TensorReshapeOp reshapeOp =		TensorReshapeOp reshapeOp =
operand.value().template getDefiningOp<TensorReshapeOp>();		operand.value().template getDefiningOp<TensorReshapeOp>();
if (!reshapeOp \|\| reshapeOp.getSrcType().getRank() >		if (!reshapeOp \|\| reshapeOp.getSrcType().getRank() >
reshapeOp.getResultType().getRank()) {		reshapeOp.getResultType().getRank()) {
continue;		continue;
}		}
// TODO: We could support non-identity map as long as the merged		// TODO: We could support non-identity map as long as the merged
// dimensions are still contiguous.		// dimensions are still contiguous.
if (!op.getIndexingMaps()[operand.index()].isIdentity())		if (!genericOp.getIndexingMaps()[operand.index()].isIdentity())
continue;		continue;
if (reshapeFound) {		if (reshapeFound) {
// Only support a second reshape op if it has the same reassociate maps.		// Only support a second reshape op if it has the same reassociate maps.
if (reshapeFound.getReassociationMaps() ==		if (reshapeFound.getReassociationMaps() ==
reshapeOp.getReassociationMaps())		reshapeOp.getReassociationMaps())
newOperands[operand.index()] = reshapeOp.src();		newOperands[operand.index()] = reshapeOp.src();
continue;		continue;
}		}
reshapeFound = reshapeOp;		reshapeFound = reshapeOp;
newOperands[operand.index()] = reshapeOp.src();		newOperands[operand.index()] = reshapeOp.src();
}		}
if (!reshapeFound)		if (!reshapeFound)
return failure();		return failure();

// Calculate the reassociation indices and rassociated reverse map.		// Calculate the reassociation indices and rassociated reverse map.
SmallVector<ReassociationIndices> reassociation =		SmallVector<ReassociationIndices> reassociation =
getReassociationIndices(reshapeFound.getReassociationMaps());		getReassociationIndices(reshapeFound.getReassociationMaps());
SmallVector<unsigned, 4> remap(destRank);		SmallVector<unsigned> remap(destRank);
for (auto &indices : llvm::enumerate(reassociation)) {		for (auto &indices : llvm::enumerate(reassociation)) {
for (int64_t index : indices.value()) {		for (int64_t index : indices.value()) {
remap[index] = indices.index();		remap[index] = indices.index();
}		}
}		}
// 2. Verify that we can merge the dimensions in the linalg and that we		// 2. Verify that we can merge the dimensions in the linalg and that we
// don't need to create new reshapes operands. Inserting new reshape		// don't need to create new reshapes operands. Inserting new reshape
// operands would defeat the purpose of the transformation.		// operands would defeat the purpose of the transformation.
for (auto operand : llvm::enumerate(op.getInputs())) {		for (auto operand : llvm::enumerate(genericOp.getInputs())) {
if (operand.value() == newOperands[operand.index()]) {		if (operand.value() == newOperands[operand.index()]) {
AffineMap map = op.getIndexingMaps()[operand.index()];		AffineMap map = genericOp.getIndexingMaps()[operand.index()];
for (unsigned i : llvm::seq(unsigned(0), map.getNumResults())) {		for (unsigned i : llvm::seq(unsigned(0), map.getNumResults())) {
if (reassociation[remap[map.getDimPosition(i)]].size() > 1)		if (reassociation[remap[map.getDimPosition(i)]].size() > 1)
return failure();		return failure();
}		}
}		}
}		}

// 3. Calculate the affine map remapping and the reassociation to apply to		// 3. Calculate the affine map remapping and the reassociation to apply to
// output tensors.		// output tensors.
SmallVector<AffineMap, 4> newMaps;		SmallVector<AffineMap> newMaps;
unsigned newRank = reassociation.size();		unsigned newRank = reassociation.size();
for (auto map : op.getIndexingMaps()) {		for (auto map : genericOp.getIndexingMaps()) {
SmallVector<AffineExpr> newExprs;		SmallVector<AffineExpr> newExprs;
for (auto expr : map.getResults()) {		for (auto expr : map.getResults()) {
unsigned position = expr.template cast<AffineDimExpr>().getPosition();		unsigned position = expr.template cast<AffineDimExpr>().getPosition();
// Skip dimension merged except for the last of the group.		// Skip dimension merged except for the last of the group.
if (reassociation[remap[position]].back() == position) {		if (reassociation[remap[position]].back() == position) {
newExprs.push_back(		newExprs.push_back(
getAffineDimExpr(remap[position], op.getContext()));		getAffineDimExpr(remap[position], genericOp.getContext()));
}		}
}		}
newMaps.push_back(AffineMap::get(newRank, 0, newExprs, op.getContext()));		newMaps.push_back(
		AffineMap::get(newRank, 0, newExprs, genericOp.getContext()));
}		}

// 4. Reshape the output tensors.		// 4. Reshape the output tensors.
SmallVector<Value> newOutputs;		SmallVector<Value> newOutputs;
SmallVector<Type> newOutputTypes;		SmallVector<Type> newOutputTypes;
for (auto output : op.outputs()) {		for (auto output : genericOp.outputs()) {
auto newOutputType = RankedTensorType::get(		auto newOutputType = RankedTensorType::get(
reshapeFound.getSrcType().getShape(),		reshapeFound.getSrcType().getShape(),
output.getType().template cast<RankedTensorType>().getElementType());		output.getType().template cast<RankedTensorType>().getElementType());
Value newOutput = rewriter.create<TensorReshapeOp>(		Value newOutput = rewriter.create<TensorReshapeOp>(
op->getLoc(), newOutputType, output, reassociation);		genericOp->getLoc(), newOutputType, output, reassociation);
newOutputTypes.push_back(newOutputType);		newOutputTypes.push_back(newOutputType);
newOutputs.push_back(newOutput);		newOutputs.push_back(newOutput);
}		}
// 5. Create a new generic op with lowerer rank.		// 5. Create a new generic op with lowerer rank.
SmallVector<StringRef, 4> iteratorTypes(newRank,		SmallVector<StringRef> iteratorTypes(newRank,
getParallelIteratorTypeName());		getParallelIteratorTypeName());
auto newOp =		auto newOp = rewriter.create<GenericOp>(genericOp->getLoc(), newOutputTypes,
rewriter.create<GenericOpTy>(op->getLoc(), newOutputTypes, newOperands,		newOperands, newOutputs, newMaps,
newOutputs, newMaps, iteratorTypes);		iteratorTypes);
rewriter.inlineRegionBefore(op.region(), newOp.region(),		rewriter.inlineRegionBefore(genericOp.region(), newOp.region(),
newOp.region().begin());		newOp.region().begin());
// 6. Reshape the so that the type matches the uses.		// 6. Reshape the so that the type matches the uses.
SmallVector<Value> newResults;		SmallVector<Value> newResults;
for (auto result : llvm::enumerate(newOp->getResults())) {		for (auto result : llvm::enumerate(newOp->getResults())) {
newResults.push_back(rewriter.create<TensorReshapeOp>(		newResults.push_back(rewriter.create<TensorReshapeOp>(
op->getLoc(), op.getOutputTensorTypes()[result.index()],		genericOp->getLoc(), genericOp.getOutputTensorTypes()[result.index()],
result.value(), reassociation));		result.value(), reassociation));
}		}
rewriter.replaceOp(op, newResults);		rewriter.replaceOp(genericOp, newResults);
return success();		return success();
}		}
};		};

/// Pattern to fuse a tensor_reshape op with its consumer		/// Pattern to fuse a tensor_reshape op with its consumer generic op, when the
/// generic/indexed_generic op, when the reshape op is collapsing		/// reshape op is collapsing dimensions. The dimensionality of the loop in the
/// dimensions. The dimensionality of the loop in the consumer is expanded.		/// consumer is expanded.
template <typename GenericOpTy>
class FoldWithProducerReshapeOpByExpansion		class FoldWithProducerReshapeOpByExpansion
: public OpRewritePattern<GenericOpTy> {		: public OpRewritePattern<GenericOp> {
public:		public:
FoldWithProducerReshapeOpByExpansion(		FoldWithProducerReshapeOpByExpansion(
MLIRContext *context, ControlElementwiseOpsFusionFn foldReshapes,		MLIRContext *context, ControlElementwiseOpsFusionFn foldReshapes,
PatternBenefit benefit = 1)		PatternBenefit benefit = 1)
: OpRewritePattern<GenericOpTy>(context, benefit),		: OpRewritePattern<GenericOp>(context, benefit),
controlFoldingReshapes(foldReshapes) {}		controlFoldingReshapes(foldReshapes) {}

LogicalResult matchAndRewrite(GenericOpTy genericOp,		LogicalResult matchAndRewrite(GenericOp genericOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
LinalgOp linalgOp = cast<LinalgOp>(genericOp.getOperation());		for (auto operand : llvm::enumerate(genericOp.getInputs())) {
for (auto operand : llvm::enumerate(linalgOp.getInputs())) {
TensorReshapeOp reshapeOp =		TensorReshapeOp reshapeOp =
operand.value().getDefiningOp<TensorReshapeOp>();		operand.value().getDefiningOp<TensorReshapeOp>();
if (!reshapeOp)		if (!reshapeOp)
continue;		continue;
// Fold only if		// Fold only if
// - The tensor reshape op is folding.		// - The tensor reshape op is folding.
// - All constraints of fusing with reshape by expansion are met.		// - All constraints of fusing with reshape by expansion are met.
if (reshapeOp.getSrcType().getRank() <		if (reshapeOp.getSrcType().getRank() <
reshapeOp.getResultType().getRank() \|\|		reshapeOp.getResultType().getRank() \|\|
!isFusableWithReshapeByDimExpansion(linalgOp, operand.index()) \|\|		!isFusableWithReshapeByDimExpansion(genericOp, operand.index()) \|\|
(!controlFoldingReshapes(		(!controlFoldingReshapes(
reshapeOp->getResult(0),		reshapeOp->getResult(0),
linalgOp.getInputOpOperands()[operand.index()])))		genericOp.getInputOpOperands()[operand.index()])))
continue;		continue;

Optional<SmallVector<Value, 1>> replacementValues =		Optional<SmallVector<Value>> replacementValues =
fuseWithReshapeByExpansion(linalgOp, reshapeOp, operand.index(),		fuseWithReshapeByExpansion(genericOp, reshapeOp, operand.index(),
rewriter);		rewriter);
if (!replacementValues)		if (!replacementValues)
return failure();		return failure();
rewriter.replaceOp(genericOp, replacementValues.getValue());		rewriter.replaceOp(genericOp, replacementValues.getValue());
return success();		return success();
}		}
return failure();		return failure();
}		}

private:		private:
ControlElementwiseOpsFusionFn controlFoldingReshapes;		ControlElementwiseOpsFusionFn controlFoldingReshapes;
};		};

/// Pattern to fold tensor_reshape op with its producer. The corresponding index		/// Pattern to fold tensor_reshape op with its producer. The corresponding index
/// map in the consumer needs to be modified to linearize the folded dimension.		/// map in the consumer needs to be modified to linearize the folded dimension.
template <bool foldUnitDimReshapesOnly>		template <bool foldUnitDimReshapesOnly>
struct FoldConsumerReshapeOpByLinearization		struct FoldConsumerReshapeOpByLinearization
: public OpRewritePattern<TensorReshapeOp> {		: public OpRewritePattern<TensorReshapeOp> {
using OpRewritePattern<TensorReshapeOp>::OpRewritePattern;		using OpRewritePattern<TensorReshapeOp>::OpRewritePattern;

LogicalResult matchAndRewrite(TensorReshapeOp reshapeOp,		LogicalResult matchAndRewrite(TensorReshapeOp reshapeOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
LinalgOp producer = reshapeOp.src().getDefiningOp<LinalgOp>();		GenericOp producer = reshapeOp.src().getDefiningOp<GenericOp>();
if (!producer \|\|		if (!producer \|\| !producer.hasTensorSemantics() \|\|
!isa<GenericOp, IndexedGenericOp>(producer.getOperation()) \|\|		producer.getNumOutputs() != 1 \|\|
!producer.hasTensorSemantics() \|\| producer.getNumOutputs() != 1 \|\|
!isTensorReshapeOpFoldableByLinearization(		!isTensorReshapeOpFoldableByLinearization(
reshapeOp, producer.getOutputIndexingMap(0),		reshapeOp, producer.getOutputIndexingMap(0),
/asProducer =/false) \|\|		/asProducer =/false) \|\|
(foldUnitDimReshapesOnly &&		(foldUnitDimReshapesOnly &&
!isUnitDimExpansionOnly(reshapeOp.getSrcType().getShape(),		!isUnitDimExpansionOnly(reshapeOp.getSrcType().getShape(),
reshapeOp.getReassociationMaps())))		reshapeOp.getReassociationMaps())))
return failure();		return failure();
// The indexing_maps for the operands of the fused operation are same as		// The indexing_maps for the operands of the fused operation are same as
Show All 20 Lines	LogicalResult matchAndRewrite(TensorReshapeOp reshapeOp,
if (!inversePermutation(concatAffineMaps(fusedIndexMaps))) {		if (!inversePermutation(concatAffineMaps(fusedIndexMaps))) {
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
producer, "fused op loop bound computation failed");		producer, "fused op loop bound computation failed");
}		}

Location loc = producer.getLoc();		Location loc = producer.getLoc();
Value output = rewriter.create<TensorReshapeOp>(		Value output = rewriter.create<TensorReshapeOp>(
loc, producer.getOutputs()[0], reshapeOp.getReassociationExprs());		loc, producer.getOutputs()[0], reshapeOp.getReassociationExprs());
LinalgOp fusedOp = createLinalgOpOfSameType(		auto fusedOp = rewriter.create<GenericOp>(
producer, rewriter, loc, reshapeOp.getResultType(),		loc, reshapeOp.getResultType(),
/inputs=/producer.getInputs(),		/inputs=/producer.getInputs(),
// TODO: handle outputs.		// TODO: handle outputs.
/outputs=/output, rewriter.getAffineMapArrayAttr(fusedIndexMaps),		/outputs=/output, rewriter.getAffineMapArrayAttr(fusedIndexMaps),
producer.iterator_types(),		producer.iterator_types(),
/doc=/nullptr,		/doc=/nullptr,
/library_call=/nullptr);		/library_call=/nullptr);
auto &fusedRegion = fusedOp->getRegion(0);		auto &fusedRegion = fusedOp->getRegion(0);
rewriter.cloneRegionBefore(producer->getRegion(0), fusedRegion,		rewriter.cloneRegionBefore(producer->getRegion(0), fusedRegion,
Show All 11 Lines	struct FoldReshapeWithGenericOpByExpansion
using OpRewritePattern<TensorReshapeOp>::OpRewritePattern;		using OpRewritePattern<TensorReshapeOp>::OpRewritePattern;
LogicalResult matchAndRewrite(TensorReshapeOp reshapeOp,		LogicalResult matchAndRewrite(TensorReshapeOp reshapeOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
// Fold only if		// Fold only if
// - The tensor reshape op is a expanding case.		// - The tensor reshape op is a expanding case.
// - All constraints of fusing with reshape by expansion are met.		// - All constraints of fusing with reshape by expansion are met.
if (reshapeOp.getSrcType().getRank() > reshapeOp.getResultType().getRank())		if (reshapeOp.getSrcType().getRank() > reshapeOp.getResultType().getRank())
return failure();		return failure();
LinalgOp producer = reshapeOp.src().getDefiningOp<LinalgOp>();		GenericOp producer = reshapeOp.src().getDefiningOp<GenericOp>();
if (!producer \|\| producer.getNumOutputs() != 1 \|\|		if (!producer \|\| producer.getNumOutputs() != 1 \|\|
!isFusableWithReshapeByDimExpansion(producer,		!isFusableWithReshapeByDimExpansion(producer,
producer.getNumInputs()) \|\|		producer.getNumInputs()) \|\|
isUnitDimExpansionOnly(reshapeOp.getResultType().getShape(),		isUnitDimExpansionOnly(reshapeOp.getResultType().getShape(),
reshapeOp.getReassociationMaps()))		reshapeOp.getReassociationMaps()))
return failure();		return failure();
Optional<SmallVector<Value, 1>> replacementValues =		Optional<SmallVector<Value>> replacementValues = fuseWithReshapeByExpansion(
fuseWithReshapeByExpansion(producer, reshapeOp, producer.getNumInputs(),		producer, reshapeOp, producer.getNumInputs(), rewriter);
rewriter);
if (!replacementValues)		if (!replacementValues)
return failure();		return failure();
rewriter.replaceOp(reshapeOp, replacementValues.getValue());		rewriter.replaceOp(reshapeOp, replacementValues.getValue());
return success();		return success();
}		}
};		};

/// Pattern to fold a GenericOp/IndexedGenericOp with a splat constant.		/// Pattern to fold a generic op with a splat constant.
template <typename LinalgOpTy>		class FoldSplatConstants : public OpRewritePattern<GenericOp> {
class FoldSplatConstants : public OpRewritePattern<LinalgOpTy> {
public:		public:
FoldSplatConstants(MLIRContext *context, ControlElementwiseOpsFusionFn &fun,		FoldSplatConstants(MLIRContext *context, ControlElementwiseOpsFusionFn &fun,
PatternBenefit benefit = 1)		PatternBenefit benefit = 1)
: OpRewritePattern<LinalgOpTy>(context, benefit), controlFn(fun) {}		: OpRewritePattern<GenericOp>(context, benefit), controlFn(fun) {}

LogicalResult matchAndRewrite(LinalgOpTy op,		LogicalResult matchAndRewrite(GenericOp genericOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
if (!op.hasTensorSemantics())		if (!genericOp.hasTensorSemantics())
return failure();		return failure();
LinalgOp linalgOp = cast<LinalgOp>(op.getOperation());		for (auto operand : llvm::enumerate(genericOp.getInputOpOperands())) {
for (auto operand : llvm::enumerate(linalgOp.getInputOpOperands())) {
Operation *def = operand.value().get().getDefiningOp();		Operation *def = operand.value().get().getDefiningOp();
DenseElementsAttr constantAttr;		DenseElementsAttr constantAttr;
if (!def \|\|		if (!def \|\|
!matchPattern(def, m_Constant<DenseElementsAttr>(&constantAttr)) \|\|		!matchPattern(def, m_Constant<DenseElementsAttr>(&constantAttr)) \|\|
!constantAttr.isSplat() \|\|		!constantAttr.isSplat() \|\|
!controlFn(def->getResult(0), operand.value()))		!controlFn(def->getResult(0), operand.value()))
continue;		continue;

// The indexing_maps for the operands of the fused operation are same as		// The indexing_maps for the operands of the fused operation are same as
// those for the operands of the linalgOp without the indexing map at		// those for the operands of the genericOp without the indexing map at
// operand.index()		// operand.index()
SmallVector<AffineMap, 4> fusedIndexMaps = llvm::to_vector<4>(		SmallVector<AffineMap, 4> fusedIndexMaps = llvm::to_vector<4>(
linalgOp.indexing_maps().getAsValueRange<AffineMapAttr>());		genericOp.indexing_maps().getAsValueRange<AffineMapAttr>());
		mravishankarUnsubmitted Done Reply Inline Actions Flagging this as potential `copy` assignment. mravishankar: Flagging this as potential `copy` assignment.
fusedIndexMaps.erase(std::next(fusedIndexMaps.begin(), operand.index()));		fusedIndexMaps.erase(std::next(fusedIndexMaps.begin(), operand.index()));

// Check if the operation shapes to loops map is computable.		// Check if the operation shapes to loops map is computable.
if (!inversePermutation(concatAffineMaps(fusedIndexMaps))) {		if (!inversePermutation(concatAffineMaps(fusedIndexMaps))) {
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
linalgOp, "fused op loop bound computation failed");		genericOp, "fused op loop bound computation failed");
}		}

// The operands list is same as the linalgOp with the argument for		// The operands list is same as the genericOp with the argument for
// constant index dropped.		// constant index dropped.
SmallVector<Value, 4> fusedOperands(linalgOp.getInputs());		SmallVector<Value> fusedOperands(genericOp.getInputs());
fusedOperands.erase(std::next(fusedOperands.begin(), operand.index()));		fusedOperands.erase(std::next(fusedOperands.begin(), operand.index()));

// Create a constant scalar value from the splat constant.		// Create a constant scalar value from the splat constant.
Value scalarConstant = rewriter.create<ConstantOp>(		Value scalarConstant = rewriter.create<ConstantOp>(
def->getLoc(), constantAttr.getSplatValue());		def->getLoc(), constantAttr.getSplatValue());

LinalgOp fusedOp = createLinalgOpOfSameType(		auto fusedOp = rewriter.create<GenericOp>(
linalgOp, rewriter, rewriter.getUnknownLoc(),		rewriter.getUnknownLoc(), genericOp->getResultTypes(),
linalgOp->getResultTypes(),
/inputs=/fusedOperands,		/inputs=/fusedOperands,
/outputs=/linalgOp.getOutputs(),		/outputs=/genericOp.getOutputs(),
rewriter.getAffineMapArrayAttr(fusedIndexMaps),		rewriter.getAffineMapArrayAttr(fusedIndexMaps),
linalgOp.iterator_types(),		genericOp.iterator_types(),
/doc=/nullptr,		/doc=/nullptr,
/library_call=/nullptr);		/library_call=/nullptr);

// Map the block argument corresponding to the replaced argument with the		// Map the block argument corresponding to the replaced argument with the
// scalar constant.		// scalar constant.
Region &linalgOpRegion = linalgOp->getRegion(0);		Region &region = genericOp->getRegion(0);
Block &entryBlock = *linalgOpRegion.begin();		Block &entryBlock = *region.begin();
		mravishankarUnsubmitted Done Reply Inline Actions Looking at this, need to revisit this one cause this seems buggy. Need to check whether all the arguments "after" the dropped operand need to be remapped as well since we have a smaller list of arguments mravishankar: Looking at this, need to revisit this one cause this seems buggy. Need to check whether all the…
		gysitAuthorUnsubmitted Done Reply Inline Actions We probably can delete entryBlock.getNumArguments() - genericOp.getNumShapedOperands() since this is always zero now that we have no index arguements! I will search for more such patterns tomorrow. Regarding the mapping I am not sure I believe the mapper will care about remapping all the remaining argument. But it is one thing I am never 100% sure myself... gysit: We probably can delete entryBlock.getNumArguments() - genericOp.getNumShapedOperands() since…
unsigned argIndex = entryBlock.getNumArguments() -
linalgOp.getNumShapedOperands() + operand.index();
BlockAndValueMapping mapping;		BlockAndValueMapping mapping;
mapping.map(entryBlock.getArgument(argIndex), scalarConstant);		mapping.map(entryBlock.getArgument(operand.index()), scalarConstant);
Region &fusedRegion = fusedOp->getRegion(0);		Region &fusedRegion = fusedOp->getRegion(0);
rewriter.cloneRegionBefore(linalgOpRegion, fusedRegion,		rewriter.cloneRegionBefore(region, fusedRegion, fusedRegion.begin(),
fusedRegion.begin(), mapping);		mapping);
rewriter.replaceOp(linalgOp, fusedOp->getResults());		rewriter.replaceOp(genericOp, fusedOp->getResults());
return success();		return success();
}		}
return failure();		return failure();
}		}

private:		private:
ControlElementwiseOpsFusionFn controlFn;		ControlElementwiseOpsFusionFn controlFn;
};		};
} // namespace		} // namespace

static Optional<SmallVector<Value, 1>>		static Optional<SmallVector<Value>>
fuseElementwiseOps(PatternRewriter &rewriter, OpOperand &consumerOpOperand,		fuseElementwiseOps(PatternRewriter &rewriter, OpOperand &consumerOpOperand,
		GenericOp producer,
const ControlElementwiseOpsFusionFn &controlFn) {		const ControlElementwiseOpsFusionFn &controlFn) {
Operation *producer = consumerOpOperand.get().getDefiningOp();		if (producer->getNumResults() != 1)
if (!producer \|\| producer->getNumResults() != 1)
return llvm::None;		return llvm::None;

// Fuse when consumer is GenericOp or IndexedGenericOp.		return fuseElementwiseOpsImpl(producer, consumerOpOperand, controlFn,
if (!isa<GenericOp, IndexedGenericOp>(consumerOpOperand.getOwner()) \|\|		rewriter);
!isa<GenericOp, IndexedGenericOp>(producer))
return llvm::None;

return fuseElementwiseOpsImpl(cast<LinalgOp>(producer), consumerOpOperand,
controlFn, rewriter);
}		}

bool mlir::linalg::skipUnitDimReshape(const OpResult &producer,		bool mlir::linalg::skipUnitDimReshape(const OpResult &producer,
const OpOperand &consumer) {		const OpOperand &consumer) {
auto reshapeOp = producer.getDefiningOp<linalg::TensorReshapeOp>();		auto reshapeOp = producer.getDefiningOp<linalg::TensorReshapeOp>();
return !isUnitDimExpansionOnly(reshapeOp.getSrcType().getShape(),		return !isUnitDimExpansionOnly(reshapeOp.getSrcType().getShape(),
reshapeOp.getReassociationMaps());		reshapeOp.getReassociationMaps());
}		}

namespace {		namespace {
/// Patterns to fuse a generic op, with the producer of its operands.		/// Patterns to fuse a generic op, with the producer of its operands.
template <typename LinalgOpTy>		class FuseElementwiseOps : public OpRewritePattern<GenericOp> {
class FuseElementwiseOps : public OpRewritePattern<LinalgOpTy> {
public:		public:
FuseElementwiseOps(MLIRContext *context, ControlElementwiseOpsFusionFn &fun,		FuseElementwiseOps(MLIRContext *context, ControlElementwiseOpsFusionFn &fun,
PatternBenefit benefit = 1)		PatternBenefit benefit = 1)
: OpRewritePattern<LinalgOpTy>(context, benefit), controlFn(fun) {}		: OpRewritePattern<GenericOp>(context, benefit), controlFn(fun) {}

LogicalResult matchAndRewrite(LinalgOpTy op,		LogicalResult matchAndRewrite(GenericOp genericOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
// Find the first operand that is defined by another generic op on tensors.		// Find the first operand that is defined by another generic op on tensors.
for (OpOperand &opOperand : op.getShapedOpOperands()) {		for (OpOperand &opOperand : genericOp.getShapedOpOperands()) {
LinalgOp producerOp =		auto producer =
dyn_cast_or_null<LinalgOp>(opOperand.get().getDefiningOp());		dyn_cast_or_null<GenericOp>(opOperand.get().getDefiningOp());
if (!producerOp \|\| !producerOp.hasTensorSemantics())		if (!producer \|\| !producer.hasTensorSemantics())
continue;		continue;
Optional<SmallVector<Value, 1>> fusedOpResults =		Optional<SmallVector<Value>> fusedOpResults =
fuseElementwiseOps(rewriter, opOperand, controlFn);		fuseElementwiseOps(rewriter, opOperand, producer, controlFn);
if (fusedOpResults) {		if (fusedOpResults) {
rewriter.replaceOp(op, *fusedOpResults);		rewriter.replaceOp(genericOp, *fusedOpResults);
return success();		return success();
}		}
}		}
return failure();		return failure();
}		}

private:		private:
ControlElementwiseOpsFusionFn controlFn;		ControlElementwiseOpsFusionFn controlFn;
Show All 12 Lines	void runOnOperation() override {
populateElementwiseOpsFusionPatterns(		populateElementwiseOpsFusionPatterns(
patterns,		patterns,
LinalgElementwiseFusionOptions().setControlFoldingReshapes(		LinalgElementwiseFusionOptions().setControlFoldingReshapes(
allowFoldingUnitDimReshapes ? allowFoldingFn : skipUnitDimReshape));		allowFoldingUnitDimReshapes ? allowFoldingFn : skipUnitDimReshape));
(void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));		(void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));
}		}
};		};

/// Pass to test folding of reshape op with generic/indexed_generic ops by		/// Pass to test folding of reshape ops with generic ops by linearization.
/// linearization.
struct FoldReshapeOpsByLinearizationPass		struct FoldReshapeOpsByLinearizationPass
: public LinalgFoldReshapeOpsByLinearizationBase<		: public LinalgFoldReshapeOpsByLinearizationBase<
FoldReshapeOpsByLinearizationPass> {		FoldReshapeOpsByLinearizationPass> {
void runOnOperation() override {		void runOnOperation() override {
Operation *op = getOperation();		Operation *op = getOperation();
RewritePatternSet patterns(op->getContext());		RewritePatternSet patterns(op->getContext());
populateFoldReshapeOpsByLinearizationPatterns(patterns);		populateFoldReshapeOpsByLinearizationPatterns(patterns);
(void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));		(void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));
}		}
};		};

} // namespace		} // namespace

void mlir::linalg::populateFoldReshapeOpsByLinearizationPatterns(		void mlir::linalg::populateFoldReshapeOpsByLinearizationPatterns(
RewritePatternSet &patterns) {		RewritePatternSet &patterns) {
patterns.add<FoldProducerReshapeOpByLinearization<GenericOp, false>,		patterns.add<FoldProducerReshapeOpByLinearization<false>,
FoldProducerReshapeOpByLinearization<IndexedGenericOp, false>,
FoldConsumerReshapeOpByLinearization<false>>(		FoldConsumerReshapeOpByLinearization<false>>(
patterns.getContext());		patterns.getContext());
}		}

void mlir::linalg::populateFoldUnitDimsReshapeOpsByLinearizationPatterns(		void mlir::linalg::populateFoldUnitDimsReshapeOpsByLinearizationPatterns(
RewritePatternSet &patterns) {		RewritePatternSet &patterns) {
patterns.add<FoldProducerReshapeOpByLinearization<GenericOp, true>,		patterns.add<FoldProducerReshapeOpByLinearization<true>,
FoldProducerReshapeOpByLinearization<IndexedGenericOp, true>,
FoldConsumerReshapeOpByLinearization<true>>(		FoldConsumerReshapeOpByLinearization<true>>(
patterns.getContext());		patterns.getContext());
}		}

void mlir::linalg::populateFoldReshapeOpsByExpansionPatterns(		void mlir::linalg::populateFoldReshapeOpsByExpansionPatterns(
RewritePatternSet &patterns,		RewritePatternSet &patterns,
ControlElementwiseOpsFusionFn controlFoldingReshapes) {		ControlElementwiseOpsFusionFn controlFoldingReshapes) {
patterns.add<FoldReshapeWithGenericOpByExpansion>(patterns.getContext());		patterns.add<FoldReshapeWithGenericOpByExpansion>(patterns.getContext());
patterns.add<FoldWithProducerReshapeOpByExpansion<GenericOp>,		patterns.add<FoldWithProducerReshapeOpByExpansion>(patterns.getContext(),
FoldWithProducerReshapeOpByExpansion<IndexedGenericOp>>(		controlFoldingReshapes);
patterns.getContext(), controlFoldingReshapes);
}		}

void mlir::linalg::populateElementwiseOpsFusionPatterns(		void mlir::linalg::populateElementwiseOpsFusionPatterns(
RewritePatternSet &patterns, LinalgElementwiseFusionOptions options) {		RewritePatternSet &patterns, LinalgElementwiseFusionOptions options) {
auto *context = patterns.getContext();		auto *context = patterns.getContext();
patterns		patterns.add<FuseElementwiseOps, FoldSplatConstants>(
.add<FuseElementwiseOps<GenericOp>, FuseElementwiseOps<IndexedGenericOp>,
FoldSplatConstants<GenericOp>, FoldSplatConstants<IndexedGenericOp>>(
context, options.controlElementwiseOpsFusionFn);		context, options.controlElementwiseOpsFusionFn);
populateFoldReshapeOpsByExpansionPatterns(patterns,		populateFoldReshapeOpsByExpansionPatterns(patterns,
options.controlFoldingReshapesFn);		options.controlFoldingReshapesFn);
AffineApplyOp::getCanonicalizationPatterns(patterns, context);		AffineApplyOp::getCanonicalizationPatterns(patterns, context);
GenericOp::getCanonicalizationPatterns(patterns, context);		GenericOp::getCanonicalizationPatterns(patterns, context);
IndexedGenericOp::getCanonicalizationPatterns(patterns, context);		IndexedGenericOp::getCanonicalizationPatterns(patterns, context);
TensorReshapeOp::getCanonicalizationPatterns(patterns, context);		TensorReshapeOp::getCanonicalizationPatterns(patterns, context);
}		}

void mlir::linalg::populatePushReshapeOpsPatterns(RewritePatternSet &patterns) {		void mlir::linalg::populatePushReshapeOpsPatterns(RewritePatternSet &patterns) {
auto *context = patterns.getContext();		auto *context = patterns.getContext();
patterns.add<PushExpandingReshape<GenericOp>,		patterns.add<PushExpandingReshape>(context);
PushExpandingReshape<IndexedGenericOp>>(context);
}		}

std::unique_ptr<Pass> mlir::createLinalgFusionOfTensorOpsPass() {		std::unique_ptr<Pass> mlir::createLinalgFusionOfTensorOpsPass() {
return std::make_unique<FusionOfTensorOpsPass>();		return std::make_unique<FusionOfTensorOpsPass>();
}		}

std::unique_ptr<Pass> mlir::createFoldReshapeOpsByLinearizationPass() {		std::unique_ptr<Pass> mlir::createFoldReshapeOpsByLinearizationPass() {
return std::make_unique<FoldReshapeOpsByLinearizationPass>();		return std::make_unique<FoldReshapeOpsByLinearizationPass>();
}		}

mlir/test/Dialect/Linalg/fusion-tensor.mlir

	Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines
	// CHECK-LABEL: func @generic_op_constant_fusion			// CHECK-LABEL: func @generic_op_constant_fusion
	// CHECK: %[[CST:.]] = constant {{.}} : f32			// CHECK: %[[CST:.]] = constant {{.}} : f32
	// CHECK: linalg.generic			// CHECK: linalg.generic
	// CHECK: ^{{.+}}(%[[ARG1:[a-zA-Z0-9_]+]]: f32, %{{.+}}: f32):			// CHECK: ^{{.+}}(%[[ARG1:[a-zA-Z0-9_]+]]: f32, %{{.+}}: f32):
	// CHECK: mulf %[[CST]], %[[ARG1]]			// CHECK: mulf %[[CST]], %[[ARG1]]

	// -----			// -----

	#map0 = affine_map<(d0, d1, d2) -> (d0)>			#map0 = affine_map<(d0, d1, d2) -> (d0)>
	mravishankarUnsubmitted Done Reply Inline Actions I would prefer not dropping these tests. Instead change them to use the `index` operation in the body and make sure that the generated op is as expected. The argument to the `index` operation should be verified. mravishankar: I would prefer not dropping these tests. Instead change them to use the `index` operation in…
	gysitAuthorUnsubmitted Done Reply Inline Actions Are you referring to the two constant fusion tests here? I believe for the other tests there should be an index op counter part. At the moment, the constant fusion tests do not make use of the indices and there are already generic op constant fusion tests. But I could enhance the test to for example add one of the indices to the fused constant? gysit: Are you referring to the two constant fusion tests here? I believe for the other tests there…
	mravishankarUnsubmitted Done Reply Inline Actions Sorry I just left a blanket comment here and not specific enough. I see these tests are not really working on `indexed_generic` operations. They are really `generic` operations (they dont use the index bb args). But there are tests below that do use. (see below) mravishankar: Sorry I just left a blanket comment here and not specific enough. I see these tests are not…
	gysitAuthorUnsubmitted Done Reply Inline Actions Ah now I understand... These tests already exist for some time. I added them when I introduced index op support (https://reviews.llvm.org/D100479) meaning they unfortunately do not show up in the diff! But they are already there (the index op counter part should always be right below the deleted indexed_generic test): generic_op_indexed_generic_op_fusion -> producer_indexed_consumer_fusion indexed_generic_op_generic_op_fusion -> indexed_producer_consumer_fusion indexed_producer_indexed_consumer_fusion -> indexed_generic_op_fusion Sorry I should have mentioned that as a review comment. gysit: Ah now I understand... These tests already exist for some time. I added them when I introduced…
	mravishankarUnsubmitted Not Done Reply Inline Actions Ah ok! I see these now. Thanks for the clarifications. Looking at these tests now, they seem to be only testing the case where the `linalg.index` values are the same before and after fusion. It would be good to add one more test of this form #map1 = affine_map<(d0) -> (d0)> #map2 = affine_map<(d0, d1) -> (d0, d1)> #map3 = affine_map<(d0, d1) -> (d1)> func (%arg0 : tensor<?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> { %c0 = constant 0 : index %c1 = constant 1 : index %d0 = memref.dim %arg0, %c0 : tensor<?xf32> %0 = linalg.init_tensor [%d0] : tensor<?xf32> %1 = linalg.generic {indexing_maps = [#map1, #map1] iterator_types = ["parallel"]} ins(%arg0 : tensor<?xf32>) outs(%0 : tensor<?xf32>) { ^bb0(%arg2 : f32, %arg3 : f32): %2 = linalg.index 0 : index %3 = index_cast %2 : index to i32 %4 = sitofp %3 : i32 to f32 %5 = addf %arg2, %4 : f32 linalg.yield %2 : f32 } -> tensor<?xf32> %2 = memref.dim %arg1, %c0 : tensor<?x?xf32> %3 = memref.dim %arg1, %c1 : tensor<?x?xf32> %4 = linalg.init_tensor [%2, %3] : tensor<?x?xf32> %5 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg1, %1 : tensor<?x?xf32>, tensor<?xf32>) outs(%4 : tensor<?x?xf32>) { ^bb0(%arg2 : f32, %arg3 : f32, %arg4: f32): %6 = addf %arg2, %arg3 : f32 linalg.yield %6 : f32 } -> tensor<?x?xf32> return %5 : tensor<?x?xf32> } If my understanding is right, the fused op should have `linalg.index 1`. I am sure it works, but this is more to make sure no changes break it. I appreciate that this test is not there already and it should have been, but I just realized now looking at all the tests and saw this missing coverage. Not blocking on this, but really hope you can just add this test. mravishankar: Ah ok! I see these now. Thanks for the clarifications. Looking at these tests now, they seem…
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>			#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	func @indexed_generic_op_constant_fusion(%arg0 : tensor<5x?x?xf32>)			func @generic_indexed_op_constant_fusion(%arg0 : tensor<5x?x?xindex>)
	-> tensor<5x?x?xf32>			-> tensor<5x?x?xindex>
	{			{
	%c0 = constant 0 : index			%c0 = constant 0 : index
	%c1 = constant 1 : index			%c1 = constant 1 : index
	%c2 = constant 2 : index			%c2 = constant 2 : index
	%cst = constant dense<42.0> : tensor<5xf32>			%cst = constant dense<42> : tensor<5xindex>
	%0 = memref.dim %arg0, %c1 : tensor<5x?x?xf32>			%0 = memref.dim %arg0, %c1 : tensor<5x?x?xindex>
	%1 = memref.dim %arg0, %c2 : tensor<5x?x?xf32>			%1 = memref.dim %arg0, %c2 : tensor<5x?x?xindex>
	%2 = linalg.init_tensor [5, %0, %1] : tensor<5x?x?xf32>			%2 = linalg.init_tensor [5, %0, %1] : tensor<5x?x?xindex>
	%3 = linalg.indexed_generic {			%3 = linalg.generic {
	indexing_maps = [#map0, #map1, #map1],			indexing_maps = [#map0, #map1, #map1],
	iterator_types = ["parallel", "parallel", "parallel"]}			iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%cst, %arg0 : tensor<5xf32>, tensor<5x?x?xf32>)			ins(%cst, %arg0 : tensor<5xindex>, tensor<5x?x?xindex>)
	outs(%2 : tensor<5x?x?xf32>) {			outs(%2 : tensor<5x?x?xindex>) {
	^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: f32, %arg5 : f32, %arg6 : f32):			^bb0(%arg1: index, %arg2 : index, %arg3 : index):
	%4 = mulf %arg4, %arg5 : f32			%idx = linalg.index 0 : index
	linalg.yield %4 : f32			%4 = subi %arg1, %arg2 : index
	} -> tensor<5x?x?xf32>			%5 = addi %4, %idx : index
	return %3 : tensor<5x?x?xf32>			linalg.yield %5 : index
				} -> tensor<5x?x?xindex>
				return %3 : tensor<5x?x?xindex>
	}			}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>			// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	// CHECK-LABEL: func @indexed_generic_op_constant_fusion			// CHECK-LABEL: func @generic_indexed_op_constant_fusion
	// CHECK: %[[CST:.]] = constant {{.}} : f32			// CHECK: %[[CST:.*]] = constant 42 : index
	// CHECK: linalg.generic			// CHECK: linalg.generic
	// CHECK: ^{{[a-zA-Z0-9_]*}}			// CHECK: ^{{[a-zA-Z0-9_]*}}
	// CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]]]: f32, %{{.}}: f32)			// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]]]: index, %{{.}}: index)
	// CHECK: mulf %[[CST]], %[[ARG4]]			// CHECK: %[[IDX:.+]] = linalg.index 0 : index
				// CHECK: %[[VAL1:.+]] = subi %[[CST]], %[[ARG2]]
				// CHECK: %{{.*}} = addi %[[VAL1]], %[[IDX]]

	// -----			// -----

	#map0 = affine_map<(d0, d1, d2) -> ()>			#map0 = affine_map<(d0, d1, d2) -> ()>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>			#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	func @generic_op_zero_dim_constant_fusion(%arg0 : tensor<5x?x?xf32>)			func @generic_op_zero_dim_constant_fusion(%arg0 : tensor<5x?x?xf32>)
	-> tensor<5x?x?xf32>			-> tensor<5x?x?xf32>
	{			{
	Show All 21 Lines
	// CHECK: linalg.generic			// CHECK: linalg.generic
	// CHECK: ^{{.}}(%[[ARG1:[a-zA-Z0-9_]]]: f32, %{{.*}}: f32)			// CHECK: ^{{.}}(%[[ARG1:[a-zA-Z0-9_]]]: f32, %{{.*}}: f32)
	// CHECK: mulf %[[CST]], %[[ARG1]]			// CHECK: mulf %[[CST]], %[[ARG1]]

	// -----			// -----

	#map0 = affine_map<(d0, d1, d2) -> ()>			#map0 = affine_map<(d0, d1, d2) -> ()>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>			#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	func @indexed_generic_op_zero_dim_constant_fusion			func @generic_indexed_op_zero_dim_constant_fusion
	(%arg0 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32>			(%arg0 : tensor<5x?x?xindex>) -> tensor<5x?x?xindex>
	{			{
	%c0 = constant 0 : index			%c0 = constant 0 : index
	%c1 = constant 1 : index			%c1 = constant 1 : index
	%c2 = constant 2 : index			%c2 = constant 2 : index
	%cst = constant dense<42.0> : tensor<f32>			%cst = constant dense<42> : tensor<index>
	%0 = memref.dim %arg0, %c1 : tensor<5x?x?xf32>			%0 = memref.dim %arg0, %c1 : tensor<5x?x?xindex>
	%1 = memref.dim %arg0, %c2 : tensor<5x?x?xf32>			%1 = memref.dim %arg0, %c2 : tensor<5x?x?xindex>
	%2 = linalg.init_tensor [5, %0, %1] : tensor<5x?x?xf32>			%2 = linalg.init_tensor [5, %0, %1] : tensor<5x?x?xindex>
	%3 = linalg.indexed_generic {			%3 = linalg.generic {
	indexing_maps = [#map0, #map1, #map1],			indexing_maps = [#map0, #map1, #map1],
	iterator_types = ["parallel", "parallel", "parallel"]}			iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%cst, %arg0 : tensor<f32>, tensor<5x?x?xf32>)			ins(%cst, %arg0 : tensor<index>, tensor<5x?x?xindex>)
	outs(%2 : tensor<5x?x?xf32>) {			outs(%2 : tensor<5x?x?xindex>) {
	^bb0(%arg1 : index, %arg2 : index, %arg3 : index, %arg4: f32, %arg5: f32, %arg6: f32):			^bb0(%arg1: index, %arg2: index, %arg3: index):
	%4 = mulf %arg4, %arg5 : f32			%idx = linalg.index 0 : index
	linalg.yield %4 : f32			%4 = subi %arg1, %arg2 : index
	} -> tensor<5x?x?xf32>			%5 = addi %4, %idx : index
	return %3 : tensor<5x?x?xf32>			linalg.yield %5 : index
				} -> tensor<5x?x?xindex>
				return %3 : tensor<5x?x?xindex>
	}			}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>			// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	// CHECK-LABEL: func @indexed_generic_op_zero_dim_constant_fusion			// CHECK-LABEL: func @generic_indexed_op_zero_dim_constant_fusion
	// CHECK: %[[CST:.]] = constant {{.}} : f32			// CHECK: %[[CST:.*]] = constant 42 : index
	// CHECK: linalg.generic
	// CHECK: ^{{[a-zA-Z0-9_]*}}
	// CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]]]: f32, %{{.}}: f32)
	// CHECK: mulf %[[CST]], %[[ARG4]]

	// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>
	func @generic_op_indexed_generic_op_fusion(%arg0: tensor<?x?xi32>,
	mravishankarUnsubmitted Done Reply Inline Actions This is the test I am talking about. The producer here is using `index_cast %arg` etc. i.e. actually using the index basic block. These need to be kept so that the transformation of the `linalg.index` can be verified. mravishankar: This is the test I am talking about. The producer here is using `index_cast %arg` etc. i.e.
	%arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
	%c0 = constant 0 : index
	%c1 = constant 1 : index
	%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>
	%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>
	%2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
	%3 = linalg.generic {
	indexing_maps = [#map0, #map0, #map0],
	iterator_types = ["parallel", "parallel"] }
	ins(%arg0, %arg1 : tensor<?x?xi32>, tensor<?x?xi32>)
	outs(%2 : tensor<?x?xi32>) {
	^bb0(%arg2: i32, %arg3: i32, %arg4: i32): // no predecessors
	%10 = addi %arg2, %arg3 : i32
	linalg.yield %10 : i32
	} -> tensor<?x?xi32>
	%4 = linalg.indexed_generic {
	indexing_maps = [#map0, #map0],
	iterator_types = ["parallel", "parallel"] }
	ins(%3 : tensor<?x?xi32>)
	outs(%2 : tensor<?x?xi32>) {
	^bb0(%arg2: index, %arg3: index, %arg4: i32, %arg5: i32): // no predecessors
	%5 = index_cast %arg2 : index to i32
	%6 = index_cast %arg3 : index to i32
	%7 = addi %arg4, %5 : i32
	%8 = subi %7, %6 : i32
	linalg.yield %8 : i32
	} -> tensor<?x?xi32>
	return %4 : tensor<?x?xi32>
	}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-LABEL: func @generic_op_indexed_generic_op_fusion
	// CHECK-NOT: linalg.indexed_generic
	// CHECK: linalg.generic			// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]]]
	// CHECK: ^{{[a-zA-Z0-9_]*}}			// CHECK: ^{{[a-zA-Z0-9_]*}}
	// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: i32			// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]]]: index, %{{.}}: index)
	// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]*]]: i32			// CHECK: %[[IDX:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG0:.+]] = linalg.index 0 : index			// CHECK: %[[VAL1:.+]] = subi %[[CST]], %[[ARG2]]
	// CHECK: %[[ARG1:.+]] = linalg.index 1 : index			// CHECK: %{{.*}} = addi %[[VAL1]], %[[IDX]]
	// CHECK: %[[VAL1:.+]] = addi %[[ARG2]], %[[ARG3]] : i32
	// CHECK: %[[ADD_OPERAND:.+]] = index_cast %[[ARG0]] : index to i32
	// CHECK: %[[SUB_OPERAND:.+]] = index_cast %[[ARG1]] : index to i32
	// CHECK: %[[VAL2:.+]] = addi %[[VAL1]], %[[ADD_OPERAND]] : i32
	// CHECK: %[[VAL3:.+]] = subi %[[VAL2]], %[[SUB_OPERAND]] : i32
	// CHECK: linalg.yield %[[VAL3]] : i32

	// -----			// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>			#map0 = affine_map<(d0, d1) -> (d0, d1)>
	func @producer_indexed_consumer_fusion(%arg0: tensor<?x?xi32>,			func @producer_indexed_consumer_fusion(%arg0: tensor<?x?xi32>,
	%arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {			%arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
	%c0 = constant 0 : index			%c0 = constant 0 : index
	%c1 = constant 1 : index			%c1 = constant 1 : index
	Show All 40 Lines
	// CHECK: %[[VAL2:.+]] = addi %[[VAL1]], %[[ADD_OPERAND]] : i32			// CHECK: %[[VAL2:.+]] = addi %[[VAL1]], %[[ADD_OPERAND]] : i32
	// CHECK: %[[VAL3:.+]] = subi %[[VAL2]], %[[SUB_OPERAND]] : i32			// CHECK: %[[VAL3:.+]] = subi %[[VAL2]], %[[SUB_OPERAND]] : i32
	// CHECK: linalg.yield %[[VAL3]] : i32			// CHECK: linalg.yield %[[VAL3]] : i32
	// CHECK-NOT: linalg.generic			// CHECK-NOT: linalg.generic

	// -----			// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>			#map0 = affine_map<(d0, d1) -> (d0, d1)>
	func @indexed_generic_op_generic_op_fusion(%arg0: tensor<?x?xi32>,
	mravishankarUnsubmitted Done Reply Inline Actions Same here. This needs to be kept as well. mravishankar: Same here. This needs to be kept as well.
	%arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
	%c0 = constant 0 : index
	%c1 = constant 1 : index
	%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>
	%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>
	%2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
	%3 = linalg.indexed_generic {
	indexing_maps = [#map0, #map0],
	iterator_types = ["parallel", "parallel"] }
	ins(%arg0 : tensor<?x?xi32>)
	outs(%2 : tensor<?x?xi32>) {
	^bb0(%arg2: index, %arg3: index, %arg4: i32, %arg5: i32): // no predecessors
	%4 = index_cast %arg2 : index to i32
	%5 = index_cast %arg3 : index to i32
	%6 = addi %arg4, %4 : i32
	%7 = subi %6, %5 : i32
	linalg.yield %7 : i32
	} -> tensor<?x?xi32>
	%4 = linalg.generic {
	indexing_maps = [#map0, #map0, #map0],
	iterator_types = ["parallel", "parallel"] }
	ins(%3, %arg1 : tensor<?x?xi32>, tensor<?x?xi32>)
	outs(%2 : tensor<?x?xi32>) {
	^bb0(%arg2: i32, %arg3: i32, %arg4: i32): // no predecessors
	%10 = addi %arg2, %arg3 : i32
	linalg.yield %10 : i32
	} -> tensor<?x?xi32>
	return %4 : tensor<?x?xi32>
	}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-LABEL: func @indexed_generic_op_generic_op_fusion
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]]]
	// CHECK: ^{{[a-zA-Z0-9_]*}}
	// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: i32
	// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]*]]: i32
	// CHECK: %[[ARG0:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG1:.+]] = linalg.index 1 : index
	// CHECK: %[[ADD_OPERAND:.+]] = index_cast %[[ARG0]] : index to i32
	// CHECK: %[[SUB_OPERAND:.+]] = index_cast %[[ARG1]] : index to i32
	// CHECK: %[[VAL1:.+]] = addi %[[ARG2]], %[[ADD_OPERAND]] : i32
	// CHECK: %[[VAL2:.+]] = subi %[[VAL1]], %[[SUB_OPERAND]] : i32
	// CHECK: %[[VAL3:.+]] = addi %[[VAL2]], %[[ARG3]] : i32
	// CHECK: linalg.yield %[[VAL3]] : i32
	// CHECK-NOT: linalg.generic

	// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>
	func @indexed_producer_consumer_fusion(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {			func @indexed_producer_consumer_fusion(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
	%c0 = constant 0 : index			%c0 = constant 0 : index
	%c1 = constant 1 : index			%c1 = constant 1 : index
	%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>			%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>
	%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>			%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>
	%2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>			%2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
	%3 = linalg.generic {			%3 = linalg.generic {
	indexing_maps = [#map0, #map0],			indexing_maps = [#map0, #map0],
	Show All 34 Lines
	// CHECK: %[[VAL1:.+]] = addi %[[ARG0]], %[[ADD_OPERAND]] : i32			// CHECK: %[[VAL1:.+]] = addi %[[ARG0]], %[[ADD_OPERAND]] : i32
	// CHECK: %[[VAL2:.+]] = subi %[[VAL1]], %[[SUB_OPERAND]] : i32			// CHECK: %[[VAL2:.+]] = subi %[[VAL1]], %[[SUB_OPERAND]] : i32
	// CHECK: %[[VAL3:.+]] = addi %[[VAL2]], %[[ARG0]] : i32			// CHECK: %[[VAL3:.+]] = addi %[[VAL2]], %[[ARG0]] : i32
	// CHECK: linalg.yield %[[VAL3]] : i32			// CHECK: linalg.yield %[[VAL3]] : i32
	// CHECK-NOT: linalg.generic			// CHECK-NOT: linalg.generic

	// -----			// -----

	// The indices of the first indexed_generic op are swapped after fusion.			// The indices of the first generic op are swapped after fusion.
	mravishankarUnsubmitted Done Reply Inline Actions This needs to be kept too. Any tests that you see using the `index` bb args need to be kept to verify that the fusion is working as expected. mravishankar: This needs to be kept too. Any tests that you see using the `index` bb args need to be kept to…
	#map0 = affine_map<(d0, d1) -> (d1, d0)>
	#map1 = affine_map<(d0, d1) -> (d0, d1)>
	func @indexed_generic_op_fusion(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
	%c0 = constant 0 : index
	%c1 = constant 1 : index
	%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>
	%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>
	%2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
	%3 = linalg.indexed_generic {
	indexing_maps = [#map0, #map0],
	iterator_types = ["parallel", "parallel"] }
	ins(%arg0 : tensor<?x?xi32>)
	outs(%2 : tensor<?x?xi32>) {
	^bb0(%arg2: index, %arg3: index, %arg4: i32, %arg5: i32): // no predecessors
	%4 = index_cast %arg2 : index to i32
	%5 = index_cast %arg3 : index to i32
	%6 = addi %arg4, %4 : i32
	%7 = subi %5, %6 : i32
	linalg.yield %7 : i32
	} -> tensor<?x?xi32>
	%4= linalg.indexed_generic {
	indexing_maps = [#map1, #map1],
	iterator_types = ["parallel", "parallel"] }
	ins(%3 : tensor<?x?xi32>)
	outs(%2 : tensor<?x?xi32>) {
	^bb0(%arg2: index, %arg3: index, %arg4: i32, %arg5: i32): // no predecessors
	%5 = index_cast %arg2 : index to i32
	%6 = index_cast %arg3 : index to i32
	%7 = addi %arg4, %5 : i32
	%8 = subi %7, %6 : i32
	linalg.yield %8 : i32
	} -> tensor<?x?xi32>
	return %4 : tensor<?x?xi32>
	}
	// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK-LABEL: func @indexed_generic_op_fusion
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP0]]]
	// CHECK: ^{{[a-zA-Z0-9_]*}}
	// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: i32
	// CHECK: %[[ARG0:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG1:.+]] = linalg.index 1 : index
	// CHECK: %[[ADD_OPERAND1:.+]] = index_cast %[[ARG1]] : index to i32
	// CHECK: %[[SUB_OPERAND1:.+]] = index_cast %[[ARG0]] : index to i32
	// CHECK: %[[VAL1:.+]] = addi %[[ARG2]], %[[ADD_OPERAND1]] : i32
	// CHECK: %[[VAL2:.+]] = subi %[[SUB_OPERAND1]], %[[VAL1]] : i32
	// CHECK: %[[ADD_OPERAND2:.+]] = index_cast %[[ARG0]] : index to i32
	// CHECK: %[[SUB_OPERAND2:.+]] = index_cast %[[ARG1]] : index to i32
	// CHECK: %[[VAL3:.+]] = addi %[[VAL2]], %[[ADD_OPERAND2]] : i32
	// CHECK: %[[VAL4:.+]] = subi %[[VAL3]], %[[SUB_OPERAND2]] : i32
	// CHECK: linalg.yield %[[VAL4]] : i32
	// CHECK-NOT: linalg.generic

	// -----

	// The indices of the first indexed_generic op are swapped after fusion.
	#map0 = affine_map<(d0, d1) -> (d1, d0)>			#map0 = affine_map<(d0, d1) -> (d1, d0)>
	#map1 = affine_map<(d0, d1) -> (d0, d1)>			#map1 = affine_map<(d0, d1) -> (d0, d1)>
	func @indexed_producer_indexed_consumer_fusion(%arg0: tensor<?x?xi32>)			func @indexed_producer_indexed_consumer_fusion(%arg0: tensor<?x?xi32>)
	-> tensor<?x?xi32> {			-> tensor<?x?xi32> {
	%c0 = constant 0 : index			%c0 = constant 0 : index
	%c1 = constant 1 : index			%c1 = constant 1 : index
	%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>			%0 = memref.dim %arg0, %c0 : tensor<?x?xi32>
	%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>			%1 = memref.dim %arg0, %c1 : tensor<?x?xi32>
	▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
	// CHECK: %[[SUB_OPERAND2:.+]] = index_cast %[[IDX3]] : index to i32			// CHECK: %[[SUB_OPERAND2:.+]] = index_cast %[[IDX3]] : index to i32
	// CHECK: %[[VAL3:.+]] = addi %[[VAL2]], %[[ADD_OPERAND2]] : i32			// CHECK: %[[VAL3:.+]] = addi %[[VAL2]], %[[ADD_OPERAND2]] : i32
	// CHECK: %[[VAL4:.+]] = subi %[[VAL3]], %[[SUB_OPERAND2]] : i32			// CHECK: %[[VAL4:.+]] = subi %[[VAL3]], %[[SUB_OPERAND2]] : i32
	// CHECK: linalg.yield %[[VAL4]] : i32			// CHECK: linalg.yield %[[VAL4]] : i32
	// CHECK-NOT: linalg.generic			// CHECK-NOT: linalg.generic

	// -----			// -----

	func @scalar_indexed_generic_fusion
	(%arg0: tensor<5x1x1xf32>, %arg1 : tensor<i32>) -> tensor<10xf32>
	{
	%c0 = constant 0 : index
	%cst = constant dense<1.000000e+00> : tensor<10xf32>
	%0 = linalg.init_tensor [] : tensor<f32>
	%1 = linalg.indexed_generic
	{indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>],
	iterator_types = []}
	ins(%arg1 : tensor<i32>) outs(%0 : tensor<f32>) {
	^bb0(%arg2: i32, %arg3: f32): // no predecessors
	%3 = index_cast %arg2 : i32 to index
	%4 = tensor.extract %arg0[%3, %c0, %c0] : tensor<5x1x1xf32>
	linalg.yield %4 : f32
	} -> tensor<f32>
	%2 = linalg.init_tensor [10] : tensor<10xf32>
	%3 = linalg.generic
	{indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%1, %cst : tensor<f32>, tensor<10xf32>) outs(%2 : tensor<10xf32>) {
	^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
	%4 = mulf %arg2, %arg3 : f32
	linalg.yield %4 : f32
	} -> tensor<10xf32>
	return %3 : tensor<10xf32>
	}
	// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> ()>
	// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0)>
	// CHECK: func @scalar_indexed_generic_fusion
	// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<5x1x1xf32>
	// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<i32>
	// CHECK: %[[T0:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]]
	// CHECK-SAME: iterator_types = ["parallel"]
	// CHECK-SAME: ins(%[[ARG1]] : tensor<i32>)
	// CHECK: tensor.extract %[[ARG0]]
	// CHECK: linalg.yield
	// CHECK return %[[T0]]

	// -----

	func @scalar_generic_fusion			func @scalar_generic_fusion
	(%arg0: tensor<5x1x1xf32>, %arg1 : tensor<i32>) -> tensor<10xf32>			(%arg0: tensor<5x1x1xf32>, %arg1 : tensor<i32>) -> tensor<10xf32>
	{			{
	%c0 = constant 0 : index			%c0 = constant 0 : index
	%cst = constant dense<1.000000e+00> : tensor<10xf32>			%cst = constant dense<1.000000e+00> : tensor<10xf32>
	%0 = linalg.init_tensor [] : tensor<f32>			%0 = linalg.init_tensor [] : tensor<f32>
	%1 = linalg.generic			%1 = linalg.generic
	{indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>],			{indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>],
	▲ Show 20 Lines • Show All 202 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/reshape_fusion.mlir

	Show First 20 Lines • Show All 160 Lines • ▼ Show 20 Lines
	// CHECK-SAME: ins(%[[T0]] : tensor<8x33x4xf32>)			// CHECK-SAME: ins(%[[T0]] : tensor<8x33x4xf32>)
	// CHECK-SAME: outs(%[[T1]] : tensor<8x33x4xf32>)			// CHECK-SAME: outs(%[[T1]] : tensor<8x33x4xf32>)
	// CHECK: return %[[T2]] : tensor<8x33x4xf32>			// CHECK: return %[[T2]] : tensor<8x33x4xf32>

	// -----			// -----

	#map0 = affine_map<(d0, d1, d2) -> (d2, d0, d1)>			#map0 = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d2, d0)>			#map1 = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
	func @indexed_generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xi32>,
	%arg1 : tensor<?x?x?xi32>) ->
	tensor<?x?x?xi32>
	{
	%0 = linalg.tensor_reshape %arg0 [[0], [1, 2], [3]] :
	tensor<?x?x4x?xi32> into tensor<?x?x?xi32>
	%1 = linalg.indexed_generic {
	indexing_maps = [#map0, #map1, #map1],
	iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%0, %arg1 : tensor<?x?x?xi32>, tensor<?x?x?xi32>)
	outs(%0 : tensor<?x?x?xi32>) {
	^bb0(%arg3 : index, %arg4 : index, %arg5 : index, %arg6: i32, %arg7: i32, %s: i32):
	%1 = muli %arg6, %arg7 : i32
	%2 = index_cast %arg3 : index to i32
	%3 = addi %1, %2 : i32
	%4 = index_cast %arg4 : index to i32
	%5 = addi %3, %4 : i32
	%6 = index_cast %arg5 : index to i32
	%7 = addi %5, %6 : i32
	linalg.yield %7 : i32
	} -> tensor<?x?x?xi32>
	return %1 : tensor<?x?x?xi32>
	}

	// The generic op version of the test check for the op structure. Only
	// checking the op body here.
	// CHECK: #[[MAP:.+]] = affine_map<(d0, d1) -> (d0 + d1 * 4)>
	// CHECK: func @indexed_generic_op_reshape_producer_fusion
	// CHECK: linalg.generic
	// CHECK: ^{{.*}}(
	// CHECK-SAME: %[[ARG6:[a-zA-Z0-9]+]]: i32, %[[ARG7:[a-zA-Z0-9]+]]: i32,
	// CHECK-SAME: %[[ARG8:[a-zA-Z0-9]+]]: i32)
	// CHECK: %[[ARG2:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG3:.+]] = linalg.index 1 : index
	// CHECK: %[[ARG4:.+]] = linalg.index 2 : index
	// CHECK: %[[ARG5:.+]] = linalg.index 3 : index
	// CHECK: %[[T3:.+]] = affine.apply #[[MAP]](%[[ARG3]], %[[ARG2]])
	// CHECK: %[[T4:.+]] = muli %[[ARG6]], %[[ARG7]]
	// CHECK: %[[T5:.+]] = index_cast %[[T3]]
	// CHECK: %[[T6:.+]] = addi %[[T4]], %[[T5]]
	// CHECK: %[[T7:.+]] = index_cast %[[ARG4]]
	// CHECK: %[[T8:.+]] = addi %[[T6]], %[[T7]]
	// CHECK: %[[T9:.+]] = index_cast %[[ARG5]]
	// CHECK: %[[T10:.+]] = addi %[[T8]], %[[T9]]
	// CHECK: linalg.yield %[[T10]]

	// -----

	#map0 = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
	func @indexed_consumer_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xi32>,			func @indexed_consumer_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xi32>,
	%arg1 : tensor<?x?x?xi32>) ->			%arg1 : tensor<?x?x?xi32>) ->
	tensor<?x?x?xi32>			tensor<?x?x?xi32>
	{			{
	%0 = linalg.tensor_reshape %arg0 [[0], [1, 2], [3]]:			%0 = linalg.tensor_reshape %arg0 [[0], [1, 2], [3]]:
	tensor<?x?x4x?xi32> into tensor<?x?x?xi32>			tensor<?x?x4x?xi32> into tensor<?x?x?xi32>
	%1 = linalg.generic {			%1 = linalg.generic {
	indexing_maps = [#map0, #map1, #map1],			indexing_maps = [#map0, #map1, #map1],
	Show All 35 Lines
	// CHECK: %[[T8:.+]] = addi %[[T6]], %[[T7]]			// CHECK: %[[T8:.+]] = addi %[[T6]], %[[T7]]
	// CHECK: %[[T9:.+]] = index_cast %[[IDX3]]			// CHECK: %[[T9:.+]] = index_cast %[[IDX3]]
	// CHECK: %[[T10:.+]] = addi %[[T8]], %[[T9]]			// CHECK: %[[T10:.+]] = addi %[[T8]], %[[T9]]
	// CHECK: linalg.yield %[[T10]]			// CHECK: linalg.yield %[[T10]]

	// -----			// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>			#map0 = affine_map<(d0, d1) -> (d0, d1)>
	func @indexed_generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?xi32>,
	%arg1 : tensor<?x?xi32>) ->
	tensor<?x?x4x5xi32>
	{
	%0 = linalg.indexed_generic {
	indexing_maps = [#map0, #map0, #map0],
	iterator_types = ["parallel", "parallel"]}
	ins(%arg0, %arg1 : tensor<?x?xi32>, tensor<?x?xi32>)
	outs(%arg0 : tensor<?x?xi32>) {
	^bb0(%arg3 : index, %arg4 : index, %arg5: i32, %arg6: i32, %s: i32): // no predecessors
	%1 = muli %arg5, %arg6 : i32
	%2 = index_cast %arg3 : index to i32
	%3 = addi %1, %2 : i32
	%4 = index_cast %arg4 : index to i32
	%5 = addi %3, %4 : i32
	linalg.yield %5 : i32
	} -> tensor<?x?xi32>
	%1 = linalg.tensor_reshape %0 [[0], [1, 2, 3]] :
	tensor<?x?xi32> into tensor<?x?x4x5xi32>
	return %1 : tensor<?x?x4x5xi32>
	}
	// The generic op version of the test check for the op structure. Only
	// checking the op body here.
	// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 5 + d2 * 20)>
	// CHECK: func @indexed_generic_op_reshape_consumer_fusion
	// CHECK: linalg.generic
	// CHECK: ^{{.*}}(
	// CHECK-SAME: %[[ARG6:[a-zA-Z0-9]+]]: i32, %[[ARG7:[a-zA-Z0-9]+]]: i32,
	// CHECK-SAME: %[[ARG8:[a-zA-Z0-9]+]]: i32)
	// CHECK: %[[ARG2:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG3:.+]] = linalg.index 1 : index
	// CHECK: %[[ARG4:.+]] = linalg.index 2 : index
	// CHECK: %[[ARG5:.+]] = linalg.index 3 : index
	// CHECK: %[[T3:.+]] = affine.apply #[[MAP]](%[[ARG5]], %[[ARG4]], %[[ARG3]])
	// CHECK: %[[T4:.+]] = muli %[[ARG6]], %[[ARG7]]
	// CHECK: %[[T5:.+]] = index_cast %[[ARG2]]
	// CHECK: %[[T6:.+]] = addi %[[T4]], %[[T5]]
	// CHECK: %[[T7:.+]] = index_cast %[[T3]]
	// CHECK: %[[T8:.+]] = addi %[[T6]], %[[T7]]
	// CHECK: linalg.yield %[[T8]]

	// -----

	#map0 = affine_map<(d0, d1) -> (d0, d1)>
	func @indexed_producer_reshape_consumer_fusion(%arg0 : tensor<?x?xi32>,			func @indexed_producer_reshape_consumer_fusion(%arg0 : tensor<?x?xi32>,
	%arg1 : tensor<?x?xi32>) ->			%arg1 : tensor<?x?xi32>) ->
	tensor<?x?x4x5xi32>			tensor<?x?x4x5xi32>
	{			{
	%0 = linalg.generic {			%0 = linalg.generic {
	indexing_maps = [#map0, #map0, #map0],			indexing_maps = [#map0, #map0, #map0],
	iterator_types = ["parallel", "parallel"]}			iterator_types = ["parallel", "parallel"]}
	ins(%arg0, %arg1 : tensor<?x?xi32>, tensor<?x?xi32>)			ins(%arg0, %arg1 : tensor<?x?xi32>, tensor<?x?xi32>)
	Show All 33 Lines
	// CHECK: linalg.yield %[[T8]]			// CHECK: linalg.yield %[[T8]]

	// -----			// -----

	func @reshape_as_consumer_permutation			func @reshape_as_consumer_permutation
	(%a : tensor<210x6x4xi32>, %b : tensor<210x4xi32>)			(%a : tensor<210x6x4xi32>, %b : tensor<210x4xi32>)
	-> tensor<2x3x4x5x6x7xi32> {			-> tensor<2x3x4x5x6x7xi32> {
	%shape = linalg.init_tensor [6, 4, 210] : tensor<6x4x210xi32>			%shape = linalg.init_tensor [6, 4, 210] : tensor<6x4x210xi32>
	%c = linalg.indexed_generic {
	indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,
	affine_map<(d0, d1, d2) -> (d1, d2)>,
	affine_map<(d0, d1, d2) -> (d0, d2, d1)>],
	iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%a, %b : tensor<210x6x4xi32>, tensor<210x4xi32>)
	outs(%shape : tensor<6x4x210xi32>) {
	^bb0(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : i32, %arg4: i32, %s: i32):
	%1 = addi %arg3, %arg4 : i32
	%2 = index_cast %arg0 : index to i32
	%3 = addi %1, %2 : i32
	%4 = index_cast %arg1 : index to i32
	%5 = addi %3, %4 : i32
	%6 = index_cast %arg2 : index to i32
	%7 = addi %5, %6 : i32
	linalg.yield %7 : i32
	} -> tensor<6x4x210xi32>
	%d = linalg.tensor_reshape %c [[0, 1], [2], [3, 4, 5]]
	: tensor<6x4x210xi32> into tensor<2x3x4x5x6x7xi32>
	return %d : tensor<2x3x4x5x6x7xi32>
	}
	// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d3, d4, d0, d1, d5)>
	// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d3, d4, d5)>
	// CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d2, d3, d4)>
	// CHECK-DAG: #[[MAP8:.+]] = affine_map<(d0, d1) -> (d0 + d1 * 3)>
	// CHECK-DAG: #[[MAP9:.+]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 7 + d2 * 42)>
	// CHECK: func @reshape_as_consumer_permutation
	// CHECK-SAME: %[[ARG0:.+]]: tensor<210x6x4xi32>
	// CHECK-SAME: %[[ARG1:.+]]: tensor<210x4xi32>
	// CHECK-DAG: %[[T1:.+]] = linalg.tensor_reshape %[[ARG0]]
	// CHECK-SAME: [0, 1, 2], [3, 4], [5]
	// CHECK-DAG: %[[T2:.+]] = linalg.tensor_reshape %[[ARG1]]
	// CHECK-SAME: [0, 1, 2], [3]
	// CHECK-DAG: %[[T0:.+]] = linalg.init_tensor [2, 3, 4, 5, 6, 7]
	// CHECK: %[[T4:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP5]], #[[MAP6]], #[[MAP7]]]
	// CHECK-SAME: ins(%[[T1]], %[[T2]] : tensor<5x6x7x2x3x4xi32>, tensor<5x6x7x4xi32>)
	// CHECK-SAME: outs(%[[T0]] : tensor<2x3x4x5x6x7xi32>)
	// CHECK: ^{{.+}}(
	// CHECK-SAME: %[[ARG8:[a-zA-Z0-9]+]]: i32, %[[ARG9:[a-zA-Z0-9]+]]: i32,
	// CHECK-SAME: %[[ARG10:[a-zA-Z0-9]+]]: i32)
	// CHECK: %[[ARG2:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG3:.+]] = linalg.index 1 : index
	// CHECK: %[[ARG4:.+]] = linalg.index 2 : index
	// CHECK: %[[ARG5:.+]] = linalg.index 3 : index
	// CHECK: %[[ARG6:.+]] = linalg.index 4 : index
	// CHECK: %[[ARG7:.+]] = linalg.index 5 : index
	// CHECK-DAG: %[[T5:.+]] = affine.apply #[[MAP8]](%[[ARG3]], %[[ARG2]])
	// CHECK-DAG: %[[T6:.+]] = affine.apply #[[MAP9]](%[[ARG6]], %[[ARG5]], %[[ARG4]])
	// CHECK-DAG: %[[T7:.+]] = addi %[[ARG8]], %[[ARG9]]
	// CHECK: %[[T8:.+]] = index_cast %[[T5]]
	// CHECK: %[[T9:.+]] = addi %[[T7]], %[[T8]]
	// CHECK: %[[T10:.+]] = index_cast %[[T6]]
	// CHECK: %[[T11:.+]] = addi %[[T9]], %[[T10]]
	// CHECK: %[[T12:.+]] = index_cast %[[ARG7]]
	// CHECK: %[[T13:.+]] = addi %[[T11]], %[[T12]]

	// -----

	func @reshape_as_consumer_permutation
	(%a : tensor<210x6x4xi32>, %b : tensor<210x4xi32>)
	-> tensor<2x3x4x5x6x7xi32> {
	%shape = linalg.init_tensor [6, 4, 210] : tensor<6x4x210xi32>
	%c = linalg.generic {			%c = linalg.generic {
	indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,			indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,
	affine_map<(d0, d1, d2) -> (d1, d2)>,			affine_map<(d0, d1, d2) -> (d1, d2)>,
	affine_map<(d0, d1, d2) -> (d0, d2, d1)>],			affine_map<(d0, d1, d2) -> (d0, d2, d1)>],
	iterator_types = ["parallel", "parallel", "parallel"]}			iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%a, %b : tensor<210x6x4xi32>, tensor<210x4xi32>)			ins(%a, %b : tensor<210x6x4xi32>, tensor<210x4xi32>)
	outs(%shape : tensor<6x4x210xi32>) {			outs(%shape : tensor<6x4x210xi32>) {
	^bb0(%arg3 : i32, %arg4: i32, %s: i32):			^bb0(%arg3 : i32, %arg4: i32, %s: i32):
	▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines

	// -----			// -----

	func @reshape_as_producer_projected_permutation(			func @reshape_as_producer_projected_permutation(
	%arg0 : tensor<33x8x?xi32>, %shape : tensor<264x?x4xi32>) -> tensor<264x?x4xi32>			%arg0 : tensor<33x8x?xi32>, %shape : tensor<264x?x4xi32>) -> tensor<264x?x4xi32>
	{			{
	%0 = linalg.tensor_reshape %arg0 [[0, 1], [2]]			%0 = linalg.tensor_reshape %arg0 [[0, 1], [2]]
	: tensor<33x8x?xi32> into tensor<264x?xi32>			: tensor<33x8x?xi32> into tensor<264x?xi32>
	%1 = linalg.indexed_generic
	{indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>,
	affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
	iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%0 : tensor<264x?xi32>)
	outs(%shape : tensor<264x?x4xi32>) {
	^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: i32, %s: i32): // no predecessors
	%2 = index_cast %arg1 : index to i32
	%3 = addi %arg4, %2 : i32
	%4 = index_cast %arg2 : index to i32
	%5 = addi %3, %4 : i32
	%6 = index_cast %arg3 : index to i32
	%7 = addi %5, %6 : i32
	linalg.yield %7 : i32
	} -> tensor<264x?x4xi32>
	return %1 : tensor<264x?x4xi32>
	}

	// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
	// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0 + d1 * 8)>
	// CHECK: @reshape_as_producer_projected_permutation
	// CHECK-SAME: %[[ARG0:.+]]: tensor<33x8x?xi32>
	// CHECK: %[[RES:.+]] = linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]]
	// CHECK-SAME: ins(%[[ARG0]] : tensor<33x8x?xi32>)
	// CHECK: ^{{.+}}(
	// CHECK-SAME: %[[ARG5:[a-zA-Z0-9]+]]: i32,
	// CHECK-SAME: %[[ARG7:[a-zA-Z0-9]+]]: i32)
	// CHECK: %[[ARG1:.+]] = linalg.index 0 : index
	// CHECK: %[[ARG2:.+]] = linalg.index 1 : index
	// CHECK: %[[ARG3:.+]] = linalg.index 2 : index
	// CHECK: %[[ARG4:.+]] = linalg.index 3 : index
	// CHECK: %[[T0:.+]] = affine.apply #[[MAP2]](%[[ARG2]], %[[ARG1]])
	// CHECK: %[[T1:.+]] = index_cast %[[T0]] : index to i32
	// CHECK: %[[T2:.+]] = addi %[[ARG5]], %[[T1]] : i32
	// CHECK: %[[T3:.+]] = index_cast %[[ARG3]] : index to i32
	// CHECK: %[[T4:.+]] = addi %[[T2]], %[[T3]] : i32
	// CHECK: %[[T5:.+]] = index_cast %[[ARG4]] : index to i32
	// CHECK: %[[T6:.+]] = addi %[[T4]], %[[T5]] : i32
	// CHECK: linalg.yield %[[T6]] : i32
	// CHECK: %[[RES2:.+]] = linalg.tensor_reshape %[[RES]]
	// CHECK-SAME: [0, 1], [2], [3]
	// CHECK-SAME: : tensor<33x8x?x4xi32> into tensor<264x?x4xi32>
	// CHECK: return %[[RES2]] : tensor<264x?x4xi32>

	// -----

	func @reshape_as_producer_projected_permutation(
	%arg0 : tensor<33x8x?xi32>, %shape : tensor<264x?x4xi32>) -> tensor<264x?x4xi32>
	{
	%0 = linalg.tensor_reshape %arg0 [[0, 1], [2]]
	: tensor<33x8x?xi32> into tensor<264x?xi32>
	%1 = linalg.generic			%1 = linalg.generic
	{indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>,			{indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>,
	affine_map<(d0, d1, d2) -> (d0, d1, d2)>],			affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
	iterator_types = ["parallel", "parallel", "parallel"]}			iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%0 : tensor<264x?xi32>)			ins(%0 : tensor<264x?xi32>)
	outs(%shape : tensor<264x?x4xi32>) {			outs(%shape : tensor<264x?x4xi32>) {
	^bb0(%arg1: i32, %s: i32): // no predecessors			^bb0(%arg1: i32, %s: i32): // no predecessors
	%idx0 = linalg.index 0 : index			%idx0 = linalg.index 0 : index
	▲ Show 20 Lines • Show All 163 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/reshape_linearization_fusion.mlir

	// RUN: mlir-opt -split-input-file -linalg-fold-reshape-ops-by-linearization %s \| FileCheck %s			// RUN: mlir-opt -split-input-file -linalg-fold-reshape-ops-by-linearization %s \| FileCheck %s

	#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>			#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	func @generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x?xf32>,			func @generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x?xi32>)
	%arg1 : tensor<?x?x4x?xf32>) -> tensor<?x?x4x?xf32> {
	%0 = linalg.tensor_reshape %arg0 [[0], [1, 2], [3]] :
	tensor<?x?x?xf32> into tensor<?x?x4x?xf32>
	%1 = linalg.generic {
	indexing_maps = [#map0, #map0, #map0],
	iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
	ins(%0, %arg1 : tensor<?x?x4x?xf32>, tensor<?x?x4x?xf32>)
	outs(%0 : tensor<?x?x4x?xf32>) {
	^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
	%1 = mulf %arg3, %arg4 : f32
	linalg.yield %1 : f32
	} -> tensor<?x?x4x?xf32>
	return %1 : tensor<?x?x4x?xf32>
	}
	// CHECK-DAG: #[[MAP3:.]] = affine_map<(d0, d1, d2, d3) -> (d0, d1 4 + d2, d3)>
	// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	// CHECK: func @generic_op_reshape_producer_fusion
	// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>
	// CHECK: %[[T0:.+]] = linalg.tensor_reshape %[[ARG0]]
	// CHECK-SAME: [0], [1, 2], [3]
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP3]], #[[MAP4]], #[[MAP4]]]
	// CHECK-SAME: ins(%[[ARG0]], %{{.+}} : tensor<?x?x?xf32>, tensor<?x?x4x?xf32>)
	// CHECK-SAME: outs(%[[T0]] : tensor<?x?x4x?xf32>)

	// -----

	#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?x4x5xf32>,
	%arg1 : tensor<?x?x4x5xf32>) -> tensor<?x?xf32> {
	%0 = linalg.generic {
	indexing_maps = [#map0, #map0, #map0],
	iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
	ins(%arg0, %arg1 : tensor<?x?x4x5xf32>, tensor<?x?x4x5xf32>)
	outs(%arg0 : tensor<?x?x4x5xf32>){
	^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
	%1 = mulf %arg3, %arg4 : f32
	linalg.yield %1 : f32
	} -> tensor<?x?x4x5xf32>
	%1 = linalg.tensor_reshape %0 [[0], [1, 2, 3]] :
	tensor<?x?x4x5xf32> into tensor<?x?xf32>
	return %1 : tensor<?x?xf32>
	}

	// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	// CHECK-DAG: #[[MAP3:.]] = affine_map<(d0, d1, d2, d3) -> (d0, d1 20 + d2 * 5 + d3)>
	// CHECK: func @generic_op_reshape_consumer_fusion
	// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x4x5xf32>
	// CHECK: %[[T0:.+]] = linalg.tensor_reshape %[[ARG0]]
	// CHECK-SAME: [0], [1, 2, 3]
	// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP2]], #[[MAP3]]]
	// CHECK-SAME: outs(%[[T0]] : tensor<?x?xf32>)

	// -----

	#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	func @indexed_generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x?xi32>)
	-> tensor<?x?x4x?xi32> {			-> tensor<?x?x4x?xi32> {
	%0 = linalg.tensor_reshape %arg0 [[0], [1, 2], [3]] :			%0 = linalg.tensor_reshape %arg0 [[0], [1, 2], [3]] :
	tensor<?x?x?xi32> into tensor<?x?x4x?xi32>			tensor<?x?x?xi32> into tensor<?x?x4x?xi32>
	%1 = linalg.indexed_generic {			%1 = linalg.generic {
	indexing_maps = [#map0, #map0],			indexing_maps = [#map0, #map0],
	iterator_types = ["parallel", "parallel", "parallel", "parallel"] }			iterator_types = ["parallel", "parallel", "parallel", "parallel"] }
	ins(%0 : tensor<?x?x4x?xi32>)			ins(%0 : tensor<?x?x4x?xi32>)
	outs(%0 : tensor<?x?x4x?xi32>) {			outs(%0 : tensor<?x?x4x?xi32>) {
	^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: i32, %arg7 : i32): // no predecessors			^bb0(%arg6: i32, %arg7 : i32): // no predecessors
	%2 = index_cast %arg2 : index to i32			%idx = linalg.index 0 : index
				%2 = index_cast %idx : index to i32
	%3 = addi %arg6, %2 : i32			%3 = addi %arg6, %2 : i32
	linalg.yield %3 : i32			linalg.yield %3 : i32
	} -> tensor<?x?x4x?xi32>			} -> tensor<?x?x4x?xi32>
	return %1 : tensor<?x?x4x?xi32>			return %1 : tensor<?x?x4x?xi32>
	}			}
	// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1 * 4 + d2, d3)>			// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1 * 4 + d2, d3)>
	// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>			// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	// CHECK: func @indexed_generic_op_reshape_producer_fusion			// CHECK: func @generic_op_reshape_producer_fusion
	// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xi32>			// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xi32>
	// CHECK: %[[T0:.+]] = linalg.tensor_reshape %[[ARG0]]			// CHECK: %[[T0:.+]] = linalg.tensor_reshape %[[ARG0]]
	// CHECK-SAME: [0], [1, 2], [3]			// CHECK-SAME: [0], [1, 2], [3]
	// CHECK: linalg.indexed_generic			// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP3]], #[[MAP4]]]			// CHECK-SAME: indexing_maps = [#[[MAP3]], #[[MAP4]]]
	// CHECK-SAME: ins(%[[ARG0]] : tensor<?x?x?xi32>)			// CHECK-SAME: ins(%[[ARG0]] : tensor<?x?x?xi32>)
	// CHECK-SAME: outs(%[[T0]] : tensor<?x?x4x?xi32>)			// CHECK-SAME: outs(%[[T0]] : tensor<?x?x4x?xi32>)
				// CHECK: %[[IDX:.+]] = linalg.index 0 : index
				// CHECK-NEXT: %[[IDX_CASTED:.+]] = index_cast %[[IDX]] : index to i32

	// -----			// -----

	#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>			#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	func @indexed_generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?x4x5xi32>)			func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?x4x5xi32>)
	-> tensor<?x?xi32> {			-> tensor<?x?xi32> {
	%0 = linalg.indexed_generic {			%0 = linalg.generic {
	indexing_maps = [#map0, #map0],			indexing_maps = [#map0, #map0],
	iterator_types = ["parallel", "parallel", "parallel", "parallel"] }			iterator_types = ["parallel", "parallel", "parallel", "parallel"] }
	ins(%arg0 : tensor<?x?x4x5xi32>) outs(%arg0 : tensor<?x?x4x5xi32>) {			ins(%arg0 : tensor<?x?x4x5xi32>) outs(%arg0 : tensor<?x?x4x5xi32>) {
	^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: i32, %arg7: i32): // no predecessors			^bb0(%arg6: i32, %arg7: i32): // no predecessors
	%2 = index_cast %arg2 : index to i32			%idx = linalg.index 0 : index
				%2 = index_cast %idx : index to i32
	%3 = addi %arg6, %2 : i32			%3 = addi %arg6, %2 : i32
	linalg.yield %3 : i32			linalg.yield %3 : i32
	} -> tensor<?x?x4x5xi32>			} -> tensor<?x?x4x5xi32>
	%1 = linalg.tensor_reshape %0 [[0], [1, 2, 3]] :			%1 = linalg.tensor_reshape %0 [[0], [1, 2, 3]] :
	tensor<?x?x4x5xi32> into tensor<?x?xi32>			tensor<?x?x4x5xi32> into tensor<?x?xi32>
	return %1 : tensor<?x?xi32>			return %1 : tensor<?x?xi32>
	}			}
	// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>			// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1 * 20 + d2 * 5 + d3)>			// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1 * 20 + d2 * 5 + d3)>
	// CHECK: func @indexed_generic_op_reshape_consumer_fusion			// CHECK: func @generic_op_reshape_consumer_fusion
	// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x4x5xi32>			// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x4x5xi32>
	// CHECK: %[[T0:.+]] = linalg.tensor_reshape %[[ARG0]]			// CHECK: %[[T0:.+]] = linalg.tensor_reshape %[[ARG0]]
	// CHECK-SAME: [0], [1, 2, 3]			// CHECK-SAME: [0], [1, 2, 3]
	// CHECK: linalg.indexed_generic			// CHECK: linalg.generic
	// CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP3]]]			// CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP3]]]
	// CHECK-SAME: outs(%[[T0]] : tensor<?x?xi32>)			// CHECK-SAME: outs(%[[T0]] : tensor<?x?xi32>)
				// CHECK: %[[IDX:.+]] = linalg.index 0 : index
				// CHECK-NEXT: %[[IDX_CASTED:.+]] = index_cast %[[IDX]] : index to i32
	// CHECK-NOT: linalg.tensor_reshape			// CHECK-NOT: linalg.tensor_reshape

	// -----			// -----

	#map2 = affine_map<(d0, d1, d2) -> (d0, d2, d1)>			#map2 = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
	#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>			#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	func @generic_op_021_permultation_reshape_producer_fusion(%arg0 : tensor<3x35xf32>) -> tensor<3x7x5xf32> {			func @generic_op_021_permultation_reshape_producer_fusion(%arg0 : tensor<3x35xf32>) -> tensor<3x7x5xf32> {
	%0 = linalg.tensor_reshape %arg0 [[0], [1, 2]]			%0 = linalg.tensor_reshape %arg0 [[0], [1, 2]]
	▲ Show 20 Lines • Show All 130 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Remove IndexedGenericOp support from FusionOnTensors...
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 344332

mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp

mlir/test/Dialect/Linalg/fusion-tensor.mlir

mlir/test/Dialect/Linalg/reshape_fusion.mlir

mlir/test/Dialect/Linalg/reshape_linearization_fusion.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Remove IndexedGenericOp support from FusionOnTensors...ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 344332

mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp

mlir/test/Dialect/Linalg/fusion-tensor.mlir

mlir/test/Dialect/Linalg/reshape_fusion.mlir

mlir/test/Dialect/Linalg/reshape_linearization_fusion.mlir

[mlir][linalg] Remove IndexedGenericOp support from FusionOnTensors...
ClosedPublic