Diff 500804

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

Show First 20 Lines • Show All 1,719 Lines • ▼ Show 20 Lines	let description = [{
Hoist vector.transfer_read / vector.transfer_write pairs out of immediately		Hoist vector.transfer_read / vector.transfer_write pairs out of immediately
enclosing scf::ForOp iteratively, if the following conditions are true:		enclosing scf::ForOp iteratively, if the following conditions are true:
1. The 2 ops access the same memref with the same indices.		1. The 2 ops access the same memref with the same indices.
2. All operands are invariant under the enclosing scf::ForOp.		2. All operands are invariant under the enclosing scf::ForOp.
3. No uses of the memref either dominate the transfer_read or are		3. No uses of the memref either dominate the transfer_read or are
dominated by the transfer_write (i.e. no aliasing between the write and		dominated by the transfer_write (i.e. no aliasing between the write and
the read across the loop)		the read across the loop)

		WARNING: This hoisting does not model parallelism and is generally incorrect
		when used on distributed loops with memref semantics!
		TODO: obsolete and should be retired.

#### Return modes:		#### Return modes:

The operation always succeeds and returns a handle to the transformed		The operation always succeeds and returns a handle to the transformed
function op.		function op.
}];		}];

let arguments = (ins TransformHandleTypeInterface:$target);		let arguments = (ins TransformHandleTypeInterface:$target);
let results = (outs TransformHandleTypeInterface:$transformed);		let results = (outs TransformHandleTypeInterface:$transformed);
▲ Show 20 Lines • Show All 82 Lines • ▼ Show 20 Lines	def ConvertConv2DToImg2ColOp : Op<Transform_Dialect,
let extraClassDeclaration = [{		let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(		::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::linalg::LinalgOp target,		::mlir::linalg::LinalgOp target,
::mlir::transform::ApplyToEachResultList &results,		::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);		::mlir::transform::TransformState &state);
}];		}];
}		}

		//===----------------------------------------------------------------------===//
		// HoistRedundantTensorSubsetsOp
		//===----------------------------------------------------------------------===//

		def HoistRedundantTensorSubsetsOp :
		Op<Transform_Dialect, "structured.hoist_redundant_tensor_subsets",
		[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
		TransformEachOpTrait, TransformOpInterface]> {
		let description = [{
		Hoists supported tensor subset extract/insert operation pairs out of
		immediately enclosing loop iteratively, if the following conditions
		are true:
		1. The 2 ops access the same tensor subset.
		2. All operands are invariant under the enclosing loop.

		The supported subset extract/insert operation pairs currently comprise:
		- tensor.extract_slice / tensor.insert_slice
		- vector.transfer_read / vector.transfer_write on tensors

		Only scf.for loops are currently supported.

		When applied to:
		1. an scf.for loop, hoist out of this loop only.
		2. a non-loop op, apply hoisting to all the contained loop ops.

		#### Return modes:

		The operation always succeeds and returns a handle to the transformed
		function op.
		}];

		let arguments = (ins TransformHandleTypeInterface:$target);
		let results = (outs TransformHandleTypeInterface:$transformed);

		let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) ";

		let builders = [
		OpBuilder<(ins "Value":$target)>,
		];
		let extraClassDeclaration = [{
		::mlir::DiagnosedSilenceableFailure applyToOne(
		::mlir::Operation *target,
		::mlir::transform::ApplyToEachResultList &results,
		::mlir::transform::TransformState &state);
		}];
		}

#endif // LINALG_TRANSFORM_OPS		#endif // LINALG_TRANSFORM_OPS

mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h

	//===- Hoisting.h - Linalg hoisting transformations -------------- C++ --===//			//===- Hoisting.h - Linalg hoisting transformations -------------- C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_			#ifndef MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_
	#define MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_			#define MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_

	namespace mlir {			namespace mlir {
				class RewriterBase;
	namespace func {			namespace func {
	class FuncOp;			class FuncOp;
	} // namespace func			} // namespace func
				namespace scf {
				class ForOp;
				} // namespace scf

	namespace linalg {			namespace linalg {

	/// Hoist vector.transfer_read/vector.transfer_write on buffers pairs out of			/// Hoist vector.transfer_read/vector.transfer_write on buffers pairs out of
	/// immediately enclosing scf::ForOp iteratively, if the following conditions			/// immediately enclosing scf::ForOp iteratively, if the following conditions
	/// are true:			/// are true:
	/// 1. The two ops access the same memref with the same indices.			/// 1. The two ops access the same memref with the same indices.
	/// 2. All operands are invariant under the enclosing scf::ForOp.			/// 2. All operands are invariant under the enclosing scf::ForOp.
	/// 3. No uses of the memref either dominate the transfer_read or are			/// 3. No uses of the memref either dominate the transfer_read or are
	/// dominated by the transfer_write (i.e. no aliasing between the write and			/// dominated by the transfer_write (i.e. no aliasing between the write and
	/// the read across the loop)			/// the read across the loop)
	/// To improve hoisting opportunities, call the `moveLoopInvariantCode` helper			/// To improve hoisting opportunities, call the `moveLoopInvariantCode` helper
	/// function on the candidate loop above which to hoist. Hoisting the transfers			/// function on the candidate loop above which to hoist. Hoisting the transfers
	/// results in scf::ForOp yielding the value that originally transited through			/// results in scf::ForOp yielding the value that originally transited through
	/// memory.			/// memory.
	// TODO: generalize on a per-need basis.			///
				/// WARNING: This hoisting does not model parallelism and is generally incorrect
				/// when used on distributed loops with memref semantics!
				ThomasRaouxUnsubmitted Done Reply Inline Actions The other implementation won't work on buffers right? I don't think we can retire that until we have an alternative. ThomasRaoux: The other implementation won't work on buffers right? I don't think we can retire that until we…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions ack It's still broken in the parallel case though. nicolasvasilache: ack It's still broken in the parallel case though.
	void hoistRedundantVectorTransfers(func::FuncOp func);			void hoistRedundantVectorTransfers(func::FuncOp func);

	/// Same behavior as `hoistRedundantVectorTransfers` but works on tensors			/// Greedily hoist redundant subset extract/insert operations on tensors outside
	/// instead of buffers.			/// of `forOp`. The logic follows:
				/// 1. Look for a write walking back from the `forOp` yield.
				/// 2. Check the uses of the matching block argument and look for a matching
				springermUnsubmitted Done Reply Inline Actions Will this break if the loop yields two iter_args in a different order? springerm: Will this break if the loop yields two iter_args in a different order?
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions The algorithm only considers bbArg and matching yield, if the input IR ping-pongs, it will not apply. nicolasvasilache: The algorithm only considers bbArg and matching yield, if the input IR ping-pongs, it will not…
				/// read (i.e. extract_slice of transfer_read) with matching indices.
				/// 3. In the case of a transfer_write, we can bypass other non-conflicting
				/// operations and find more hoisting opportunities.
				springermUnsubmitted Done Reply Inline Actions It's a bit unclear what a WAW conflict on tensors is. springerm: It's a bit unclear what a WAW conflict on tensors is.
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions a WAW appears by virtue of reordering ops and using iter_args. Dropping WAW though because it is indeed biased towards "memory". nicolasvasilache: a WAW appears by virtue of reordering ops and using iter_args. Dropping WAW though because it…
				/// 4. Hoist the read/write pair and update the tensor SSA links.
				///
				/// Return the unmodified `forOp` if no hoisting occured.
				/// Return a new scf::ForOp if hoisting on tensors occured.
				///
				springermUnsubmitted Done Reply Inline Actions An example would be nice here. springerm: An example would be nice here.
				/// After this transformation the returned scf::ForOp may have unused arguments
				springermUnsubmitted Done Reply Inline Actions nit: `returned` springerm: nit: `returned`
				/// that can be removed by application of canonicalization patterns.
				///
				/// Example:
				/// ========
				/// IR Resembling:
				///
				/// ```
				/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0)->(tensor<10xf32>) {
				/// %1 = scf.for %j = %l to %u step %s iter_args(%a6 = %a0)->(tensor<10xf32>) {
				/// %e = tensor.extract_slice %a6[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
				/// %r = vector.transfer_read %e[%c0], %cst: tensor<?xf32>, vector<4xf32>
				/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
				/// %w = vector.transfer_write %u, %e[%c0] : vector<4xf32>, tensor<?xf32>
				/// %st = tensor.insert_slice %w into %a6[%i][%sz][1]
				/// : tensor<?xf32> into tensor<10xf32>
				/// scf.yield %st: tensor<10xf32>
				/// }
				/// scf.yield %1: tensor<10xf32>
				/// }
				/// ```
				///
				/// Progressively hoists to:
				///
				/// ```
				/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
				/// %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
				/// %1:2 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e)
				/// -> (tensor<10xf32>, tensor<?xf32>) {
				/// %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
				/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
				/// %w = vector.transfer_write %u, %a7[%c0] : vector<4xf32>, tensor<?xf32>
				/// scf.yield %a6, %w: tensor<10xf32>, tensor<?xf32>
				/// }
				/// %st = tensor.insert_slice %1#1 into %1#0[%i][%sz][1]
				/// : tensor<?xf32> into tensor<10xf32>
				/// scf.yield %1: tensor<10xf32>
				/// }
				/// ```
				///
				/// and
				///
				/// ```
				/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
				/// %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
				/// %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
				/// %1:3 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e, %a7 = r)
				/// -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
				/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
				/// scf.yield %a6, %a7, %u: tensor<10xf32>, tensor<?xf32>, vector<4xf32>
				/// }
				/// %w = vector.transfer_write %1#2, %1#1[%c0] : vector<4xf32>, tensor<?xf32>
				/// %st = tensor.insert_slice %w into %1#0[%i][%sz][1]
				/// : tensor<?xf32> into tensor<10xf32>
				/// scf.yield %1: tensor<10xf32>
				/// }
				/// ```
				///
				/// It can then canonicalize to:
				///
				/// ```
				/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
				/// %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
				/// %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
				/// %1 = scf.for %j = %l to %u step %s iter_args(%a7 = r)
				/// -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
				/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
				/// scf.yield %u: vector<4xf32>
				/// }
				/// %w = vector.transfer_write %1, %e[%c0] : vector<4xf32>, tensor<?xf32>
				/// %st = tensor.insert_slice %w into %a0[%i][%sz][1]
				/// : tensor<?xf32> into tensor<10xf32>
				/// scf.yield %1: tensor<10xf32>
				/// }
				/// ```
				///
				// TODO: This should be further generalized along a few different axes:
				// - Other loops than scf.ForOp that operate on tensors (both sequential and
				// parallel loops).
				// - Other subset extract/insert pairs than tensor.extract/insert_slice and
				// vector.transfer_read/write.
				// - More general areSubsetDisjoint analysis/interface to work across all
				// subset op types and allow bypassing non-WAW-conflicting operations in
				// more cases.
				scf::ForOp hoistRedundantSubsetExtractInsert(RewriterBase &rewriter,
				scf::ForOp forOp);

				/// Call into `hoistRedundantSubsetInsertExtract` without a RewriterBase.
				// TODO: obsolete and should be retired
				springermUnsubmitted Done Reply Inline Actions "redundant" sounds like they would fold away. springerm: "redundant" sounds like they would fold away.
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions They are redundant within the loop: out of N iterations of the loop, N-1 read/write are redundant. It can be viewed as a form of folding: to 1 read/write outside of the loop, nicolasvasilache: They are redundant within the loop: out of N iterations of the loop, N-1 read/write are…
	void hoistRedundantVectorTransfersOnTensor(func::FuncOp func);			void hoistRedundantVectorTransfersOnTensor(func::FuncOp func);

	} // namespace linalg			} // namespace linalg
	} // namespace mlir			} // namespace mlir

	#endif // MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_			#endif // MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_

mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td

Show All 24 Lines	class Tensor_Op<string mnemonic, list<Trait> traits = []>
: Op<Tensor_Dialect, mnemonic, traits>;		: Op<Tensor_Dialect, mnemonic, traits>;

// Base class for ops with static/dynamic offset, sizes and strides		// Base class for ops with static/dynamic offset, sizes and strides
// attributes/arguments.		// attributes/arguments.
class Tensor_OpWithOffsetSizesAndStrides<string mnemonic,		class Tensor_OpWithOffsetSizesAndStrides<string mnemonic,
list<Trait> traits = []>		list<Trait> traits = []>
: Tensor_Op<mnemonic, traits> {		: Tensor_Op<mnemonic, traits> {
code extraBaseClassDeclaration = [{		code extraBaseClassDeclaration = [{
/// Returns the dynamic sizes for this subview operation if specified.		/// Return the type of the base tensor operand.
		::mlir::RankedTensorType getSourceType() {
		return getSource().getType().cast<RankedTensorType>();
		}

		/// Return the type of the result tensor.
		::mlir::RankedTensorType getResultType() {
		return getResult().getType().cast<RankedTensorType>();
		}

		/// Return the dynamic sizes for this subview operation if specified.
::mlir::Operation::operand_range getDynamicSizes() { return getSizes(); }		::mlir::Operation::operand_range getDynamicSizes() { return getSizes(); }

/// Return the list of Range (i.e. offset, size, stride). Each		/// Return the list of Range (i.e. offset, size, stride). Each
/// Range entry contains either the dynamic value or a ConstantIndexOp		/// Range entry contains either the dynamic value or a ConstantIndexOp
/// constructed with `b` at location `loc`.		/// constructed with `b` at location `loc`.
::mlir::SmallVector<::mlir::Range, 8> getOrCreateRanges(		::mlir::SmallVector<::mlir::Range, 8> getOrCreateRanges(
::mlir::OpBuilder &b, ::mlir::Location loc) {		::mlir::OpBuilder &b, ::mlir::Location loc) {
return ::mlir::getOrCreateRanges(*this, b, loc);		return ::mlir::getOrCreateRanges(*this, b, loc);
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	let description = [{

Example:		Example:

```mlir		```mlir
// Always returns 4, can be constant folded:		// Always returns 4, can be constant folded:
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
%x = tensor.dim %A, %c0 : tensor<4x?xf32>		%x = tensor.dim %A, %c0 : tensor<4x?xf32>

// Returns the dynamic dimension of %A.		// Return the dynamic dimension of %A.
%c1 = arith.constant 1 : index		%c1 = arith.constant 1 : index
%y = tensor.dim %A, %c1 : memref<4x?xf32>		%y = tensor.dim %A, %c1 : memref<4x?xf32>

// Equivalent generic form:		// Equivalent generic form:
%x = "tensor.dim"(%A, %c0) : (memref<4x?xf32>, index) -> index		%x = "tensor.dim"(%A, %c0) : (memref<4x?xf32>, index) -> index
%y = "tensor.dim"(%A, %c1) : (memref<4x?xf32>, index) -> index		%y = "tensor.dim"(%A, %c1) : (memref<4x?xf32>, index) -> index
```		```
}];		}];
▲ Show 20 Lines • Show All 239 Lines • ▼ Show 20 Lines	OpBuilder<(ins "RankedTensorType":$resultType, "Value":$source,
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,		CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
// Build an ExtractSliceOp with mixed static and dynamic entries packed in		// Build an ExtractSliceOp with mixed static and dynamic entries packed in
// a Range vector.		// a Range vector.
OpBuilder<(ins "Value":$source, "ArrayRef<Range>":$ranges,		OpBuilder<(ins "Value":$source, "ArrayRef<Range>":$ranges,
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,		CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
];		];

let extraClassDeclaration = extraBaseClassDeclaration # [{		let extraClassDeclaration = extraBaseClassDeclaration # [{
/// Returns the type of the base tensor operand.
RankedTensorType getSourceType() {
return getSource().getType().cast<RankedTensorType>();
}

/// The result of an extract_slice is always a tensor.		/// The result of an extract_slice is always a tensor.
		// TODO: deprecate
RankedTensorType getType() {		RankedTensorType getType() {
return getResult().getType().cast<RankedTensorType>();		return getResultType();
}		}

/// Compute the rank-reduction mask that can be applied to map the source		/// Compute the rank-reduction mask that can be applied to map the source
/// tensor type to the result tensor type by dropping unit dims.		/// tensor type to the result tensor type by dropping unit dims.
std::optional<llvm::SmallDenseSet<unsigned>>		std::optional<llvm::SmallDenseSet<unsigned>>
computeRankReductionMask() {		computeRankReductionMask() {
return ::mlir::computeRankReductionMask(getSourceType().getShape(),		return ::mlir::computeRankReductionMask(getSourceType().getShape(),
getType().getShape());		getType().getShape());
▲ Show 20 Lines • Show All 449 Lines • ▼ Show 20 Lines	let builders = [
// Build an InsertSliceOp with mixed static and dynamic entries packed in		// Build an InsertSliceOp with mixed static and dynamic entries packed in
// a Range vector.		// a Range vector.
OpBuilder<(ins "Value":$source, "Value":$dest,		OpBuilder<(ins "Value":$source, "Value":$dest,
"ArrayRef<Range>":$ranges,		"ArrayRef<Range>":$ranges,
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>		CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
];		];

let extraClassDeclaration = extraBaseClassDeclaration # [{		let extraClassDeclaration = extraBaseClassDeclaration # [{
/// Returns the type of the base tensor operand.
RankedTensorType getSourceType() {
return getSource().getType().cast<RankedTensorType>();
}

/// The result of a insert_slice is always a tensor.		/// The result of a insert_slice is always a tensor.
		// TODO: Deprecate this method.
RankedTensorType getType() {		RankedTensorType getType() {
return getResult().getType().cast<RankedTensorType>();		return getResultType();
}		}

/// The `dest` type is the same as the result type.		/// The `dest` type is the same as the result type.
RankedTensorType getDestType() {		RankedTensorType getDestType() {
return getType();		return getResultType();
}		}

/// Return the expected rank of each of the`static_offsets`, `static_sizes`		/// Return the expected rank of each of the`static_offsets`, `static_sizes`
/// and `static_strides` attributes.		/// and `static_strides` attributes.
std::array<unsigned, 3> getArrayAttrMaxRanks() {		std::array<unsigned, 3> getArrayAttrMaxRanks() {
unsigned rank = getType().getRank();		unsigned rank = getResultType().getRank();
return {rank, rank, rank};		return {rank, rank, rank};
}		}

/// Return the number of leading operands before the `offsets`, `sizes` and		/// Return the number of leading operands before the `offsets`, `sizes` and
/// and `strides` operands.		/// and `strides` operands.
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }		static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }

std::pair<int64_t, int64_t> getDpsInitsPositionRange() {		std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
▲ Show 20 Lines • Show All 1,042 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/Utils/StaticValueUtils.h

	Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines
	/// Return true if `ofr` is constant integer equal to `value`.			/// Return true if `ofr` is constant integer equal to `value`.
	bool isConstantIntValue(OpFoldResult ofr, int64_t value);			bool isConstantIntValue(OpFoldResult ofr, int64_t value);

	/// Return true if ofr1 and ofr2 are the same integer constant attribute			/// Return true if ofr1 and ofr2 are the same integer constant attribute
	/// values or the same SSA value. Ignore integer bitwitdh and type mismatch			/// values or the same SSA value. Ignore integer bitwitdh and type mismatch
	/// that come from the fact there is no IndexAttr and that IndexType have no			/// that come from the fact there is no IndexAttr and that IndexType have no
	/// bitwidth.			/// bitwidth.
	bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);			bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);
				bool isEqualConstantIntOrValueArray(ArrayRef<OpFoldResult> ofrs1,
				ArrayRef<OpFoldResult> ofrs2);

	/// Helper function to convert a vector of `OpFoldResult`s into a vector of			/// Helper function to convert a vector of `OpFoldResult`s into a vector of
	/// `Value`s. For each `OpFoldResult` in `valueOrAttrVec` return the fold			/// `Value`s. For each `OpFoldResult` in `valueOrAttrVec` return the fold
	/// result if it casts to a `Value` or create an index-type constant if it			/// result if it casts to a `Value` or create an index-type constant if it
	/// casts to `IntegerAttr`. No other attribute types are supported.			/// casts to `IntegerAttr`. No other attribute types are supported.
	SmallVector<Value> getAsValues(OpBuilder &b, Location loc,			SmallVector<Value> getAsValues(OpBuilder &b, Location loc,
	ArrayRef<OpFoldResult> valueOrAttrVec);			ArrayRef<OpFoldResult> valueOrAttrVec);

	Show All 16 Lines

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

Show First 20 Lines • Show All 3,091 Lines • ▼ Show 20 Lines
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// HoistRedundantVectorTransfersOp		// HoistRedundantVectorTransfersOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
transform::HoistRedundantVectorTransfersOp::applyToOne(		transform::HoistRedundantVectorTransfersOp::applyToOne(
func::FuncOp target, transform::ApplyToEachResultList &results,		func::FuncOp target, transform::ApplyToEachResultList &results,
transform::TransformState &state) {		transform::TransformState &state) {
		// WARNING: This hoisting does not model parallelism and is generally
		// incorrect when used on distributed loops with memref semantics!
		// TODO: obsolete and should be retired.
linalg::hoistRedundantVectorTransfers(target);		linalg::hoistRedundantVectorTransfers(target);
linalg::hoistRedundantVectorTransfersOnTensor(target);
results.push_back(target);		results.push_back(target);
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ConvertConv2DToImg2ColOp.		// ConvertConv2DToImg2ColOp.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

Show All 22 Lines	DiagnosedSilenceableFailure transform::ConvertConv2DToImg2ColOp::applyToOne(
// Handle to the operation producing the img2col tensor.		// Handle to the operation producing the img2col tensor.
results.push_back(maybeTransformed->first);		results.push_back(maybeTransformed->first);
// Handle to the operation that replaces the original convolution.		// Handle to the operation that replaces the original convolution.
results.push_back(maybeTransformed->second);		results.push_back(maybeTransformed->second);
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		// HoistRedundantTensorSubsetsOp
		//===----------------------------------------------------------------------===//

		DiagnosedSilenceableFailure
		transform::HoistRedundantTensorSubsetsOp::applyToOne(
		Operation *target, transform::ApplyToEachResultList &results,
		transform::TransformState &state) {
		IRRewriter rewriter(target->getContext());
		auto forOp = dyn_cast<scf::ForOp>(target);
		if (forOp) {
		scf::ForOp newForOp =
		linalg::hoistRedundantSubsetExtractInsert(rewriter, forOp);
		results.push_back(newForOp);
		return DiagnosedSilenceableFailure::success();
		}

		// TODO: walking in some reverse / inside-out order would be more efficient
		// and would capture more cases.
		target->walk([&](scf::ForOp forOp) {
		hoistRedundantSubsetExtractInsert(rewriter, forOp);
		});
		results.push_back(target);
		return DiagnosedSilenceableFailure::success();
		}

		//===----------------------------------------------------------------------===//
// Transform op registration		// Transform op registration
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

namespace {		namespace {
/// Registers new ops and declares PDL as dependent dialect since the		/// Registers new ops and declares PDL as dependent dialect since the
/// additional ops are using PDL types for operands and results.		/// additional ops are using PDL types for operands and results.
class LinalgTransformDialectExtension		class LinalgTransformDialectExtension
: public transform::TransformDialectExtension<		: public transform::TransformDialectExtension<
Show All 30 Lines

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

Show All 19 Lines	add_mlir_dialect_library(MLIRLinalgTransforms
HoistPadding.cpp		HoistPadding.cpp
InlineScalarOperands.cpp		InlineScalarOperands.cpp
Interchange.cpp		Interchange.cpp
Loops.cpp		Loops.cpp
NamedOpConversions.cpp		NamedOpConversions.cpp
Promotion.cpp		Promotion.cpp
Split.cpp		Split.cpp
SplitReduction.cpp		SplitReduction.cpp
		SubsetHoisting.cpp
SwapExtractSliceWithFillPatterns.cpp		SwapExtractSliceWithFillPatterns.cpp
Tiling.cpp		Tiling.cpp
TilingInterfaceImpl.cpp		TilingInterfaceImpl.cpp
Transforms.cpp		Transforms.cpp
Vectorization.cpp		Vectorization.cpp

ADDITIONAL_HEADER_DIRS		ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg		${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg
▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp

	Show All 37 Lines

	#define DEBUG_TYPE "linalg-hoisting"			#define DEBUG_TYPE "linalg-hoisting"

	#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")			#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")

	using namespace mlir;			using namespace mlir;
	using namespace mlir::linalg;			using namespace mlir::linalg;

	namespace {
	/// Represents a unit of hoistable TransferWriteOp. This may comprise other
	/// instructions that need to be hoisted too.
	struct HoistableWrite {
	vector::TransferWriteOp transferWriteOp;
	tensor::InsertSliceOp insertSliceOp;
	};
	/// Represents a unit of hoistable TransferReadOp. This may comprise other
	/// instructions that need to be hoisted too.
	struct HoistableRead {
	vector::TransferReadOp transferReadOp;
	tensor::ExtractSliceOp extractSliceOp;
	};
	} // namespace

	/// Return true if op1 and op2 are the same constant or the same SSA value.
	static bool isEqualOffsetSizeOrStride(OpFoldResult op1, OpFoldResult op2) {
	auto getConstantIntValue = [](OpFoldResult ofr) -> std::optional<int64_t> {
	Attribute attr = ofr.dyn_cast<Attribute>();
	// Note: isa+cast-like pattern allows writing the condition below as 1 line.
	if (!attr && ofr.get<Value>().getDefiningOp<arith::ConstantOp>())
	attr = ofr.get<Value>().getDefiningOp<arith::ConstantOp>().getValue();
	if (auto intAttr = attr.dyn_cast_or_null<IntegerAttr>())
	return intAttr.getValue().getSExtValue();
	return std::nullopt;
	};
	auto cst1 = getConstantIntValue(op1), cst2 = getConstantIntValue(op2);
	if (cst1 && cst2 && cst1 == cst2)
	return true;
	auto v1 = op1.dyn_cast<Value>(), v2 = op2.dyn_cast<Value>();
	return v1 && v2 && v1 == v2;
	}

	/// Return true is all offsets, sizes and strides are equal.
	static bool sameOffsetsSizesAndStrides(tensor::ExtractSliceOp s,
	tensor::InsertSliceOp si) {
	if (s.getStaticOffsets().size() != si.getStaticOffsets().size())
	return false;
	if (s.getStaticSizes().size() != si.getStaticSizes().size())
	return false;
	if (s.getStaticStrides().size() != si.getStaticStrides().size())
	return false;
	for (auto it : llvm::zip(s.getMixedOffsets(), si.getMixedOffsets()))
	if (!isEqualOffsetSizeOrStride(std::get<0>(it), std::get<1>(it)))
	return false;
	for (auto it : llvm::zip(s.getMixedSizes(), si.getMixedSizes()))
	if (!isEqualOffsetSizeOrStride(std::get<0>(it), std::get<1>(it)))
	return false;
	for (auto it : llvm::zip(s.getMixedStrides(), si.getMixedStrides()))
	if (!isEqualOffsetSizeOrStride(std::get<0>(it), std::get<1>(it)))
	return false;
	return true;
	}

	/// Look for a HoistableRead, in the given tensor uses, accessing the same
	/// offset as the HoistableWrite.
	static HoistableRead findMatchingTransferRead(HoistableWrite write,
	Value srcTensor) {
	assert(write.transferWriteOp &&
	"expected hoistable write to have a .transfer_write");

	LLVM_DEBUG(DBGS() << "findMatchingTransferRead for: "
	<< *write.transferWriteOp.getOperation() << "\n");
	if (write.insertSliceOp)
	LLVM_DEBUG(DBGS() << "findMatchingTransferRead inserSliceOp: "
	<< *write.insertSliceOp.getOperation() << "\n");
	SmallVector<Operation *> users(srcTensor.getUsers().begin(),
	srcTensor.getUsers().end());
	while (!users.empty()) {
	Operation *user = users.pop_back_val();
	LLVM_DEBUG(DBGS() << "findMatchingTransferRead inspect user: " << *user
	<< "\n");

	// If HoistableWrite involves a InsertSliceOp, we need to find a
	// matching ExtractSliceOp.
	tensor::ExtractSliceOp sliceOp;
	Operation *maybeTransferReadUser = user;
	if (write.insertSliceOp) {
	sliceOp = dyn_cast<tensor::ExtractSliceOp>(user);
	if (!sliceOp \|\| sliceOp.getResult().getType() !=
	write.insertSliceOp.getSource().getType())
	continue;

	LLVM_DEBUG(DBGS() << "check whether sameOffsetsSizesAndStrides: "
	<< sliceOp << " vs " << write.insertSliceOp << "\n");
	if (!sameOffsetsSizesAndStrides(sliceOp, write.insertSliceOp))
	continue;

	LLVM_DEBUG(DBGS() << "sameOffsetsSizesAndStrides: SUCCESS\n");
	// If we got here, sliceOp is hoistable iff it has exactly 2 uses:
	// 1. the transfer_write we want to hoist.
	// 2. a matching transfer_read.
	// Anything else, we skip.
	bool skip = false;
	Operation *otherUser = nullptr;
	for (Operation *u : sliceOp->getUsers()) {
	if (u == write.transferWriteOp)
	continue;
	if (otherUser) {
	skip = true;
	break;
	}
	otherUser = u;
	}
	if (skip \|\| !otherUser)
	continue;
	maybeTransferReadUser = otherUser;
	}

	LLVM_DEBUG(DBGS() << "maybeTransferReadUser: " << *maybeTransferReadUser
	<< "\n");
	auto read = dyn_cast<vector::TransferReadOp>(maybeTransferReadUser);
	if (read && read.getIndices() == write.transferWriteOp.getIndices() &&
	read.getVectorType() == write.transferWriteOp.getVectorType())
	return HoistableRead{read, sliceOp};

	if (isa<vector::TransferWriteOp>(user)) {
	// If we find a write with disjoint indices recurse through its uses.
	if (vector::isDisjointTransferIndices(
	cast<VectorTransferOpInterface>(user),
	cast<VectorTransferOpInterface>(
	write.transferWriteOp.getOperation()))) {
	users.append(user->getUsers().begin(), user->getUsers().end());
	}
	}
	}
	return HoistableRead();
	}

	/// Check if the chunk of data inserted by the HoistableWrite are read by any
	/// other op than the HoistableRead candidate.
	static bool tensorChunkAccessedByUnknownOp(HoistableWrite write,
	HoistableRead candidateRead,
	BlockArgument tensorArg) {
	// Make sure none of the other uses read the part of the tensor modified
	// by the transfer_write.
	llvm::SmallVector<Value::use_range, 1> uses;
	uses.push_back(tensorArg.getUses());
	while (!uses.empty()) {
	for (OpOperand &use : uses.pop_back_val()) {
	Operation *user = use.getOwner();
	// Skip the candidate use, only inspect the "other" uses.
	if (user == candidateRead.transferReadOp \|\|
	user == candidateRead.extractSliceOp \|\|
	user == write.transferWriteOp \|\| user == write.insertSliceOp)
	continue;
	// Consider all transitive uses through a extract_slice / insert_slice.
	// TODO: atm we just bail because a stronger analysis is needed for these
	// cases.
	if (isa<tensor::ExtractSliceOp, tensor::InsertSliceOp>(user))
	return true;
	// Consider all transitive uses through a vector.transfer_write.
	if (auto writeUser = dyn_cast<vector::TransferWriteOp>(user)) {
	uses.push_back(writeUser->getResult(0).getUses());
	continue;
	}
	// Consider all nested uses through an scf::ForOp. We may have
	// pass-through tensor arguments left from previous level of
	// hoisting.
	if (auto forUser = dyn_cast<scf::ForOp>(user)) {
	Value arg = forUser.getLoopBody().getArgument(
	use.getOperandNumber() - forUser.getNumControlOperands() +
	/iv value/ 1);
	uses.push_back(arg.getUses());
	continue;
	}
	// Follow the use yield as long as it doesn't escape the original
	// region.
	scf::YieldOp yieldUser = dyn_cast<scf::YieldOp>(user);
	if (yieldUser && write.transferWriteOp->getParentOp()->isAncestor(
	yieldUser->getParentOp())) {
	Value ret = yieldUser->getParentOp()->getResult(use.getOperandNumber());
	uses.push_back(ret.getUses());
	continue;
	}
	auto read = dyn_cast<vector::TransferReadOp>(user);
	if (!read \|\| !vector::isDisjointTransferIndices(
	cast<VectorTransferOpInterface>(read.getOperation()),
	cast<VectorTransferOpInterface>(
	write.transferWriteOp.getOperation()))) {
	return true;
	}
	}
	}
	return false;
	}

	/// Return the `forOp`-invariant HoistableWrite that produces `yieldOperand`.
	/// Return the null HoistableWrite() if it is not comprised of a
	/// vector.transfer_write + optional insert_slice or if any of the indexings
	/// is `forOp`-dependent.
	static HoistableWrite
	getLoopInvariantTransferWriteOpDefining(scf::ForOp forOp,
	OpOperand &yieldOperand) {
	Value v = yieldOperand.get();
	if (auto write = v.getDefiningOp<vector::TransferWriteOp>()) {
	// Indexing must not depend on `forOp`.
	for (Value operand : write.getIndices())
	if (!forOp.isDefinedOutsideOfLoop(operand))
	return HoistableWrite();

	return HoistableWrite{write, nullptr};
	}

	if (auto insertSliceOp = v.getDefiningOp<tensor::InsertSliceOp>()) {
	// Inserted slice must come from vector.transfer_write.
	auto write =
	insertSliceOp.getSource().getDefiningOp<vector::TransferWriteOp>();
	if (!write)
	return HoistableWrite();

	// Tensor inserted into must be a BBArg at position matching yieldOperand's.
	auto bbArg = insertSliceOp.getDest().dyn_cast<BlockArgument>();
	if (!bbArg \|\| bbArg.getOwner()->getParentOp() != forOp \|\|
	bbArg.getArgNumber() != /num iv=/1 + yieldOperand.getOperandNumber())
	return HoistableWrite();

	// Indexing inserted into must not depend on `forOp`.
	for (Value operand : insertSliceOp->getOperands().drop_front(
	tensor::InsertSliceOp::getOffsetSizeAndStrideStartOperandIndex()))
	if (!forOp.isDefinedOutsideOfLoop(operand))
	return HoistableWrite();

	return HoistableWrite{write, insertSliceOp};
	}

	return HoistableWrite();
	}

	/// Mechanical hoisting of a matching HoistableRead / HoistableWrite pair.
	static void hoistReadWrite(HoistableRead read, HoistableWrite write,
	BlockArgument tensorBBArg) {
	scf::ForOp forOp = cast<scf::ForOp>(tensorBBArg.getOwner()->getParentOp());
	assert(read.transferReadOp && write.transferWriteOp &&
	"expected transfer_read and transfer_write ops to be set");
	assert(((read.extractSliceOp && write.insertSliceOp) \|\|
	(!read.extractSliceOp && !write.insertSliceOp)) &&
	"expected matching extract_slice / insert_slice");
	LLVM_DEBUG(DBGS() << "In forOp:\n"
	<< *forOp.getOperation()
	<< "\nHoist: " << *read.transferReadOp.getOperation()
	<< "\nHoist: " << *write.transferWriteOp.getOperation()
	<< "\nInvolving: " << tensorBBArg << "\n");

	// If a read slice is present, hoist it.
	if (read.extractSliceOp)
	forOp.moveOutOfLoop(read.extractSliceOp);

	// Hoist the transfer_read op.
	forOp.moveOutOfLoop(read.transferReadOp);

	// TODO: don't hardcode /numIvs=/1.
	assert(tensorBBArg.getArgNumber() >= /numIvs=/1);
	unsigned initArgNumber = tensorBBArg.getArgNumber() - /numIvs=/1;

	// Update the source tensor.
	if (read.extractSliceOp)
	read.extractSliceOp.getSourceMutable().assign(
	forOp.getInitArgs()[initArgNumber]);
	else
	read.transferReadOp.getSourceMutable().assign(
	forOp.getInitArgs()[initArgNumber]);

	// Hoist write after.
	if (write.insertSliceOp)
	write.insertSliceOp->moveAfter(forOp);
	write.transferWriteOp->moveAfter(forOp);

	// Update the yield.
	auto yieldOp = cast<scf::YieldOp>(forOp.getRegion().front().getTerminator());
	if (write.insertSliceOp)
	yieldOp->setOperand(initArgNumber, write.insertSliceOp.getDest());
	else
	yieldOp->setOperand(initArgNumber, write.transferWriteOp.getSource());

	// Rewrite `loop` with additional new yields.
	OpBuilder b(read.transferReadOp);
	NewYieldValueFn yieldFn = [&](OpBuilder &b, Location loc,
	ArrayRef<BlockArgument> newBBArgs) {
	return SmallVector<Value>{write.transferWriteOp.getVector()};
	};
	auto newForOp = replaceLoopWithNewYields(
	b, forOp, read.transferReadOp.getVector(), yieldFn);

	// Transfer write has been hoisted, need to update the vector and tensor
	// source. Replace the result of the loop to use the new tensor created
	// outside the loop.
	// Depending on whether a insert_slice is present or not, it carries the
	// update on the tensor operands.
	if (write.insertSliceOp) {
	newForOp.getResult(initArgNumber)
	.replaceAllUsesWith(write.insertSliceOp.getResult());
	write.transferWriteOp.getSourceMutable().assign(
	read.extractSliceOp.getResult());
	write.insertSliceOp.getDestMutable().assign(
	read.extractSliceOp.getSource());
	} else {
	newForOp.getResult(initArgNumber)
	.replaceAllUsesWith(write.transferWriteOp.getResult());
	write.transferWriteOp.getSourceMutable().assign(
	newForOp.getResult(initArgNumber));
	}

	// Always update with the newly yield tensor and vector.
	write.transferWriteOp.getVectorMutable().assign(newForOp.getResults().back());
	}

	// To hoist transfer op on tensor the logic can be significantly simplified
	// compared to the case on buffer. The transformation follows this logic:
	// 1. Look for transfer_write with a single use from ForOp yield
	// 2. Check the uses of the matching block argument and look for a transfer_read
	// with the same indices.
	// 3. Check that all the other uses of the tensor argument are either disjoint
	// tensor_read or transfer_write. For transfer_write uses recurse to make sure
	// the new tensor has the same restrictions on its uses.
	// 4. Hoist the tensor_read/tensor_write and update the tensor SSA links.
	// After this transformation the scf.forOp may have unused arguments that can be
	// remove by the canonicalization pass.
	void mlir::linalg::hoistRedundantVectorTransfersOnTensor(func::FuncOp func) {			void mlir::linalg::hoistRedundantVectorTransfersOnTensor(func::FuncOp func) {
	bool changed = true;			IRRewriter rewriter(func->getContext());
	while (changed) {			// TODO: walking in some reverse / inside-out order would be more efficient
	changed = false;			// and would capture more cases.
	func.walk([&](scf::ForOp forOp) {			func.walk([&](scf::ForOp forOp) {
	Operation *yield = forOp.getBody()->getTerminator();			hoistRedundantSubsetExtractInsert(rewriter, forOp);
	for (const auto &it : llvm::enumerate(forOp.getRegionIterArgs())) {
	OpOperand &ret = yield->getOpOperand(it.index());
	HoistableWrite write =
	getLoopInvariantTransferWriteOpDefining(forOp, ret);
	if (!write.transferWriteOp \|\| !write.transferWriteOp->hasOneUse())
	continue;
	LLVM_DEBUG(dbgs() << "\n";
	DBGS() << "Candidate write for hoisting: "
	<< *write.transferWriteOp.getOperation() << "\n");
	if (write.insertSliceOp)
	LLVM_DEBUG(DBGS() << "Candidate insert_slice for hoisting: "
	<< *write.insertSliceOp.getOperation() << "\n");
	if (llvm::any_of(write.transferWriteOp.getIndices(),
	[&forOp](Value index) {
	return !forOp.isDefinedOutsideOfLoop(index);
	}))
	continue;
	// Find a read with the same type and indices.
	HoistableRead matchingRead =
	findMatchingTransferRead(write, it.value());
	// Make sure none of the other uses read the part of the tensor modified
	// by the transfer_write.
	if (!matchingRead.transferReadOp \|\|
	tensorChunkAccessedByUnknownOp(write, matchingRead, it.value()))
	continue;

	LLVM_DEBUG(DBGS() << "Start hoisting\n");
	hoistReadWrite(matchingRead, write, it.value());
	changed = true;
	forOp.erase();

	// Need to interrupt and restart: erasing the loop messes up the walk.
	return WalkResult::interrupt();
	}
	return WalkResult::advance();
	});			});
	// Apply canonicalization so the newForOp + yield folds immediately, thus
	// cleaning up the IR and potentially enabling more hoisting.
	if (changed) {
	RewritePatternSet patterns(func->getContext());
	scf::ForOp::getCanonicalizationPatterns(patterns, func->getContext());
	(void)applyPatternsAndFoldGreedily(func, std::move(patterns));
	}
	}
	}			}

	void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) {			void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) {
	bool changed = true;			bool changed = true;
	while (changed) {			while (changed) {
	changed = false;			changed = false;
	// First move loop invariant ops outside of their loop. This needs to be			// First move loop invariant ops outside of their loop. This needs to be
	// done before as we cannot move ops without interrupting the function walk.			// done before as we cannot move ops without interrupting the function walk.
	▲ Show 20 Lines • Show All 137 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp

This file was added.

				//===- SubsetHoisting.cpp - Linalg hoisting transformations----------------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements functions concerned with hoisting invariant subset
				// operations in the context of Linalg transformations.
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Dialect/Func/IR/FuncOps.h"
				#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
				#include "mlir/Dialect/SCF/IR/SCF.h"
				#include "mlir/Dialect/SCF/Utils/Utils.h"
				#include "mlir/Dialect/Tensor/IR/Tensor.h"
				#include "mlir/Dialect/Utils/StaticValueUtils.h"
				#include "mlir/Dialect/Vector/IR/VectorOps.h"
				#include "mlir/IR/BuiltinOps.h"
				#include "mlir/IR/PatternMatch.h"
				#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
				#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
				#include "llvm/Support/Debug.h"
				#include "llvm/Support/ErrorHandling.h"

				#define DEBUG_TYPE "subset-hoisting"

				#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")

				using namespace mlir;
				using namespace mlir::linalg;

				/// Return true if the location of the subset defined by the op is invariant of
				/// the loop iteration.
				static bool
				isSubsetLocationLoopInvariant(scf::ForOp forOp,
				springermUnsubmitted Done Reply Inline Actions I think you can use `ViewLikeInterface::isSameAs`. springerm: I think you can use `ViewLikeInterface::isSameAs`.
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions nice ,thanks ! nicolasvasilache: nice ,thanks !
				vector::TransferWriteOp transferWriteOp) {
				for (Value operand : transferWriteOp.getIndices())
				if (!forOp.isDefinedOutsideOfLoop(operand))
				return false;
				return true;
				}

				/// Return true if the location of the subset defined by the op is invariant of
				/// the loop iteration.
				static bool isSubsetLocationLoopInvariant(scf::ForOp forOp,
				tensor::InsertSliceOp insertSliceOp) {
				for (Value operand : insertSliceOp->getOperands().drop_front(
				tensor::InsertSliceOp::getOffsetSizeAndStrideStartOperandIndex()))
				if (!forOp.isDefinedOutsideOfLoop(operand))
				return false;
				return true;
				springermUnsubmitted Done Reply Inline Actions Note: Does not capture cases where the operand is an `affine.apply` etc. that's defined inside the loop but does not use the IV. springerm: Note: Does not capture cases where the operand is an `affine.apply` etc. that's defined inside…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions It does not: this is the job of the classical loop hoisting that I am not planning on duplicating here. nicolasvasilache: It does not: this is the job of the classical loop hoisting that I am not planning on…
				}

				/// Given an `srcTensor` that is a block argument belong to a loop.
				/// Greedily look for the first read that can be hoisted out of the loop (i.e.
				/// that satisfied the conditions):
				/// - The read is of type `tensor.extract_slice`.
				/// - The read is one of the uses of `srcTensor`.
				/// - The read is to the same subset that `tensor.insert_slice` writes.
				// TODO: Unify implementations once the "bypassing behavior" is the same.
				static FailureOr<tensor::ExtractSliceOp>
				findHoistableMatchingExtractSlice(RewriterBase &rewriter,
				tensor::InsertSliceOp insertSliceOp,
				BlockArgument srcTensor) {
				assert(srcTensor.getType().isa<RankedTensorType>() && "not a ranked tensor");

				auto forOp = cast<scf::ForOp>(srcTensor.getOwner()->getParentOp());

				LLVM_DEBUG(DBGS() << "--find matching read for: " << insertSliceOp << "\n";
				DBGS() << "--amongst users of: " << srcTensor << "\n");

				SmallVector<Operation *> users(srcTensor.getUsers());
				if (forOp.isDefinedOutsideOfLoop(insertSliceOp.getDest()))
				springermUnsubmitted Done Reply Inline Actions Should this be called `findHoistableMatchingExtractSlice`? springerm: Should this be called `findHoistableMatchingExtractSlice`?
				springermUnsubmitted Done Reply Inline Actions It's not clear to me what this function is doing. In particular, `isDefinedOutsideOfLoop` is used in the function but loops are not mentioned in the description. An example could be useful. springerm: It's not clear to me what this function is doing. In particular, `isDefinedOutsideOfLoop` is…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions clarified nicolasvasilache: clarified
				llvm::append_range(users, insertSliceOp.getDest().getUsers());

				for (Operation *user : users) {
				LLVM_DEBUG(DBGS() << "----inspect user: " << *user << "\n");
				auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(user);
				// Skip ops other than extract_slice with an exact matching of their tensor
				// subset.
				if (extractSliceOp) {
				auto isSame = [](OpFoldResult a, OpFoldResult b) { return a == b; };
				if (extractSliceOp.getResultType() != insertSliceOp.getSourceType() \|\|
				springermUnsubmitted Done Reply Inline Actions Naming: The comment says - The read is to the same subset that `tensor.insert_slice` writes. So I expected something like `SmallVector<Operation > users(destTensor.getUsers());` springerm:* Naming: The comment says ``` - The read is to the same subset that `tensor.insert_slice`…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions clarified nicolasvasilache: clarified
				!extractSliceOp.isSameAs(insertSliceOp, isSame)) {
				LLVM_DEBUG(DBGS() << "------not a matching extract_slice\n";
				DBGS() << user << " vs " << insertSliceOp << "\n");
				continue;
				}

				// Skip insert_slice whose vector is defined within the loop: we need to
				// hoist that definition first otherwise dominance violations trigger.
				if (!extractSliceOp.getSource().isa<BlockArgument>() &&
				!forOp.isDefinedOutsideOfLoop(extractSliceOp.getSource())) {
				LLVM_DEBUG(DBGS() << "------transfer_read vector is loop-dependent\n");
				continue;
				}
				return extractSliceOp;
				}

				// TODO: Look through disjoint subsets, similar to vector.transfer_write
				// and unify implementations.
				}

				LLVM_DEBUG(DBGS() << "----no matching extract_slice");
				return failure();
				}

				/// Given an `srcTensor` that is a block argument belong to a loop.
				/// Greedily look for the first read that can be hoisted out of the loop (i.e.
				/// that satisfied the conditions):
				/// - The read is of type `tensor.transfer_read`.
				/// - The read is one of the uses of `srcTensor`.
				/// - The read is to the same subset that `tensor.transfer_write` writes.
				// TODO: Unify implementations once the "bypassing behavior" is the same.
				static FailureOr<vector::TransferReadOp>
				findHoistableMatchingTransferRead(RewriterBase &rewriter,
				vector::TransferWriteOp transferWriteOp,
				BlockArgument srcTensor) {
				if (!srcTensor.getType().isa<RankedTensorType>())
				return failure();

				auto forOp = cast<scf::ForOp>(srcTensor.getOwner()->getParentOp());

				LLVM_DEBUG(DBGS() << "--find matching read for: " << transferWriteOp << "\n";
				DBGS() << "--amongst users of: " << srcTensor << "\n";);

				// vector.transfer_write is a bit peculiar: we look through dependencies
				// to disjoint tensor subsets. This requires a while loop.
				// TODO: Look through disjoint subsets for tensor.insert_slice and unify
				// implementations.
				SmallVector<Operation *> users(srcTensor.getUsers());
				// TODO: transferWriteOp.getSource is actually the destination tensor!!
				if (forOp.isDefinedOutsideOfLoop(transferWriteOp.getSource()))
				llvm::append_range(users, transferWriteOp.getSource().getUsers());
				while (!users.empty()) {
				Operation *user = users.pop_back_val();
				LLVM_DEBUG(DBGS() << "----inspect user: " << *user << "\n");
				auto read = dyn_cast<vector::TransferReadOp>(user);
				if (read) {
				// Skip ops other than transfer_read with an exact matching subset.
				if (read.getIndices() != transferWriteOp.getIndices() \|\|
				read.getVectorType() != transferWriteOp.getVectorType()) {
				LLVM_DEBUG(DBGS() << "------not a transfer_read that matches the "
				"transfer_write: "
				<< user << "\n\t(vs " << transferWriteOp << ")\n");
				continue;
				}

				// transfer_read may be of a vector that is defined within the loop: we
				// traverse it by virtue of bypassing disjoint subset operations rooted at
				// a bbArg and yielding a matching yield.
				if (!read.getSource().isa<BlockArgument>() &&
				!forOp.isDefinedOutsideOfLoop(read.getSource())) {
				LLVM_DEBUG(DBGS() << "------transfer_read vector appears loop "
				"dependent but will be tested for disjointness as "
				"part of the bypass analysis\n");
				}
				LLVM_DEBUG(DBGS() << "------found match\n");
				return read;
				}

				// As an optimization, we look further through dependencies to disjoint
				// tensor subsets. This creates more opportunities to find a matching read.
				if (isa<vector::TransferWriteOp>(user)) {
				// If we find a write with disjoint indices append all its uses.
				// TODO: Generalize areSubsetsDisjoint and allow other bypass than
				// just vector.transfer_write - vector.transfer_write.
				if (vector::isDisjointTransferIndices(
				cast<VectorTransferOpInterface>(user),
				cast<VectorTransferOpInterface>(
				transferWriteOp.getOperation()))) {
				LLVM_DEBUG(DBGS() << "----follow through disjoint write\n");
				users.append(user->getUsers().begin(), user->getUsers().end());
				} else {
				LLVM_DEBUG(DBGS() << "----skip non-disjoint write\n");
				}
				}
				}

				LLVM_DEBUG(DBGS() << "--no matching transfer_read\n");
				return rewriter.notifyMatchFailure(transferWriteOp,
				"no matching transfer_read");
				}

				/// Return the `vector.transfer_write` that produces `yieldOperand`, if:
				/// - The write operates on tensors.
				/// - All indices are defined outside of the loop.
				/// Return failure otherwise.
				///
				/// This is sufficient condition to hoist the `vector.transfer_write`; other
				/// operands can always be yielded by the loop where needed.
				// TODO: generalize beyond scf::ForOp.
				// TODO: Unify implementations once the "bypassing behavior" is the same.
				static FailureOr<vector::TransferWriteOp>
				getLoopInvariantTransferWriteDefining(RewriterBase &rewriter, scf::ForOp forOp,
				BlockArgument bbArg,
				OpOperand &yieldOperand) {
				assert(bbArg.getArgNumber() ==
				forOp.getNumInductionVars() + yieldOperand.getOperandNumber() &&
				"bbArg and yieldOperand must match");
				assert(isa<scf::YieldOp>(yieldOperand.getOwner()) && "must be an scf.yield");

				Value v = yieldOperand.get();
				auto transferWriteOp = v.getDefiningOp<vector::TransferWriteOp>();
				if (!transferWriteOp)
				return rewriter.notifyMatchFailure(v.getLoc(), "not a transfer_write");

				if (transferWriteOp->getNumResults() == 0) {
				return rewriter.notifyMatchFailure(v.getLoc(),
				"unsupported transfer_write on buffers");
				}

				// We do not explicitly check that the destination is a BBarg that matches the
				// yield operand as this would prevent us from bypassing other non-conflicting
				// writes.

				// Indexing must not depend on `forOp`.
				if (!isSubsetLocationLoopInvariant(forOp, transferWriteOp))
				return rewriter.notifyMatchFailure(
				v.getLoc(), "transfer_write indexing is loop-dependent");

				return transferWriteOp;
				}

				/// Return the `tensor.insert_slice` that produces `yieldOperand`, if:
				/// 1. Its destination tensor is a block argument of the `forOp`.
				/// 2. The unique use of its result is a yield with operand number matching
				/// the block argument.
				/// 3. All indices are defined outside of the loop.
				/// Return failure otherwise.
				///
				/// This is sufficient condition to hoist the `tensor.insert_slice`; other
				/// operands can always be yielded by the loop where needed.
				/// Note: 1. + 2. ensure that the yield / iter_args cycle results in proper
				/// semantics (i.e. no ping-ping between iter_args across iterations).
				// TODO: generalize beyond scf::ForOp.
				// TODO: Unify implementations once the "bypassing behavior" is the same.
				static FailureOr<tensor::InsertSliceOp>
				getLoopInvariantInsertSliceDefining(RewriterBase &rewriter, scf::ForOp forOp,
				BlockArgument bbArg,
				OpOperand &yieldOperand) {
				assert(bbArg.getArgNumber() ==
				forOp.getNumInductionVars() + yieldOperand.getOperandNumber() &&
				"bbArg and yieldOperand must match");
				assert(isa<scf::YieldOp>(yieldOperand.getOwner()) && "must be an scf.yield");

				Value v = yieldOperand.get();
				auto insertSliceOp = v.getDefiningOp<tensor::InsertSliceOp>();
				if (!insertSliceOp)
				return rewriter.notifyMatchFailure(v.getLoc(), "not an insert_slice");

				// Tensor inserted into must be a BBArg at position matching yield operand.
				// TODO: In the future we should not perform this check if we want to bypass
				// other non-conflicting writes.
				if (bbArg != insertSliceOp.getDest())
				return rewriter.notifyMatchFailure(v.getLoc(), "not a matching bbarg");

				// Indexing inserted into must not depend on `forOp`.
				if (!isSubsetLocationLoopInvariant(forOp, insertSliceOp))
				return rewriter.notifyMatchFailure(
				v.getLoc(), "insert_slice indexing is loop-dependent");

				return insertSliceOp;
				}

				/// Check if the chunk of data inserted by the `writeOp` is read by any other
				/// op than the candidateReadOp. This conflicting operation prevents hoisting,
				/// return it or nullptr if none is found.
				// TODO: Generalize subset disjunction analysis/interface.
				// TODO: Support more subset op types.
				static Operation isTensorChunkAccessedByUnknownOp(Operation writeOp,
				Operation *candidateReadOp,
				BlockArgument tensorArg) {
				// Make sure none of the other uses read the part of the tensor modified
				// by the transfer_write.
				llvm::SmallVector<Value::use_range, 1> uses;
				uses.push_back(tensorArg.getUses());
				while (!uses.empty()) {
				for (OpOperand &use : uses.pop_back_val()) {
				Operation *user = use.getOwner();
				// Skip the candidate use, only inspect the "other" uses.
				if (user == candidateReadOp \|\| user == writeOp)
				continue;

				// TODO: Consider all transitive uses through
				// extract_slice/insert_slice. Atm we just bail because a stronger
				// analysis is needed for these cases.
				if (isa<tensor::ExtractSliceOp, tensor::InsertSliceOp>(user))
				return user;

				// Consider all transitive uses through a vector.transfer_write.
				if (isa<vector::TransferWriteOp>(writeOp)) {
				if (auto writeUser = dyn_cast<vector::TransferWriteOp>(user)) {
				uses.push_back(writeUser->getResult(0).getUses());
				continue;
				}
				}

				// Consider all nested uses through an scf::ForOp. We may have
				// pass-through tensor arguments left from previous level of
				// hoisting.
				if (auto forUser = dyn_cast<scf::ForOp>(user)) {
				Value arg = forUser.getLoopBody().getArgument(
				use.getOperandNumber() - forUser.getNumControlOperands() +
				/iv value/ 1);
				uses.push_back(arg.getUses());
				continue;
				}

				// Follow the use yield, only if it doesn't escape the original region.
				scf::YieldOp yieldUser = dyn_cast<scf::YieldOp>(user);
				if (yieldUser &&
				writeOp->getParentOp()->isAncestor(yieldUser->getParentOp())) {
				Value ret = yieldUser->getParentOp()->getResult(use.getOperandNumber());
				uses.push_back(ret.getUses());
				continue;
				}

				// If the write is a vector::TransferWriteOp, it may have been bypassed
				// and we need to check subset disjunction
				if (isa<vector::TransferWriteOp>(writeOp)) {
				auto read = dyn_cast<vector::TransferReadOp>(user);
				if (!read \|\| !vector::isDisjointTransferIndices(
				cast<VectorTransferOpInterface>(read.getOperation()),
				cast<VectorTransferOpInterface>(writeOp))) {
				return user;
				}
				}
				}
				}
				return nullptr;
				}

				/// Mechanical hoisting of a matching read / write pair.
				/// Return the newly created scf::ForOp with an extra yields.
				// TODO: Unify implementations once the "bypassing behavior" is the same.
				static scf::ForOp hoistTransferReadWrite(
				RewriterBase &rewriter, vector::TransferReadOp transferReadOp,
				vector::TransferWriteOp transferWriteOp, BlockArgument tensorBBArg) {
				scf::ForOp forOp = cast<scf::ForOp>(tensorBBArg.getOwner()->getParentOp());
				LLVM_DEBUG(DBGS() << "--Start hoisting\n";
				DBGS() << "--Hoist read : " << transferReadOp << "\n";
				DBGS() << "--Hoist write: " << transferWriteOp << "\n";
				DBGS() << "--Involving : " << tensorBBArg << "\n");

				// TODO: don't hardcode /numIvs=/1.
				assert(tensorBBArg.getArgNumber() >= /numIvs=/1);
				int64_t initArgNumber = tensorBBArg.getArgNumber() - /numIvs=/1;

				// 1. Hoist the read op. Thanks to our previous checks we know this will not
				// trigger dominance violations once BBArgs are updated.
				// TODO: should the rewriter ever want to track this move ?
				transferReadOp->moveBefore(forOp);
				if (!forOp.isDefinedOutsideOfLoop(transferReadOp.getSource())) {
				rewriter.startRootUpdate(transferReadOp);
				transferReadOp.getSourceMutable().assign(
				forOp.getInitArgs()[initArgNumber]);
				rewriter.finalizeRootUpdate(transferReadOp);
				}

				// 2. Rewrite `loop` with an additional yield. This is the quantity that is
				// computed iteratively but whose storage has become loop-invariant.
				NewYieldValueFn yieldFn = [&](OpBuilder &b, Location loc,
				ArrayRef<BlockArgument> newBBArgs) {
				return SmallVector<Value>{transferWriteOp.getVector()};
				};
				auto newForOp = replaceLoopWithNewYields(
				rewriter, forOp, {transferReadOp.getVector()}, yieldFn);
				rewriter.eraseOp(forOp);

				// 3. Update the yield. Invariant: initArgNumber is the destination tensor.
				auto yieldOp =
				cast<scf::YieldOp>(newForOp.getRegion().front().getTerminator());
				// TODO: transferWriteOp.getSource is actually the destination tensor!!
				rewriter.startRootUpdate(yieldOp);
				yieldOp->setOperand(initArgNumber, transferWriteOp.getSource());
				rewriter.finalizeRootUpdate(yieldOp);

				// 4. Hoist write after and make uses of newForOp.getResult(initArgNumber)
				// flow through it.
				// TODO: should the rewriter ever want to track this move ?
				transferWriteOp->moveAfter(newForOp);
				rewriter.startRootUpdate(transferWriteOp);
				transferWriteOp.getVectorMutable().assign(newForOp.getResults().back());
				// TODO: transferWriteOp.getSource is actually the destination tensor!!
				transferWriteOp.getSourceMutable().assign(newForOp.getResult(initArgNumber));
				rewriter.finalizeRootUpdate(transferWriteOp);
				rewriter.replaceAllUsesExcept(newForOp.getResult(initArgNumber),
				transferWriteOp.getResult(), transferWriteOp);
				return newForOp;
				}

				/// Mechanical hoisting of a matching read / write pair.
				/// Return the newly created scf::ForOp with an extra yields.
				// TODO: Unify implementations once the "bypassing behavior" is the same.
				static scf::ForOp hoistExtractInsertSlice(RewriterBase &rewriter,
				tensor::ExtractSliceOp extractSliceOp,
				tensor::InsertSliceOp insertSliceOp,
				BlockArgument tensorBBArg) {
				scf::ForOp forOp = cast<scf::ForOp>(tensorBBArg.getOwner()->getParentOp());
				LLVM_DEBUG(DBGS() << "--Start hoisting\n";
				DBGS() << "--Hoist read : " << extractSliceOp << "\n";
				DBGS() << "--Hoist write: " << insertSliceOp << "\n";
				DBGS() << "--Involving : " << tensorBBArg << "\n");

				// TODO: don't hardcode /numIvs=/1.
				assert(tensorBBArg.getArgNumber() >= /numIvs=/1);
				int64_t initArgNumber = tensorBBArg.getArgNumber() - /numIvs=/1;

				// 1. Hoist the read op. Thanks to our previous checks we know this will not
				// trigger dominance violations once BBArgs are updated.
				// TODO: should the rewriter ever want to track this move ?
				extractSliceOp->moveBefore(forOp);
				if (!forOp.isDefinedOutsideOfLoop(extractSliceOp.getSource())) {
				assert(extractSliceOp.getSource() == tensorBBArg &&
				"extractSlice source not defined above must be the tracked bbArg");
				rewriter.startRootUpdate(extractSliceOp);
				extractSliceOp.getSourceMutable().assign(
				forOp.getInitArgs()[initArgNumber]);
				rewriter.finalizeRootUpdate(extractSliceOp);
				}

				// 2. Rewrite `loop` with an additional yield. This is the quantity that is
				// computed iteratively but whose storage has become loop-invariant.
				NewYieldValueFn yieldFn = [&](OpBuilder &b, Location loc,
				ArrayRef<BlockArgument> newBBArgs) {
				return SmallVector<Value>{insertSliceOp.getSource()};
				};
				auto newForOp = replaceLoopWithNewYields(rewriter, forOp,
				extractSliceOp.getResult(), yieldFn);
				rewriter.eraseOp(forOp);

				// 3. Update the yield. Invariant: initArgNumber is the destination tensor.
				auto yieldOp =
				cast<scf::YieldOp>(newForOp.getRegion().front().getTerminator());
				// TODO: should the rewriter ever want to track this ?
				rewriter.startRootUpdate(yieldOp);
				yieldOp->setOperand(initArgNumber, insertSliceOp.getDest());
				rewriter.finalizeRootUpdate(yieldOp);

				// 4. Hoist write after and make uses of newForOp.getResult(initArgNumber)
				// flow through it.
				// TODO: should the rewriter ever want to track this move ?
				insertSliceOp->moveAfter(newForOp);
				rewriter.startRootUpdate(insertSliceOp);
				insertSliceOp.getSourceMutable().assign(newForOp.getResults().back());
				insertSliceOp.getDestMutable().assign(newForOp.getResult(initArgNumber));
				rewriter.finalizeRootUpdate(insertSliceOp);
				rewriter.replaceAllUsesExcept(newForOp.getResult(initArgNumber),
				insertSliceOp.getResult(), insertSliceOp);
				return newForOp;
				}

				/// Greedily hoist redundant subset extract/insert operations on tensors
				/// outside `forOp`.
				/// Return the unmodified `forOp` if no hoisting occurred.
				/// Return a new scf::ForOp if hoisting on tensors occurred.
				scf::ForOp
				mlir::linalg::hoistRedundantSubsetExtractInsert(RewriterBase &rewriter,
				scf::ForOp forOp) {
				LLVM_DEBUG(DBGS() << "Enter hoistRedundantSubsetExtractInsert scf.for\n");
				Operation *yield = forOp.getBody()->getTerminator();

				LLVM_DEBUG(DBGS() << "\n"; DBGS() << "Consider " << forOp << "\n");

				scf::ForOp newForOp = forOp;
				do {
				forOp = newForOp;
				for (const auto &it : llvm::enumerate(forOp.getRegionIterArgs())) {
				LLVM_DEBUG(DBGS() << "Consider " << it.value() << "\n");

				// 1. Find a loop invariant subset write yielding `ret` that we can
				// consider for hoisting.
				// TODO: TypeSwitch when we add more cases.
				OpOperand &ret = yield->getOpOperand(it.index());
				FailureOr<vector::TransferWriteOp> transferWriteOp =
				getLoopInvariantTransferWriteDefining(rewriter, forOp, it.value(),
				ret);
				FailureOr<tensor::InsertSliceOp> insertSliceOp =
				getLoopInvariantInsertSliceDefining(rewriter, forOp, it.value(), ret);
				if (failed(transferWriteOp) && failed(insertSliceOp)) {
				LLVM_DEBUG(DBGS() << "no loop invariant write defining iter_args "
				<< it.value() << "\n");
				continue;
				}

				Operation *writeOp = succeeded(transferWriteOp)
				? transferWriteOp->getOperation()
				: insertSliceOp->getOperation();

				// 2. Only accept writes with a single use (i.e. the yield).
				if (!writeOp->hasOneUse()) {
				LLVM_DEBUG(DBGS() << "write with more than 1 use " << *writeOp << "\n");
				continue;
				}

				LLVM_DEBUG(DBGS() << "Write to hoist: " << *writeOp << "\n");

				// 3. Find a matching read that can also be hoisted.
				Operation *matchingReadOp = nullptr;
				// TODO: TypeSwitch.
				if (succeeded(transferWriteOp)) {
				auto maybeTransferRead = findHoistableMatchingTransferRead(
				rewriter, *transferWriteOp, it.value());
				if (succeeded(maybeTransferRead))
				matchingReadOp = maybeTransferRead->getOperation();
				} else if (succeeded(insertSliceOp)) {
				auto maybeExtractSlice = findHoistableMatchingExtractSlice(
				rewriter, *insertSliceOp, it.value());
				if (succeeded(maybeExtractSlice))
				matchingReadOp = maybeExtractSlice->getOperation();
				} else {
				llvm_unreachable("unexpected case");
				}
				if (!matchingReadOp) {
				LLVM_DEBUG(DBGS() << "No matching read\n");
				continue;
				}

				// 4. Make sure no other use reads the part of the modified tensor.
				// This is necessary to guard against hazards when non-conflicting subset
				// ops are bypassed.
				Operation *maybeUnknownOp =
				isTensorChunkAccessedByUnknownOp(writeOp, matchingReadOp, it.value());
				if (maybeUnknownOp) {
				LLVM_DEBUG(DBGS() << "Tensor chunk accessed by unknown op, skip: "
				<< *maybeUnknownOp << "\n");
				continue;
				}

				// 5. Perform the actual mechanical hoisting.
				// TODO: TypeSwitch.
				LLVM_DEBUG(DBGS() << "Read to hoist: " << *matchingReadOp << "\n");
				if (succeeded(transferWriteOp)) {
				newForOp = hoistTransferReadWrite(
				rewriter, cast<vector::TransferReadOp>(matchingReadOp),
				*transferWriteOp, it.value());
				} else if (succeeded(insertSliceOp)) {
				newForOp = hoistExtractInsertSlice(
				rewriter, cast<tensor::ExtractSliceOp>(matchingReadOp),
				*insertSliceOp, it.value());
				} else {
				llvm_unreachable("unexpected case");
				}
				break;
				}
				} while (forOp != newForOp);

				return newForOp;
				}

mlir/lib/Dialect/Utils/StaticValueUtils.cpp

	Show First 20 Lines • Show All 130 Lines • ▼ Show 20 Lines
	bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2) {			bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2) {
	auto cst1 = getConstantIntValue(ofr1), cst2 = getConstantIntValue(ofr2);			auto cst1 = getConstantIntValue(ofr1), cst2 = getConstantIntValue(ofr2);
	if (cst1 && cst2 && cst1 == cst2)			if (cst1 && cst2 && cst1 == cst2)
	return true;			return true;
	auto v1 = ofr1.dyn_cast<Value>(), v2 = ofr2.dyn_cast<Value>();			auto v1 = ofr1.dyn_cast<Value>(), v2 = ofr2.dyn_cast<Value>();
	return v1 && v1 == v2;			return v1 && v1 == v2;
	}			}

				bool isEqualConstantIntOrValueArray(ArrayRef<OpFoldResult> ofrs1,
				ArrayRef<OpFoldResult> ofrs2) {
				if (ofrs1.size() != ofrs2.size())
				return false;
				for (auto [ofr1, ofr2] : llvm::zip_equal(ofrs1, ofrs2))
				if (!isEqualConstantIntOrValue(ofr1, ofr2))
				return false;
				return true;
				}

	/// Helper function to convert a vector of `OpFoldResult`s into a vector of			/// Helper function to convert a vector of `OpFoldResult`s into a vector of
	/// `Value`s. For each `OpFoldResult` in `valueOrAttrVec` return the fold result			/// `Value`s. For each `OpFoldResult` in `valueOrAttrVec` return the fold result
	/// if it casts to a `Value` or create an index-type constant if it casts to			/// if it casts to a `Value` or create an index-type constant if it casts to
	/// `IntegerAttr`. No other attribute types are supported.			/// `IntegerAttr`. No other attribute types are supported.
	SmallVector<Value> getAsValues(OpBuilder &b, Location loc,			SmallVector<Value> getAsValues(OpBuilder &b, Location loc,
	ArrayRef<OpFoldResult> valueOrAttrVec) {			ArrayRef<OpFoldResult> valueOrAttrVec) {
	return llvm::to_vector<4>(			return llvm::to_vector<4>(
	llvm::map_range(valueOrAttrVec, [&](OpFoldResult value) -> Value {			llvm::map_range(valueOrAttrVec, [&](OpFoldResult value) -> Value {
	▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/hoisting.mlir

// RUN: mlir-opt -test-transform-dialect-interpreter --split-input-file --allow-unregistered-dialect %s \| FileCheck %s		// RUN: mlir-opt -test-transform-dialect-interpreter -canonicalize --split-input-file --allow-unregistered-dialect %s \| FileCheck %s

// CHECK-LABEL: func @hoist_vector_transfer_pairs(		// CHECK-LABEL: func @hoist_vector_transfer_pairs(
// CHECK-SAME: %[[MEMREF0:[a-zA-Z0-9]*]]: memref<?x?xf32>,		// CHECK-SAME: %[[MEMREF0:[a-zA-Z0-9]*]]: memref<?x?xf32>,
// CHECK-SAME: %[[MEMREF1:[a-zA-Z0-9]*]]: memref<?x?xf32>,		// CHECK-SAME: %[[MEMREF1:[a-zA-Z0-9]*]]: memref<?x?xf32>,
// CHECK-SAME: %[[MEMREF2:[a-zA-Z0-9]*]]: memref<?x?xf32>,		// CHECK-SAME: %[[MEMREF2:[a-zA-Z0-9]*]]: memref<?x?xf32>,
// CHECK-SAME: %[[MEMREF3:[a-zA-Z0-9]*]]: memref<?x?xf32>,		// CHECK-SAME: %[[MEMREF3:[a-zA-Z0-9]*]]: memref<?x?xf32>,
// CHECK-SAME: %[[MEMREF4:[a-zA-Z0-9]*]]: memref<?x?xf32>,		// CHECK-SAME: %[[MEMREF4:[a-zA-Z0-9]*]]: memref<?x?xf32>,
// CHECK-SAME: %[[MEMREF5:[a-zA-Z0-9]*]]: memref<?x?xf32>,		// CHECK-SAME: %[[MEMREF5:[a-zA-Z0-9]*]]: memref<?x?xf32>,
Show All 14 Lines
// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<2xf32>		// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<2xf32>
// CHECK: scf.for %[[J:.]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args({{.}}) -> (vector<1xf32>, vector<2xf32>) {		// CHECK: scf.for %[[J:.]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args({{.}}) -> (vector<1xf32>, vector<2xf32>) {
// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<3xf32>		// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<3xf32>
// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<4xf32>		// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<4xf32>
// CHECK: "some_crippling_use"(%[[MEMREF4]]) : (memref<?x?xf32>) -> ()		// CHECK: "some_crippling_use"(%[[MEMREF4]]) : (memref<?x?xf32>) -> ()
// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<5xf32>		// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<5xf32>
// CHECK: "some_use"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>		// CHECK: "some_use"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
// CHECK: "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>		// CHECK: "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
// CHECK: "some_use"(%[[MEMREF2]]) : (memref<?x?xf32>) -> vector<3xf32>		// CHECK: "some_use"(%[[MEMREF2]], %{{.*}}) : (memref<?x?xf32>, vector<3xf32>) -> vector<3xf32>
// CHECK: "some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>		// CHECK: "some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
// CHECK: "some_use"(%{{.*}}) : (vector<5xf32>) -> vector<5xf32>		// CHECK: "some_use"(%{{.*}}) : (vector<5xf32>) -> vector<5xf32>
// CHECK: vector.transfer_write %{{.*}} : vector<3xf32>, memref<?x?xf32>		// CHECK: vector.transfer_write %{{.*}} : vector<3xf32>, memref<?x?xf32>
// CHECK: vector.transfer_write %{{.*}} : vector<4xf32>, memref<?x?xf32>		// CHECK: vector.transfer_write %{{.*}} : vector<4xf32>, memref<?x?xf32>
// CHECK: vector.transfer_write %{{.*}} : vector<5xf32>, memref<?x?xf32>		// CHECK: vector.transfer_write %{{.*}} : vector<5xf32>, memref<?x?xf32>
// CHECK: "some_crippling_use"(%[[MEMREF3]]) : (memref<?x?xf32>) -> ()		// CHECK: "some_crippling_use"(%[[MEMREF3]]) : (memref<?x?xf32>) -> ()
// CHECK: scf.yield {{.*}} : vector<1xf32>, vector<2xf32>		// CHECK: scf.yield {{.*}} : vector<1xf32>, vector<2xf32>
// CHECK: }		// CHECK: }
Show All 10 Lines	scf.for %j = %lb to %ub step %step {
%r2 = vector.transfer_read %memref2[%c0, %c0], %cst: memref<?x?xf32>, vector<3xf32>		%r2 = vector.transfer_read %memref2[%c0, %c0], %cst: memref<?x?xf32>, vector<3xf32>
%r3 = vector.transfer_read %memref3[%c0, %c0], %cst: memref<?x?xf32>, vector<4xf32>		%r3 = vector.transfer_read %memref3[%c0, %c0], %cst: memref<?x?xf32>, vector<4xf32>
"some_crippling_use"(%memref4) : (memref<?x?xf32>) -> ()		"some_crippling_use"(%memref4) : (memref<?x?xf32>) -> ()
%r4 = vector.transfer_read %memref4[%c0, %c0], %cst: memref<?x?xf32>, vector<5xf32>		%r4 = vector.transfer_read %memref4[%c0, %c0], %cst: memref<?x?xf32>, vector<5xf32>
%r5 = vector.transfer_read %memref5[%c0, %c0], %cst: memref<?x?xf32>, vector<6xf32>		%r5 = vector.transfer_read %memref5[%c0, %c0], %cst: memref<?x?xf32>, vector<6xf32>
"some_crippling_use"(%memref5) : (memref<?x?xf32>) -> ()		"some_crippling_use"(%memref5) : (memref<?x?xf32>) -> ()
%u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>		%u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
%u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>		%u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
%u2 = "some_use"(%memref2) : (memref<?x?xf32>) -> vector<3xf32>		%u2 = "some_use"(%memref2, %r2) : (memref<?x?xf32>, vector<3xf32>) -> vector<3xf32>
%u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>		%u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>
%u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>		%u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>
%u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>		%u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>
vector.transfer_write %u0, %memref1[%c0, %c0] : vector<1xf32>, memref<?x?xf32>		vector.transfer_write %u0, %memref1[%c0, %c0] : vector<1xf32>, memref<?x?xf32>
vector.transfer_write %u1, %memref0[%i, %i] : vector<2xf32>, memref<?x?xf32>		vector.transfer_write %u1, %memref0[%i, %i] : vector<2xf32>, memref<?x?xf32>
vector.transfer_write %u2, %memref2[%c0, %c0] : vector<3xf32>, memref<?x?xf32>		vector.transfer_write %u2, %memref2[%c0, %c0] : vector<3xf32>, memref<?x?xf32>
vector.transfer_write %u3, %memref3[%c0, %c0] : vector<4xf32>, memref<?x?xf32>		vector.transfer_write %u3, %memref3[%c0, %c0] : vector<4xf32>, memref<?x?xf32>
vector.transfer_write %u4, %memref4[%c0, %c0] : vector<5xf32>, memref<?x?xf32>		vector.transfer_write %u4, %memref4[%c0, %c0] : vector<5xf32>, memref<?x?xf32>
▲ Show 20 Lines • Show All 100 Lines • ▼ Show 20 Lines	^bb1(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["func.func"]} in %arg1		%0 = transform.structured.match ops{["func.func"]} in %arg1
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
transform.structured.hoist_redundant_vector_transfers %0		transform.structured.hoist_redundant_vector_transfers %0
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
}		}

// -----		// -----

		// CHECK-LABEL: func @hoist_vector_transfer_pairs_in_affine_loops(
		// CHECK-SAME: %[[MEMREF0:[a-zA-Z0-9]+]]: memref<64x64xi32>,
		// CHECK-SAME: %[[MEMREF1:[a-zA-Z0-9]+]]: memref<64x64xi32>,
		// CHECK-SAME: %[[MEMREF2:[a-zA-Z0-9]+]]: memref<64x64xi32>) {
		// CHECK: %[[C0:.*]] = arith.constant 0 : i32
		// CHECK: affine.for %[[I:.*]] = 0 to 64 {
		// CHECK: affine.for %[[J:.*]] = 0 to 64 step 16 {
		// CHECK: %[[R0:.*]] = vector.transfer_read %[[MEMREF2]][%[[I]], %[[J]]], %[[C0]] : memref<64x64xi32>, vector<16xi32>
		// CHECK: %[[R:.]] = affine.for %[[K:.]] = 0 to 64 iter_args(%[[ACC:.*]] = %[[R0]]) -> (vector<16xi32>) {
		// CHECK: %[[AV:.]] = vector.transfer_read %[[MEMREF0]][%[[I]], %[[K]]], %[[C0]] {{.}}: memref<64x64xi32>, vector<16xi32>
		// CHECK: %[[BV:.]] = vector.transfer_read %[[MEMREF1]][%[[K]], %[[J]]], %[[C0]] {{.}}: memref<64x64xi32>, vector<16xi32>
		// CHECK: %[[T0:.*]] = arith.muli %[[AV]], %[[BV]] : vector<16xi32>
		// CHECK: %[[T1:.*]] = arith.addi %[[ACC]], %[[T0]] : vector<16xi32>
		// CHECK: affine.yield %[[T1]] : vector<16xi32>
		// CHECK: }
		// CHECK: vector.transfer_write %[[R]], %[[MEMREF2]][%[[I]], %[[J]]] : vector<16xi32>, memref<64x64xi32>
		// CHECK: }
		// CHECK: }
		func.func @hoist_vector_transfer_pairs_in_affine_loops(%memref0: memref<64x64xi32>, %memref1: memref<64x64xi32>, %memref2: memref<64x64xi32>) {
		%c0_i32 = arith.constant 0 : i32
		affine.for %arg3 = 0 to 64 {
		affine.for %arg4 = 0 to 64 step 16 {
		affine.for %arg5 = 0 to 64 {
		%0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>} : memref<64x64xi32>, vector<16xi32>
		%1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32>
		%2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32>
		%3 = arith.muli %0, %1 : vector<16xi32>
		%4 = arith.addi %2, %3 : vector<16xi32>
		vector.transfer_write %4, %memref2[%arg3, %arg4] : vector<16xi32>, memref<64x64xi32>
		}
		}
		}
		return
		}

		transform.sequence failures(propagate) {
		^bb1(%arg1: !pdl.operation):
		%0 = transform.structured.match ops{["func.func"]} in %arg1
		: (!pdl.operation) -> !pdl.operation
		transform.structured.hoist_redundant_vector_transfers %0
		: (!pdl.operation) -> !pdl.operation
		}

		// -----

// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor		// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor
func.func @hoist_vector_transfer_pairs_tensor(		func.func @hoist_vector_transfer_pairs_tensor(
%tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,		%tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,
%tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,		%tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,
%val: index, %lb : index, %ub : index, %step: index) ->		%val: index, %lb : index, %ub : index, %step: index) ->
(tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,		(tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
tensor<?x?xf32>, tensor<?x?xf32>) {		tensor<?x?xf32>, tensor<?x?xf32>) {
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines	return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 :
tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,		tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
tensor<?x?xf32>, tensor<?x?xf32>		tensor<?x?xf32>, tensor<?x?xf32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg1: !pdl.operation):		^bb1(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["func.func"]} in %arg1		%0 = transform.structured.match ops{["func.func"]} in %arg1
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
transform.structured.hoist_redundant_vector_transfers %0		transform.structured.hoist_redundant_tensor_subsets %0
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
}		}

// -----		// -----

// CHECK-LABEL: func @hoist_vector_transfer_pairs_disjoint_tensor(		// CHECK-LABEL: func @hoist_vector_transfer_pairs_disjoint_tensor(
// CHECK-SAME: %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,		// CHECK-SAME: %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
// CHECK-SAME: %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,		// CHECK-SAME: %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines	// CHECK: vector.transfer_write %[[R]]#2, %[[TENSOR5]]{{.*}} : vector<3xf32>, tensor<?x?xf32>
}		}
return %0#0, %0#1, %0#2, %0#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>		return %0#0, %0#1, %0#2, %0#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg1: !pdl.operation):		^bb1(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["func.func"]} in %arg1		%0 = transform.structured.match ops{["func.func"]} in %arg1
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
transform.structured.hoist_redundant_vector_transfers %0		transform.structured.hoist_redundant_tensor_subsets %0
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
}		}

// -----		// -----

// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices		// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices
// CHECK-SAME: %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,		// CHECK-SAME: %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
// CHECK-SAME: %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,		// CHECK-SAME: %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
▲ Show 20 Lines • Show All 100 Lines • ▼ Show 20 Lines	func.func @hoist_vector_transfer_pairs_tensor_and_slices(
}		}
return %0#0, %0#1, %0#2 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>		return %0#0, %0#1, %0#2 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg1: !pdl.operation):		^bb1(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["func.func"]} in %arg1		%0 = transform.structured.match ops{["func.func"]} in %arg1
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
transform.structured.hoist_redundant_vector_transfers %0		transform.structured.hoist_redundant_tensor_subsets %0
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
}		}

// -----		// -----

// CHECK-LABEL: func @hoist_vector_transfer_write_pairs_disjoint_tensor(		// CHECK-LABEL: func @hoist_vector_transfer_write_pairs_disjoint_tensor(
// CHECK-SAME: %[[T:.*]]: tensor<?x?xf32>,		// CHECK-SAME: %[[T:.*]]: tensor<?x?xf32>,
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
Show All 16 Lines	func.func @hoist_vector_transfer_write_pairs_disjoint_tensor(
%c1 = arith.constant 1 : index		%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index		%c3 = arith.constant 3 : index
%cst = arith.constant 0.0 : f32		%cst = arith.constant 0.0 : f32
%1 = scf.for %j = %lb to %ub step %step iter_args(%arg5 = %tensor)		%1 = scf.for %j = %lb to %ub step %step iter_args(%arg5 = %tensor)
-> (tensor<?x?xf32>) {		-> (tensor<?x?xf32>) {
%r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>		%r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
%u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>		%u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
%w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>		%w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>

		// Hoist by properly bypassing the disjoint write %w10.
%r01 = vector.transfer_read %w10[%c0, %c3], %cst: tensor<?x?xf32>, vector<2xf32>		%r01 = vector.transfer_read %w10[%c0, %c3], %cst: tensor<?x?xf32>, vector<2xf32>
%u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>		%u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
%w11 = vector.transfer_write %u01, %w10[%c0, %c3] : vector<2xf32>, tensor<?x?xf32>		%w11 = vector.transfer_write %u01, %w10[%c0, %c3] : vector<2xf32>, tensor<?x?xf32>
scf.yield %w11 : tensor<?x?xf32>		scf.yield %w11 : tensor<?x?xf32>
}		}
return %1 : tensor<?x?xf32>		return %1 : tensor<?x?xf32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg1: !pdl.operation):		^bb1(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["func.func"]} in %arg1		%0 = transform.structured.match ops{["func.func"]} in %arg1
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
transform.structured.hoist_redundant_vector_transfers %0		transform.structured.hoist_redundant_tensor_subsets %0
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
}		}

// -----		// -----

// CHECK-LABEL: func @hoist_vector_transfer_pairs_in_affine_loops(		// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices_static_large_tensor
// CHECK-SAME: %[[MEMREF0:[a-zA-Z0-9]+]]: memref<64x64xi32>,		// CHECK-SAME: %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<100x100xf32>,
// CHECK-SAME: %[[MEMREF1:[a-zA-Z0-9]+]]: memref<64x64xi32>,		// CHECK-SAME: %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<200x200xf32>,
// CHECK-SAME: %[[MEMREF2:[a-zA-Z0-9]+]]: memref<64x64xi32>) {		// CHECK-SAME: %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<300x300xf32>
// CHECK: %[[C0:.*]] = arith.constant 0 : i32		func.func @hoist_vector_transfer_pairs_tensor_and_slices_static_large_tensor(
// CHECK: affine.for %[[I:.*]] = 0 to 64 {		%tensor0: tensor<100x100xf32>, %tensor1: tensor<200x200xf32>, %tensor2: tensor<300x300xf32>,
// CHECK: affine.for %[[J:.*]] = 0 to 64 step 16 {		%val: index, %lb : index, %ub : index, %step: index) ->
// CHECK: %[[R0:.*]] = vector.transfer_read %[[MEMREF2]][%[[I]], %[[J]]], %[[C0]] : memref<64x64xi32>, vector<16xi32>		(
// CHECK: %[[R:.]] = affine.for %[[K:.]] = 0 to 64 iter_args(%[[ACC:.*]] = %[[R0]]) -> (vector<16xi32>) {		tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
// CHECK: %[[AV:.]] = vector.transfer_read %[[MEMREF0]][%[[I]], %[[K]]], %[[C0]] {{.}}: memref<64x64xi32>, vector<16xi32>		) {
// CHECK: %[[BV:.]] = vector.transfer_read %[[MEMREF1]][%[[K]], %[[J]]], %[[C0]] {{.}}: memref<64x64xi32>, vector<16xi32>		%c0 = arith.constant 0 : index
// CHECK: %[[T0:.*]] = arith.muli %[[AV]], %[[BV]] : vector<16xi32>		%cst = arith.constant 0.0 : f32
// CHECK: %[[T1:.*]] = arith.addi %[[ACC]], %[[T0]] : vector<16xi32>
// CHECK: affine.yield %[[T1]] : vector<16xi32>		// CHECK: scf.for %[[I:.]] = {{.}} iter_args(
// CHECK: }		// CHECK-SAME: %[[TENSOR0_ARG:[0-9a-zA-Z]+]] = %[[TENSOR0]],
// CHECK: vector.transfer_write %[[R]], %[[MEMREF2]][%[[I]], %[[J]]] : vector<16xi32>, memref<64x64xi32>		// CHECK-SAME: %[[TENSOR1_ARG:[0-9a-zA-Z]+]] = %[[TENSOR1]],
// CHECK: }		// CHECK-SAME: %[[TENSOR2_ARG:[0-9a-zA-Z]+]] = %[[TENSOR2]]
		// CHECK-SAME: ) ->
		// CHECK-SAME: (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
		%0:3 = scf.for %i = %lb to %ub step %step
		iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2)
		-> (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>) {

		// Hoisted
		// CHECK: %[[ST0:.]] = tensor.extract_slice %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.}}: tensor<100x100xf32> to tensor<?x?xf32>
		// CHECK: %[[V0:.]] = vector.transfer_read %[[ST0]]{{.}} : tensor<?x?xf32>, vector<1xf32>

		// CHECK: %[[R:.]]:3 = scf.for %[[J:.]] = {{.*}} iter_args(
		// CHECK-SAME: %[[TENSOR1_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR1_ARG]]
		// CHECK-SAME: %[[TENSOR2_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR2_ARG]]
		// CHECK-SAME: %[[V0_ARG_L2:[0-9a-zA-Z]+]] = %[[V0]]
		// CHECK-SAME: ) ->
		// CHECK-SAME: (tensor<200x200xf32>, tensor<300x300xf32>, vector<1xf32>
		%1:3 = scf.for %j = %lb to %ub step %step
		iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2)
		-> (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>) {
		// Hoists.
		%st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<100x100xf32> to tensor<?x?xf32>
		%r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>

		// CHECK: %[[ST1:.]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.}}: tensor<200x200xf32> to tensor<?x?xf32>
		// CHECK: %[[V1:.]] = vector.transfer_read %[[ST1]]{{.}} : tensor<?x?xf32>, vector<2xf32>
		// Does not hoist (slice depends on %j)
		%st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<200x200xf32> to tensor<?x?xf32>
		%r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>

		// CHECK: %[[ST2:.]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.}}: tensor<300x300xf32> to tensor<?x?xf32>
		// CHECK: %[[V2:.]] = vector.transfer_read %[[ST2]]{{.}} : tensor<?x?xf32>, vector<3xf32>
		// Does not hoist, 2 slice %arg8.
		%st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<300x300xf32> to tensor<?x?xf32>
		%r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>

		// CHECK: %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
		// CHECK: %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32>
		// CHECK: %[[U2:.*]] = "some_use"(%[[V2]]) : (vector<3xf32>) -> vector<3xf32>
		%u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
		%u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
		%u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32>

		// Hoists
		%w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>

		// CHECK-DAG: %[[STI1:.]] = vector.transfer_write %[[U1]], %{{.}} : vector<2xf32>, tensor<?x?xf32>
		// Does not hoist (associated slice depends on %j).
		%w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>

		// CHECK-DAG: %[[STI2:.]] = vector.transfer_write %[[U2]], %{{.}} : vector<3xf32>, tensor<?x?xf32>
		// Does not hoist, 2 slice / insert_slice for %arg8.
		%w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>

		// Hoists.
		%sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<100x100xf32>

		// CHECK-DAG: tensor.insert_slice %[[STI1]] into %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> into tensor<200x200xf32>
		// Does not hoist (depends on %j).
		%sti1 = tensor.insert_slice %w1 into %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<200x200xf32>

		// CHECK-DAG: tensor.insert_slice %[[STI2]] into %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> into tensor<300x300xf32>
		// Does not hoist, 2 slice / insert_slice for %arg8.
		%sti2 = tensor.insert_slice %w2 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<300x300xf32>
		// Extract with a different stride to make sure we cannot fold this extract with the above insert.
		%st22 = tensor.extract_slice %sti2[%i, %c0][%step, %step][2, 1] : tensor<300x300xf32> to tensor<?x?xf32>
		%sti22 = tensor.insert_slice %st22 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<300x300xf32>

		// CHECK: scf.yield {{.*}} : tensor<200x200xf32>, tensor<300x300xf32>, vector<1xf32>
// CHECK: }		// CHECK: }
func.func @hoist_vector_transfer_pairs_in_affine_loops(%memref0: memref<64x64xi32>, %memref1: memref<64x64xi32>, %memref2: memref<64x64xi32>) {		scf.yield %sti0, %sti1, %sti22:
%c0_i32 = arith.constant 0 : i32		tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
affine.for %arg3 = 0 to 64 {
affine.for %arg4 = 0 to 64 step 16 {
affine.for %arg5 = 0 to 64 {
%0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>} : memref<64x64xi32>, vector<16xi32>
%1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32>
%2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32>
%3 = arith.muli %0, %1 : vector<16xi32>
%4 = arith.addi %2, %3 : vector<16xi32>
vector.transfer_write %4, %memref2[%arg3, %arg4] : vector<16xi32>, memref<64x64xi32>
}
}		}

		// Hoisted
		// CHECK: %[[STI0:.]] = vector.transfer_write %[[R]]#2, %[[ST0]]{{.}} : vector<1xf32>, tensor<?x?xf32>
		// CHECK: tensor.insert_slice %[[STI0]] into %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}} : tensor<?x?xf32> into tensor<100x100xf32>

		// CHECK: scf.yield {{.*}} : tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
		scf.yield %1#0, %1#1, %1#2 :
		tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>

		// CHECK: }
}		}
return		return %0#0, %0#1, %0#2 : tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg1: !pdl.operation):		^bb1(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["func.func"]} in %arg1		%0 = transform.structured.match ops{["func.func"]} in %arg1
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
transform.structured.hoist_redundant_vector_transfers %0		transform.structured.hoist_redundant_tensor_subsets %0
: (!pdl.operation) -> !pdl.operation		: (!pdl.operation) -> !pdl.operation
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Linalg] Reimplement hoisting on tensors as a subset-based transformation
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 500804

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h

mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td

mlir/include/mlir/Dialect/Utils/StaticValueUtils.h

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp

mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp

mlir/lib/Dialect/Utils/StaticValueUtils.cpp

mlir/test/Dialect/Linalg/hoisting.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Linalg] Reimplement hoisting on tensors as a subset-based transformationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 500804

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h

mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td

mlir/include/mlir/Dialect/Utils/StaticValueUtils.h

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp

mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp

mlir/lib/Dialect/Utils/StaticValueUtils.cpp

mlir/test/Dialect/Linalg/hoisting.mlir

[mlir][Linalg] Reimplement hoisting on tensors as a subset-based transformation
ClosedPublic