Diff 402541

mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h

	Show All 13 Lines
	namespace mlir {			namespace mlir {
	class Value;			class Value;

	namespace tensor {			namespace tensor {
	class PadOp;			class PadOp;
	} // namespace tensor			} // namespace tensor

	namespace linalg {			namespace linalg {
				class GenericOp;

	/// Mechanically hoist padding operations on tensors by `numLoops` into a new,			/// Mechanically hoist padding operations on tensors by `numLoops` into a new,
	/// generally larger tensor. This achieves packing of multiple padding ops into			/// generally larger tensor. This achieves packing of multiple padding ops into
	/// a larger tensor. On success, `padTensorOp` is replaced by the cloned version			/// a larger tensor. On success, `opToHoist` is replaced by the cloned version
	/// in the packing loop so the caller can continue reasoning about the padding			/// in the packing loop so the caller can continue reasoning about the padding
	/// operation.			/// operation. If `transposeVector` is non-empty, hoist padding introduces a
				/// GenericOp to transpose the padded tensor before inserting it into the packed
				/// tensor. A `transposeVector` can change the storage order of the padded
				/// tensor but does not change the order of the pack or compute loops.
				///
	///			///
	/// Example in pseudo-mlir:			/// Example in pseudo-mlir:
	/// =======================			/// =======================
	///			///
	/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.			/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
	/// ```			/// ```
	/// scf.for (%i, %j, %k)			/// scf.for (%i, %j, %k)
	/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>			/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
	/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] {			/// %0 = tensor.pad %st0 low[0, 0] high[...] {
	/// ^bb0( ... ):			/// ^bb0( ... ):
	/// linalg.yield %pad			/// linalg.yield %pad
	/// } : tensor<?x?xf32> to tensor<4x8xf32>			/// } : tensor<?x?xf32> to tensor<4x8xf32>
	/// compute(%0)			/// compute(%0)
	/// ```			/// ```
	///			///
	/// IR resembling the following is produced:			/// IR resembling the following is produced:
	///			///
	/// ```			/// ```
	/// scf.for (%i) {			/// scf.for (%i) {
	/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>			/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
	/// %packed = scf.for (%k) iter_args(%p : %packed_init) {			/// %packed = scf.for (%k) iter_args(%p : %packed_init) {
	/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>			/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
	/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] {			/// %0 = tensor.pad %st0 low[0, 0] high[...] {
	/// ^bb0( ... ):			/// ^bb0( ... ):
	/// linalg.yield %pad			/// linalg.yield %pad
	/// } : tensor<?x?xf32> to tensor<4x8xf32>			/// } : tensor<?x?xf32> to tensor<4x8xf32>
	/// %1 = tensor.insert_slice %0 ...			/// %1 = tensor.insert_slice %0 ...
	/// : tensor<4x8xf32> to tensor<?x4x8xf32>			/// : tensor<4x8xf32> to tensor<?x4x8xf32>
	/// scf.yield %1: tensor<?x4x8xf32>			/// scf.yield %1: tensor<?x4x8xf32>
	/// } -> tensor<?x4x8xf32>			/// } -> tensor<?x4x8xf32>
	/// scf.for (%j, %k) {			/// scf.for (%j, %k) {
	/// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :			/// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
	/// tensor<?x4x8xf32> to tensor<4x8xf32>			/// tensor<?x4x8xf32> to tensor<4x8xf32>
	/// compute(%st0)			/// compute(%st0)
	/// }			/// }
	/// }			/// }
	/// ```			/// ```
	FailureOr<Value> hoistPaddingOnTensors(tensor::PadOp opToHoist, int numLoops,			FailureOr<Value> hoistPaddingOnTensors(
	tensor::PadOp &hoistedOp);			tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector,
				tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps);

	} // namespace linalg			} // namespace linalg
	} // namespace mlir			} // namespace mlir

	#endif // MLIR_DIALECT_LINALG_TRANSFORMS_HOISTPADDING_H			#endif // MLIR_DIALECT_LINALG_TRANSFORMS_HOISTPADDING_H

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

Show First 20 Lines • Show All 478 Lines • ▼ Show 20 Lines	using TileSizeComputationFunction =
std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;		std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;

/// Callback returning the padding value to use for a given OpOperand or failure		/// Callback returning the padding value to use for a given OpOperand or failure
/// for no padding. This should be a function of both the operation and the		/// for no padding. This should be a function of both the operation and the
/// operand type.		/// operand type.
using PaddingValueComputationFunction =		using PaddingValueComputationFunction =
std::function<FailureOr<Value>(OpBuilder &, OpOperand &)>;		std::function<FailureOr<Value>(OpBuilder &, OpOperand &)>;

/// Callback returning true if the pad tensor operation defining the given		/// Callback returning true if the PadOp defining the given OpOperand shall be
/// OpOperand shall be marked as nofold to enable packing.		/// marked as nofold to enable packing.
using PaddingNoFoldComputationFunction = std::function<bool(OpOperand &)>;		using PaddingNoFoldComputationFunction = std::function<bool(OpOperand &)>;

/// Callback returning the number of loops to hoist the pad tensor operation		/// Callback returning the number of loops to hoist the PadOp defining the given
/// defining the given OpOperand.		/// OpOperand.
using PaddingHoistComputationFunction = std::function<int64_t(OpOperand &)>;		using PaddingHoistComputationFunction = std::function<int64_t(OpOperand &)>;

		/// Callback returning the transpose vector used to permute the result tensor
		/// dimensions of the PadOp defining the given OpOperand.
		using PaddingTransposeComputationFunction =
		std::function<SmallVector<int64_t>(OpOperand &)>;

struct LinalgPaddingOptions {		struct LinalgPaddingOptions {
/// Callback returning the padding value to use for a given OpOperand or		/// Callback returning the padding value to use for a given OpOperand or
/// failure for no padding. Padding operations are introduced if		/// failure for no padding. Padding operations are introduced if
/// `paddingValueComputationFunction` is set and does not return failure.		/// `paddingValueComputationFunction` is set and does not return failure.
/// Padding all operands guarantees the operation is statically shaped and		/// Padding all operands guarantees the operation is statically shaped and
/// thus can be vectorized.		/// thus can be vectorized.
PaddingValueComputationFunction paddingValueComputationFunction = nullptr;		PaddingValueComputationFunction paddingValueComputationFunction = nullptr;

LinalgPaddingOptions &		LinalgPaddingOptions &
setPaddingValueComputationFunction(PaddingValueComputationFunction fun) {		setPaddingValueComputationFunction(PaddingValueComputationFunction fun) {
paddingValueComputationFunction = std::move(fun);		paddingValueComputationFunction = std::move(fun);
return *this;		return *this;
}		}

/// Callback returning true if the pad tensor operation defining the given		/// Callback returning true if the PadOp defining the given OpOperand shall be
/// OpOperand shall be marked as nofold to enable packing. A padding operation		/// marked as nofold to enable packing. A padding operation is only marked
/// is only marked nofold if `paddingNoFoldComputationFunction` is set and		/// nofold if `paddingNoFoldComputationFunction` is set and returns true.
/// returns true. Otherwise, the nofold attribute is set to false.		/// Otherwise, the nofold attribute is set to false.
PaddingNoFoldComputationFunction paddingNoFoldComputationFunction = nullptr;		PaddingNoFoldComputationFunction paddingNoFoldComputationFunction = nullptr;

LinalgPaddingOptions &		LinalgPaddingOptions &
setPaddingNoFoldComputationFunction(PaddingNoFoldComputationFunction fun) {		setPaddingNoFoldComputationFunction(PaddingNoFoldComputationFunction fun) {
paddingNoFoldComputationFunction = std::move(fun);		paddingNoFoldComputationFunction = std::move(fun);
return *this;		return *this;
}		}

/// Callback returning the number of loops to hoist the pad tensor operation		/// Callback returning the number of loops to hoist the PadOp defining the
/// defining the given OpOperand.		/// given OpOperand.
PaddingHoistComputationFunction paddingHoistComputationFunction = nullptr;		PaddingHoistComputationFunction paddingHoistComputationFunction = nullptr;

LinalgPaddingOptions &		LinalgPaddingOptions &
setPaddingHoistComputationFunction(PaddingHoistComputationFunction fun) {		setPaddingHoistComputationFunction(PaddingHoistComputationFunction fun) {
paddingHoistComputationFunction = std::move(fun);		paddingHoistComputationFunction = std::move(fun);
return *this;		return *this;
}		}

		/// Callback returning the transpose vector used to permute the result tensor
		/// dimensions of the PadOp defining the given OpOperand.
		PaddingTransposeComputationFunction paddingTransposeComputationFunction =
		nullptr;

		LinalgPaddingOptions &setPaddingTransposeComputationFunction(
		PaddingTransposeComputationFunction fun) {
		paddingTransposeComputationFunction = std::move(fun);
		return *this;
		}
};		};

struct LinalgTilingAndFusionOptions {		struct LinalgTilingAndFusionOptions {
/// Tile sizes used to tile the root operation.		/// Tile sizes used to tile the root operation.
SmallVector<int64_t> tileSizes;		SmallVector<int64_t> tileSizes;
/// Tile interchange used to permute the tile loops.		/// Tile interchange used to permute the tile loops.
SmallVector<int64_t> tileInterchange;		SmallVector<int64_t> tileInterchange;
};		};
▲ Show 20 Lines • Show All 811 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/Linalg/Utils/Utils.h

	Show First 20 Lines • Show All 111 Lines • ▼ Show 20 Lines
	/// `source` size. The padding introduces trailing `pad` values until the target			/// `source` size. The padding introduces trailing `pad` values until the target
	/// size is met. If `source` is defined by one or more LinalgOps that have been			/// size is met. If `source` is defined by one or more LinalgOps that have been
	/// padded with the same value and sizes, return their padded result instead of			/// padded with the same value and sizes, return their padded result instead of
	/// creating a tensor::PadOp.			/// creating a tensor::PadOp.
	///			///
	/// Example:			/// Example:
	/// ```			/// ```
	/// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]			/// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
	/// %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }			/// %1 = tensor.pad %0 low[0, 0] high[...] { tensor.yield %cst }
	/// %2 = linalg.matmul ins(...) outs(%1)			/// %2 = linalg.matmul ins(...) outs(%1)
	/// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]			/// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
	/// ```			/// ```
	/// makeComposedPadHighOp(source=%3, pad=%cst) returns %2			/// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
	/// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4			/// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
	/// ```			/// ```
	/// %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %other_cst }			/// %4 = tensor.pad %3 low[0, 0] high[...] { tensor.yield %other_cst }
	/// ```			/// ```
	Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,			Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
	Value source, Value pad, bool nofold);			Value source, Value pad, bool nofold);

				/// Returns a GenericOp that tansposes `inputTensor` into `outputTensor` using
				/// `transposeVector` to permute the `inputTensor` dimensions.
				GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
				Value outputTensor,
				ArrayRef<int64_t> transposeVector);

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// Fusion / Tiling utilities			// Fusion / Tiling utilities
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	/// The type of loops to be generated during tiling.			/// The type of loops to be generated during tiling.
	enum class LinalgTilingLoopType {			enum class LinalgTilingLoopType {
	Loops = 0,			Loops = 0,
	AffineLoops = 1,			AffineLoops = 1,
	▲ Show 20 Lines • Show All 295 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines
/// 5. The backward slice from the pad op to the scf::ForOp to hoist above is		/// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
/// empty.		/// empty.
/// 6. The source tensor of pad op is not defined by an extract slice op.		/// 6. The source tensor of pad op is not defined by an extract slice op.
/// 7. The source tensor of the extract slice op is not defined outside of		/// 7. The source tensor of the extract slice op is not defined outside of
/// the outermost enclosing scf::ForOp.		/// the outermost enclosing scf::ForOp.
/// 8. There is no enclosing scf::ForOp that indexes the padded data.		/// 8. There is no enclosing scf::ForOp that indexes the padded data.
/// Other cases succeed and will trigger hoisting of the pad op.		/// Other cases succeed and will trigger hoisting of the pad op.
struct HoistingAnalysis {		struct HoistingAnalysis {
HoistingAnalysis(tensor::PadOp padTensorOp, int numLoops);		HoistingAnalysis(tensor::PadOp padOp, int numLoops);

bool isValid() { return valid; }		bool isValid() { return valid; }

/// Footprint of the packedTensor, computed from the packingLoops.		/// Footprint of the packedTensor, computed from the packingLoops.
SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);		SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);

/// The outermost loop, determined by `nLevels` above which `padTensorOp` will		/// The outermost loop, determined by `nLevels` above which `padOp` will
/// be hoisted.		/// be hoisted.
scf::ForOp outermostEnclosingForOp;		scf::ForOp outermostEnclosingForOp;

/// Backward slice rooted at `padTensorOp` and nested under		/// Backward slice rooted at `padOp` and nested under
/// `outermostEnclosingForOp`.		/// `outermostEnclosingForOp`.
SetVector<Operation *> backwardSlice;		SetVector<Operation *> backwardSlice;

/// The scf::ForOp immediately enclosing `padTensorOp` such that:		/// The scf::ForOp immediately enclosing `padOp` such that:
/// 1. they are nested under `outermostEnclosingForOp` (inclusive)		/// 1. they are nested under `outermostEnclosingForOp` (inclusive)
/// 2. whose induction variable is used, directly or indirectly, in the		/// 2. whose induction variable is used, directly or indirectly, in the
/// computation of `padTensorOp`.		/// computation of `padOp`.
/// The span of these loops determines the footprint of the packed tensor.		/// The span of these loops determines the footprint of the packed tensor.
SmallVector<scf::ForOp> packingLoops;		SmallVector<scf::ForOp> packingLoops;

private:		private:
/// Drop any non-index dependencies of `padTensorOp` and `sliceOp` from		/// Drop any non-index dependencies of `padOp` and `sliceOp` from
/// `backwardSlice`. The method follows the use-def chains of the index		/// `backwardSlice`. The method follows the use-def chains of the index
/// operands consumed by `padTensorOp` and `sliceOp` and drops the operations		/// operands consumed by `padOp` and `sliceOp` and drops the operations
/// not part of this index computation. Afterwards, the filtered		/// not part of this index computation. Afterwards, the filtered
/// `backwardSlice` contains only the loops whose induction variable is used,		/// `backwardSlice` contains only the loops whose induction variable is used,
/// directly or indirectly, to index the padded tensor. The method returns		/// directly or indirectly, to index the padded tensor. The method returns
/// failure if the filtered backward slice contains an unexpected operation.		/// failure if the filtered backward slice contains an unexpected operation.
///		///
/// Example:		/// Example:
/// ```		/// ```
/// %source = linalg.fill(%cst, %arg0)		/// %source = linalg.fill(%cst, %arg0)
/// scf.for %i		/// scf.for %i
/// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!		/// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
/// scf.for %j (%arg2 = %unrelated)		/// scf.for %j (%arg2 = %unrelated)
/// scf.for %k // not used to index %source!		/// scf.for %k // not used to index %source!
/// %ubi = affine.min #map(%i)		/// %ubi = affine.min #map(%i)
/// %ubj = affine.min #map(%j)		/// %ubj = affine.min #map(%j)
/// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]		/// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
/// %padded_slice = linalg.pad_tensor %slice		/// %padded_slice = tensor.pad %slice
/// ```		/// ```
/// dropNonIndexDependencies(%padded_slice, %slice)		/// dropNonIndexDependencies(%padded_slice, %slice)
/// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.		/// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
LogicalResult dropNonIndexDependencies(tensor::PadOp padTensorOp,		LogicalResult dropNonIndexDependencies(tensor::PadOp padOp,
tensor::ExtractSliceOp sliceOp);		tensor::ExtractSliceOp sliceOp);

/// Encodes whether the analysis is valid and hoisting can proceed.		/// Encodes whether the analysis is valid and hoisting can proceed.
bool valid;		bool valid;
};		};

/// Return true if all uses of `padTensorOp` are an input tensor of some		/// Return true if all uses of `padOp` are an input tensor of some
/// LinalgOp.		/// LinalgOp.
static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padTensorOp) {		static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) {
for (OpOperand &use : padTensorOp.result().getUses()) {		for (OpOperand &use : padOp.result().getUses()) {
auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());		auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
if (!linalgUser \|\| !linalgUser.isInputTensor(&use)) {		if (!linalgUser \|\| !linalgUser.isInputTensor(&use)) {
LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)		LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp)
<< "\nthat is not an input tensor of a LinalgOp, "		<< "\nthat is not an input tensor of a LinalgOp, "
<< "cannot hoist\n"		<< "cannot hoist\n"
<< *(use.getOwner()) << "\n");		<< *(use.getOwner()) << "\n");
return false;		return false;
}		}
}		}
return true;		return true;
}		}

/// Return at most nLevels of immediately enclosing scf::ForOp loops.		/// Return at most nLevels of immediately enclosing scf::ForOp loops.
/// Stops at the first parent that is not an scf::ForOp.		/// Stops at the first parent that is not an scf::ForOp.
/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.		/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
/// Control-flow and other containing ops with regions are not modeled atm.		/// Control-flow and other containing ops with regions are not modeled atm.
static void		static void
getAtMostNEnclosingLoops(tensor::PadOp padTensorOp, int nLevels,		getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
SmallVector<scf::ForOp> &reverseEnclosingLoops) {		SmallVector<scf::ForOp> &reverseEnclosingLoops) {
AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());		AsmState state(padOp->getParentOfType<mlir::FuncOp>());
(void)state;		(void)state;
scf::ForOp outermostEnclosingForOp = nullptr;		scf::ForOp outermostEnclosingForOp = nullptr;
Operation *nextEnclosingOp = padTensorOp->getParentOp();		Operation *nextEnclosingOp = padOp->getParentOp();
while (nLevels-- > 0 &&		while (nLevels-- > 0 &&
(outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {		(outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
LLVM_DEBUG(		LLVM_DEBUG(
DBGS() << "loops: ";		DBGS() << "loops: ";
outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);		outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
dbgs() << "\n");		dbgs() << "\n");
reverseEnclosingLoops.push_back(outermostEnclosingForOp);		reverseEnclosingLoops.push_back(outermostEnclosingForOp);
nextEnclosingOp = outermostEnclosingForOp->getParentOp();		nextEnclosingOp = outermostEnclosingForOp->getParentOp();
}		}
}		}

HoistingAnalysis::HoistingAnalysis(tensor::PadOp padTensorOp, int numLoops) {		/// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty.
		/// Fail if `transposeVector` is no permutation matching the tensor rank.
		static FailureOr<RankedTensorType>
		computeTransposedType(RankedTensorType rankedTensorType,
		ArrayRef<int64_t> transposeVector) {
		if (transposeVector.empty())
		return rankedTensorType;
		if (!isPermutation(transposeVector) \|\|
		transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank()))
		return failure();

		SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(),
		rankedTensorType.getShape().end());
		applyPermutationToVector(transposedShape, transposeVector);

		nicolasvasilacheUnsubmitted Done Reply Inline Actions You could use `RankedTensorType::Builder` , up to you. nicolasvasilache: You could use `RankedTensorType::Builder `, up to you.
		using RTTBuilder = RankedTensorType::Builder;
		RankedTensorType transposedTensorType =
		RTTBuilder(rankedTensorType).setShape(transposedShape);
		return transposedTensorType;
		}
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Can we add this as a helper somewhere in linalgops.cpp / h or utils ? nicolasvasilache: Can we add this as a helper somewhere in linalgops.cpp / h or utils ?

		HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) {
valid = false;		valid = false;

// Bail on any use that isn't an input of a Linalg op.		// Bail on any use that isn't an input of a LinalgOp.
// Hoisting of inplace updates happens after vectorization.		// Hoisting of inplace updates happens after vectorization.
if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))		if (!isOnlyUsedAsInputOfLinalgOp(padOp))
return;		return;

// Get at most `numLoops` of immediately enclosing loops.		// Get at most `numLoops` of immediately enclosing loops.
SmallVector<scf::ForOp> reverseEnclosingLoops;		SmallVector<scf::ForOp> reverseEnclosingLoops;
getAtMostNEnclosingLoops(padTensorOp, numLoops, reverseEnclosingLoops);		getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops);
if (reverseEnclosingLoops.empty()) {		if (reverseEnclosingLoops.empty()) {
LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");		LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
return;		return;
}		}

outermostEnclosingForOp = reverseEnclosingLoops.back();		outermostEnclosingForOp = reverseEnclosingLoops.back();

// Get the `sliceOp` that defines the source tensor of `padTensorOp` and		// Get the `sliceOp` that defines the source tensor of `padOp` and
// check its source is defined outside of the outermost loop. This check		// check its source is defined outside of the outermost loop. This check
// ensures the padded data is available for packing before entering the		// ensures the padded data is available for packing before entering the
// outermost enclosing loop.		// outermost enclosing loop.
//		//
// Example:		// Example:
// ```		// ```
// %source = linalg.fill(%cst, %arg0)		// %source = linalg.fill(%cst, %arg0)
// // %source is available for packing here!		// // %source is available for packing here!
// scf.for %i		// scf.for %i
// scf.for %j		// scf.for %j
// scf.for %k		// scf.for %k
// %slice = tensor.extract_slice %source [%i, %j]		// %slice = tensor.extract_slice %source [%i, %j]
// %padded_slice = linalg.pad_tensor %slice		// %padded_slice = tensor.pad %slice
// ```		// ```
auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();		auto sliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>();
if (!sliceOp) {		if (!sliceOp) {
LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");		LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
return;		return;
}		}
if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {		if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");		LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
return;		return;
}		}

// Check the region of `padTensorOp` depends on a constant only. Adding		// Check the region of `padOp` depends on a constant only. Adding
// hoisting support for arbitrary padding regions would require cloning all		// hoisting support for arbitrary padding regions would require cloning all
// dependencies captured by the padding region.		// dependencies captured by the padding region.
Value paddingValue = padTensorOp.getConstantPaddingValue();		Value paddingValue = padOp.getConstantPaddingValue();
if (!paddingValue \|\|		if (!paddingValue \|\|
!isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {		!isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");		LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");
return;		return;
}		}

// Get all the ops in the backwards slice starting from `padTensorOp` and that		// Get all the ops in the backwards slice starting from `padOp` and that
// are dominated by the outermost enclosing loop.		// are dominated by the outermost enclosing loop.
DominanceInfo domInfo(outermostEnclosingForOp);		DominanceInfo domInfo(outermostEnclosingForOp);
getBackwardSlice(padTensorOp.getOperation(), &backwardSlice,		getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) {
[&](Operation *op) {
return domInfo.dominates(outermostEnclosingForOp, op);		return domInfo.dominates(outermostEnclosingForOp, op);
});		});
if (backwardSlice.empty())		if (backwardSlice.empty())
return;		return;
// Add `padTensorOp` itself to the backward slice.		// Add `padOp` itself to the backward slice.
backwardSlice.insert(padTensorOp.getOperation());		backwardSlice.insert(padOp.getOperation());

// Remove all ops in the backward slice that are not used to index the padded		// Remove all ops in the backward slice that are not used to index the padded
// tensor. In particular, keep `padTensorOp`, `sliceOp`, and the loop and		// tensor. In particular, keep `padOp`, `sliceOp`, and the loop and
// affine operations used for the index computation.		// affine operations used for the index computation.
if (failed(dropNonIndexDependencies(padTensorOp, sliceOp)))		if (failed(dropNonIndexDependencies(padOp, sliceOp)))
return;		return;

// Add only the loops part of the filtered `backwardSlice` to the packing		// Add only the loops part of the filtered `backwardSlice` to the packing
// loops. All other loops are not used to index the padded data and		// loops. All other loops are not used to index the padded data and
// consequently access the same data in every loop iteration. Adding them to		// consequently access the same data in every loop iteration. Adding them to
// the packing loops would increase the cache footprint of the packed data		// the packing loops would increase the cache footprint of the packed data
// by storing the same data multiple times.		// by storing the same data multiple times.
for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))		for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
if (backwardSlice.contains(forOp))		if (backwardSlice.contains(forOp))
packingLoops.push_back(forOp);		packingLoops.push_back(forOp);
if (packingLoops.empty()) {		if (packingLoops.empty()) {
LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");		LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
return;		return;
}		}

// The analysis is valid and hoisting can occur.		// The analysis is valid and hoisting can occur.
valid = true;		valid = true;
}		}

LogicalResult		LogicalResult
HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padTensorOp,		HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp,
tensor::ExtractSliceOp sliceOp) {		tensor::ExtractSliceOp sliceOp) {
// Set of all values used for index computation.		// Set of all values used for index computation.
SetVector<Value> indexEdges;		SetVector<Value> indexEdges;

// Add all index operands of `operation` to `indexEdges`. An index operand is		// Add all index operands of `operation` to `indexEdges`. An index operand is
// an operand of type index.		// an operand of type index.
auto addIndexOperandsToIndexEdges = [&](Operation *operation) {		auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
for (Value operand : operation->getOperands())		for (Value operand : operation->getOperands())
if (operand.getType().isIndex())		if (operand.getType().isIndex())
indexEdges.insert(operand);		indexEdges.insert(operand);
};		};

// Check if any operation result is contained in `indexEdges`.		// Check if any operation result is contained in `indexEdges`.
auto hasIndexResult = [&](Operation *operation) {		auto hasIndexResult = [&](Operation *operation) {
return llvm::any_of(operation->getResults(), [&](Value result) {		return llvm::any_of(operation->getResults(), [&](Value result) {
return indexEdges.contains(result);		return indexEdges.contains(result);
});		});
};		};

// Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index		// Starting from `padOp` and `sliceOp` walk the use-def edges of index
// type in `backwardSlice`. Add the index operands of an operation to		// type in `backwardSlice`. Add the index operands of an operation to
// `indexEdges` and remove all operations from `backwardSlice` that are not		// `indexEdges` and remove all operations from `backwardSlice` that are not
// part of the index computation.		// part of the index computation.
//		//
// Example:		// Example:
// ```		// ```
// %source = linalg.fill(%cst, %arg0)		// %source = linalg.fill(%cst, %arg0)
// scf.for %i		// scf.for %i
// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!		// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
// scf.for %j (%arg2 = %unrelated)		// scf.for %j (%arg2 = %unrelated)
// scf.for %k // not used to index %source!		// scf.for %k // not used to index %source!
// %ubi = affine.min #map(%i)		// %ubi = affine.min #map(%i)
// %ubj = affine.min #map(%j)		// %ubj = affine.min #map(%j)
// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]		// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
// %padded_slice = linalg.pad_tensor %slice		// %padded_slice = tensor.pad %slice
// ```		// ```
// After iterating `backwardSlice` we obtain:		// After iterating `backwardSlice` we obtain:
// indexEdges = [%i, %j, %ubi, %ubj]		// indexEdges = [%i, %j, %ubi, %ubj]
// backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]		// backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
SetVector<Operation *> operationsToRemove;		SetVector<Operation *> operationsToRemove;
for (Operation *op : llvm::reverse(backwardSlice)) {		for (Operation *op : llvm::reverse(backwardSlice)) {
// Add the index operands of `padTensorOp` and `sliceOp` to start the		// Add the index operands of `padOp` and `sliceOp` to start the
// exploration of the index computation.		// exploration of the index computation.
if (op == padTensorOp \|\| op == sliceOp) {		if (op == padOp \|\| op == sliceOp) {
addIndexOperandsToIndexEdges(op);		addIndexOperandsToIndexEdges(op);
continue;		continue;
}		}
// Add the index operands of the loop if its induction variable is		// Add the index operands of the loop if its induction variable is
// used for index computation.		// used for index computation.
if (auto forOp = dyn_cast<scf::ForOp>(op)) {		if (auto forOp = dyn_cast<scf::ForOp>(op)) {
if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {		if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
addIndexOperandsToIndexEdges(op);		addIndexOperandsToIndexEdges(op);
Show All 17 Lines	if (hasIndexResult(op)) {
if (hasMemoryEffect \|\| op->getNumRegions() != 0) {		if (hasMemoryEffect \|\| op->getNumRegions() != 0) {
LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "		LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
<< op << " -> skip\n");		<< op << " -> skip\n");
return failure();		return failure();
}		}
continue;		continue;
}		}
// Remove all other operations not used by the index computation. An		// Remove all other operations not used by the index computation. An
// exception are constant operations that may be used by `padTensorOp`.		// exception are constant operations that may be used by `padOp`.
if (!isa<arith::ConstantOp>(op))		if (!isa<arith::ConstantOp>(op))
operationsToRemove.insert(op);		operationsToRemove.insert(op);
}		}
backwardSlice.set_subtract(operationsToRemove);		backwardSlice.set_subtract(operationsToRemove);
return success();		return success();
}		}

SmallVector<Value>		SmallVector<Value>
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) \|\|
return Value();		return Value();
Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),		Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
stepVal = forOp.getStep();		stepVal = forOp.getStep();
auto loc = forOp->getLoc();		auto loc = forOp->getLoc();
return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),		return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
ValueRange{ivVal, lbVal, stepVal});		ValueRange{ivVal, lbVal, stepVal});
}		}

FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist,		FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
int numLoops,		tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector,
tensor::PadOp &hoistedOp) {		tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) {
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Please add some comment on the fact that we do not modify loop order but just tensor order and so the analysis is the same but the contiguous accesses end up being different. nicolasvasilache: Please add some comment on the fact that we do not modify loop order but just tensor order and…
LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops		LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
<< " loops\n");		<< " loops\n");
HoistingAnalysis analysis(opToHoist, numLoops);		HoistingAnalysis analysis(opToHoist, numLoops);
if (!analysis.isValid()) {		if (!analysis.isValid()) {
LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");		LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
return failure();		return failure();
}		}

scf::ForOp outer = analysis.outermostEnclosingForOp;		scf::ForOp outer = analysis.outermostEnclosingForOp;
ImplicitLocOpBuilder b(outer->getLoc(), outer);		ImplicitLocOpBuilder b(outer->getLoc(), outer);

SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);		SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);

// Update actual number of loops, which may be smaller.		// Update actual number of loops, which may be smaller.
int nPackedLoops = analysis.packingLoops.size();		int nPackedLoops = analysis.packingLoops.size();

Location loc = opToHoist->getLoc();		Location loc = opToHoist->getLoc();
RankedTensorType paddedTensorType = opToHoist.getResultType();		RankedTensorType paddedTensorType = opToHoist.getResultType();
int paddedRank = paddedTensorType.getRank();		int paddedRank = paddedTensorType.getRank();

// Create the packed tensor<?x?x..?xpadded_shape> into which we amortize		// Compute the type of the transposed padded tensor.
		FailureOr<RankedTensorType> transposedTensorType =
		computeTransposedType(paddedTensorType, transposeVector);
		if (failed(transposedTensorType))
		return failure();

		// Create the packed tensor<?x?x..?xtransposedShape> into which we amortize
// padding.		// padding.
SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);		SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
// TODO: go grab dims when necessary, for now tensor::PadOp returns a static		// TODO: go grab dims when necessary, for now tensor::PadOp returns a static
// tensor.		// tensor.
llvm::append_range(packedShape, paddedTensorType.getShape());		llvm::append_range(packedShape, transposedTensorType->getShape());
auto packedTensorType =		auto packedTensorType = RankedTensorType::get(
RankedTensorType::get(packedShape, paddedTensorType.getElementType());		packedShape, transposedTensorType->getElementType());
Value packedTensor = b.create<linalg::InitTensorOp>(		Value packedTensor = b.create<linalg::InitTensorOp>(
loc, dynamicTensorSizes, packedTensorType.getShape(),		loc, dynamicTensorSizes, packedTensorType.getShape(),
packedTensorType.getElementType());		packedTensorType.getElementType());

// Clone the operations involved in the backward slice, iteratively stepping		// Clone the operations involved in the backward slice, iteratively stepping
// into the loops that we encounter.		// into the loops that we encounter.
// The implementation proceeds in a stack-like fashion:		// The implementation proceeds in a stack-like fashion:
// 1. Iteratively clone and step into the loops, pushing the `packedTensor`		// 1. Iteratively clone and step into the loops, pushing the `packedTensor`
// deeper in the stack.		// deeper in the stack.
// 2. Create a InsertSliceOp at the top of the stack.		// 2. Create a GenericOp if `transposeVector` is non-empty.
// 3. Iteratively pop and yield the result of the InsertSliceOp across		// 3. Create a InsertSliceOp at the top of the stack.
		// 4. Iteratively pop and yield the result of the InsertSliceOp across
// the cloned loops.		// the cloned loops.
SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;		SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
clonedLoopIvs.reserve(nPackedLoops);		clonedLoopIvs.reserve(nPackedLoops);
leadingPackedTensorIndexings.reserve(nPackedLoops);		leadingPackedTensorIndexings.reserve(nPackedLoops);
BlockAndValueMapping bvm;		BlockAndValueMapping bvm;
// Stack step 1. iteratively clone loops and push `packedTensor`.		// Stack step 1. iteratively clone loops and push `packedTensor`.
for (Operation *op : analysis.backwardSlice) {		for (Operation *op : analysis.backwardSlice) {
// Specifically sit out in the extract_slice(packedTensor) case: this is the		// Specifically sit out in the extract_slice(packedTensor) case: this is the
// piece we seek to replace.		// piece we seek to replace.
Show All 23 Lines	Value loopIndependentIterationCount =
buildLoopIterationCount(b, outer, clonedForOp);		buildLoopIterationCount(b, outer, clonedForOp);
// Assert the loop-independent iteration count can be computed.		// Assert the loop-independent iteration count can be computed.
if (!loopIndependentIterationCount)		if (!loopIndependentIterationCount)
llvm_unreachable("loop independence prerequisite not met");		llvm_unreachable("loop independence prerequisite not met");
leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);		leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
packedTensor = clonedForOp.getRegionIterArgs().front();		packedTensor = clonedForOp.getRegionIterArgs().front();
}		}

// Stack step 2. create InsertSliceOp at the top of the stack.
// offsets = [clonedLoopIvs, 0 .. 0].		// offsets = [clonedLoopIvs, 0 .. 0].
SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),		SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
leadingPackedTensorIndexings.end());		leadingPackedTensorIndexings.end());
offsets.append(paddedRank, b.getIndexAttr(0));		offsets.append(paddedRank, b.getIndexAttr(0));
// sizes = [1 .. 1, paddedShape].		// sizes = [1 .. 1, transposedShape].
SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));		SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
for (int64_t sz : paddedTensorType.getShape()) {		for (int64_t sz : transposedTensorType->getShape()) {
// TODO: go grab dims when necessary, for now tensor::PadOp returns a static		// TODO: go grab dims when necessary, for now tensor::PadOp returns a static
// tensor.
assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");		assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
sizes.push_back(b.getIndexAttr(sz));		sizes.push_back(b.getIndexAttr(sz));
}		}
// strides = [1 .. 1].		// strides = [1 .. 1].
SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,		SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
b.getIndexAttr(1));		b.getIndexAttr(1));

Value inserted =		// Stack step 2. create GenericOp if `transposeVector` is non-empty.
b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()),		Value paddedTensor = bvm.lookup(opToHoist.result());
packedTensor, offsets, sizes, strides);		if (!transposeVector.empty()) {
		Value outputTensor = b.create<tensor::ExtractSliceOp>(
		loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
		transposeOps.push_back(
		makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector));
		paddedTensor = transposeOps.back()->getResult(0);
		}

		// Stack step 3. create InsertSliceOp at the top of the stack.
		Value inserted = b.create<tensor::InsertSliceOp>(
		loc, paddedTensor, packedTensor, offsets, sizes, strides);

// Stack step 3. iteratively pop the stack and propagate the yield.		// Stack step 4. iteratively pop the stack and propagate the yield.
Value valueToYield = inserted;		Value valueToYield = inserted;
for (Value iv : llvm::reverse(clonedLoopIvs)) {		for (Value iv : llvm::reverse(clonedLoopIvs)) {
auto forOp = scf::getForInductionVarOwner(iv);		auto forOp = scf::getForInductionVarOwner(iv);
b.setInsertionPointToEnd(&forOp.getRegion().front());		b.setInsertionPointToEnd(&forOp.getRegion().front());
b.create<scf::YieldOp>(loc, valueToYield);		b.create<scf::YieldOp>(loc, valueToYield);
valueToYield = forOp.getResult(0);		valueToYield = forOp.getResult(0);
}		}

// Now the packed tensor is ready, replace the original padding op by a		// Now the packed tensor is ready, replace the original padding op by a
// 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].		// 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
b.setInsertionPoint(opToHoist);		b.setInsertionPoint(opToHoist);
SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(		SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
llvm::map_range(analysis.packingLoops, [&](Operation *loop) {		llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));		return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
}));		}));
// Assert all loop iteration counts can be computed.		// Assert all loop iteration counts can be computed.
if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))		if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
llvm_unreachable("loop independence prerequisite not met");		llvm_unreachable("loop independence prerequisite not met");
// offsets = [originalLoopIvs, 0 .. 0].		// offsets = [originalLoopIvs, 0 .. 0].
offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());		offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
offsets.append(paddedRank, b.getIndexAttr(0));		offsets.append(paddedRank, b.getIndexAttr(0));
// sizes = [1 .. 1, paddedShape] (definedabove).		// sizes = [1 .. 1, transposedShape] (definedabove).
// strides = [1 .. 1] (defined above)		// strides = [1 .. 1] (defined above)
packedTensor =		packedTensor =
scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);		scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
Value newResult = b.create<tensor::ExtractSliceOp>(		Value newResult = b.create<tensor::ExtractSliceOp>(
loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides);		loc, *transposedTensorType, packedTensor, offsets, sizes, strides);

		// Transpose the packed tensor back to the original storage order.
		if (!transposeVector.empty()) {
		Value initTensor =
		b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(),
		paddedTensorType.getElementType());
		transposeOps.push_back(
		makeTransposeOp(b, loc, newResult, initTensor, transposeVector));
		newResult = transposeOps.back()->getResult(0);
		}

// Make the newly cloned `opToHoist` available to the caller.		// Make the newly cloned `opToHoist` available to the caller.
hoistedOp =		hoistedOp =
cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp());		cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
return newResult;		return newResult;
}		}

mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp

Show First 20 Lines • Show All 522 Lines • ▼ Show 20 Lines	mlir::linalg::LinalgPaddingPattern::returningMatchAndRewrite(
if (options.paddingHoistComputationFunction) {		if (options.paddingHoistComputationFunction) {
for (OpOperand *opOperand : linalgOp.getInputAndOutputOperands())		for (OpOperand *opOperand : linalgOp.getInputAndOutputOperands())
depths.push_back(options.paddingHoistComputationFunction(*opOperand));		depths.push_back(options.paddingHoistComputationFunction(*opOperand));
}		}

// Hoist the padding.		// Hoist the padding.
for (const auto &en : enumerate(depths)) {		for (const auto &en : enumerate(depths)) {
OpOperand &opOperand = paddedOp->getOpOperand(en.index());		OpOperand &opOperand = paddedOp->getOpOperand(en.index());
auto padTensorOp = opOperand.get().getDefiningOp<tensor::PadOp>();		auto padOp = opOperand.get().getDefiningOp<tensor::PadOp>();
if (!padTensorOp \|\| en.value() == 0)		if (!padOp \|\| en.value() == 0)
continue;		continue;
tensor::PadOp hoistedOp;		tensor::PadOp hoistedOp;
FailureOr<Value> newResult =		SmallVector<GenericOp> transposeOps;
hoistPaddingOnTensors(padTensorOp, en.value(), hoistedOp);		SmallVector<int64_t> transposeVector =
		options.paddingTransposeComputationFunction(opOperand);

		FailureOr<Value> newResult = hoistPaddingOnTensors(
		padOp, en.value(), transposeVector, hoistedOp, transposeOps);
if (failed(newResult))		if (failed(newResult))
continue;		continue;
rewriter.replaceOp(padTensorOp, newResult.getValue());		rewriter.replaceOp(padOp, newResult.getValue());

		// Do not apply hoist padding to the newly introduced transpose operations.
		for (GenericOp transposeOp : transposeOps)
		filter.replaceLinalgTransformationFilter(rewriter, transposeOp);
}		}

// Replace the original operation to pad.		// Replace the original operation to pad.
rewriter.replaceOp(linalgOp, newResults.getValue());		rewriter.replaceOp(linalgOp, newResults.getValue());
filter.replaceLinalgTransformationFilter(rewriter, paddedOp);		filter.replaceLinalgTransformationFilter(rewriter, paddedOp);

return paddedOp;		return paddedOp;
}		}

/// Linalg tile and fuse tensor ops pattern.		/// Linalg tile and fuse tensor ops pattern.
mlir::linalg::LinalgTileAndFuseTensorOpsPattern::		mlir::linalg::LinalgTileAndFuseTensorOpsPattern::
LinalgTileAndFuseTensorOpsPattern(MLIRContext *context,		LinalgTileAndFuseTensorOpsPattern(MLIRContext *context,
LinalgTilingAndFusionOptions options,		LinalgTilingAndFusionOptions options,
LinalgTransformationFilter f,		LinalgTransformationFilter f,
▲ Show 20 Lines • Show All 554 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Utils/Utils.cpp

Show First 20 Lines • Show All 334 Lines • ▼ Show 20 Lines	Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
Value current = sliceOp.source();		Value current = sliceOp.source();
while (current) {		while (current) {
auto linalgOp = current.getDefiningOp<LinalgOp>();		auto linalgOp = current.getDefiningOp<LinalgOp>();
if (!linalgOp)		if (!linalgOp)
break;		break;
OpResult opResult = current.cast<OpResult>();		OpResult opResult = current.cast<OpResult>();
current = linalgOp.getOutputOperand(opResult.getResultNumber())->get();		current = linalgOp.getOutputOperand(opResult.getResultNumber())->get();
}		}
auto padTensorOp = current ? current.getDefiningOp<tensor::PadOp>() : nullptr;		auto padOp = current ? current.getDefiningOp<tensor::PadOp>() : nullptr;

// Exit if the search fails to match a tensor::PadOp at the end of the matched		// Exit if the search fails to match a tensor::PadOp at the end of the matched
// LinalgOp sequence.		// LinalgOp sequence.
if (!padTensorOp)		if (!padOp)
return tensor::createPadHighOp(type, source, pad, nofold, loc, b);		return tensor::createPadHighOp(type, source, pad, nofold, loc, b);

// Exit if the padded result type does not match.		// Exit if the padded result type does not match.
if (sliceOp.source().getType() != type)		if (sliceOp.source().getType() != type)
return tensor::createPadHighOp(type, source, pad, nofold, loc, b);		return tensor::createPadHighOp(type, source, pad, nofold, loc, b);

// Exit if the LinalgOps are not high padded.		// Exit if the LinalgOps are not high padded.
if (llvm::any_of(padTensorOp.getMixedLowPad(), [](OpFoldResult ofr) {		if (llvm::any_of(padOp.getMixedLowPad(), [](OpFoldResult ofr) {
return getConstantIntValue(ofr) != static_cast<int64_t>(0);		return getConstantIntValue(ofr) != static_cast<int64_t>(0);
}))		}))
return tensor::createPadHighOp(type, source, pad, nofold, loc, b);		return tensor::createPadHighOp(type, source, pad, nofold, loc, b);

// Exit if `padTensorOpSliceOp`, which defines the slice used by		// Exit if `padOpSliceOp`, which defines the slice used by
// `padTensorOp`, is rank-reducing.		// `padOp`, is rank-reducing.
auto padTensorOpSliceOp =		auto padOpSliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>();
padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();		if (!padOpSliceOp \|\|
if (!padTensorOpSliceOp \|\| sliceOp.getMixedSizes().size() !=		sliceOp.getMixedSizes().size() != padOpSliceOp.getMixedSizes().size())
padTensorOpSliceOp.getMixedSizes().size())
return tensor::createPadHighOp(type, source, pad, nofold, loc, b);		return tensor::createPadHighOp(type, source, pad, nofold, loc, b);

// Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size		// Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size
// of the slice padded by `padTensorOp`.		// of the slice padded by `padOp`.
if (llvm::any_of(llvm::zip(sliceOp.getMixedSizes(),		if (llvm::any_of(
padTensorOpSliceOp.getMixedSizes()),		llvm::zip(sliceOp.getMixedSizes(), padOpSliceOp.getMixedSizes()),
[](std::tuple<OpFoldResult, OpFoldResult> it) {		[](std::tuple<OpFoldResult, OpFoldResult> it) {
return !isEqualConstantIntOrValue(std::get<0>(it),		return !isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it));
std::get<1>(it));
}))		}))
return tensor::createPadHighOp(type, source, pad, nofold, loc, b);		return tensor::createPadHighOp(type, source, pad, nofold, loc, b);

// Exit if the padding values do not match.		// Exit if the padding values do not match.
Attribute padTensorOpPadAttr, padAttr;		Attribute padOpPadAttr, padAttr;
Value padTensorOpPad = padTensorOp.getConstantPaddingValue();		Value padOpPad = padOp.getConstantPaddingValue();
if (!padTensorOpPad \|\|		if (!padOpPad \|\| !matchPattern(padOpPad, m_Constant(&padOpPadAttr)) \|\|
!matchPattern(padTensorOpPad, m_Constant(&padTensorOpPadAttr)) \|\|		!matchPattern(pad, m_Constant(&padAttr)) \|\| padOpPadAttr != padAttr)
!matchPattern(pad, m_Constant(&padAttr)) \|\| padTensorOpPadAttr != padAttr)
return tensor::createPadHighOp(type, source, pad, nofold, loc, b);		return tensor::createPadHighOp(type, source, pad, nofold, loc, b);

// Return the padded result if the padding values and sizes match.		// Return the padded result if the padding values and sizes match.
return sliceOp.source();		return sliceOp.source();
}		}

		GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
		Value outputTensor,
		ArrayRef<int64_t> transposeVector) {
		auto resultTensorType = outputTensor.getType().cast<RankedTensorType>();
		Type elementType = resultTensorType.getElementType();

		assert(isPermutation(transposeVector) &&
		"expect transpose vector to be a permutation");
		assert(transposeVector.size() ==
		static_cast<size_t>(resultTensorType.getRank()) &&
		"expect transpose vector size to match result tensor rank");

		// Compute the transpose and the indentity indexing maps.
		SmallVector<AffineMap> indexingMaps = {
		inversePermutation(AffineMap::getPermutationMap(
		SmallVector<unsigned>(transposeVector.begin(), transposeVector.end()),
		b.getContext())),
		AffineMap::getMultiDimIdentityMap(transposeVector.size(),
		b.getContext())};
		SmallVector<llvm::StringRef> iteratorTypes(transposeVector.size(),
		getParallelIteratorTypeName());

		// Create a GenericOp to transpose `inputTensor` into `outputTensor`.
		auto transposeOp = b.create<GenericOp>(
		loc, resultTensorType, inputTensor, outputTensor,
		b.getAffineMapArrayAttr(indexingMaps), b.getStrArrayAttr(iteratorTypes),
		/doc=/nullptr,
		/library_call=/nullptr);
		Region &body = transposeOp.getRegion();
		body.push_back(new Block());
		body.front().addArguments({elementType, elementType}, {loc, loc});

		// Create the body of the transpose operation.
		OpBuilder::InsertionGuard g(b);
		b.setInsertionPointToEnd(&body.front());
		b.create<YieldOp>(loc, transposeOp.getRegion().front().getArgument(0));
		return transposeOp;
		}

/// Specialization to build an scf "for" nest.		/// Specialization to build an scf "for" nest.
template <>		template <>
void GenerateLoopNest<scf::ForOp>::doit(		void GenerateLoopNest<scf::ForOp>::doit(
OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp,		OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp,
ArrayRef<Attribute> iteratorTypes,		ArrayRef<Attribute> iteratorTypes,
function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,		function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
ValueRange)>		ValueRange)>
bodyBuilderFn,		bodyBuilderFn,
▲ Show 20 Lines • Show All 530 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/hoist-padding.mlir

// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=MATVEC		// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=MATVEC
		// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 transpose-paddings=1:0,0,0 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=TRANSP
// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad hoist-paddings=1,2,1 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=MATMUL		// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad hoist-paddings=1,2,1 run-enable-pass=false" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=MATMUL

// MATVEC-DAG: #[[DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>		// MATVEC-DAG: #[[DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>

// MATVEC: static_size_divisible		// MATVEC: static_size_divisible
// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>		// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
func @static_size_divisible(%arg0: tensor<24x12xf32>,		func @static_size_divisible(%arg0: tensor<24x12xf32>,
%arg1: tensor<12xf32>,		%arg1: tensor<12xf32>,
Show All 15 Lines	func @static_size_divisible(%arg0: tensor<24x12xf32>,
%0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {		%0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
%1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>		%1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>

// Index the packed vector.		// Index the packed vector.
// MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV4]](%[[IV0]])		// MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV4]](%[[IV0]])
// MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]		// MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
%2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>		%2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
%3 = tensor.pad %2 nofold low[%c0] high[%c0] {		%3 = tensor.pad %2 nofold low[%c0] high[%c0] {
^bb0(%arg5: index):		^bb0(%arg5: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<4xf32> to tensor<4xf32>		} : tensor<4xf32> to tensor<4xf32>

// Check matvec uses the packed input vector.		// Check matvec uses the packed input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]
%4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>		%4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %4 : tensor<24xf32>		scf.yield %4 : tensor<24xf32>
}		}
Show All 34 Lines	%0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
%2 = tensor.extract_slice %arg0[0, %arg3] [24, %1] [1, 1] : tensor<24x12xf32> to tensor<24x?xf32>		%2 = tensor.extract_slice %arg0[0, %arg3] [24, %1] [1, 1] : tensor<24x12xf32> to tensor<24x?xf32>

// Index the packed vector.		// Index the packed vector.
// MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV0]])		// MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV0]])
// MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]		// MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
%3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor<?xf32>		%3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor<?xf32>
%4 = affine.apply #map1(%1)		%4 = affine.apply #map1(%1)
%5 = tensor.pad %2 low[%c0, %c0] high[%c0, %4] {		%5 = tensor.pad %2 low[%c0, %c0] high[%c0, %4] {
^bb0(%arg5: index, %arg6: index):		^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<24x?xf32> to tensor<24x5xf32>		} : tensor<24x?xf32> to tensor<24x5xf32>
%6 = tensor.pad %3 low[%c0] high[%4] {		%6 = tensor.pad %3 low[%c0] high[%4] {
^bb0(%arg5: index):		^bb0(%arg5: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<?xf32> to tensor<5xf32>		} : tensor<?xf32> to tensor<5xf32>

// Check matvec uses the packed input vector.		// Check matvec uses the packed input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]
%7 = linalg.matvec ins(%5, %6 : tensor<24x5xf32>, tensor<5xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>		%7 = linalg.matvec ins(%5, %6 : tensor<24x5xf32>, tensor<5xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %7 : tensor<24xf32>		scf.yield %7 : tensor<24xf32>
}		}
Show All 39 Lines	%1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
%3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>		%3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>

// Index the packed vector.		// Index the packed vector.
// MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DDIV4]](%[[IV0]])		// MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DDIV4]](%[[IV0]])
// MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]		// MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
%4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>		%4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
%5 = affine.apply #map1(%2)		%5 = affine.apply #map1(%2)
%6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] {		%6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] {
^bb0(%arg5: index, %arg6: index):		^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<24x?xf32> to tensor<24x4xf32>		} : tensor<24x?xf32> to tensor<24x4xf32>
%7 = tensor.pad %4 nofold low[%c0] high[%5] {		%7 = tensor.pad %4 nofold low[%c0] high[%5] {
^bb0(%arg5: index):		^bb0(%arg5: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<?xf32> to tensor<4xf32>		} : tensor<?xf32> to tensor<4xf32>

// Check matvec uses the packed input vector.		// Check matvec uses the packed input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]
%8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>		%8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %8 : tensor<24xf32>		scf.yield %8 : tensor<24xf32>
}		}
Show All 15 Lines	func @non_constant_padding(%arg0: tensor<24x12xf32>,
%0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {		%0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
%1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>		%1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>

// Check the non constant padding is not hoisted.		// Check the non constant padding is not hoisted.
// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]		// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
// MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]		// MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]
%2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>		%2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
%3 = tensor.pad %2 nofold low[%c0] high[%c0] {		%3 = tensor.pad %2 nofold low[%c0] high[%c0] {
^bb0(%arg5: index):		^bb0(%arg5: index):
%5 = arith.index_cast %arg3 : index to i32		%5 = arith.index_cast %arg3 : index to i32
%6 = arith.sitofp %5 : i32 to f32		%6 = arith.sitofp %5 : i32 to f32
tensor.yield %6 : f32		tensor.yield %6 : f32
} : tensor<4xf32> to tensor<4xf32>		} : tensor<4xf32> to tensor<4xf32>

// Check matvec uses the padded input vector.		// Check matvec uses the padded input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
%4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>		%4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
Show All 20 Lines	%0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
// Check the non constant op padding is not hoisted.		// Check the non constant op padding is not hoisted.
// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]		// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
// MATVEC: %[[V0:.*]] = tensor.extract %[[ARG1]][%[[IV0]]		// MATVEC: %[[V0:.*]] = tensor.extract %[[ARG1]][%[[IV0]]
// MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]		// MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]
// MATVEC: tensor.yield %[[V0]]		// MATVEC: tensor.yield %[[V0]]
%2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>		%2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
%3 = tensor.extract %arg1[%arg3] : tensor<12xf32>		%3 = tensor.extract %arg1[%arg3] : tensor<12xf32>
%4 = tensor.pad %2 nofold low[%c0] high[%c0] {		%4 = tensor.pad %2 nofold low[%c0] high[%c0] {
^bb0(%arg5: index):		^bb0(%arg5: index):
tensor.yield %3 : f32		tensor.yield %3 : f32
} : tensor<4xf32> to tensor<4xf32>		} : tensor<4xf32> to tensor<4xf32>

// Check matvec uses the padded input vector.		// Check matvec uses the padded input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>		%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %5 : tensor<24xf32>		scf.yield %5 : tensor<24xf32>
}		}
Show All 20 Lines	%0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {

// Check the index_cast prevents hoisting due to its non index operand.		// Check the index_cast prevents hoisting due to its non index operand.
// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]		// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
// MATVEC: %[[IDX0:.*]] = arith.index_cast %[[ARG3]]		// MATVEC: %[[IDX0:.*]] = arith.index_cast %[[ARG3]]
// MATVEC: %[[T1:.]] = tensor.pad %[[T0]]{{.}}%[[IDX0]]		// MATVEC: %[[T1:.]] = tensor.pad %[[T0]]{{.}}%[[IDX0]]
%2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>		%2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
%3 = arith.index_cast %arg3 : i32 to index		%3 = arith.index_cast %arg3 : i32 to index
%4 = tensor.pad %2 nofold low[%3] high[%3] {		%4 = tensor.pad %2 nofold low[%3] high[%3] {
^bb0(%arg6: index):		^bb0(%arg6: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<4xf32> to tensor<4xf32>		} : tensor<4xf32> to tensor<4xf32>

// Check matvec uses the padded input vector.		// Check matvec uses the padded input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>		%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %5 : tensor<24xf32>		scf.yield %5 : tensor<24xf32>
}		}
Show All 20 Lines	%0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {

// Check the load prevents hoisting due to its memory effect.		// Check the load prevents hoisting due to its memory effect.
// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]		// MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
// MATVEC: %[[IDX0:.*]] = memref.load %[[ARG3]]		// MATVEC: %[[IDX0:.*]] = memref.load %[[ARG3]]
// MATVEC: %[[T1:.]] = tensor.pad %[[T0]]{{.}}%[[IDX0]]		// MATVEC: %[[T1:.]] = tensor.pad %[[T0]]{{.}}%[[IDX0]]
%2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>		%2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
%3 = memref.load %arg3[%c0] : memref<?xindex>		%3 = memref.load %arg3[%c0] : memref<?xindex>
%4 = tensor.pad %2 nofold low[%3] high[%3] {		%4 = tensor.pad %2 nofold low[%3] high[%3] {
^bb0(%arg6: index):		^bb0(%arg6: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<4xf32> to tensor<4xf32>		} : tensor<4xf32> to tensor<4xf32>

// Check matvec uses the padded input vector.		// Check matvec uses the padded input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>		%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %5 : tensor<24xf32>		scf.yield %5 : tensor<24xf32>
}		}
Show All 23 Lines	%0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
// MATVEC: %[[IDX0:.]] = scf.for {{.}} step %[[ARG3]]		// MATVEC: %[[IDX0:.]] = scf.for {{.}} step %[[ARG3]]
// MATVEC: %[[T1:.]] = tensor.pad %[[T0]]{{.}}%[[IDX0]]		// MATVEC: %[[T1:.]] = tensor.pad %[[T0]]{{.}}%[[IDX0]]
%2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>		%2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
%3 = scf.for %arg6 = %c0 to %c12 step %arg3 iter_args(%arg7 = %c0) -> (index) {		%3 = scf.for %arg6 = %c0 to %c12 step %arg3 iter_args(%arg7 = %c0) -> (index) {
%6 = arith.addi %arg3, %arg7 : index		%6 = arith.addi %arg3, %arg7 : index
scf.yield %6 : index		scf.yield %6 : index
}		}
%4 = tensor.pad %2 nofold low[%3] high[%3] {		%4 = tensor.pad %2 nofold low[%3] high[%3] {
^bb0(%arg6: index):		^bb0(%arg6: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<4xf32> to tensor<4xf32>		} : tensor<4xf32> to tensor<4xf32>

// Check matvec uses the padded input vector.		// Check matvec uses the padded input vector.
// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]		// MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>		%5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
scf.yield %5 : tensor<24xf32>		scf.yield %5 : tensor<24xf32>
}		}
Show All 28 Lines	%0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<12x24xf32>) {
%1 = affine.min #map0(%arg3)		%1 = affine.min #map0(%arg3)

// Check the extract_slice op introduced by the double tiling does not prevent the hoisting.		// Check the extract_slice op introduced by the double tiling does not prevent the hoisting.
%2 = tensor.extract_slice %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<12x24xf32> to tensor<?x24xf32>		%2 = tensor.extract_slice %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<12x24xf32> to tensor<?x24xf32>
%3 = affine.apply #map1(%1)		%3 = affine.apply #map1(%1)

// Check the fused and padded fill op does not prevent hoisting.		// Check the fused and padded fill op does not prevent hoisting.
%4 = tensor.pad %2 nofold low[%c0, %c0] high[%3, %c0] {		%4 = tensor.pad %2 nofold low[%c0, %c0] high[%3, %c0] {
^bb0(%arg5: index, %arg6: index):		^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<?x24xf32> to tensor<5x24xf32>		} : tensor<?x24xf32> to tensor<5x24xf32>
%5 = linalg.fill(%cst, %4) : f32, tensor<5x24xf32> -> tensor<5x24xf32>		%5 = linalg.fill(%cst, %4) : f32, tensor<5x24xf32> -> tensor<5x24xf32>
%6 = tensor.extract_slice %5[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>		%6 = tensor.extract_slice %5[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>

// Check the first input operand is hoisted by one loop nest.		// Check the first input operand is hoisted by one loop nest.
// MATMUL: %[[T3:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =		// MATMUL: %[[T3:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
// MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG0]]		// MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG0]]
// MATMUL: %[[T5:.*]] = tensor.pad %[[T4]]		// MATMUL: %[[T5:.*]] = tensor.pad %[[T4]]

// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =		// MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
%7 = scf.for %arg5 = %c0 to %c6 step %c3 iter_args(%arg6 = %6) -> (tensor<?x24xf32>) {		%7 = scf.for %arg5 = %c0 to %c6 step %c3 iter_args(%arg6 = %6) -> (tensor<?x24xf32>) {

// Index the packed operands.		// Index the packed operands.
// MATMUL-DAG: %[[T6:.*]] = tensor.extract_slice %[[T3]]		// MATMUL-DAG: %[[T6:.*]] = tensor.extract_slice %[[T3]]
// MATMUL-DAG: %[[T7:.*]] = tensor.extract_slice %[[T0]]		// MATMUL-DAG: %[[T7:.*]] = tensor.extract_slice %[[T0]]
%9 = tensor.extract_slice %arg0[%arg3, %arg5] [%1, 3] [1, 1] : tensor<12x6xf32> to tensor<?x3xf32>		%9 = tensor.extract_slice %arg0[%arg3, %arg5] [%1, 3] [1, 1] : tensor<12x6xf32> to tensor<?x3xf32>
%10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32>		%10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32>
%11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> to tensor<?x24xf32>		%11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> to tensor<?x24xf32>
%12 = tensor.pad %9 nofold low[%c0, %c0] high[%3, %c0] {		%12 = tensor.pad %9 nofold low[%c0, %c0] high[%3, %c0] {
^bb0(%arg7: index, %arg8: index):		^bb0(%arg7: index, %arg8: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<?x3xf32> to tensor<5x3xf32>		} : tensor<?x3xf32> to tensor<5x3xf32>
%13 = tensor.pad %10 nofold low[%c0, %c0] high[%c0, %c0] {		%13 = tensor.pad %10 nofold low[%c0, %c0] high[%c0, %c0] {
^bb0(%arg7: index, %arg8: index):		^bb0(%arg7: index, %arg8: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<3x24xf32> to tensor<3x24xf32>		} : tensor<3x24xf32> to tensor<3x24xf32>

// Check the output padding is not hoisted.		// Check the output padding is not hoisted.
// MATMUL: %[[T8:.*]] = tensor.pad		// MATMUL: %[[T8:.*]] = tensor.pad
%14 = tensor.pad %11 nofold low[%c0, %c0] high[%3, %c0] {		%14 = tensor.pad %11 nofold low[%c0, %c0] high[%3, %c0] {
^bb0(%arg7: index, %arg8: index):		^bb0(%arg7: index, %arg8: index):
tensor.yield %cst : f32		tensor.yield %cst : f32
} : tensor<?x24xf32> to tensor<5x24xf32>		} : tensor<?x24xf32> to tensor<5x24xf32>

// Check matmul uses the padded operands.		// Check matmul uses the padded operands.
// MATMUL: = linalg.matmul ins(%[[T6]], %[[T7]] {{.*}} outs(%[[T8]]		// MATMUL: = linalg.matmul ins(%[[T6]], %[[T7]] {{.*}} outs(%[[T8]]
%15 = linalg.matmul ins(%12, %13 : tensor<5x3xf32>, tensor<3x24xf32>) outs(%14 : tensor<5x24xf32>) -> tensor<5x24xf32>		%15 = linalg.matmul ins(%12, %13 : tensor<5x3xf32>, tensor<3x24xf32>) outs(%14 : tensor<5x24xf32>) -> tensor<5x24xf32>
%16 = tensor.extract_slice %15[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>		%16 = tensor.extract_slice %15[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>
%17 = tensor.insert_slice %16 into %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<?x24xf32>		%17 = tensor.insert_slice %16 into %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<?x24xf32>
scf.yield %17 : tensor<?x24xf32>		scf.yield %17 : tensor<?x24xf32>
}		}
%8 = tensor.insert_slice %7 into %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<12x24xf32>		%8 = tensor.insert_slice %7 into %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<12x24xf32>
scf.yield %8 : tensor<12x24xf32>		scf.yield %8 : tensor<12x24xf32>
}		}
return %0 : tensor<12x24xf32>		return %0 : tensor<12x24xf32>
}		}

		// -----

		#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
		#map1 = affine_map<(d0) -> (-d0 + 4)>

		// TRANSP: transpose
		// TRANSP-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x?xf32>
		func @transpose(%arg0: tensor<24x?xf32>,
		%arg1: tensor<?xf32>,
		%arg2: tensor<24xf32>) -> tensor<24xf32> {
		%cst = arith.constant 0.000000e+00 : f32
		%c0 = arith.constant 0 : index
		%c1 = arith.constant 1 : index
		%c4 = arith.constant 4 : index
		%0 = tensor.dim %arg0, %c1 : tensor<24x?xf32>

		// Transpose the padded matrix.
		// TRANSP: %[[T0:.]] = scf.for %[[PIV0:[0-9a-z]+]] = {{.}}iter_args(%[[T1:.*]] =
		// TRANSP: %[[T2:.*]] = tensor.pad
		// TRANSP: %[[T3:.*]] = tensor.extract_slice %[[T1]]
		// TRANSP: %[[T4:.*]] = linalg.generic
		// TRANSP-SAME: ins(%[[T2]] : tensor<24x4xf32>
		// TRANSP-SAME: outs(%[[T3]] : tensor<4x24xf32>
		// TRANSP: %[[T5:.*]] = tensor.insert_slice %[[T4]] into %[[T1]]
		// TRANSP: scf.yield %[[T5:.*]]

		// TRANSP: scf.for %[[IV0:[0-9a-zA-Z]*]] =
		%1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
		%2 = affine.min #map0(%arg3)[%0]
		%3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>

		// Index the packed vector and transpose back.
		// TRANSP: %[[T6:.*]] = tensor.extract_slice %[[T0]]
		// TRANSP: %[[T7:.*]] = linalg.init_tensor
		// TRANSP: %[[T8:.*]] = linalg.generic
		// TRANSP-SAME: ins(%[[T6]] : tensor<4x24xf32>
		// TRANSP-SAME: outs(%[[T7]] : tensor<24x4xf32>
		%4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
		%5 = affine.apply #map1(%2)
		%6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] {
		^bb0(%arg5: index, %arg6: index): // no predecessors
		tensor.yield %cst : f32
		} : tensor<24x?xf32> to tensor<24x4xf32>
		%7 = tensor.pad %4 nofold low[%c0] high[%5] {
		^bb0(%arg5: index): // no predecessors
		tensor.yield %cst : f32
		} : tensor<?xf32> to tensor<4xf32>

		// Check matvec uses the packed input vector.
		// TRANSP: = linalg.matvec ins(%[[T8]]
		%8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
		scf.yield %8 : tensor<24xf32>
		}
		return %1 : tensor<24xf32>
		}

mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp

Show First 20 Lines • Show All 103 Lines • ▼ Show 20 Lines	struct TestLinalgCodegenStrategy
ListOption<int64_t> packPaddings{		ListOption<int64_t> packPaddings{
*this, "pack-paddings",		*this, "pack-paddings",
llvm::cl::desc("Operand packing flags when test-pad-pattern."),		llvm::cl::desc("Operand packing flags when test-pad-pattern."),
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};		llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
ListOption<int64_t> hoistPaddings{		ListOption<int64_t> hoistPaddings{
*this, "hoist-paddings",		*this, "hoist-paddings",
llvm::cl::desc("Operand hoisting depths when test-pad-pattern."),		llvm::cl::desc("Operand hoisting depths when test-pad-pattern."),
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};		llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
		ListOption<std::string> transposePaddings{
		*this, "transpose-paddings",
		llvm::cl::desc(
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Can we make the individual strings `:` separated (i.e. 1:0,0,0) until we can find a better list of list option? Also plz document this as it is tricky. nicolasvasilache: Can we make the individual strings `:` separated (i.e. 1:0,0,0) until we can find a better list…
		"Transpose paddings when test-pad-pattern. Specify a "
		"operand dimension interchange using the following format:\n"
		"-transpose-paddings=1:0:2,0:1,0:1\n"
		"It defines the interchange [1, 0, 2] for operand one and "
		"the interchange [0, 1] (no transpose) for the remaining operands."
		"All interchange vectors have to be permuations matching the "
		"operand rank."),
		llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
Option<bool> generalize{*this, "generalize",		Option<bool> generalize{*this, "generalize",
llvm::cl::desc("Generalize named operations."),		llvm::cl::desc("Generalize named operations."),
llvm::cl::init(false)};		llvm::cl::init(false)};
ListOption<int64_t> iteratorInterchange{		ListOption<int64_t> iteratorInterchange{
*this, "iterator-interchange", llvm::cl::MiscFlags::CommaSeparated,		*this, "iterator-interchange", llvm::cl::MiscFlags::CommaSeparated,
llvm::cl::desc("Specifies the iterator interchange.")};		llvm::cl::desc("Specifies the iterator interchange.")};
Option<bool> decompose{		Option<bool> decompose{
*this, "decompose",		*this, "decompose",
▲ Show 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	return opOperand.getOperandNumber() < packPaddings.size()
? packPaddings[opOperand.getOperandNumber()]		? packPaddings[opOperand.getOperandNumber()]
: false;		: false;
};		};
auto hoistingFunc = [&](OpOperand &opOperand) {		auto hoistingFunc = [&](OpOperand &opOperand) {
return opOperand.getOperandNumber() < hoistPaddings.size()		return opOperand.getOperandNumber() < hoistPaddings.size()
? hoistPaddings[opOperand.getOperandNumber()]		? hoistPaddings[opOperand.getOperandNumber()]
: 0;		: 0;
};		};
		auto transposeFunc = [&](OpOperand &opOperand) {
		SmallVector<int64_t> transposeVector = {};
		if (opOperand.getOperandNumber() >= transposePaddings.size())
		return transposeVector;
		SmallVector<StringRef> elems;
		StringRef(transposePaddings[opOperand.getOperandNumber()])
		.split(elems, ':');
		for (StringRef elem : elems)
		transposeVector.push_back(std::stoi(elem.str()));
		return transposeVector;
		};
paddingOptions.setPaddingValueComputationFunction(getNeutralOfLinalgOp);		paddingOptions.setPaddingValueComputationFunction(getNeutralOfLinalgOp);
paddingOptions.setPaddingNoFoldComputationFunction(packFunc);		paddingOptions.setPaddingNoFoldComputationFunction(packFunc);
paddingOptions.setPaddingHoistComputationFunction(hoistingFunc);		paddingOptions.setPaddingHoistComputationFunction(hoistingFunc);
		paddingOptions.setPaddingTransposeComputationFunction(transposeFunc);

// Compute input padding values only an return failure for output operands.		// Compute input padding values only an return failure for output operands.
if (padInputsOnly) {		if (padInputsOnly) {
paddingOptions.setPaddingValueComputationFunction(		paddingOptions.setPaddingValueComputationFunction(
[](OpBuilder &b, OpOperand &op) -> FailureOr<Value> {		[](OpBuilder &b, OpOperand &op) -> FailureOr<Value> {
auto linalgOp = dyn_cast<LinalgOp>(op.getOwner());		auto linalgOp = dyn_cast<LinalgOp>(op.getOwner());
if (linalgOp && linalgOp.isInputTensor(&op))		if (linalgOp && linalgOp.isInputTensor(&op))
return getNeutralOfLinalgOp(b, op);		return getNeutralOfLinalgOp(b, op);
Show All 30 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Add transpose support to hoist padding.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 402541

mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/include/mlir/Dialect/Linalg/Utils/Utils.h

mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp

mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp

mlir/lib/Dialect/Linalg/Utils/Utils.cpp

mlir/test/Dialect/Linalg/hoist-padding.mlir

mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Add transpose support to hoist padding.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 402541

mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/include/mlir/Dialect/Linalg/Utils/Utils.h

mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp

mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp

mlir/lib/Dialect/Linalg/Utils/Utils.cpp

mlir/test/Dialect/Linalg/hoist-padding.mlir

mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp

[mlir][linalg] Add transpose support to hoist padding.
ClosedPublic