Diff 385431

mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp

Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
/// Analysis class to support PadTensorOp hoisting across multiple enclosing		/// Analysis class to support PadTensorOp hoisting across multiple enclosing
/// loops. The failure conditions are:		/// loops. The failure conditions are:
/// 1. Pad op has a use that is not an input of a LinalgOp.		/// 1. Pad op has a use that is not an input of a LinalgOp.
/// 2. There is no immediately enclosing scf::ForOp.		/// 2. There is no immediately enclosing scf::ForOp.
/// 3. The backward slice from the pad op to the scf::ForOp to hoist above		/// 3. The backward slice from the pad op to the scf::ForOp to hoist above
/// contains an unknown op with a region.		/// contains an unknown op with a region.
/// 4. The backward slice from the pad op to the scf::ForOp to hoist above is		/// 4. The backward slice from the pad op to the scf::ForOp to hoist above is
/// empty.		/// empty.
		/// 5. The source tensor of pad op is not defined by an extract slice op.
		/// 6. The source tensor of the extract slice op is not defined outside of
		/// the outermost enclosing scf::ForOp.
		/// 7. There is no enclosing scf::ForOp that indexes the padded data.
/// Other cases succeed and will trigger hoisting of the pad op.		/// Other cases succeed and will trigger hoisting of the pad op.
struct HoistingAnalysis {		struct HoistingAnalysis {
HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);		HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);

bool isValid() { return valid; }		bool isValid() { return valid; }

/// Footprint of the packedTensor, computed from the packingLoops and		/// Footprint of the packedTensor, computed from the packingLoops and
/// `backwardSlice`.		/// `backwardSlice`.
Show All 18 Lines	struct HoistingAnalysis {
/// 2. whose induction variable is used, directly or indirectly, in the		/// 2. whose induction variable is used, directly or indirectly, in the
/// computation of `padTensorOp`.		/// computation of `padTensorOp`.
/// The span of these loops determines the footprint of the packed tensor.		/// The span of these loops determines the footprint of the packed tensor.
/// SmallSetVector<scf::ForOp> packingLoops;		/// SmallSetVector<scf::ForOp> packingLoops;
SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>		SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
packingLoops;		packingLoops;

private:		private:
		/// Returns the loops in `backwardSlice` used to index the padded data. The
		/// method starts from `padTensorOp` and `sliceOp`, follows the use-def
		/// chains of their index operands, and stores any enclosing loop whose
		/// induction variable is part of the walked index computation.
		///
		/// Example:
		/// ```
		/// %source = linalg.fill(%cst, %arg0)
		/// scf.for %i
		/// scf.for %j
		/// scf.for %k // not used to index %source!
		/// %ubi = affine.min #map(%i)
		/// %ubj = affine.min #map(%j)
		/// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
		/// %padded_slice = linalg.pad_tensor %slice
		/// ```
		/// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j]
		SetVector<Operation *> getIndexingLoops(PadTensorOp padTensorOp,
		tensor::ExtractSliceOp sliceOp);

/// Encodes whether the analysis is valid and hoisting can proceed.		/// Encodes whether the analysis is valid and hoisting can proceed.
bool valid;		bool valid;
};		};

/// Return true if all uses of `padTensorOp` are an input tensor of some		/// Return true if all uses of `padTensorOp` are an input tensor of some
/// LinalgOp.		/// LinalgOp.
static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {		static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
for (OpOperand &use : padTensorOp.result().getUses()) {		for (OpOperand &use : padTensorOp.result().getUses()) {
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	getBackwardSlice(
return false;		return false;
}		}
return true;		return true;
});		});

if (analysisFailure \|\| backwardSlice.empty())		if (analysisFailure \|\| backwardSlice.empty())
return;		return;

// Backward slice is a topologically sorted list of ops starting at		// Get the `sliceOp` that defines the source tensor of `padTensorOp` and
// `outermostEnclosingForOp`.		// check its source is defined outside of the outermost loop. This check
assert(outermostEnclosingForOp == backwardSlice.front());		// ensures the padded data is available for packing before entering the
gysitAuthorUnsubmitted Done Reply Inline Actions The assertion triggers for the double tiling test case since an affine min computation in the body of the outermost loop actually is in front of the loop. This is OK since they are not dependent meaning both orders are valid topological sorts. Additionally, I think it may be possible that the outermost loop is not in the backward slice (no data dependency) and we may still want to hoist out of it. gysit: The assertion triggers for the double tiling test case since an affine min computation in the…
		// outermost enclosing loop.
// Filter out the loops whose induction variable is not used to compute the		//
// padded result. As a first approximation, just look for IVs that have no use		// Example:
// in the backwardSlice.		// ```
// These are the dimensions of reuse that we can exploit to reduce the amount		// %source = linalg.fill(%cst, %arg0)
// of copy / memory.		// // %source is available for packing here!
		// scf.for %i
		// scf.for %j
		// scf.for %k
		// %slice = tensor.extract_slice %source [%i, %j]
		// %padded_slice = linalg.pad_tensor %slice
		// ```
		auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
		if (!sliceOp) {
		LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
		return;
		}
		if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
		LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
		return;
		}

		// Search the loops found in `backwardSlice` used to index the padded data.
		SetVector<Operation *> indexingLoops = getIndexingLoops(padTensorOp, sliceOp);

		// Add only the loops part of `indexingLoops` to the packing loops. All other
		// loops are not used to index the padded data and consequently access the
		// same data in every loop iteration. Adding them to the packing loops would
		// increase the cache footprint of the packed data by storing the same data
		// multiple times.
for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {		for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
for (Operation *user : forOp.getInductionVar().getUsers()) {		if (indexingLoops.contains(forOp))
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions `A loop` nicolasvasilache: `A loop`
		nicolasvasilacheUnsubmitted Done Reply Inline Actions seems this has reached a natural limit and would benefit from its own function ? nicolasvasilache: seems this has reached a natural limit and would benefit from its own function ?
if (backwardSlice.contains(user)) {
packingLoops.insert(forOp);		packingLoops.insert(forOp);
break;
}
}		}
		assert(indexingLoops.size() == packingLoops.size() &&
		"expect the all indexing loops are enclosing loops");
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions sounds even more like an interface for slicing nicolasvasilache: sounds even more like an interface for slicing
		if (packingLoops.empty()) {
		LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Could you rephrase `since and with the exception of` ? nicolasvasilache: Could you rephrase ` since and with the exception of` ?
		return;
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions can we have some comment on what this represents ? If this is to better filter out the fact that the backward slice is too coarse and we want finer control over which values used in scf::For should propagate of not please explain do in the broader context of proper documentation about the behavior of the slice algorithm that you are using. nicolasvasilache: can we have some comment on what this represents ? If this is to better filter out the fact…
}		}

// The analysis is valid and hoisting can occur.		// The analysis is valid and hoisting can occur.
valid = true;		valid = true;
}		}

		SetVector<Operation *>
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions A bit more contained and precise explanation + an example in pseudoIR would be useful here. The backward slice computation algorithm takes a filter function that ... This has the effect of .. When applied to ... turns all enclosing loops into packing loops regardless of whether they are actually used etc Example: for i for j for k .. nicolasvasilache: A bit more contained and precise explanation + an example in pseudoIR would be useful here. ```…
		HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
		tensor::ExtractSliceOp sliceOp) {
		// Set of all values used for index computation.
		SetVector<Value> indexEdges;

		// Helper function that adds all index operands of an operation to
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions are these actual assertions that can never happen or failure conditions where we should abort the transformation? nicolasvasilache: are these actual assertions that can never happen or failure conditions where we should abort…
		// `indexEdges`. An operand is an index operand if it is of index type.
		auto addIndexOperandsToIndexEdges = [&](Operation *op) {
		for (Value operand : op->getOperands())
		if (operand.getType().isIndex())
		indexEdges.insert(operand);
		};

		// Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index
		// type in `backwardSlice`. Add the index operands of an operation to
		// `indexEdges` if one of its results is an index edge found so far and store
		// all loops part of the index computation to `indexingLoops`.
		//
		// Example:
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions The way you are using this seems to hint at an interface for performing slice analysis that would be more precise. The trickiness seems related to ops that contain other ops with side effects. Is the restriction you are making here valid in the presence of say load/store on memref within the scf::ForOp ? Maybe that is part of what needs to be computed here: is there a nested op with side effects (which in practice we know doesn't happen in the case we are interested in but we need to be conservative). nicolasvasilache: The way you are using this seems to hint at an interface for performing slice analysis that…
		gysitAuthorUnsubmitted Done Reply Inline Actions @nicolasvasilache After reading this I think we should consider switching to an explicit model where we encode the ops that can appear in the backward slice and how we handle them. For example, if the induction variable of a for op is part of the backward slice we only add the bounds and the step to the backward slice but not the iteration argument which is usually completely unrelated. For the ForOp the fact that it has a body and side effects is not the only problem. It is more the fact that it has a complex semantic and we are not interested in the iteration argument part. For example, an hypothetical cast_tensor_and_affine_apply op would also be problematic and would require special treatment depending on which of its results is found in the backward slice. I.e. we either continue the backward slice computation with the cast or the apply specific operands. Loads and stores on memrefs are a different beast. They would usually not appear in the backward slice but can in a later point in time indeed make hoisting invalid. We can probably handle this by adding a check what operations may be part of the hoisted / cloned loop nest? gysit: @nicolasvasilache After reading this I think we should consider switching to an explicit model…
		// ```
		// %source = linalg.fill(%cst, %arg0)
		// scf.for %i
		// scf.for %j
		// scf.for %k // not used to index %source!
		// %ubi = affine.min #map(%i)
		// %ubj = affine.min #map(%j)
		// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
		// %padded_slice = linalg.pad_tensor %slice
		// ```
		// After iterating `backwardSlice` we obtain:
		// indexEdges = [%i, %j, %ubi, %ubj]
		// indexingLoops = [scf.for %i, scf.for %j]
		SetVector<Operation *> indexingLoops;
		for (Operation *op : llvm::reverse(backwardSlice)) {
		// Add the index operands of `padTensorOp` and `sliceOp` to start the
		// exploration of the index computation.
		if (op == padTensorOp \|\| op == sliceOp) {
		addIndexOperandsToIndexEdges(op);
		continue;
		}
		// Add the index operands of the loop if its induction variable is
		// used for index computation. Additionally, insert the loop into
		// `indexingLoops`
		if (auto forOp = dyn_cast<scf::ForOp>(op)) {
		if (indexEdges.contains(forOp.getInductionVar())) {
		addIndexOperandsToIndexEdges(op);
		indexingLoops.insert(forOp);
		continue;
		}
		}
		// Add the index operands of all other operations if at least one result is
		// used for index computation.
		if (llvm::any_of(op->getResults(),
		[&](Value result) { return indexEdges.contains(result); }))
		addIndexOperandsToIndexEdges(op);
		}
		return indexingLoops;
		}

static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {		static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
return outer.isDefinedOutsideOfLoop(v) \|\| v.getDefiningOp<ConstantOp>();		return outer.isDefinedOutsideOfLoop(v) \|\| v.getDefiningOp<ConstantOp>();
}		}

/// For each loop in `loops`, determine the ops involved in the construction of		/// For each loop in `loops`, determine the ops involved in the construction of
/// its upper bound---up to the outerLimit loop--- and fold them as new		/// its upper bound---up to the outerLimit loop--- and fold them as new
/// inequalities in the constraint set.		/// inequalities in the constraint set.
/// This is achieved by computing the backwardSlice of the loop's upper bound		/// This is achieved by computing the backwardSlice of the loop's upper bound
▲ Show 20 Lines • Show All 307 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/hoist-padding.mlir

Show First 20 Lines • Show All 135 Lines • ▼ Show 20 Lines	%3 = scf.for %arg3 = %c0 to %0 step %c2 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
}		}
scf.yield %4 : tensor<?x?xf32>		scf.yield %4 : tensor<?x?xf32>
}		}
return %3 : tensor<?x?xf32>		return %3 : tensor<?x?xf32>
}		}

// -----		// -----


// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>		// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
// CHECK-DAG: #[[$MIN_REST4:[0-9a-z]+]] = affine_map<(d0, d1) -> (4, d0 - d1)>
// CHECK-DAG: #[[$MIN_REST2:[0-9a-z]+]] = affine_map<(d0, d1) -> (2, d0 - d1)>
// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>		// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>		// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>		#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>		#map1 = affine_map<(d0, d1) -> (4, -d0 + d1)>
#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>		#map2 = affine_map<(d0, d1) -> (2, -d0 + d1)>
		#map3 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>

// CHECK-LABEL: func @dot		// CHECK-LABEL: func @dot
// VERIFIER-ONLY-LABEL: func @dot		// VERIFIER-ONLY-LABEL: func @dot
func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)		func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
-> tensor<f32>		-> tensor<f32>
{		{
		%cst = arith.constant 0.000000e+00 : f32
%c8 = arith.constant 8 : index		%c8 = arith.constant 8 : index
		%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index		%c4 = arith.constant 4 : index
%cst = arith.constant 0.000000e+00 : f32
%c2 = arith.constant 2 : index		%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index		%0 = tensor.dim %arg0, %c0 : tensor<?xf32>
%1 = tensor.dim %arg0, %c0 : tensor<?xf32>
%2 = tensor.dim %arg0, %c0 : tensor<?xf32>
%3 = tensor.dim %arg1, %c0 : tensor<?xf32>

// CHECK: scf.for %[[I:[0-9a-z]+]] =		// CHECK: scf.for %[[I:[0-9a-z]+]] =
//		//
// CHECK: %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])		// CHECK: %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
// CHECK: %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])		// CHECK: %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
// Init tensor and pack.		// Init tensor and pack.
// CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], 2, 2] : tensor<?x2x2xf32>		// CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], 2, 2] : tensor<?x2x2xf32>
// CHECK: %[[CAST_INIT_PACKED_A:.*]] = tensor.cast %[[INIT_PACKED_A]] : tensor<?x2x2xf32> to tensor<?x?x2xf32>		// CHECK: %[[CAST_INIT_PACKED_A:.*]] = tensor.cast %[[INIT_PACKED_A]] : tensor<?x2x2xf32> to tensor<?x?x2xf32>
Show All 14 Lines	func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
// CHECK: %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])		// CHECK: %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
// CHECK: %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])		// CHECK: %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
// CHECK: %[[A:.*]] = tensor.extract_slice %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>		// CHECK: %[[A:.*]] = tensor.extract_slice %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
// CHECK: %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])		// CHECK: %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
// CHECK: %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])		// CHECK: %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
// CHECK: %[[B:.*]] = tensor.extract_slice %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>		// CHECK: %[[B:.*]] = tensor.extract_slice %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
// CHECK: linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>		// CHECK: linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>

%4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {		%1 = scf.for %arg3 = %c0 to %0 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
%5 = affine.min #map0(%arg3)[%2]		%2 = affine.min #map0(%arg3)[%0]
%6 = tensor.extract_slice %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>		%3 = scf.for %arg5 = %c0 to %2 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
%7 = affine.min #map0(%arg3)[%3]		%4 = affine.min #map1(%arg5, %2)
%8 = tensor.extract_slice %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>		%5 = scf.for %arg7 = %c0 to %4 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
%9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {		%6 = affine.min #map2(%arg7, %4)
%10 = affine.min #map1(%5, %arg5)		%7 = affine.apply #map3(%arg7, %arg5, %arg3)
%11 = tensor.extract_slice %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>		%8 = tensor.extract_slice %arg0[%7] [%6] [1] : tensor<?xf32> to tensor<?xf32>
%12 = affine.min #map1(%7, %arg5)		%9 = tensor.extract_slice %arg1[%7] [%6] [1] : tensor<?xf32> to tensor<?xf32>
%13 = tensor.extract_slice %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>		%10 = arith.subi %c2, %6 : index
%14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {		%11 = linalg.pad_tensor %8 low[%c0] high[%10] {
%15 = affine.min #map2(%10, %arg7)
%16 = tensor.extract_slice %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
%17 = affine.min #map2(%12, %arg7)
%18 = tensor.extract_slice %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
%19 = arith.subi %c2, %15 : index
%20 = linalg.pad_tensor %16 low[%c0] high[%19] {
^bb0(%arg9: index): // no predecessors		^bb0(%arg9: index): // no predecessors
linalg.yield %cst : f32		linalg.yield %cst : f32
} : tensor<?xf32> to tensor<2xf32>		} : tensor<?xf32> to tensor<2xf32>
%21 = arith.subi %c2, %17 : index		%12 = linalg.pad_tensor %9 low[%c0] high[%10] {
%22 = linalg.pad_tensor %18 low[%c0] high[%21] {
^bb0(%arg9: index): // no predecessors		^bb0(%arg9: index): // no predecessors
linalg.yield %cst : f32		linalg.yield %cst : f32
} : tensor<?xf32> to tensor<2xf32>		} : tensor<?xf32> to tensor<2xf32>
%23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>		%13 = linalg.dot ins(%11, %12 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
scf.yield %23 : tensor<f32>		scf.yield %13 : tensor<f32>
}		}
scf.yield %14 : tensor<f32>		scf.yield %5 : tensor<f32>
}		}
scf.yield %9 : tensor<f32>		scf.yield %3 : tensor<f32>
}		}
return %4 : tensor<f32>		return %1 : tensor<f32>
}		}

// -----		// -----

// CHECK-LABEL: func @matmul_2d_tiling		// CHECK-LABEL: func @matmul_2d_tiling
// VERIFIER-ONLY-LABEL: func @matmul_2d_tiling		// VERIFIER-ONLY-LABEL: func @matmul_2d_tiling
func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %arg2: tensor<32x64xf32>) -> tensor<32x64xf32> {		func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %arg2: tensor<32x64xf32>) -> tensor<32x64xf32> {
%c128 = arith.constant 128 : index		%c128 = arith.constant 128 : index
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/pad-and-hoist.mlir

// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file \| FileCheck %s		// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file \| FileCheck %s
// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=CHECK-DOUBLE		// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=3,2,0" -cse -canonicalize -split-input-file \| FileCheck %s --check-prefix=CHECK-DOUBLE

// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)>		// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)>
// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)>		// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)>
// CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>		// CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
#map0 = affine_map<(d0) -> (5, -d0 + 24)>		#map0 = affine_map<(d0) -> (5, -d0 + 24)>
#map1 = affine_map<(d0) -> (8, -d0 + 12)>		#map1 = affine_map<(d0) -> (7, -d0 + 25)>
#map2 = affine_map<(d0) -> (7, -d0 + 25)>

// CHECK: single_tiling
// CHECK-DOUBLE: single_tiling

		// CHECK: static_sizes
		// CHECK-DOUBLE: static_sizes
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>		// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>		// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>		// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
func @single_tiling(%arg0: tensor<24x12xf32>,		func @static_sizes(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,		%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {		%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C5:.*]] = arith.constant 5		// CHECK-DAG: %[[C5:.*]] = arith.constant 5
// CHECK-DAG: %[[C8:.*]] = arith.constant 8		// CHECK-DAG: %[[C7:.*]] = arith.constant 7
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
%c12 = arith.constant 12 : index		%c12 = arith.constant 12 : index
%c25 = arith.constant 25 : index		%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index		%c24 = arith.constant 24 : index
%c6 = arith.constant 6 : index		%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index		%c7 = arith.constant 7 : index
%c5 = arith.constant 5 : index		%c5 = arith.constant 5 : index

// CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] =		// CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] =
%0 = scf.for %arg3 = %c0 to %c24 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {		%0 = scf.for %arg3 = %c0 to %c24 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {

// Packing the first input operand for all values of IV2 (IV2x5x6).		// Packing the first input operand for all values of IV2 (IV2x5x6).
// CHECK: = linalg.init_tensor [2, 5, 6]		// CHECK: = linalg.init_tensor [2, 5, 6]
// CHECK: %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] =		// CHECK: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
// CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]])		// CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[PIV0]])
// CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])		// CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
// CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]		// CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
// CHECK-SAME: %[[IV0]], %[[P0IV2]]		// CHECK-SAME: %[[IV0]], %[[PIV0]]
// CHECK-SAME: %[[TS0]], 6		// CHECK-SAME: %[[TS0]], 6
// CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]		// CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
// CHECK: %[[T1:.]] = linalg.pad_tensor %[[T0]] nofold {{.}} high[%[[V0]]		// CHECK: %[[T1:.]] = linalg.pad_tensor %[[T0]] nofold {{.}} high[%[[V0]]
// CHECK: %[[T2:.]] = tensor.insert_slice %[[T1:.]] into %{{.*}}[%[[PIDX0]], 0, 0]		// CHECK: %[[T2:.]] = tensor.insert_slice %[[T1:.]] into %{{.*}}[%[[PIDX0]], 0, 0]
// CHECK: scf.yield %[[T2:.*]]		// CHECK: scf.yield %[[T2:.*]]

// CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] =		// CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] =
%1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {		%1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {

// Packing the second input operand for all values of IV2 (IV2x6x8).		// Packing the second input operand for all values of IV2 (IV2x6x7).
// CHECK: = linalg.init_tensor [2, 6, 8]		// CHECK: = linalg.init_tensor [2, 6, 7]
// CHECK: %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] =		// CHECK: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
// CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]])		// CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
// CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])		// CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])
// CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]]		// CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
// CHECK-SAME: %[[P1IV2]], %[[IV1]]		// CHECK-SAME: %[[PIV1]], %[[IV1]]
// CHECK-SAME: 6, %[[TS1]]		// CHECK-SAME: 6, %[[TS1]]
// CHECK: %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]]		// CHECK: %[[V1:.*]] = arith.subi %[[C7]], %[[TS1]]
// CHECK: %[[T4:.]] = linalg.pad_tensor %[[T3]] nofold {{.}} high[%[[C0]], %[[V1]]		// CHECK: %[[T4:.]] = linalg.pad_tensor %[[T3]] nofold {{.}} high[%[[C0]], %[[V1]]
// CHECK: %[[T5:.]] = tensor.insert_slice %[[T4:.]] into %{{.*}}[%[[PIDX1]], 0, 0]		// CHECK: %[[T5:.]] = tensor.insert_slice %[[T4:.]] into %{{.*}}[%[[PIDX1]], 0, 0]
// CHECK: scf.yield %[[T5:.*]]		// CHECK: scf.yield %[[T5:.*]]

// CHECK: scf.for %[[IV2:[0-9a-zA-Z]]] = {{.}} iter_args(%[[ARG4:.*]] =		// CHECK: scf.for %[[IV2:[0-9a-zA-Z]]] = {{.}} iter_args(%[[ARG4:.*]] =
%2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {		%2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
%3 = affine.min #map0(%arg3)		%3 = affine.min #map0(%arg3)

// Index the packed operands.		// Index the packed operands.
// CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]])		// CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]])
// CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]		// CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
// CHECK-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]]		// CHECK-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]]
%4 = tensor.extract_slice %arg0[%arg3, %arg7] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32>		%4 = tensor.extract_slice %arg0[%arg3, %arg7] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32>
%5 = affine.min #map1(%arg5)		%5 = affine.min #map1(%arg5)
%6 = tensor.extract_slice %arg1[%arg7, %arg5] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>		%6 = tensor.extract_slice %arg1[%arg7, %arg5] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>

Show All 12 Lines	%0 = scf.for %arg3 = %c0 to %c24 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {
}		}
scf.yield %1 : tensor<24x25xf32>		scf.yield %1 : tensor<24x25xf32>
}		}
return %0 : tensor<24x25xf32>		return %0 : tensor<24x25xf32>
}		}

// -----		// -----

		// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)>
		// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)>
		// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)>
		// CHECK-DAG: #[[SDIV6:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 6)>
		// CHECK-DAG: #[[DDIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
		#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)>
		#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)>
		#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)>

		// CHECK: dynamic_sizes
		// CHECK-DOUBLE: dynamic_sizes
		// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
		// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
		// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
		func @dynamic_sizes(%arg0: tensor<?x?xf32>,
		%arg1: tensor<?x?xf32>,
		%arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
		// CHECK-DAG: %[[C1:.*]] = arith.constant 1
		// CHECK-DAG: %[[C5:.*]] = arith.constant 5
		// CHECK-DAG: %[[C6:.*]] = arith.constant 6
		%c1 = arith.constant 1 : index
		%c0 = arith.constant 0 : index
		%c6 = arith.constant 6 : index
		%c7 = arith.constant 7 : index
		%c5 = arith.constant 5 : index

		// CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
		// CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
		// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]]
		%0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
		%1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
		%2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>

		// CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] =
		%3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {

		// Packing the first input operand for all values of IV2 (IV2x5x6).
		// CHECK: %[[PS0:.*]] = affine.apply #[[SDIV6]]()[%[[D1]]
		// CHECK: = linalg.init_tensor [%[[PS0]], 5, 6]
		// CHECK: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
		// CHECK: %[[PIDX0:.*]] = affine.apply #[[DDIV6]](%[[PIV0]])
		// CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]]
		// CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[PIV0]])[%[[D1]]
		// CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
		// CHECK-SAME: %[[IV0]], %[[PIV0]]
		// CHECK-SAME: %[[TS0]], %[[TS1]]
		// CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
		// CHECK: %[[V1:.*]] = arith.subi %[[C6]], %[[TS1]]
		// CHECK: %[[T1:.]] = linalg.pad_tensor %[[T0]] nofold {{.}} high[%[[V0]], %[[V1]]
		// CHECK: %[[T2:.]] = tensor.insert_slice %[[T1:.]] into %{{.*}}[%[[PIDX0]], 0, 0]
		// CHECK: scf.yield %[[T2:.*]]

		// CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] =
		%4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {

		// Packing the second input operand for all values of IV2 (IV2x6x7).
		// CHECK: = linalg.init_tensor [%[[PS0]], 6, 7]
		// CHECK: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
		// CHECK: %[[PIDX1:.*]] = affine.apply #[[DDIV6]](%[[PIV1]])
		// CHECK: %[[TS2:.*]] = affine.min #[[MAP1]](%[[PIV1]])[%[[D1]]
		// CHECK: %[[TS3:.*]] = affine.min #[[MAP2]](%[[IV1]])[%[[D2]]
		// CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
		// CHECK-SAME: %[[PIV1]], %[[IV1]]
		// CHECK-SAME: %[[TS2]], %[[TS3]]
		// CHECK: %[[V2:.*]] = arith.subi %[[C6]], %[[TS2]]
		// CHECK: %[[V3:.*]] = arith.subi %[[C7]], %[[TS3]]
		// CHECK: %[[T4:.]] = linalg.pad_tensor %[[T3]] nofold {{.}} high[%[[V2]], %[[V3]]
		// CHECK: %[[T5:.]] = tensor.insert_slice %[[T4:.]] into %{{.*}}[%[[PIDX1]], 0, 0]
		// CHECK: scf.yield %[[T5:.*]]

		// CHECK: scf.for %[[IV2:[0-9a-zA-Z]]] = {{.}} iter_args(%[[ARG4:.*]] =
		%5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
		%6 = affine.min #map0(%arg3)[%0]
		%7 = affine.min #map1(%arg7)[%1]

		// Index the packed operands.
		// CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DDIV6]](%[[IV2]])
		// CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
		// CHECK-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]]
		%8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
		%9 = affine.min #map2(%arg5)[%2]
		%10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
		%11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>

		// Check matmul uses the packed input operands.
		// CHECK: = linalg.matmul ins(%[[T6]], %[[T7]]
		%12 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
		%13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
		scf.yield %13 : tensor<?x?xf32>
		}
		scf.yield %5 : tensor<?x?xf32>
		}
		scf.yield %4 : tensor<?x?xf32>
		}
		return %3 : tensor<?x?xf32>
		}

		// -----

		// CHECK-DOUBLE-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)>
		// CHECK-DOUBLE-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
#map0 = affine_map<(d0) -> (15, -d0 + 24)>		#map0 = affine_map<(d0) -> (15, -d0 + 24)>
#map1 = affine_map<(d0) -> (16, -d0 + 25)>		#map1 = affine_map<(d0) -> (16, -d0 + 25)>
#map2 = affine_map<(d0, d1) -> (5, -d0 + d1)>		#map2 = affine_map<(d0, d1) -> (5, -d0 + d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>		#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0, d1) -> (6, -d0 + d1)>		#map4 = affine_map<(d0, d1) -> (6, -d0 + d1)>

// CHECK: double_tiling		// CHECK: double_tiling
// CHECK-DOUBLE: double_tiling		// CHECK-DOUBLE: double_tiling

// CHECK-DOUBLE-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>		// CHECK-DOUBLE-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
// CHECK-DOUBLE-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>		// CHECK-DOUBLE-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
// CHECK-DOUBLE-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>		// CHECK-DOUBLE-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
func @double_tiling(%arg0: tensor<24x12xf32>,		func @double_tiling(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,		%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {		%arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
%c15 = arith.constant 15 : index		%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index		%c16 = arith.constant 16 : index
%c24 = arith.constant 24 : index		%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index		%c25 = arith.constant 25 : index
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
%c5 = arith.constant 5 : index		%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index		%c6 = arith.constant 6 : index

// Packing the first input operand.
// CHECK-DOUBLE: = linalg.init_tensor
// CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold

// CHECK-DOUBLE: scf.for %[[IV0:[0-9a-zA-Z]*]] =		// CHECK-DOUBLE: scf.for %[[IV0:[0-9a-zA-Z]*]] =
%0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {		%0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {

// Packing the second input operand.		// Packing the first input operand.
// CHECK-DOUBLE: = linalg.init_tensor		// CHECK-DOUBLE: = linalg.init_tensor [3, 5, 12]
// CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold		// CHECK-DOUBLE: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
		// CHECK-DOUBLE: %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]])
		// CHECK-DOUBLE: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
		// CHECK-DOUBLE: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
		// CHECK-DOUBLE: %[[T2:.]] = tensor.insert_slice %[[T1:.]] into %{{.*}}[%[[PIDX0]], 0, 0]
		// CHECK-DOUBLE: scf.yield %[[T2:.*]]

// CHECK-DOUBLE: scf.for %[[IV1:[0-9a-zA-Z]*]] =		// CHECK-DOUBLE: scf.for %[[IV1:[0-9a-zA-Z]*]] =
%1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {		%1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
%2 = affine.min #map0(%arg3)		%2 = affine.min #map0(%arg3)
%3 = affine.min #map1(%arg5)		%3 = affine.min #map1(%arg5)
%4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>		%4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>

		// Packing the second input operand.
		// CHECK-DOUBLE: = linalg.init_tensor [%{{.*}}, 12, 6]
		// CHECK-DOUBLE: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
		// CHECK-DOUBLE: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
		// CHECK-DOUBLE: %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
		// CHECK-DOUBLE: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold
		// CHECK-DOUBLE: %[[T5:.]] = tensor.insert_slice %[[T4:.]] into %{{.*}}[%[[PIDX1]], 0, 0]
		// CHECK-DOUBLE: scf.yield %[[T5:.*]]

// CHECK-DOUBLE: scf.for %[[IV2:[0-9a-zA-Z]*]] =		// CHECK-DOUBLE: scf.for %[[IV2:[0-9a-zA-Z]*]] =
%5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor<?x?xf32>) {		%5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor<?x?xf32>) {

// CHECK-DOUBLE: scf.for %[[IV3:[0-9a-zA-Z]*]] =		// CHECK-DOUBLE: scf.for %[[IV3:[0-9a-zA-Z]*]] =
%7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor<?x?xf32>) {		%7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor<?x?xf32>) {
%8 = affine.min #map2(%arg7, %2)		%8 = affine.min #map2(%arg7, %2)
%9 = affine.apply #map3(%arg7, %arg3)		%9 = affine.apply #map3(%arg7, %arg3)

		// Index the packed operands.
		// CHECK-DOUBLE-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV2]])
		// CHECK-DOUBLE-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX0]]
		// CHECK-DOUBLE-DAG: %[[IDX1:.*]] = affine.apply #[[DIV6]](%[[IV3]])
		// CHECK-DOUBLE-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX1]]
%10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor<?x12xf32>		%10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor<?x12xf32>
%11 = affine.min #map4(%arg9, %3)		%11 = affine.min #map4(%arg9, %3)
%12 = affine.apply #map3(%arg9, %arg5)		%12 = affine.apply #map3(%arg9, %arg5)
%13 = tensor.extract_slice %arg1[0, %12] [12, %11] [1, 1] : tensor<12x25xf32> to tensor<12x?xf32>		%13 = tensor.extract_slice %arg1[0, %12] [12, %11] [1, 1] : tensor<12x25xf32> to tensor<12x?xf32>
%14 = affine.min #map2(%arg7, %2)		%14 = affine.min #map2(%arg7, %2)
%15 = affine.min #map4(%arg9, %3)		%15 = affine.min #map4(%arg9, %3)
%16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>		%16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>

// Pad the output operand and perform the multiplication.		// Check matmul uses the packed input operands.
// CHECK-DOUBLE: = linalg.pad_tensor		// CHECK-DOUBLE: = linalg.matmul ins(%[[T6]], %[[T7]]
// CHECK-DOUBLE: = linalg.matmul
%17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>		%17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
%18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>		%18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %18 : tensor<?x?xf32>		scf.yield %18 : tensor<?x?xf32>
}		}
scf.yield %7 : tensor<?x?xf32>		scf.yield %7 : tensor<?x?xf32>
}		}
%6 = tensor.insert_slice %5 into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>		%6 = tensor.insert_slice %5 into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
scf.yield %6 : tensor<24x25xf32>		scf.yield %6 : tensor<24x25xf32>
}		}
scf.yield %1 : tensor<24x25xf32>		scf.yield %1 : tensor<24x25xf32>
}		}
return %0 : tensor<24x25xf32>		return %0 : tensor<24x25xf32>
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Improve the padding packing loop computation.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 385431

mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp

mlir/test/Dialect/Linalg/hoist-padding.mlir

mlir/test/Dialect/Linalg/pad-and-hoist.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Improve the padding packing loop computation.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 385431

mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp

mlir/test/Dialect/Linalg/hoist-padding.mlir

mlir/test/Dialect/Linalg/pad-and-hoist.mlir

[mlir][linalg] Improve the padding packing loop computation.
ClosedPublic