Diff 439123

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

Show First 20 Lines • Show All 216 Lines • ▼ Show 20 Lines	def SplitReductionOp : Op<Transform_Dialect, "structured.split_reduction",
let assemblyFormat = "$target attr-dict";		let assemblyFormat = "$target attr-dict";

let extraClassDeclaration = [{		let extraClassDeclaration = [{
::mlir::FailureOr<::llvm::SmallVector<::mlir::Operation *>> applyToOne(		::mlir::FailureOr<::llvm::SmallVector<::mlir::Operation *>> applyToOne(
::mlir::linalg::LinalgOp target, TransformState &state);		::mlir::linalg::LinalgOp target, TransformState &state);
}];		}];
}		}

		def SplitReductionByScalingOp :
		Op<Transform_Dialect, "structured.split_reduction_by_scaling",
		[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
		TransformEachOpTrait, TransformOpInterface]> {
		let description = [{
		Indicates that the given `target` op should be transformed with the
		`splitReductionByScaling` transformation and split factor provided as
		attribute.

		Instead of introducing an ExpandShapeOp, this scaling-based implementation
		rewrites a reduction dimension `k` into `k * split_factor + kk`.
		The dimension `kk` is added as an extra parallel dimension to the
		intermediate output tensor at position `insert_split_dimension`.

		Consider a minimal example where `k` is reduced:
		O(i, j) += I(i, j, k)
		Assume i=3, j=5, k=128, split_factor=16 and insert_split_dimension=0.
		The compute is rewritten as:
		a. O_i(kk, i, j) += I(i, j, 16 * k + kk)
		b. O(i, j) += O_i(kk, i, j)
		The intermediate tensor O_i is of shape (128/16)x3x5 == 8x3x5.

		Example:

		```
		%0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
		outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
		```

		Is transformed to:

		```
		#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)>
		#map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)>
		#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
		#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
		#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
		#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
		%0 = linalg.init_tensor [16, 32, 64] : tensor<16x32x64xf32>
		springermUnsubmitted Done Reply Inline Actions %arg5 is not used in the computation. Is this intentional? springerm: %arg5 is not used in the computation. Is this intentional?
		%cst = arith.constant 0.000000e+00 : f32
		%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) ->
		tensor<16x32x64xf32>
		%2 = linalg.init_tensor [64, 4] : tensor<64x4xi1>

		%3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
		iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
		ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>, tensor<64x4xi1>)
		outs(%1 : tensor<16x32x64xf32>) {
		^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
		%5 = arith.mulf %arg3, %arg4 : f32
		%6 = arith.addf %arg6, %5 : f32
		linalg.yield %6 : f32
		} -> tensor<16x32x64xf32>

		%4 = linalg.generic {indexing_maps = [#map4, #map5],
		iterator_types = ["parallel", "parallel", "reduction"]}
		ins(%3 : tensor<16x32x64xf32>)
		outs(%C : tensor<16x32xf32>) {
		^bb0(%arg3: f32, %arg4: f32):
		%5 = arith.addf %arg3, %arg4 : f32
		linalg.yield %5 : f32
		springermUnsubmitted Done Reply Inline Actions Purpose of this arg is not mentioned in the op description springerm: Purpose of this arg is not mentioned in the op description
		} -> tensor<16x32xf32>

		return %4 : tensor<16x32xf32>
		```

		}];

		let arguments = (ins PDL_Operation:$target,
		DefaultValuedAttr<I64Attr, "{}">:$split_factor,
		DefaultValuedAttr<I64Attr, "{}">:$insert_split_dimension);
		let results = (outs PDL_Operation:$fill_op,
		PDL_Operation:$split_linalg_op,
		PDL_Operation:$combining_linalg_op);

		let assemblyFormat = "$target attr-dict";

		let extraClassDeclaration = [{
		::mlir::FailureOr<::llvm::SmallVector<::mlir::Operation *>> applyToOne(
		::mlir::linalg::LinalgOp target, TransformState &state);
		}];
		}

def TileOp : Op<Transform_Dialect, "structured.tile",		def TileOp : Op<Transform_Dialect, "structured.tile",
[DeclareOpInterfaceMethods<TransformOpInterface>,		[DeclareOpInterfaceMethods<TransformOpInterface>,
FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface]> {		FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface]> {
let description = [{		let description = [{
Indicates that the given `target` op should be tiled with the options		Indicates that the given `target` op should be tiled with the options
provided as attributes. This transform generates a loop nest with a smaller		provided as attributes. This transform generates a loop nest with a smaller
("tiled") target operation in its body. Currently limited to LinalgOps.		("tiled") target operation in its body. Currently limited to LinalgOps.

▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

Show First 20 Lines • Show All 1,526 Lines • ▼ Show 20 Lines	struct SplitReductionResult {
FillOp fillOp;		FillOp fillOp;
LinalgOp splitLinalgOp;		LinalgOp splitLinalgOp;
LinalgOp resultCombiningLinalgOp;		LinalgOp resultCombiningLinalgOp;
};		};
FailureOr<SplitReductionResult>		FailureOr<SplitReductionResult>
splitReduction(PatternRewriter &b, LinalgOp op,		splitReduction(PatternRewriter &b, LinalgOp op,
const ControlSplitReductionFn &controlSplitReductionFn);		const ControlSplitReductionFn &controlSplitReductionFn);

		/// Scaling-based implementation of the split reduction transformation.
		/// Instead of introducing an ExpandShapeOp, this rewrites a reduction dimension
		/// `k` into `k * scale + kk`.
		///
		/// Example:
		/// ```
		/// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
		/// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
		/// ```
		///
		/// Is transformed to:
		///
		/// ```
		/// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)>
		/// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)>
		/// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
		/// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
		/// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
		/// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
		/// %0 = linalg.init_tensor [16, 32, 64] : tensor<16x32x64xf32>
		/// %cst = arith.constant 0.000000e+00 : f32
		/// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) ->
		/// tensor<16x32x64xf32>
		/// %2 = linalg.init_tensor [64, 4] : tensor<64x4xi1>
		///
		/// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
		/// iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
		/// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>, tensor<64x4xi1>)
		/// outs(%1 : tensor<16x32x64xf32>) {
		/// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
		/// %5 = arith.mulf %arg3, %arg4 : f32
		/// %6 = arith.addf %arg6, %5 : f32
		/// linalg.yield %6 : f32
		/// } -> tensor<16x32x64xf32>
		///
		/// %4 = linalg.generic {indexing_maps = [#map4, #map5],
		/// iterator_types = ["parallel", "parallel", "reduction"]}
		// ins(%3 : tensor<16x32x64xf32>)
		/// outs(%C : tensor<16x32xf32>) {
		/// ^bb0(%arg3: f32, %arg4: f32):
		/// %5 = arith.addf %arg3, %arg4 : f32
		/// linalg.yield %5 : f32
		/// } -> tensor<16x32xf32>
		///
		/// return %4 : tensor<16x32xf32>
		/// ```
		FailureOr<SplitReductionResult>
		splitReductionByScaling(PatternRewriter &b, LinalgOp op,
		const ControlSplitReductionFn &controlSplitReductionFn);

} // namespace linalg		} // namespace linalg
} // namespace mlir		} // namespace mlir

#endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H		#endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H

mlir/include/mlir/IR/AffineMap.h

Show First 20 Lines • Show All 234 Lines • ▼ Show 20 Lines	return AffineMap::get(getNumDims(), getNumSymbols() + shift,
getResults(),		getResults(),
[&](AffineExpr e) {		[&](AffineExpr e) {
return e.shiftSymbols(getNumSymbols(), shift,		return e.shiftSymbols(getNumSymbols(), shift,
offset);		offset);
})),		})),
getContext());		getContext());
}		}

		/// Returns a new AffineMap with the same number of dims and symbols and one
		/// less result at `pos`, dropped.
		AffineMap dropResult(unsigned pos) {
		auto exprs = llvm::to_vector<4>(getResults());
		exprs.erase(exprs.begin() + pos);
		return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
		}

		/// Returns a new AffineMap with the same number of dims and symbols and an
		/// extra result inserted at `pos`.
		AffineMap insertResult(AffineExpr expr, unsigned pos) {
		springermUnsubmitted Done Reply Inline Actions nit: Any particular reason why the default insertion point is at the beginning instead of at the end? springerm: nit: Any particular reason why the default insertion point is at the beginning instead of at…
		auto exprs = llvm::to_vector<4>(getResults());
		exprs.insert(exprs.begin() + pos, expr);
		return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
		}

/// Folds the results of the application of an affine map on the provided		/// Folds the results of the application of an affine map on the provided
/// operands to a constant if possible.		/// operands to a constant if possible.
LogicalResult constantFold(ArrayRef<Attribute> operandConstants,		LogicalResult constantFold(ArrayRef<Attribute> operandConstants,
SmallVectorImpl<Attribute> &results) const;		SmallVectorImpl<Attribute> &results) const;

/// Propagates the constant operands into this affine map. Operands are		/// Propagates the constant operands into this affine map. Operands are
/// allowed to be null, at which point they are treated as non-constant. This		/// allowed to be null, at which point they are treated as non-constant. This
/// does not change the number of symbols and dimensions. Returns a new map,		/// does not change the number of symbols and dimensions. Returns a new map,
▲ Show 20 Lines • Show All 346 Lines • Show Last 20 Lines

mlir/include/mlir/IR/BuiltinTypes.h

Show First 20 Lines • Show All 243 Lines • ▼ Show 20 Lines	Builder &dropDim(unsigned pos) {
assert(pos < shape.size() && "overflow");		assert(pos < shape.size() && "overflow");
if (storage.empty())		if (storage.empty())
storage.append(shape.begin(), shape.end());		storage.append(shape.begin(), shape.end());
storage.erase(storage.begin() + pos);		storage.erase(storage.begin() + pos);
shape = {storage.data(), storage.size()};		shape = {storage.data(), storage.size()};
return *this;		return *this;
}		}

		/// Insert a val into shape @pos.
		Builder &insertDim(int64_t val, unsigned pos) {
		assert(pos <= shape.size() && "overflow");
		if (storage.empty())
		storage.append(shape.begin(), shape.end());
		storage.insert(storage.begin() + pos, val);
		shape = {storage.data(), storage.size()};
		return *this;
		}

operator RankedTensorType() {		operator RankedTensorType() {
return RankedTensorType::get(shape, elementType, encoding);		return RankedTensorType::get(shape, elementType, encoding);
}		}

private:		private:
ArrayRef<int64_t> shape;		ArrayRef<int64_t> shape;
// Owning shape data for copy-on-write operations.		// Owning shape data for copy-on-write operations.
SmallVector<int64_t> storage;		SmallVector<int64_t> storage;
▲ Show 20 Lines • Show All 241 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

Show First 20 Lines • Show All 416 Lines • ▼ Show 20 Lines	transform::SplitReductionOp::applyToOne(LinalgOp target,
if (failed(splitResult))		if (failed(splitResult))
return getOperation()->emitError("failed to apply");		return getOperation()->emitError("failed to apply");
return SmallVector<Operation *>{splitResult->fillOp,		return SmallVector<Operation *>{splitResult->fillOp,
splitResult->splitLinalgOp,		splitResult->splitLinalgOp,
splitResult->resultCombiningLinalgOp};		splitResult->resultCombiningLinalgOp};
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		// SplitReductionByScalingOp
		//===----------------------------------------------------------------------===//

		FailureOr<SmallVector<Operation *>>
		transform::SplitReductionByScalingOp::applyToOne(LinalgOp target,
		TransformState &state) {
		ControlSplitReductionFn splitFn = [&](LinalgOp) {
		return std::pair<int64_t, unsigned>(getSplitFactor(),
		getInsertSplitDimension());
		};
		SimpleRewriter rewriter(getContext());
		rewriter.setInsertionPoint(target);
		FailureOr<SplitReductionResult> splitResult =
		splitReductionByScaling(rewriter, target, splitFn);
		if (failed(splitResult))
		return getOperation()->emitError("failed to apply");
		return SmallVector<Operation *>{splitResult->fillOp,
		splitResult->splitLinalgOp,
		splitResult->resultCombiningLinalgOp};
		}

		//===----------------------------------------------------------------------===//
// TileOp		// TileOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
transform::TileOp::apply(TransformResults &transformResults,		transform::TileOp::apply(TransformResults &transformResults,
TransformState &state) {		TransformState &state) {
LinalgTilingOptions tilingOptions;		LinalgTilingOptions tilingOptions;
SmallVector<int64_t> tileSizes = extractI64Array(getSizes());		SmallVector<int64_t> tileSizes = extractI64Array(getSizes());
▲ Show 20 Lines • Show All 89 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp

Show All 13 Lines
#include <utility>		#include <utility>

#include "mlir/Analysis/SliceAnalysis.h"		#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"		#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"		#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"		#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"		#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"		#include "mlir/Dialect/Tensor/IR/Tensor.h"
		#include "mlir/Dialect/Tensor/Utils/Utils.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::linalg;		using namespace mlir::linalg;

/// Return the identity numeric value associated to the give op.		/// Return the identity numeric value associated to the give op.
static Optional<Attribute> getIdentity(Operation *op) {		static Attribute getNeutralElement(Operation *op) {
// Builder only used as helper for attribute creation.		// Builder only used as helper for attribute creation.
OpBuilder b(op->getContext());		OpBuilder b(op->getContext());
Type resultType = op->getResult(0).getType();		Type resultType = op->getResult(0).getType();
if (auto floatType = resultType.dyn_cast<FloatType>()) {		if (auto floatType = resultType.dyn_cast<FloatType>()) {
const llvm::fltSemantics &semantic = floatType.getFloatSemantics();		const llvm::fltSemantics &semantic = floatType.getFloatSemantics();
if (isa<arith::AddFOp>(op))		if (isa<arith::AddFOp>(op))
return b.getFloatAttr(resultType, llvm::APFloat::getZero(semantic));		return b.getFloatAttr(resultType, llvm::APFloat::getZero(semantic));
if (isa<arith::MulFOp>(op))		if (isa<arith::MulFOp>(op))
return b.getFloatAttr(resultType, llvm::APFloat(semantic, 1));		return b.getFloatAttr(resultType, llvm::APFloat(semantic, 1));
if (isa<arith::MaxFOp>(op))		if (isa<arith::MaxFOp>(op))
return b.getFloatAttr(resultType,		return b.getFloatAttr(resultType,
llvm::APFloat::getLargest(semantic, true));		llvm::APFloat::getLargest(semantic, true));
if (isa<arith::MinFOp>(op))		if (isa<arith::MinFOp>(op))
return b.getFloatAttr(resultType,		return b.getFloatAttr(resultType,
llvm::APFloat::getLargest(semantic, true));		llvm::APFloat::getLargest(semantic, true));
return llvm::None;		return Attribute();
}		}
if (isa<arith::AddIOp, arith::OrIOp, arith::XOrIOp>(op))		if (isa<arith::AddIOp, arith::OrIOp, arith::XOrIOp>(op))
return b.getIntegerAttr(resultType, 0);		return b.getIntegerAttr(resultType, 0);
if (isa<arith::AndIOp>(op))		if (isa<arith::AndIOp>(op))
return b.getIntegerAttr(resultType, -1);		return b.getIntegerAttr(resultType, -1);
if (isa<arith::MaxSIOp>(op))		if (isa<arith::MaxSIOp>(op))
return b.getIntegerAttr(resultType, std::numeric_limits<int64_t>::min());		return b.getIntegerAttr(resultType, std::numeric_limits<int64_t>::min());
if (isa<arith::MinSIOp>(op))		if (isa<arith::MinSIOp>(op))
return b.getIntegerAttr(resultType, std::numeric_limits<int64_t>::max());		return b.getIntegerAttr(resultType, std::numeric_limits<int64_t>::max());
if (isa<arith::MulIOp>(op))		if (isa<arith::MulIOp>(op))
return b.getIntegerAttr(resultType, 1);		return b.getIntegerAttr(resultType, 1);
return llvm::None;		return Attribute();
}		}

FailureOr<LinalgOp> mlir::linalg::splitReduction(		FailureOr<LinalgOp> mlir::linalg::splitReduction(
PatternRewriter &b, LinalgOp op,		PatternRewriter &b, LinalgOp op,
const ControlSplitReductionFn &controlSplitReductionFn,		const ControlSplitReductionFn &controlSplitReductionFn,
const LinalgTransformationFilter &filter) {		const LinalgTransformationFilter &filter) {
if (failed(filter.checkAndNotify(b, op)) \|\| !op.hasTensorSemantics() \|\|		if (failed(filter.checkAndNotify(b, op)) \|\| !op.hasTensorSemantics() \|\|
op.getNumReductionLoops() != 1 \|\| op.getNumOutputs() != 1 \|\|		op.getNumReductionLoops() != 1 \|\| op.getNumOutputs() != 1 \|\|
Show All 14 Lines
FailureOr<SplitReductionResult> mlir::linalg::splitReduction(		FailureOr<SplitReductionResult> mlir::linalg::splitReduction(
PatternRewriter &b, LinalgOp op,		PatternRewriter &b, LinalgOp op,
const ControlSplitReductionFn &controlSplitReductionFn) {		const ControlSplitReductionFn &controlSplitReductionFn) {
OpBuilder::InsertionGuard guard(b);		OpBuilder::InsertionGuard guard(b);
b.setInsertionPoint(op);		b.setInsertionPoint(op);

std::pair<int64_t, unsigned> control = controlSplitReductionFn(op);		std::pair<int64_t, unsigned> control = controlSplitReductionFn(op);
int64_t ratio = control.first;		int64_t ratio = control.first;
unsigned insertDimIndex = control.second;		unsigned insertSplitDimension = control.second;
if (ratio <= 1)		if (ratio <= 1)
return b.notifyMatchFailure(op, "split ratio needs to be greater than 1");		return b.notifyMatchFailure(op, "split ratio needs to be greater than 1");

SmallVector<unsigned> dims;		SmallVector<unsigned> dims;
op.getReductionDims(dims);		op.getReductionDims(dims);
assert(dims.size() == 1);		assert(dims.size() == 1);
unsigned reductionDim = dims[0];		unsigned reductionDim = dims[0];
SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges();		SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges();
int64_t reductionDimSize = loopRanges[reductionDim];		int64_t reductionDimSize = loopRanges[reductionDim];
if (reductionDimSize == ShapedType::kDynamicSize \|\|		if (reductionDimSize == ShapedType::kDynamicSize \|\|
reductionDimSize % ratio != 0 \|\| insertDimIndex >= loopRanges.size())		reductionDimSize % ratio != 0 \|\|
		insertSplitDimension >= loopRanges.size())
return b.notifyMatchFailure(		return b.notifyMatchFailure(
op, "Reduction dimension not divisible by split ratio");		op, "Reduction dimension not divisible by split ratio");

SmallVector<Operation *, 4> combinerOps;		SmallVector<Operation *, 4> combinerOps;
if (!matchReduction(op.getRegionOutputArgs(), 0, combinerOps) \|\|		if (!matchReduction(op.getRegionOutputArgs(), 0, combinerOps) \|\|
combinerOps.size() != 1)		combinerOps.size() != 1)
return b.notifyMatchFailure(op, "Cannot match the reduction pattern");		return b.notifyMatchFailure(op, "Cannot match the reduction pattern");

Operation *reductionOp = combinerOps[0];		Operation *reductionOp = combinerOps[0];
Optional<Attribute> identity = getIdentity(reductionOp);		Attribute identity = getNeutralElement(reductionOp);
if (!identity)		if (!identity)
return b.notifyMatchFailure(op, "Unknown identity value for the reduction");		return b.notifyMatchFailure(op, "Unknown identity value for the reduction");

Location loc = op->getLoc();		Location loc = op->getLoc();
SmallVector<Value> newInputs;		SmallVector<Value> newInputs;
SmallVector<AffineMap> newMaps;		SmallVector<AffineMap> newMaps;
// Calculate the new shapes and indexing maps of the input operands.		// Calculate the new shapes and indexing maps of the input operands.
for (OpOperand *operand : op.getInputOperands()) {		for (OpOperand *operand : op.getInputOperands()) {
AffineMap map = op.getTiedIndexingMap(operand);		AffineMap map = op.getTiedIndexingMap(operand);
SmallVector<int64_t> newShape;		SmallVector<int64_t> newShape;
SmallVector<AffineExpr> exprs;		SmallVector<AffineExpr> exprs;
SmallVector<ReassociationIndices> reassociation;		SmallVector<ReassociationIndices> reassociation;
unsigned index = 0;		unsigned index = 0;
for (unsigned idx : llvm::seq<unsigned>(0, map.getNumResults())) {		for (unsigned idx : llvm::seq<unsigned>(0, map.getNumResults())) {
unsigned dim = map.getDimPosition(idx);		unsigned dim = map.getDimPosition(idx);
if (reductionDim == dim) {		if (reductionDim == dim) {
newShape.push_back(ratio);		newShape.push_back(ratio);
newShape.push_back(op.getShape(operand)[idx] / ratio);		newShape.push_back(op.getShape(operand)[idx] / ratio);
reassociation.push_back({index++, index++});		reassociation.push_back({index++, index++});
exprs.push_back(b.getAffineDimExpr(insertDimIndex));		exprs.push_back(b.getAffineDimExpr(insertSplitDimension));
exprs.push_back(		exprs.push_back(
b.getAffineDimExpr(dim < insertDimIndex ? dim : dim + 1));		b.getAffineDimExpr(dim < insertSplitDimension ? dim : dim + 1));
continue;		continue;
}		}
newShape.push_back(op.getShape(operand)[idx]);		newShape.push_back(op.getShape(operand)[idx]);
exprs.push_back(b.getAffineDimExpr(dim < insertDimIndex ? dim : dim + 1));		exprs.push_back(
		b.getAffineDimExpr(dim < insertSplitDimension ? dim : dim + 1));
reassociation.push_back({index++});		reassociation.push_back({index++});
}		}
newMaps.push_back(		newMaps.push_back(
AffineMap::get(map.getNumDims() + 1, 0, exprs, op.getContext()));		AffineMap::get(map.getNumDims() + 1, 0, exprs, op.getContext()));
// If the shape is unchanged the input doesn't change.		// If the shape is unchanged the input doesn't change.
if (newShape == op.getShape(operand)) {		if (newShape == op.getShape(operand)) {
newInputs.push_back(operand->get());		newInputs.push_back(operand->get());
continue;		continue;
Show All 9 Lines	FailureOr<SplitReductionResult> mlir::linalg::splitReduction(
// Calculate the new output map and shape, we insert the new dimension based		// Calculate the new output map and shape, we insert the new dimension based
// on the index returned by `controlSplitReductionFn`.		// on the index returned by `controlSplitReductionFn`.
SmallVector<int64_t> newOutputShape;		SmallVector<int64_t> newOutputShape;
AffineMap oldOutputMap = op.getTiedIndexingMap(op.getOutputOperand(0));		AffineMap oldOutputMap = op.getTiedIndexingMap(op.getOutputOperand(0));
ArrayRef<int64_t> oldShape = op.getShape(op.getOutputOperand(0));		ArrayRef<int64_t> oldShape = op.getShape(op.getOutputOperand(0));
SmallVector<AffineExpr> outputExpr;		SmallVector<AffineExpr> outputExpr;
for (unsigned idx :		for (unsigned idx :
llvm::seq<unsigned>(0, oldOutputMap.getNumResults() + 1)) {		llvm::seq<unsigned>(0, oldOutputMap.getNumResults() + 1)) {
if (idx == insertDimIndex) {		if (idx == insertSplitDimension) {
newOutputShape.push_back(ratio);		newOutputShape.push_back(ratio);
outputExpr.push_back(b.getAffineDimExpr(insertDimIndex));		outputExpr.push_back(b.getAffineDimExpr(insertSplitDimension));
continue;		continue;
}		}
unsigned oldDim = idx < insertDimIndex ? idx : idx - 1;		unsigned oldDim = idx < insertSplitDimension ? idx : idx - 1;
newOutputShape.push_back(oldShape[oldDim]);		newOutputShape.push_back(oldShape[oldDim]);
unsigned dim = oldOutputMap.getDimPosition(oldDim);		unsigned dim = oldOutputMap.getDimPosition(oldDim);
outputExpr.push_back(		outputExpr.push_back(
b.getAffineDimExpr(dim < insertDimIndex ? dim : dim + 1));		b.getAffineDimExpr(dim < insertSplitDimension ? dim : dim + 1));
}		}
Value initTensor = b.create<linalg::InitTensorOp>(		Value initTensor = b.create<linalg::InitTensorOp>(
loc, newOutputShape, op.getRegionOutputArgs()[0].getType());		loc, newOutputShape, op.getRegionOutputArgs()[0].getType());
Value constantOp = b.create<arith::ConstantOp>(loc, *identity);		Value constantOp = b.create<arith::ConstantOp>(loc, identity);
Value identityTensor =		Value identityTensor =
b.create<linalg::FillOp>(op->getLoc(), constantOp, initTensor)		b.create<linalg::FillOp>(op->getLoc(), constantOp, initTensor)
.getResult(0);		.getResult(0);

newMaps.push_back(AffineMap::get(oldOutputMap.getNumDims() + 1, 0, outputExpr,		newMaps.push_back(AffineMap::get(oldOutputMap.getNumDims() + 1, 0, outputExpr,
op.getContext()));		op.getContext()));
SmallVector<StringRef> newIteratorTypes;		SmallVector<StringRef> newIteratorTypes;
for (auto &it : llvm::enumerate(op.iterator_types())) {		for (auto &it : llvm::enumerate(op.iterator_types())) {
if (insertDimIndex == it.index())		if (insertSplitDimension == it.index())
newIteratorTypes.push_back(getParallelIteratorTypeName());		newIteratorTypes.push_back(getParallelIteratorTypeName());
newIteratorTypes.push_back(it.value().cast<StringAttr>().getValue());		newIteratorTypes.push_back(it.value().cast<StringAttr>().getValue());
}		}
// Create the new op matching the original op with an extra parallel		// Create the new op matching the original op with an extra parallel
// dimension.		// dimension.
GenericOp genericOp = b.create<GenericOp>(		GenericOp genericOp = b.create<GenericOp>(
loc, TypeRange({initTensor.getType()}), newInputs,		loc, TypeRange({initTensor.getType()}), newInputs,
ValueRange({identityTensor}), newMaps, newIteratorTypes);		ValueRange({identityTensor}), newMaps, newIteratorTypes);
b.inlineRegionBefore(op->getRegion(0), genericOp.region(),		b.inlineRegionBefore(op->getRegion(0), genericOp.region(),
genericOp.region().begin());		genericOp.region().begin());

// Then create a new reduction that only reduce the newly added dimension		// Then create a new reduction that only reduce the newly added dimension
// from the previous op.		// from the previous op.
unsigned intermRank = newOutputShape.size();		unsigned intermRank = newOutputShape.size();
AffineMap inputMap = b.getMultiDimIdentityMap(intermRank);		AffineMap inputMap = b.getMultiDimIdentityMap(intermRank);
SmallVector<Value> outputOperands = op.getOutputOperands();		SmallVector<Value> outputOperands = op.getOutputOperands();
SmallVector<StringRef> reductionIteratorTypes;		SmallVector<StringRef> reductionIteratorTypes;
SmallVector<AffineExpr> exprs;		SmallVector<AffineExpr> exprs;
for (unsigned i : llvm::seq<unsigned>(0, intermRank)) {		for (unsigned i : llvm::seq<unsigned>(0, intermRank)) {
if (insertDimIndex == i) {		if (insertSplitDimension == i) {
reductionIteratorTypes.push_back(getReductionIteratorTypeName());		reductionIteratorTypes.push_back(getReductionIteratorTypeName());
} else {		} else {
exprs.push_back(b.getAffineDimExpr(i));		exprs.push_back(b.getAffineDimExpr(i));
reductionIteratorTypes.push_back(getParallelIteratorTypeName());		reductionIteratorTypes.push_back(getParallelIteratorTypeName());
}		}
}		}
AffineMap outputMap = AffineMap::get(intermRank, 0, exprs, op.getContext());		AffineMap outputMap = AffineMap::get(intermRank, 0, exprs, op.getContext());
SmallVector<AffineMap> reductionMaps = {inputMap, outputMap};		SmallVector<AffineMap> reductionMaps = {inputMap, outputMap};
Show All 9 Lines	auto reduction = b.create<GenericOp>(
});		});
b.replaceOp(op, reduction.getResults());		b.replaceOp(op, reduction.getResults());

return SplitReductionResult{identityTensor.getDefiningOp<FillOp>(),		return SplitReductionResult{identityTensor.getDefiningOp<FillOp>(),
cast<LinalgOp>(genericOp.getOperation()),		cast<LinalgOp>(genericOp.getOperation()),
reduction};		reduction};
}		}

		/// Rewrite f(i, j, k, ...) into f(i, j, k * ratio + kk, ...)
		/// TODO: Additional pattern to rewrite f(i, j, k * ratio + kk, ...) into
		/// f(i, j, k, kk, ...) with a proper ExpandShapeOp. This is probably better
		/// done as a transform to enable better vectorization.
		static AffineMap scaleReductionDim(LinalgOp op, OpOperand &opOperand,
		unsigned reductionDimPos,
		int64_t reductionRatio) {
		auto reductionDim = getAffineDimExpr(reductionDimPos, op.getContext());
		auto reductionDimP1 = getAffineDimExpr(reductionDimPos + 1, op.getContext());
		AffineMap map = op.getTiedIndexingMap(&opOperand);
		AffineMap idMap =
		AffineMap::getMultiDimIdentityMap(map.getNumDims(), op.getContext());
		AffineMap shiftedIdMap = idMap.shiftDims(1, /offset=/reductionDimPos + 1);
		AffineMap composeMap = shiftedIdMap.replace(
		reductionDim, reductionDim * reductionRatio + reductionDimP1,
		shiftedIdMap.getNumDims(), /numSymbols=/0);
		return map.compose(composeMap);
		}

		static AffineMap insertParallelDim(LinalgOp op, OpOperand &opOperand,
		unsigned reductionDimPos, int64_t size) {
		auto reductionDim = getAffineDimExpr(reductionDimPos, op.getContext());
		AffineMap map = op.getTiedIndexingMap(&opOperand);
		AffineMap idMap =
		AffineMap::getMultiDimIdentityMap(map.getNumDims(), op.getContext());
		AffineMap shiftedIdMap = idMap.shiftDims(1, /offset=/reductionDimPos + 1);
		return map.compose(shiftedIdMap).insertResult(reductionDim, reductionDimPos);
		}

		/// Core rewrite implementation.
		FailureOr<SplitReductionResult> mlir::linalg::splitReductionByScaling(
		PatternRewriter &b, LinalgOp op,
		const ControlSplitReductionFn &controlSplitReductionFn) {
		OpBuilder::InsertionGuard guard(b);
		b.setInsertionPoint(op);

		// Matcher part, enforce preconditions.
		std::pair<int64_t, unsigned> control = controlSplitReductionFn(op);
		int64_t splitFactor = control.first;
		unsigned insertSplitDimension = control.second;
		if (splitFactor <= 1)
		return b.notifyMatchFailure(op, "split factor needs to be greater than 1");

		SmallVector<unsigned> dims;
		op.getReductionDims(dims);
		if (dims.empty())
		return b.notifyMatchFailure(op, "needs at least 1 reduction dimension");

		unsigned reductionDimPos = dims[0];
		SmallVector<int64_t> loopRanges = op.getStaticLoopRanges();
		int64_t reductionDimSize = loopRanges[reductionDimPos];
		if (reductionDimSize == ShapedType::kDynamicSize \|\|
		reductionDimSize % splitFactor != 0 \|\|
		insertSplitDimension >= loopRanges.size())
		return b.notifyMatchFailure(
		op, "first reduction dimension not divisible by split factor");

		SmallVector<Operation *> combinerOps;
		if (!matchReduction(op.getRegionOutputArgs(), 0, combinerOps))
		return b.notifyMatchFailure(op, "cannot match a reduction pattern");

		SmallVector<Attribute> neutralElements = llvm::to_vector<4>(
		llvm::map_range(combinerOps, [&](Operation *reductionOp) {
		return getNeutralElement(reductionOp);
		}));
		if (!llvm::all_of(neutralElements, [](Attribute attr) { return attr; }))
		return b.notifyMatchFailure(op, "unknown reduction neutral");

		// TODO: relax this when multi-reduction support is available.
		if (op.getNumOutputs() != neutralElements.size())
		return b.notifyMatchFailure(op, "expect one reduction per output");

		// Rewrite part.
		// Step 1. Build the intermediate outputs filled with the proper
		// neutralElements. Such outputs are of the same shape with an extra dimension
		// inserted at `insertSplitDimension`.
		//
		// Consider a minimal example where `k` is reduced:
		// O(i, j) += I(i, j, k)
		// Assume i=3, j=5, k=128, splitFactor=16 and insertSplitDimension=0.
		// The compute is rewritten as:
		// a. O_i(kk, i, j) += I(i, j, 16 * k + kk)
		// b. O(i, j) += O_i(kk, i, j)
		// The intermediate tensor O_i is of shape (128/16)x3x5 == 8x3x5.
		Location loc = op->getLoc();
		MLIRContext *context = op.getContext();
		// For now assume outputs are 1-1 with reduction neutralElements.
		// TODO: generalize when multi-reduction support is available.
		SmallVector<Value> newOutputs;
		newOutputs.reserve(op.getNumOutputs());
		SmallVector<linalg::FillOp> fillOps;
		fillOps.reserve(op.getNumOutputs());
		for (auto it : llvm::zip(op.outputs(), neutralElements)) {
		Value rankedTensor = std::get<0>(it);
		auto t = rankedTensor.getType().cast<RankedTensorType>();
		RankedTensorType newT = RankedTensorType::Builder(t).insertDim(
		reductionDimSize / splitFactor, insertSplitDimension);
		SmallVector<Value> dims =
		tensor::createDynamicDimValues(b, loc, rankedTensor);
		Value initTensor = b.create<linalg::InitTensorOp>(
		loc, dims, newT.getShape(), t.getElementType());
		Value constantOp = b.create<arith::ConstantOp>(loc, std::get<1>(it));
		fillOps.push_back(
		b.create<linalg::FillOp>(op->getLoc(), constantOp, initTensor));
		newOutputs.push_back(fillOps.back().getResult(0));
		}

		// Step 2. Reindex / expand indexing maps.
		// Reindex existing input indexings: k -> k * splitFactor + k'.
		SmallVector<AffineMap> newMaps;
		newMaps.reserve(op.getNumInputsAndOutputs() + 1);
		for (OpOperand *o : op.getInputOperands())
		newMaps.push_back(scaleReductionDim(op, *o, reductionDimPos, splitFactor));
		// Provision a new indexing for the shape-only tensor.
		auto nDims = op.getNumLoops() + 1;
		auto redDim = getAffineDimExpr(reductionDimPos, context);
		auto redDimP1 = getAffineDimExpr(reductionDimPos + 1, context);
		newMaps.push_back(AffineMap::get(nDims, 0, {redDim, redDimP1}, context));
		// Expand existing output indexings.
		// TODO: a subset of these may not reduce along reducePos and should be
		// reindexed: k -> k * splitFactor + k', when multi-reduction support is
		// available.
		for (OpOperand *o : op.getOutputOperands())
		newMaps.push_back(insertParallelDim(op, *o, reductionDimPos,
		reductionDimSize / splitFactor));

		// Step 3. Handle operands.
		// Compute the new input tensors.
		auto newInputs = llvm::to_vector<4>(op.inputs());
		// Add a single shape-only tensor to carry the dimensions without resorting to
		// more complex inversions.
		newInputs.push_back(b.create<linalg::InitTensorOp>(
		loc, ArrayRef<int64_t>{reductionDimSize / splitFactor, splitFactor},
		b.getIntegerType(1)));
		// Output tensors are already good to go.

		// Step 4. Create the new op matching the original op with an extra parallel
		// dimension.
		SmallVector<StringRef> iteratorTypes =
		llvm::to_vector<4>(op.getIteratorTypes().getAsValueRange<StringAttr>());
		iteratorTypes.insert(iteratorTypes.begin() + reductionDimPos,
		getParallelIteratorTypeName());
		GenericOp genericOp =
		b.create<GenericOp>(loc, ValueRange(newOutputs).getTypes(), newInputs,
		newOutputs, newMaps, iteratorTypes);
		b.inlineRegionBefore(op->getRegion(0), genericOp.region(),
		genericOp.region().begin());
		genericOp.region().front().insertArgument(reductionDimPos,
		b.getIntegerType(1), loc);

		// Step 5. Create new reduction ops that only reduce the newly added
		// dimensions from the previous op.
		// For now assume outputs are 1-1 with reduction ops.
		// TODO: a subset of these may not reduce in the first place and do not
		// require a new op, when multi-reduction support is available.
		// TODO: all results can be handled in a single GenericOp, when
		// multi-reduction support is available.
		SmallVector<LinalgOp> results;
		for (auto it :
		llvm::zip(genericOp->getResults(), op.outputs(), combinerOps)) {
		Value reindexedOutput = std::get<0>(it);
		Value originalOutput = std::get<1>(it);
		auto originalOutputType = originalOutput.getType().cast<RankedTensorType>();
		Operation *combinerOp = std::get<2>(it);

		AffineMap map = b.getMultiDimIdentityMap(originalOutputType.getRank() + 1);
		SmallVector<AffineMap> indexingMaps = {
		map, map.dropResult(insertSplitDimension)};
		SmallVector<StringRef> reductionIteratorTypes(
		originalOutputType.getRank() + 1, getParallelIteratorTypeName());
		reductionIteratorTypes[insertSplitDimension] =
		getReductionIteratorTypeName();

		// clang-format off
		auto reductionOp = b.create<GenericOp>(
		loc,
		originalOutputType,
		reindexedOutput,
		originalOutput,
		indexingMaps,
		reductionIteratorTypes,
		[combinerOp](OpBuilder &b, Location loc, ValueRange bbArgs) {
		Operation clonedReductionOp = b.clone(combinerOp);
		clonedReductionOp->setOperand(0, bbArgs[0]);
		clonedReductionOp->setOperand(1, bbArgs[1]);
		b.create<linalg::YieldOp>(loc, clonedReductionOp->getResult(0));
		});
		// clang-format on

		results.push_back(reductionOp);
		}

		// TODO: extend when multi-reduction support is available.
		assert(fillOps.size() == results.size() && results.size() == 1);
		b.replaceOp(op, results.front()->getResults());
		return SplitReductionResult{fillOps.front(),
		cast<LinalgOp>(genericOp.getOperation()),
		results.front()};
		}

namespace {		namespace {

struct LinalgSplitReduction : public OpInterfaceRewritePattern<LinalgOp> {		struct LinalgSplitReduction : public OpInterfaceRewritePattern<LinalgOp> {
/// Construct a generic pattern applied to all LinalgOp that verify `filter`.		/// Construct a generic pattern applied to all LinalgOp that verify `filter`.
LinalgSplitReduction(MLIRContext *context,		LinalgSplitReduction(MLIRContext *context,
ControlSplitReductionFn controlSplitReductionFn,		ControlSplitReductionFn controlSplitReductionFn,
LinalgTransformationFilter f, PatternBenefit benefit = 1)		LinalgTransformationFilter f, PatternBenefit benefit = 1)
: OpInterfaceRewritePattern<LinalgOp>(context, benefit),		: OpInterfaceRewritePattern<LinalgOp>(context, benefit),
Show All 22 Lines

mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir

This file was added.

				// RUN: mlir-opt --test-transform-dialect-interpreter %s \| FileCheck %s

				// CHECK-LABEL: func.func @matmul_split
				func.func @matmul_split(%A : tensor<?x256xf32>, %B: tensor<256x32xf32>, %C: tensor<?x32xf32>) -> tensor<?x32xf32> {

				// CHECK: linalg.generic
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: ins(%{{[a-zA-Z0-9]}}, %{{[a-zA-Z0-9]}}, %{{[a-zA-Z0-9]*}} : tensor<?x256xf32>, tensor<256x32xf32>, tensor<64x4xi1>)
				// CHECK-SAME: outs(%{{[a-zA-Z0-9]*}} : tensor<?x32x64xf32>) {

				// CHECK: linalg.generic
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"]
				// CHECK-SAME: ins(%{{[a-zA-Z0-9]*}} : tensor<?x32x64xf32>)
				// CHECK-SAME: outs(%{{[a-zA-Z0-9]*}} : tensor<?x32xf32>) {
				%0 = linalg.matmul ins(%A, %B: tensor<?x256xf32>, tensor<256x32xf32>)
				outs(%C: tensor<?x32xf32>) -> tensor<?x32xf32>
				return %0: tensor<?x32xf32>
				}

				transform.with_pdl_patterns {
				^bb0(%arg0: !pdl.operation):
				pdl.pattern @pdl_target : benefit(1) {
				%args = operands
				%results = types
				%0 = pdl.operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
				// TODO: we don't want this, but it is the required terminator for pdl.pattern
				rewrite %0 with "transform.dialect"
				}

				transform.sequence %arg0 {
				^bb1(%arg1: !pdl.operation):
				%0 = pdl_match @pdl_target in %arg1
				%1:3 = transform.structured.split_reduction_by_scaling %0 { split_factor = 4, insert_split_dimension = 2}
				}
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Linalg] SplitReduction implementation without tensor::ExpandShapeOp
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 439123

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/include/mlir/IR/AffineMap.h

mlir/include/mlir/IR/BuiltinTypes.h

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp

mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Linalg] SplitReduction implementation without tensor::ExpandShapeOpClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 439123

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/include/mlir/IR/AffineMap.h

mlir/include/mlir/IR/BuiltinTypes.h

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp

mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir

[mlir][Linalg] SplitReduction implementation without tensor::ExpandShapeOp
ClosedPublic