Diff 462401

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

Show First 20 Lines • Show All 840 Lines • ▼ Show 20 Lines	def MapNestedForeachThreadToGpuThreads :
let extraClassDeclaration = [{		let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(		::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,		::mlir::Operation *target,
::llvm::SmallVectorImpl<::mlir::Operation *> &results,		::llvm::SmallVectorImpl<::mlir::Operation *> &results,
::mlir::transform::TransformState &state);		::mlir::transform::TransformState &state);
}];		}];
}		}

		def MapNestedForeachThreadToGpuBlocks : Op<Transform_Dialect,
		"structured.map_nested_foreach_thread_to_gpu_blocks",
		[FunctionalStyleTransformOpTrait,
		MemoryEffectsOpInterface,
		TransformOpInterface,
		TransformEachOpTrait]> {
		let description = [{
		Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
		to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
		is set, then first generates `gpu_launch` and moves the top level
		`scf.foreach_thread` inside.

		The operation searches top level `scf.foreach_thread` ops under
		`gpu_launch` and maps each such op to GPU blocks. Mapping is
		one-to-one and the induction variables of `scf.foreach_thread` are
		rewritten to gpu.block_id according to the `thread_dim_apping` attribute.

		Dynamic, `scf.foreach_thread` trip counts are currently not supported.
		Dynamic block dim sizes are currently not supported.

		Only bufferized scf.foreach_thread are currently supported.
		Only scf.foreach_thread distributed to at most 3 dimensions are
		currently supported.

		The operation alters the block size of the given gpu_launch using
		gridDim argument.

		Return modes:
		=============
		ftynseUnsubmitted Not Done Reply Inline Actions Please do not use `===` that indicates the page title. Use the same-level header as the operation operations in this file. ftynse: Please do not use `===` that indicates the page title. Use the same-level header as the…
		This operation ignores non-gpu_launch ops and drops them in the return.

		If any scf.foreach_thread with tensors is found, the transform definitely
		fails.

		If all the scf.foreach_thread operations contained within the LaunchOp
		referred to by the `target` PDLOperation lower to GPU properly, the
		transform succeeds. Otherwise the transform definitely fails.

		The returned handle points to the same LaunchOp operand, consuming it and
		producing a new SSA value to satisfy chaining and linearity of the IR
		properties.
		}];

		let arguments = (ins PDL_Operation:$target,
		DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
		UnitAttr:$generate_gpu_launch);
		let results = (outs PDL_Operation:$result);

		let assemblyFormat = "$target attr-dict";
		let extraClassDeclaration = [{
		::mlir::DiagnosedSilenceableFailure applyToOne(
		::mlir::Operation *target,
		::llvm::SmallVectorImpl<::mlir::Operation *> &results,
		::mlir::transform::TransformState &state);
		}];
		}

def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",		def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,		[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
TransformEachOpTrait, TransformOpInterface]> {		TransformEachOpTrait, TransformOpInterface]> {
let description = [{		let description = [{
Indicates that the given `target` op all the ops it contains should be		Indicates that the given `target` op all the ops it contains should be
vectorized with the configuration specified by the attributes of this op.		vectorized with the configuration specified by the attributes of this op.
This vectorization only handles structured ops that operate on shaped types		This vectorization only handles structured ops that operate on shaped types
and does not vectorize loops or straight-line. Internally, it applies a		and does not vectorize loops or straight-line. Internally, it applies a
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

	Show First 20 Lines • Show All 119 Lines • ▼ Show 20 Lines
	bool areElementwiseOpsFusable(OpOperand *fusedOperand);			bool areElementwiseOpsFusable(OpOperand *fusedOperand);

	/// Fuse two `linalg.generic` operations that have a producer-consumer			/// Fuse two `linalg.generic` operations that have a producer-consumer
	/// relationship captured through `fusedOperand`. The method expects			/// relationship captured through `fusedOperand`. The method expects
	/// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`.			/// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`.
	FailureOr<Operation *> fuseElementwiseOps(RewriterBase &rewriter,			FailureOr<Operation *> fuseElementwiseOps(RewriterBase &rewriter,
	OpOperand *fusedOperand);			OpOperand *fusedOperand);

				/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
				/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
				/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
				/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
				/// dim sizes are currently not supported.
				LogicalResult rewriteTopLevelForeachThreadToGpuBlocks(
				RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
				function_ref<void(Operation *, const SmallVector<int64_t> &, IndexType,
				SmallVector<Value> &)>
				nicolasvasilacheUnsubmitted Done Reply Inline Actions This will break in various ways at link time when used from outside, please use a proper type / typedef. E.g. something similar coming from scf: LoopNest mlir::scf::buildLoopNest( OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder) { nicolasvasilache: This will break in various ways at link time when used from outside, please use a proper type /…
				ftynseUnsubmitted Not Done Reply Inline Actions Overall, when you need an immutable cheap view of a sequential construct such as vector, take an `ArrayRef`, when you need a mutable vector, take a `SmallVectorImpl` so it is compatible with any stack-size and not only the hardcoded/inferred one. ftynse: Overall, when you need an immutable cheap view of a sequential construct such as vector, take…
				blockIdGenerator,
				SmallVector<int64_t> &gridDims);

				/// Finds the top level scf::ForeachThreadOp of given target.
				ftynseUnsubmitted Not Done Reply Inline Actions What does "top-level" mean here? Is this going considering ascendants or descendants of the given operation? ftynse: What does "top-level" mean here? Is this going considering ascendants or descendants of the…
				FailureOr<scf::ForeachThreadOp> findTopLevelForeachThreadOp(Operation *target);

	/// Searches `scf.foreach_thread` ops nested under `target` and maps each such			/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
	/// op to GPU threads. Mapping is one-to-one and the induction variables of			/// op to GPU threads. Mapping is one-to-one and the induction variables of
	/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the			/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
	/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in			/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
	/// which case, the union of the number of threads is computed and may result in			/// which case, the union of the number of threads is computed and may result in
	/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not			/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
	/// supported. Dynamic block dim sizes are currently not supported.			/// supported. Dynamic block dim sizes are currently not supported.
	mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads(			mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads(
	▲ Show 20 Lines • Show All 1,344 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

Show All 12 Lines
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"		#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"		#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"		#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/PDL/IR/PDL.h"		#include "mlir/Dialect/PDL/IR/PDL.h"
#include "mlir/Dialect/PDL/IR/PDLTypes.h"		#include "mlir/Dialect/PDL/IR/PDLTypes.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"		#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"		#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
		#include "mlir/IR/Value.h"
		nicolasvasilacheUnsubmitted Done Reply Inline Actions please drop spurious include (that I imagine was insert automatically by e.g. vscode ?) nicolasvasilache: please drop spurious include (that I imagine was insert automatically by e.g. vscode ?)
#include "mlir/Interfaces/TilingInterface.h"		#include "mlir/Interfaces/TilingInterface.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/StringSet.h"		#include "llvm/ADT/StringSet.h"
		#include "llvm/Support/raw_ostream.h"
		nicolasvasilacheUnsubmitted Done Reply Inline Actions please drop spurious include (that I imagine was insert automatically by e.g. vscode ?) nicolasvasilache: please drop spurious include (that I imagine was insert automatically by e.g. vscode ?)

using namespace mlir;		using namespace mlir;
using namespace mlir::linalg;		using namespace mlir::linalg;
using namespace mlir::transform;		using namespace mlir::transform;

/// Extracts a vector of unsigned from an array attribute. Asserts if the		/// Extracts a vector of unsigned from an array attribute. Asserts if the
/// attribute contains values other than intergers. May truncate.		/// attribute contains values other than intergers. May truncate.
static SmallVector<unsigned> extractUIntArray(ArrayAttr attr) {		static SmallVector<unsigned> extractUIntArray(ArrayAttr attr) {
▲ Show 20 Lines • Show All 1,178 Lines • ▼ Show 20 Lines	for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
threadOps.push_back(		threadOps.push_back(
rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpuDims[idx]));		rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpuDims[idx]));
}		}
// Step 2. Maybe create conditionals to predicate the region.		// Step 2. Maybe create conditionals to predicate the region.
Value predicate;		Value predicate;
for (auto [threadId, blockDim, globalBlockDim] :		for (auto [threadId, blockDim, globalBlockDim] :
llvm::zip(threadOps, blockDim, globalBlockDims)) {		llvm::zip(threadOps, blockDim, globalBlockDims)) {
if (blockDim > globalBlockDim) {		if (blockDim > globalBlockDim) {
return foreachThreadOp.emitOpError("blockDim size overflow: ")		return foreachThreadOp.emitOpError("blockDim size overflow: ")
		nicolasvasilacheUnsubmitted Done Reply Inline Actions nit double comment nicolasvasilache: nit double comment
<< blockDim << " > " << globalBlockDim;		<< blockDim << " > " << globalBlockDim;
}		}
if (blockDim == globalBlockDim)		if (blockDim == globalBlockDim)
continue;		continue;
Value tmpPredicate = rewriter.create<arith::CmpIOp>(		Value tmpPredicate = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, threadId,		loc, arith::CmpIPredicate::ult, threadId,
rewriter.create<arith::ConstantIndexOp>(loc, blockDim));		rewriter.create<arith::ConstantIndexOp>(loc, blockDim));
predicate =		predicate =
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
if (failed(rewriteOneForeachThreadToGpuThreads(rewriter, foreachThreadOp,		if (failed(rewriteOneForeachThreadToGpuThreads(rewriter, foreachThreadOp,
blockDim, true)))		blockDim, true)))
return WalkResult::interrupt();		return WalkResult::interrupt();
return WalkResult::advance();		return WalkResult::advance();
});		});
return walkResult;		return walkResult;
}		}

// Alter blockDim of the given kernel		/// Alter grid or block dimensions of the given kernel
static LogicalResult alterGpuLaunchBlockDim(SimpleRewriter &rewriter,		static LogicalResult
gpu::LaunchOp gpuLaunch,		alterGpuLaunch(SimpleRewriter &rewriter, gpu::LaunchOp gpuLaunch,
SmallVector<int64_t> blockDim) {		unsigned int gridDimX = 1, unsigned int gridDimY = 1,
		nicolasvasilacheUnsubmitted Done Reply Inline Actions almost never use `unsigned` unless doing bit manipulations, just use `int64_t` everywhere plz. nicolasvasilache: almost never use `unsigned` unless doing bit manipulations, just use `int64_t` everywhere plz.
gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();		unsigned int gridDimZ = 1, unsigned int blockDimX = 1,
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Please don't use the magic constant 1 to detect whether or not to modify, pass an explicit `Optional<int64_t>` nicolasvasilache: Please don't use the magic constant 1 to detect whether or not to modify, pass an explicit…
if (blockDim[0] < 1 \|\| blockDim[1] < 1 \|\| blockDim[2] < 1) {		unsigned int blockDimY = 1, unsigned int blockDimZ = 1) {
gpuLaunch->emitError() << "Given blockDim(" << blockDim[0] << ","		// TODO The limits should live in the gpu dialect, but it's not like that
<< blockDim[1] << "," << blockDim[2]		// right now. Read them in the common gpu dialect
<< ") is invalid";		if ((blockDimX * blockDimY * blockDimZ) > 1024 \|\| gridDimY > 65535 \|\|
		gridDimZ > 65535 \|\| gridDimX > 2147483647) {
		gpuLaunch->emitError("Given kernel configuration is larger than the limits");
return failure();		return failure();
}		}

		gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
rewriter.setInsertionPointAfterValue(currentBlockdim.x);		rewriter.setInsertionPointAfterValue(currentBlockdim.x);
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Definitely use an OpBuilder::InsertionGuard here, never let a function leak a modification of insertion point nicolasvasilache: Definitely use an OpBuilder::InsertionGuard here, never let a function leak a modification of…
auto createBlockDimValue = [&](int64_t dim) {		auto createConstValue = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),		return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
dim);		dim);
};		};
gpuLaunch.blockSizeXMutable().assign(createBlockDimValue(blockDim[0]));		if (gridDimX != 1)
		nicolasvasilacheUnsubmitted Done Reply Inline Actions optional for less magic constant-y tests nicolasvasilache: optional for less magic constant-y tests
gpuLaunch.blockSizeYMutable().assign(createBlockDimValue(blockDim[1]));		gpuLaunch.gridSizeXMutable().assign(createConstValue(gridDimX));
gpuLaunch.blockSizeZMutable().assign(createBlockDimValue(blockDim[2]));		if (gridDimY != 1)
		gpuLaunch.gridSizeYMutable().assign(createConstValue(gridDimY));
		if (gridDimZ != 1)
		gpuLaunch.gridSizeZMutable().assign(createConstValue(gridDimZ));
		if (blockDimX != 1)
		gpuLaunch.blockSizeXMutable().assign(createConstValue(blockDimX));
		if (blockDimY != 1)
		gpuLaunch.blockSizeYMutable().assign(createConstValue(blockDimY));
		if (blockDimZ != 1)
		gpuLaunch.blockSizeZMutable().assign(createConstValue(blockDimZ));
return success();		return success();
}		}

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
transform::MapNestedForeachThreadToGpuThreads::applyToOne(		transform::MapNestedForeachThreadToGpuThreads::applyToOne(
Operation target, SmallVectorImpl<Operation > &results,		Operation target, SmallVectorImpl<Operation > &results,
transform::TransformState &state) {		transform::TransformState &state) {

gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);		gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
if (!gpuLaunch) {		if (!gpuLaunch) {
target->emitError("Given target is not gpu.launch");		target->emitError("Given target is not gpu.launch");
return DiagnosedSilenceableFailure::definiteFailure();		return DiagnosedSilenceableFailure::definiteFailure();
}		}

SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());		SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
blockDim.resize(/size=/3, /value=/1);		blockDim.resize(/size=/3, /value=/1);
SimpleRewriter rewriter(getContext());		SimpleRewriter rewriter(getContext());
rewriter.setInsertionPoint(target);		rewriter.setInsertionPoint(target);
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Note: it is good practice OpBuilder::InsertionGuard each time you modify insertion points in a scoped piece of code. Here is does not matter much because it won't leak above, but it does matter a lot at function boundaries. nicolasvasilache: Note: it is good practice OpBuilder::InsertionGuard each time you modify insertion points in a…
auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(		auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
rewriter, target, blockDim, true);		rewriter, target, blockDim, true);
if (walkResult.wasInterrupted())		if (walkResult.wasInterrupted())
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));		return DiagnosedSilenceableFailure(reportUnknownTransformError(target));

LogicalResult result = alterGpuLaunchBlockDim(rewriter, gpuLaunch, blockDim);		LogicalResult result = alterGpuLaunch(rewriter, gpuLaunch, 0, 0, 0,
		nicolasvasilacheUnsubmitted Done Reply Inline Actions 0, 0, 0, feels fishy here, what is the intent ? Use `None` if you want it to be optional. nicolasvasilache: 0, 0, 0, feels fishy here, what is the intent ? Use `None` if you want it to be optional.
		blockDim[0], blockDim[1], blockDim[2]);
if (failed(result))		if (failed(result))
return DiagnosedSilenceableFailure::definiteFailure();		return DiagnosedSilenceableFailure::definiteFailure();

results.assign({target});		results.assign({target});
return DiagnosedSilenceableFailure(success());		return DiagnosedSilenceableFailure(success());
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		// MapNestedForeachThreadToGpuBlocks
		//===----------------------------------------------------------------------===//

		LogicalResult mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
		RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
		function_ref<void(Operation *, const SmallVector<int64_t> &, IndexType,
		SmallVector<Value> &)>
		blockIdGenerator,
		SmallVector<int64_t> &gridDims) {
		if (foreachThreadOp.getNumResults() > 0)
		return foreachThreadOp->emitError(
		"only bufferized scf.foreach_thread lowers to gpu.block_id");
		if (foreachThreadOp.getNumThreads().size() > 3)
		return foreachThreadOp->emitError(
		"scf.foreach_thread with rank > 3 does not lower to gpu.block_id");

		// Step 0. Outline the compute workload region and set up the workload
		// operands.
		auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter);
		if (failed(potentialGridDim) \|\|
		llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
		return !getConstantIntValue(ofr).has_value();
		csiggUnsubmitted Done Reply Inline Actions I'm not familiar with transform ops, is this how this is done (rather than `rewriter.notifyMatchFailure()`)? csigg: I'm not familiar with transform ops, is this how this is done (rather than `rewriter.
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Transform ops are more directive, the equivalent of the pattern driver is the tranform IR itself where, if things fail, to apply we want a louder signal (at least for now). nicolasvasilache: Transform ops are more directive, the equivalent of the pattern driver is the tranform IR…
		}))
		return foreachThreadOp->emitError("unsupported dynamic gridDim");

		for (OpFoldResult ofr : *potentialGridDim)
		gridDims.push_back(getConstantIntValue(ofr).value());

		IndexType indexType = rewriter.getIndexType();
		SmallVector<Value> blockOps;
		blockIdGenerator(foreachThreadOp, gridDims, indexType, blockOps);

		// Step 1. Move the body of foreachThreadOp.
		// Erase the terminator first, it will not be used since we are on buffers.
		rewriter.eraseOp(foreachThreadOp.getTerminator());
		Block *targetBlock = foreachThreadOp->getBlock();
		Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
		Block &sourceBlock = foreachThreadOp.getRegion().front();
		targetBlock->getOperations().splice(insertionPoint,
		sourceBlock.getOperations());
		csiggUnsubmitted Done Reply Inline Actions Would it be cleaner to clone the region with a block arg mapping so that it can be rolled back? csigg: Would it be cleaner to clone the region with a block arg mapping so that it can be rolled back?
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Rollback is a thing only for DialectConversion today, we do not use that. Transforms maintain operation handles, moving ops allows to keep the handles valid which is a behavior we want here. nicolasvasilache: Rollback is a thing only for DialectConversion today, we do not use that. Transforms maintain…

		// Step 2. RAUW thread indices to thread ops.
		SmallVector<Value> threadIndices =
		*foreachThreadOp.getPermutedThreadIndices();
		assert(blockOps.size() == 3 && "3 block id ops are required");
		for (auto it : llvm::zip(threadIndices, blockOps)) {
		csiggUnsubmitted Done Reply Inline Actions This looks like a left-over. csigg: This looks like a left-over.
		Value val = std::get<0>(it);
		if (!val)
		continue;
		for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
		rewriter.updateRootInPlace(
		user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
		}
		}

		// Step 3. Erase old op.
		rewriter.eraseOp(foreachThreadOp);

		return success();
		}

		FailureOr<scf::ForeachThreadOp>
		mlir::linalg::findTopLevelForeachThreadOp(Operation *target) {
		scf::ForeachThreadOp topLevelForeachThreadOp;
		auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
		if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
		return WalkResult::advance();
		if (topLevelForeachThreadOp)
		// TODO Handle multiple foreach if there is no dependences between them
		return WalkResult::interrupt();
		topLevelForeachThreadOp = foreachThreadOp;
		return WalkResult::advance();
		});

		if (walkResult.wasInterrupted())
		return target->emitError(
		"could not find a unique topLevel scf.foreach_thread");

		return topLevelForeachThreadOp;
		}

		/// Create gpuLauncOp with given kernel configurations
		static gpu::LaunchOp createGpuLaunch(RewriterBase &rewriter, Location loc,
		int gridDimX = 1, int gridDimY = 1,
		nicolasvasilacheUnsubmitted Done Reply Inline Actions we use `int64_t` everywhere we can, please don't deviate from that nicolasvasilache: we use `int64_t` everywhere we can, please don't deviate from that
		int gridDimZ = 1, int blockDimX = 1,
		int blockDimY = 1, int blockDimZ = 1) {
		auto createConstant = [&](int dim) {
		return rewriter.create<arith::ConstantIndexOp>(loc, dim);
		};
		Value gridSizeX = createConstant(gridDimX);
		Value gridSizeY = createConstant(gridDimY);
		Value gridSizeZ = createConstant(gridDimZ);
		Value blockSizeX = createConstant(blockDimX);
		Value blockSizeY = createConstant(blockDimY);
		Value blockSizeZ = createConstant(blockDimZ);
		auto launchOp = rewriter.create<gpu::LaunchOp>(
		loc, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
		rewriter.setInsertionPointToEnd(&launchOp.body().front());
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Definitely use an OpBuilder::InsertionGuard here, never let a function leak a modification of insertion point nicolasvasilache: Definitely use an OpBuilder::InsertionGuard here, never let a function leak a modification of…
		rewriter.create<gpu::TerminatorOp>(loc);
		return launchOp;
		}

		DiagnosedSilenceableFailure
		transform::MapNestedForeachThreadToGpuBlocks::applyToOne(
		Operation target, SmallVectorImpl<Operation > &results,
		transform::TransformState &state) {
		gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
		SimpleRewriter rewriter(getContext());

		if (!getGenerateGpuLaunch() && !gpuLaunch) {
		target->emitError("Given target is not gpu.launch, set "
		"`generate_gpu_launch` attribute");
		return DiagnosedSilenceableFailure::definiteFailure();
		}

		auto res = mlir::linalg::findTopLevelForeachThreadOp(target);
		if (failed(res))
		return DiagnosedSilenceableFailure(reportUnknownTransformError(target));

		scf::ForeachThreadOp topLevelForeachThreadOp = *res;
		rewriter.setInsertionPoint(topLevelForeachThreadOp);
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Note: it is good practice OpBuilder::InsertionGuard each time you modify insertion points in a scoped piece of code. Here is does not matter much because it won't leak above, but it does matter a lot at function boundaries. nicolasvasilache: Note: it is good practice OpBuilder::InsertionGuard each time you modify insertion points in a…

		// Generate gpu launch here and move the foreach_thread inside
		if (getGenerateGpuLaunch()) {
		gpuLaunch = createGpuLaunch(rewriter, target->getLoc());
		rewriter.setInsertionPointToStart(&gpuLaunch.body().front());
		Operation newForeachThreadOp = rewriter.clone(topLevelForeachThreadOp);
		rewriter.eraseOp(topLevelForeachThreadOp);
		topLevelForeachThreadOp =
		dyn_cast<scf::ForeachThreadOp>(newForeachThreadOp);
		}

		auto generateBlocks = [&](Operation *op, const SmallVector<int64_t> &gridDims,
		IndexType indexType, SmallVector<Value> &blockOps) {
		Location loc = op->getLoc();
		rewriter.setInsertionPoint(op);
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Note: it is good practice OpBuilder::InsertionGuard each time you modify insertion points in a scoped piece of code. Here is does not matter much because it won't leak above, but it does matter a lot at function boundaries. I would still use one here given the fact that this lambda may have a chance of turning into a standalone function in the future. nicolasvasilache: Note: it is good practice OpBuilder::InsertionGuard each time you modify insertion points in a…
		SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
		gpu::Dimension::z};
		for (int64_t idx : llvm::seq<int64_t>(0, gridDims.size())) {
		blockOps.push_back(
		rewriter.create<gpu::BlockIdOp>(loc, indexType, gpuDims[idx]));
		}
		};

		SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
		if (failed(mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
		rewriter, topLevelForeachThreadOp, generateBlocks, gridDim)))
		return DiagnosedSilenceableFailure(reportUnknownTransformError(target));

		if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1],
		gridDim[2])))
		csiggUnsubmitted Done Reply Inline Actions Should there be some check somewhere that the current gridDims are 1 before rewriting them? csigg: Should there be some check somewhere that the current gridDims are 1 before rewriting them?
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Yes, it would make sense to add an error at the top. nicolasvasilache: Yes, it would make sense to add an error at the top.
		return DiagnosedSilenceableFailure::definiteFailure();

		results.assign({gpuLaunch});
		return DiagnosedSilenceableFailure(success());
		}

		//===----------------------------------------------------------------------===//
// TileToForeachThreadOp		// TileToForeachThreadOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure transform::tileToForeachThreadOpImpl(		DiagnosedSilenceableFailure transform::tileToForeachThreadOpImpl(
RewriterBase &rewriter, transform::TransformState &state,		RewriterBase &rewriter, transform::TransformState &state,
TransformOpInterface transformOp, ArrayRef<Operation *> targets,		TransformOpInterface transformOp, ArrayRef<Operation *> targets,
ArrayRef<OpFoldResult> mixedNumThreads,		ArrayRef<OpFoldResult> mixedNumThreads,
ArrayRef<OpFoldResult> mixedTileSizes, Optional<ArrayAttr> threadDimMapping,		ArrayRef<OpFoldResult> mixedTileSizes, Optional<ArrayAttr> threadDimMapping,
▲ Show 20 Lines • Show All 210 Lines • ▼ Show 20 Lines	public:

void init() {		void init() {
declareDependentDialect<pdl::PDLDialect>();		declareDependentDialect<pdl::PDLDialect>();
declareDependentDialect<LinalgDialect>();		declareDependentDialect<LinalgDialect>();
declareGeneratedDialect<AffineDialect>();		declareGeneratedDialect<AffineDialect>();
declareGeneratedDialect<arith::ArithmeticDialect>();		declareGeneratedDialect<arith::ArithmeticDialect>();
declareGeneratedDialect<scf::SCFDialect>();		declareGeneratedDialect<scf::SCFDialect>();
declareGeneratedDialect<vector::VectorDialect>();		declareGeneratedDialect<vector::VectorDialect>();
		declareGeneratedDialect<gpu::GPUDialect>();

registerTransformOps<		registerTransformOps<
#define GET_OP_LIST		#define GET_OP_LIST
#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp.inc"		#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp.inc"
>();		>();
}		}
};		};
} // namespace		} // namespace
Show All 10 Lines

mlir/test/Dialect/Linalg/transform-gpu.mlir

	// RUN: mlir-opt --test-transform-dialect-interpreter --split-input-file %s \| FileCheck %s			// RUN: mlir-opt --test-transform-dialect-interpreter --split-input-file %s \| FileCheck %s

	!type = memref<2 x 32 x f32>			!type = memref<2 x 32 x f32>
	!type1d = memref<32 x f32>			!type1d = memref<32 x f32>

				// CHECK-LABEL: func.func @saxpy2dblock(
				// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
				// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
				// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32>
				func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
				%c9 = arith.constant 9 : index
				%c7 = arith.constant 7 : index
				%one = arith.constant 1 : index
				// CHECK: gpu.launch
				// CHECK: %[[BLKX:.*]] = gpu.block_id x
				// CHECK: %[[BLKY:.*]] = gpu.block_id y
				// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]]]
				// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]]]
				%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
				threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
				{
				scf.foreach_thread (%i, %j) in (%c7, %c9) {
				%4 = memref.load %x[%i, %j] : !type
				%5 = memref.load %y[%i, %j] : !type
				%6 = math.fma %alpha, %4, %5 : f32
				memref.store %6, %y[%i, %j] : !type
				} {thread_dim_mapping = [0, 1, 2]}
				gpu.terminator
				}
				return %y : !type
				}

				transform.with_pdl_patterns {
				^bb0(%arg0: !pdl.operation):
				transform.sequence %arg0 failures(propagate) {
				^bb1(%arg1: !pdl.operation):
				%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
				transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { blockDim = [12, 9, 1]}
				}
				}

				// -----

				!type = memref<2 x 32 x f32>
				!type1d = memref<32 x f32>

	// CHECK-LABEL: func.func @saxpy2d(			// CHECK-LABEL: func.func @saxpy2d(
	// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>			// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
	// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>			// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
	// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32>			// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32>
	func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {			func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
	%one = arith.constant 1 : index			%one = arith.constant 1 : index
	%c12 = arith.constant 12 : index			%c12 = arith.constant 12 : index
	%c9 = arith.constant 9 : index			%c9 = arith.constant 9 : index
	Show All 37 Lines
	^bb0(%arg0: !pdl.operation):			^bb0(%arg0: !pdl.operation):
	transform.sequence %arg0 failures(propagate) {			transform.sequence %arg0 failures(propagate) {
	^bb1(%arg1: !pdl.operation):			^bb1(%arg1: !pdl.operation):
	%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0			%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
	transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] }			transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] }
	}			}
	}			}

				// -----

				!type4d = memref<32x64x4x32xf32>

				// CHECK-LABEL: func.func @saxpy4d(
				// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32>
				// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32>
				func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
				%c32 = arith.constant 32 : index
				%c64 = arith.constant 64 : index
				%c4 = arith.constant 4 : index
				// CHECK: gpu.launch
				// CHECK: %[[BLKX:.*]] = gpu.block_id x
				// CHECK: %[[BLKY:.*]] = gpu.block_id y
				// CHECK: %[[TIDX:.*]] = gpu.thread_id x
				// CHECK: %[[TIDY:.*]] = gpu.thread_id y
				// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
				// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
				scf.foreach_thread (%i, %j) in (%c32, %c64) {
				scf.foreach_thread (%k, %l) in (%c4, %c32) {
				%4 = memref.load %x[%i, %j, %k, %l] : !type4d
				%5 = memref.load %y[%i, %j, %k, %l] : !type4d
				%6 = math.fma %alpha, %4, %5 : f32
				memref.store %6, %y[%i, %j, %k, %l] : !type4d
				} {thread_dim_mapping = [1, 0, 2]}
				} {thread_dim_mapping = [0, 1, 2]}
				return %y : !type4d
				}

				transform.with_pdl_patterns {
				^bb0(%arg0: !pdl.operation):
				transform.sequence %arg0 failures(propagate) {
				^bb1(%arg1: !pdl.operation):
				%funcop = transform.structured.match ops{["func.func"]} in %arg0
				%gpuLaunch = transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { generate_gpu_launch }
				nicolasvasilacheUnsubmitted Done Reply Inline Actions nice! nicolasvasilache: nice!
				transform.structured.map_nested_foreach_thread_to_gpu_threads %gpuLaunch { blockDim = [32, 4, 1] }
				}
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Add map_nested_foreach_thread_to_gpu_blocks op to transform dialect
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 462401

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/test/Dialect/Linalg/transform-gpu.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Add map_nested_foreach_thread_to_gpu_blocks op to transform dialectClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 462401

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/test/Dialect/Linalg/transform-gpu.mlir

[mlir] Add map_nested_foreach_thread_to_gpu_blocks op to transform dialect
ClosedPublic