Diff 505892

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Show First 20 Lines • Show All 108 Lines • ▼ Show 20 Lines	let description = [{
```mlir		```mlir
%laneId = gpu.lane_id		%laneId = gpu.lane_id
```		```
}];		}];
let results = (outs Index:$result);		let results = (outs Index:$result);
let assemblyFormat = "attr-dict";		let assemblyFormat = "attr-dict";
}		}

		def GPU_LinearIdOp : GPU_Op<"linear_id", [
		Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
		let description = [{
		Returns the linearized id within the workgroup (block).

		Example:
		```mlir
		%laneId = gpu.lane_id
		bondhugulaUnsubmitted Done Reply Inline Actions This doesn't seem to have any test cases here. Is this PR ready for review? bondhugula: This doesn't seem to have any test cases here. Is this PR ready for review?
		nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions Good point, there is indeed no value in introducing new GPU ops that do not lower through the common pass for the purpose of the transformations I am implementing. Refactored the implementation to avoid relying on a new op. This op may make sense to be introduced separately and lowered through the existing schemes that are mindful of index type length. But this is indeed a separate concern and a separate PR. nicolasvasilache: Good point, there is indeed no value in introducing new GPU ops that do not lower through the…
		bondhugulaUnsubmitted Done Reply Inline Actions linear_id or GPU_LaneIdOp to be consistent. bondhugula: linear_id or GPU_LaneIdOp to be consistent.
		```
		}];
		let results = (outs Index:$result);
		let assemblyFormat = "attr-dict";
		}

def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [		def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,		Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
Arguments<(ins)>, Results<(outs Index:$result)> {		Arguments<(ins)>, Results<(outs Index:$result)> {
let description = [{		let description = [{
Returns the subgroup id, i.e. the index of the current subgroup within the		Returns the subgroup id, i.e. the index of the current subgroup within the
workgroup.		workgroup.

Example:		Example:
▲ Show 20 Lines • Show All 1,267 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

Show First 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	let description = [{

Warp (aka subgroup) are grouped into a grid where grid may be		Warp (aka subgroup) are grouped into a grid where grid may be
described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates		described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates
that thread block parallelism is desired. It can be consumed by lowering to		that thread block parallelism is desired. It can be consumed by lowering to
generate GPU code.		generate GPU code.
}];		}];
}		}

		def LinearIdEnum : I64EnumAttr<"LinearId", "threads for loop mapping", [
		DimX, DimY, DimZ]> {
		ThomasRaouxUnsubmitted Done Reply Inline Actions Doesn't have to be addressed in this patch but DimX, DimY and DimZ don't really make sense for linearID, I wonder if we could just have an index to give the order of distribution? ThomasRaoux: Doesn't have to be addressed in this patch but DimX, DimY and DimZ don't really make sense for…
		nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions ack, we can also go higher-D than 3 nicolasvasilache: ack, we can also go higher-D than 3
		let cppNamespace = "::mlir::gpu";
		}

		def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [
		DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
		let parameters = (ins
		EnumParameter<LinearIdEnum>:$linear_id
		);
		let assemblyFormat = "`<` params `>`";
		let description = [{
		An attribute that allows defining thread parallelism for GPU devices.
		bondhugulaUnsubmitted Done Reply Inline Actions This description needs to be completed (recommend focusing on "what" instead of what it allows in general). bondhugula: This description needs to be completed (recommend focusing on "what" instead of what it allows…
		}];
		}

def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [		def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [
DimX, DimY, DimZ]> {		DimX, DimY, DimZ]> {
let cppNamespace = "::mlir::gpu";		let cppNamespace = "::mlir::gpu";
}		}

def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [		def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [
DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {		DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
let parameters = (ins		let parameters = (ins
Show All 33 Lines

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

	Show All 27 Lines
	#define GET_OP_CLASSES			#define GET_OP_CLASSES
	#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"			#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"

	namespace mlir {			namespace mlir {
	class DialectRegistry;			class DialectRegistry;
	namespace transform {			namespace transform {
	namespace gpu {			namespace gpu {

				constexpr int64_t kWarpSize = 32;
				bondhugulaUnsubmitted Done Reply Inline Actions Doc comment. bondhugula: Doc comment.

				/// Helper type for functions that generate ids for the mapping of a
				/// scf.forall.
				struct IdBuilderResult {
				// Ops used to replace the forall induction variables.
				SmallVector<Value> mappingIdOps;
				// Actual mapping sizes used to predicate the forall body whenthey are smaller
				// than the availableMappingSizes.
				SmallVector<int64_t> predicateMappingSizes;
				// Ops used to predicate the forall body when predicateMappingSizes is smaller
				// than the availableMappingSizes.
				SmallVector<Value> predicateIdOps;
				};
				using GpuIdBuilderFnType = llvm::function_ref<IdBuilderResult(
				RewriterBase &, scf::ForallOp, ArrayRef<int64_t>, ArrayRef<int64_t>)>;

				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: replace `A the` -> `The`? ThomasRaoux: nit: replace `A the` -> `The`?
				/// Helper struct for passing the mapping attributes and id generator to the
				/// common forall rewriter.
				struct GpuIdBuilder {
				/// The mapping attributes targeted by this generator.
				SmallVector<DeviceMappingAttrInterface> mappingAttributes;
				/// The constructor that builds the concrete IR for mapping ids.
				GpuIdBuilderFnType idBuilder;
				};

	/// Map the top level `scf.forall` op to GPU Thread Blocks.			/// Map the top level `scf.forall` op to GPU Thread Blocks.
	/// Mapping is one-to-one and the induction variables of `scf.forall` are			/// Mapping is one-to-one and the induction variables of `scf.forall` are
	/// rewritten to gpu.block_id according to the thread_dim_apping attribute.			/// rewritten to gpu.block_id according to the thread_dim_mapping attribute.
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic block dim sizes are currently not supported.			/// Dynamic block dim sizes are currently not supported.
	DiagnosedSilenceableFailure mapForallToBlocksImpl(			DiagnosedSilenceableFailure
	RewriterBase &rewriter, TransformOpInterface transformOp,			mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,
	scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,			scf::ForallOp forallOp,
	const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes,			SmallVectorImpl<int64_t> &gridDims,
	function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>			const GpuIdBuilder &gpuIdBuilder);
	blockIdGenerator);
				/// Search `scf.forall` ops nested under `target` and map each such op to an
	/// Search `scf.forall` ops nested under `target` and map each such op to GPU			/// explicit GPU implementation along `availableMappingSizes`.
	/// threads. Mapping is one-to-one and the induction variables of `scf.forall`			/// The mapping is one-to-one and the induction variables of `scf.forall` are
	/// are rewritten to gpu.thread_id according to the thread_dim_mapping			/// rewritten to gpuIdBuilder.idBuilder according to the
	/// attribute.			/// gpuIdBuilder.mappingAttributes attribute.
	/// Sibling `scf.forall` are supported in which case, the union of the number of
	/// threads is computed and may result in predication.
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic block dim sizes are currently not supported.			/// Dynamic `availableMappingSizes` sizes are currently not supported.
				/// `availableMappingSizes` is expected to be of size 3.
				DiagnosedSilenceableFailure mapOneForallToThreadsImpl(
				RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
				scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
				bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder);

				/// Search `scf.forall` ops nested under `target` and map each such op to an
				/// explicit GPU implementation along blockDims, warpDims and linearDims.
				/// The mapping is one-to-one and the induction variables of `scf.forall` are
				/// rewritten to threads/warps/linear ids according.
				/// Dynamic, `scf.forall` trip counts are currently not supported.
				/// Dynamic `blockDims`, `warpDims` or `linearDims` sizes are currently not
				/// supported.
				/// `blockDims` is expected to be of size 3.
				/// `warpDims` is expected to be empty or of size 3.
				/// The insertion point is expected to be set at the beginning of the target
				/// body block and dominate all other blocks.
	DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(			DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
	RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,			RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
	Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,			Operation *target, ArrayRef<int64_t> blockDims, ArrayRef<int64_t> warpDims,
	bool syncAfterDistribute,			bool syncAfterDistribute);
	const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes,
	function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
	threadIdGenerator);

	/// Find the unique top level scf::ForallOp within a given target op.			/// Find the unique top level scf::ForallOp within a given target op.
	DiagnosedSilenceableFailure			DiagnosedSilenceableFailure
	findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,			findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
	TransformOpInterface transformOp);			TransformOpInterface transformOp);

	} // namespace gpu			} // namespace gpu
	} // namespace transform			} // namespace transform

	namespace gpu {			namespace gpu {
	void registerTransformDialectExtension(DialectRegistry &registry);			void registerTransformDialectExtension(DialectRegistry &registry);
	} // namespace gpu			} // namespace gpu
	} // namespace mlir			} // namespace mlir

	#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H			#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td

Show All 16 Lines

def MapNestedForallToThreads :		def MapNestedForallToThreads :
Op<Transform_Dialect, "gpu.map_nested_forall_to_threads",		Op<Transform_Dialect, "gpu.map_nested_forall_to_threads",
[FunctionalStyleTransformOpTrait,		[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,		MemoryEffectsOpInterface,
TransformEachOpTrait,		TransformEachOpTrait,
TransformOpInterface]> {		TransformOpInterface]> {
let description = [{		let description = [{
Target the `gpu.launch op` and rewrite all `scf.forall`		Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to
nested in it to distributed `gpu.thread_id` attribute.		distributed `gpu.thread_id` attribute.

The operation searches for `scf.forall` ops nested under `target`		The operation searches for `scf.forall` ops nested under `target` and maps
and maps each such op to GPU threads. Mapping is one-to-one and the		each such op to GPU threads.
induction variables of `scf.forall` are rewritten to
`gpu.thread_id` according to the `mapping` attribute.		`scf.forall` induction variables are rewritten to `gpu.thread_id` according
		to the `mapping` attribute.
Sibling `scf.forall` are supported in which case, the union of
the number of threads is computed and may result in predication.		Different types of mappings attributes are supported:
		- the block_dims is a list of integers that specifies the number of
Multiple scf.forall are supported per `gpu.launch` in which case,		threads in each dimension. This is a mandatory attribute that is used
the max of all the threads is computed and taken for the global		to constrain the number of threads in each dimension. If an
`gpu.thread_id`. If necessary, `scf.forall` that do not use the		`scf.forall` op is mapped to fewer threads, predication occurs.
whole thread range result in predicated computations.		- the warp_dims is a list of integers that specifies the number of
		warps in each dimension. This is an optional attribute that is used
		to constrain the number of warps in each dimension. When present, this
		attribute must be specified in a way that is compatible with the
		block_dims attribute. If an `scf.forall` op is mapped to fewer warps,
		predicaiton occurs.

Dynamic `scf.forall` trip counts are currently not supported.		Dynamic `scf.forall` trip counts are currently not supported.
Dynamic block dim sizes are currently not supported.		Dynamic block dim sizes are currently not supported.

Only bufferized `scf.forall` are currently supported.		Only bufferized `scf.forall` are currently supported.
Only `scf.forall` distributed to at most 3 dimensions are		Only `scf.forall` distributed to at most 3 dimensions are
currently supported.		currently supported.

Barriers are inserted after each scf.forall op for now.		The `sync_after_distribute`attribute controls whether a `gpu.barrier` is
		inserted after each scf.forall op. At this time, this is an all or nothing
		choice. This will need to be tightened in the future.

The operation alters the block size of the given gpu_launch using		The operation alters the block size of the given gpu_launch using the
blockDim argument.		mandatory block_dims argument.

#### Return modes:		#### Return modes:

This operation ignores non-gpu_launch ops and drops them in the return.		This operation ignores non-gpu_launch ops and drops them in the return.

If any scf.forall with tensors is found, the transform definitely		If any scf.forall with tensors is found, the transform definitely
fails.		fails.

Show All 18 Lines	let description = [{
... // body 1		... // body 1
} {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]}		} {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]}
scf.forall (%i) in (12) {		scf.forall (%i) in (12) {
... // body 2		... // body 2
} {mapping = [#gpu.thread<x>]}		} {mapping = [#gpu.thread<x>]}
gpu.terminator		gpu.terminator
}		}
```		```

is translated to:		is translated to:

```		```
%bdimX = arith.constant 12 : index		%bdimX = arith.constant 12 : index
%bdimY = arith.constant 9 : index		%bdimY = arith.constant 9 : index
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)		gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {		threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
if (threadIdx.x < 9 && threadIdx.y < 7) {		if (threadIdx.x < 9 && threadIdx.y < 7) {
... // body 1		... // body 1
}		}
gpu.barrier		gpu.barrier
if (threadIdx.y < 1) {		if (threadIdx.y < 1) {
... // body 2		... // body 2
}		}
gpu.barrier		gpu.barrier
gpu.terminator		gpu.terminator
}		}
```		```
}];		}];

let arguments = (ins PDL_Operation:$target,		let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,		DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$block_dims,
DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);		DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
		DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute);
let results = (outs PDL_Operation:$result);		let results = (outs PDL_Operation:$result);

let assemblyFormat = "$target attr-dict";		let assemblyFormat = [{
		$target
		`block_dims` `=` $block_dims
		(`warp_dims` `=` $warp_dims^)?
		(`sync_after_distribute` `=` $sync_after_distribute^)?
		attr-dict
		}];
let extraClassDeclaration = [{		let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(		::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,		::mlir::Operation *target,
::mlir::transform::ApplyToEachResultList &results,		::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);		::mlir::transform::TransformState &state);
}];		}];
}		}


def MapForallToBlocks :		def MapForallToBlocks :
Op<Transform_Dialect, "gpu.map_forall_to_blocks",		Op<Transform_Dialect, "gpu.map_forall_to_blocks",
[FunctionalStyleTransformOpTrait,		[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,		MemoryEffectsOpInterface,
TransformOpInterface,		TransformOpInterface,
TransformEachOpTrait]> {		TransformEachOpTrait]> {
let description = [{		let description = [{
Target the gpu_launch op and rewrite the top level `scf.forall`		Target the gpu_launch op and rewrite the top level `scf.forall`
to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute		to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
is set, then first generates `gpu_launch` and moves the top level		is set, then first generates `gpu_launch` and moves the top level
`scf.forall` inside.		`scf.forall` inside.

The operation searches top level `scf.forall` ops under		The operation searches top level `scf.forall` ops under
`gpu_launch` and maps each such op to GPU blocks. Mapping is		`gpu_launch` and maps each such op to GPU blocks. Mapping is
one-to-one and the induction variables of `scf.forall` are		one-to-one and the induction variables of `scf.forall` are
rewritten to gpu.block_id according to the `thread_dim_mapping` attribute.		rewritten to gpu.block_id according to the `thread_dim_mapping` attribute.

Dynamic, `scf.forall` trip counts are currently not supported.		Dynamic, `scf.forall` trip counts are currently not supported.
Dynamic block dim sizes are currently not supported.		Dynamic block dim sizes are currently not supported.

Only bufferized scf.forall are currently supported.		Only bufferized scf.forall are currently supported.
Only scf.forall distributed to at most 3 dimensions are		Only scf.forall distributed to at most 3 dimensions are
currently supported.		currently supported.

The operation alters the block size of the given gpu_launch using		The operation alters the block size of the given gpu_launch using the
gridDim argument.		grid_dims argument.

#### Return modes:		#### Return modes:

This operation ignores non-gpu_launch ops and drops them in the return.		This operation ignores non-gpu_launch ops and drops them in the return.

If any scf.forall with tensors is found, the transform definitely		If any scf.forall with tensors is found, the transform definitely
fails.		fails.

If all the scf.forall operations contained within the LaunchOp		If all the scf.forall operations contained within the LaunchOp
referred to by the `target` PDLOperation lower to GPU properly, the		referred to by the `target` PDLOperation lower to GPU properly, the
transform succeeds. Otherwise the transform definitely fails.		transform succeeds. Otherwise the transform definitely fails.

The returned handle points to the same LaunchOp operand, consuming it and		The returned handle points to the same LaunchOp operand, consuming it and
producing a new SSA value to satisfy chaining and linearity of the IR		producing a new SSA value to satisfy chaining and linearity of the IR
properties.		properties.
}];		}];

let arguments = (ins PDL_Operation:$target,		let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,		DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$grid_dims,
UnitAttr:$generate_gpu_launch);		UnitAttr:$generate_gpu_launch);
let results = (outs PDL_Operation:$result);		let results = (outs PDL_Operation:$result);

let assemblyFormat = "$target attr-dict";		let assemblyFormat = [{
		$target
		(`generate_gpu_launch` $generate_gpu_launch^)?
		(`grid_dims` `=` $grid_dims^)?
		attr-dict
		}];
let extraClassDeclaration = [{		let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(		::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::Operation *target,		::mlir::Operation *target,
::mlir::transform::ApplyToEachResultList &results,		::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);		::mlir::transform::TransformState &state);
}];		}];
}		}

#endif // GPU_TRANSFORM_OPS		#endif // GPU_TRANSFORM_OPS

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

	Show All 40 Lines
	int64_t GPUBlockMappingAttr::getMappingId() const {			int64_t GPUBlockMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getBlock());			return static_cast<int64_t>(getBlock());
	}			}

	int64_t GPUWarpMappingAttr::getMappingId() const {			int64_t GPUWarpMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getWarp());			return static_cast<int64_t>(getWarp());
	}			}

				int64_t GPULinearIdMappingAttr::getMappingId() const {
				return static_cast<int64_t>(getLinearId());
				}

	int64_t GPUThreadMappingAttr::getMappingId() const {			int64_t GPUThreadMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getThread());			return static_cast<int64_t>(getThread());
	}			}

	int64_t GPUMemorySpaceMappingAttr::getMappingId() const {			int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getAddressSpace());			return static_cast<int64_t>(getAddressSpace());
	}			}

	▲ Show 20 Lines • Show All 1,395 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp

Show First 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	void ThreadIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
setResultRange(getResult(), getIndexRange(0, max - 1ULL));		setResultRange(getResult(), getIndexRange(0, max - 1ULL));
}		}

void LaneIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,		void LaneIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {		SetIntRangeFn setResultRange) {
setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));		setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));
}		}

		void LinearIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
		SetIntRangeFn setResultRange) {
		setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));
		}

void SubgroupIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,		void SubgroupIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {		SetIntRangeFn setResultRange) {
setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL));		setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL));
}		}

void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,		void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {		SetIntRangeFn setResultRange) {
uint64_t blockDimMax =		uint64_t blockDimMax =
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//		//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"		#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"

		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"		#include "mlir/Dialect/Arith/IR/Arith.h"
		#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"		#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/PDL/IR/PDL.h"		#include "mlir/Dialect/PDL/IR/PDL.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"		#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"		#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"		#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"		#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
		#include "mlir/Dialect/Utils/IndexingUtils.h"
		#include "mlir/IR/AffineExpr.h"
		#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"		#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/IRMapping.h"		#include "mlir/IR/IRMapping.h"
		#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OpDefinition.h"		#include "mlir/IR/OpDefinition.h"
		#include "mlir/IR/Visitors.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::gpu;		using namespace mlir::gpu;
using namespace mlir::transform;		using namespace mlir::transform;
		using namespace mlir::transform::gpu;

#define DEBUG_TYPE "gpu-transforms"		#define DEBUG_TYPE "gpu-transforms"

#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")		#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")		#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

namespace {		namespace {

/// Helper type for functions that generate ids for the mapping of a scf.forall.		/// Return a flattened thread id for the workgroup with given sizes.
using IdGeneratorFnType = llvm::function_ref<void(RewriterBase &, scf::ForallOp,		static OpFoldResult getStaticLinearThreadId(RewriterBase &rewriter,
SmallVectorImpl<Value> &)>;		Location loc,
		ArrayRef<OpFoldResult> blockDims) {
struct MappingToGpuHelper {		assert(blockDims.size() == 3 && "expected 3 workgroup sizes");
MappingToGpuHelper(SmallVector<DeviceMappingAttrInterface> mappingAttributes,		AffineExpr tx, ty, tz, BDX, BDY;
IdGeneratorFnType idGenerator)		bindDims(rewriter.getContext(), tx, ty, tz);
: mappingAttributes(mappingAttributes), idGenerator(idGenerator) {}		bindSymbols(rewriter.getContext(), BDX, BDY);
		IndexType indexType = rewriter.getIndexType();
SmallVector<DeviceMappingAttrInterface> mappingAttributes;		SmallVector<OpFoldResult> threadsAndWorkGroups{
IdGeneratorFnType idGenerator;		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
};		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
		threadsAndWorkGroups.push_back(blockDims[0]);
		threadsAndWorkGroups.push_back(blockDims[1]);
		return makeComposedFoldedAffineApply(
		rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
		}

struct MappingToGpuBlocksHelper : public MappingToGpuHelper {		struct GpuBlockIdBuilder : public GpuIdBuilder {

MappingToGpuBlocksHelper(MLIRContext *ctx)		GpuBlockIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
: MappingToGpuHelper(		mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
SmallVector<DeviceMappingAttrInterface>{
GPUBlockMappingAttr::get(ctx, Blocks::DimX),
GPUBlockMappingAttr::get(ctx, Blocks::DimY),		GPUBlockMappingAttr::get(ctx, Blocks::DimY),
GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},		GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
SmallVectorImpl<Value> &ids) {		ArrayRef<int64_t> forallMappingSizes,
		ArrayRef<int64_t> availableMappingSizes) {
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(forallOp);		rewriter.setInsertionPoint(forallOp);
IndexType indexType = rewriter.getIndexType();		IndexType indexType = rewriter.getIndexType();
auto loc = forallOp->getLoc();		auto loc = forallOp->getLoc();
ids.assign(		SmallVector<Value> ids{
{rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),		rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),		rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)});		rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
}}) {}		return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
		ids};
		};
		}
};		};

struct MappingToGpuThreadsHelper : public MappingToGpuHelper {		struct GpuThreadIdBuilder : public GpuIdBuilder {
MappingToGpuThreadsHelper(MLIRContext *ctx)		GpuThreadIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
: MappingToGpuHelper(		mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
SmallVector<DeviceMappingAttrInterface>{
GPUThreadMappingAttr::get(ctx, Threads::DimX),
GPUThreadMappingAttr::get(ctx, Threads::DimY),		GPUThreadMappingAttr::get(ctx, Threads::DimY),
GPUThreadMappingAttr::get(ctx, Threads::DimZ)},		GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
SmallVectorImpl<Value> &ids) {		ArrayRef<int64_t> forallMappingSizes,
		ArrayRef<int64_t> availableMappingSizes) {
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(forallOp);		rewriter.setInsertionPoint(forallOp);
IndexType indexType = rewriter.getIndexType();		IndexType indexType = rewriter.getIndexType();
auto loc = forallOp->getLoc();		auto loc = forallOp->getLoc();
ids.assign(		SmallVector<Value> ids{
{rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)});		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
}}) {}		return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
		ids};
		};
		}
		};

		struct GpuWarpIdBuilder : public GpuIdBuilder {
		GpuWarpIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
		mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
		GPUWarpMappingAttr::get(ctx, Warps::DimY),
		GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
		ArrayRef<int64_t> forallMappingSizes,
		ArrayRef<int64_t> availableMappingSizes) {
		OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPoint(forallOp);
		Location loc = forallOp.getLoc();
		Value warpId = rewriter.create<SubgroupIdOp>(loc);
		SmallVector<int64_t> reverseBasisSizes(
		llvm::reverse(availableMappingSizes));
		LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
		DBGS() << "--delinearization basis: ");
		llvm::dbgs() << "\n");

		SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
		LLVM_DEBUG(llvm::interleaveComma(strides,
		DBGS() << "--delinearization strides: ");
		llvm::dbgs() << "\n");

		AffineExpr d0;
		bindDims(rewriter.getContext(), d0);
		SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
		LLVM_DEBUG(llvm::interleaveComma(delinearizingExprs,
		DBGS() << "--delinearization exprs: ");
		llvm::dbgs() << "\n");

		SmallVector<Value> ids;
		for (AffineExpr e : delinearizingExprs)
		ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId));
		LLVM_DEBUG(llvm::interleaveComma(ids, DBGS() << "--ids: ");
		llvm::dbgs() << "\n");
		return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
		ids};
		};
		}
		};

		struct GpuLinearIdBuilder : public GpuIdBuilder {
		GpuLinearIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
		mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
		GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
		GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
		ArrayRef<int64_t> forallMappingSizes,
		ArrayRef<int64_t> availableMappingSizes) {
		OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPoint(forallOp);
		Location loc = forallOp.getLoc();
		Value linearIdOp = rewriter.create<LinearIdOp>(loc);
		SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
		LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
		DBGS() << "--delinearization basis: ");
		llvm::dbgs() << "\n");

		SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
		LLVM_DEBUG(llvm::interleaveComma(strides,
		DBGS() << "--delinearization strides: ");
		llvm::dbgs() << "\n");

		AffineExpr d0;
		bindDims(rewriter.getContext(), d0);
		SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
		LLVM_DEBUG(llvm::interleaveComma(delinearizingExprs,
		DBGS() << "--delinearization exprs: ");
		llvm::dbgs() << "\n");

		SmallVector<Value> ids;
		for (AffineExpr e : delinearizingExprs)
		ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearIdOp));
		LLVM_DEBUG(llvm::interleaveComma(ids, DBGS() << "--ids: ");
		llvm::dbgs() << "\n");

		int64_t actualMappingSize = 1;
		for (int64_t s : forallMappingSizes)
		actualMappingSize *= s;
		return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
		SmallVector<Value>{linearIdOp}};
		};
		}
		bondhugulaUnsubmitted Done Reply Inline Actions Missing code comments for these. bondhugula: Missing code comments for these.
};		};

} // namespace		} // namespace

static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
failureHelper(std::optional<TransformOpInterface> transformOp,		definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp, const Twine &message) {		Operation *target, const Twine &message) {
if (transformOp.has_value())		if (transformOp.has_value())
return emitDefiniteFailure(*transformOp, message);		return transformOp->emitDefiniteFailure() << message;
return emitDefiniteFailure(forallOp, message);		return emitDefiniteFailure(target, message);
}		}

/// Check if given mapping attributes are one of the desired attributes		/// Check if given mapping attributes are one of the desired attributes
static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,		checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp) {		scf::ForallOp forallOp) {
if (!forallOp.getMapping().has_value())		if (!forallOp.getMapping().has_value())
return failureHelper(transformOp, forallOp, "mapping must be present");		return definiteFailureHelper(transformOp, forallOp,
		"mapping must be present");

bool hasBlockMapping =		bool hasBlockMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return attr.isa<GPUBlockMappingAttr>();		return attr.isa<GPUBlockMappingAttr>();
});		});
bool hasThreadMapping =		bool hasThreadMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return attr.isa<GPUThreadMappingAttr>();		return attr.isa<GPUThreadMappingAttr>();
});		});
		bool hasWarpMapping =
		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
		return attr.isa<GPUWarpMappingAttr>();
		});
		bool hasLinearMapping =
		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
		return attr.isa<GPULinearIdMappingAttr>();
		});
int64_t countMappingTypes = 0;		int64_t countMappingTypes = 0;
countMappingTypes += hasBlockMapping ? 1 : 0;		countMappingTypes += hasBlockMapping ? 1 : 0;
countMappingTypes += hasThreadMapping ? 1 : 0;		countMappingTypes += hasThreadMapping ? 1 : 0;
		countMappingTypes += hasWarpMapping ? 1 : 0;
		countMappingTypes += hasLinearMapping ? 1 : 0;
if (countMappingTypes > 1) {		if (countMappingTypes > 1) {
return failureHelper(transformOp, forallOp,		return definiteFailureHelper(
		transformOp, forallOp,
"cannot mix different mapping types, use nesting");		"cannot mix different mapping types, use nesting");
}		}

DenseSet<Attribute> seen;		DenseSet<Attribute> seen;
for (Attribute map : forallOp.getMapping()->getValue()) {		for (Attribute map : forallOp.getMapping()->getValue()) {
if (llvm::is_contained(seen, map)) {		if (llvm::is_contained(seen, map)) {
return failureHelper(transformOp, forallOp,		return definiteFailureHelper(
		transformOp, forallOp,
"duplicated attribute, cannot map different loops "		"duplicated attribute, cannot map different loops "
"to the same processor");		"to the same processor");
}		}
seen.insert(map);		seen.insert(map);
}		}

return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
verifyGpuMapping(std::optional<TransformOpInterface> transformOp,		verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp) {		scf::ForallOp forallOp) {
// Check the types of the mapping attributes match.		// Check the types of the mapping attributes match.
DiagnosedSilenceableFailure typeRes =		DiagnosedSilenceableFailure typeRes =
checkMappingAttributeTypes(transformOp, forallOp);		checkMappingAttributeTypes(transformOp, forallOp);
if (!typeRes.succeeded())		if (!typeRes.succeeded())
return typeRes;		return typeRes;

// Perform other non-types verifications.		// Perform other non-types verifications.
if (!forallOp.isNormalized())		if (!forallOp.isNormalized())
return failureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"unsupported non-normalized loops");		"unsupported non-normalized loops");
if (forallOp.getNumResults() > 0)		if (forallOp.getNumResults() > 0)
return failureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"only bufferized scf.forall can be mapped");		"only bufferized scf.forall can be mapped");
if (forallOp.getRank() > 3)		if (forallOp.getRank() > 3)
return failureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"scf.forall with rank > 3 does not lower");		"scf.forall with rank > 3 does not lower");
if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) {		if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();		return !getConstantIntValue(ofr).has_value();
})) {		})) {
return failureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"unsupported dynamic sizes in forall op");		"unsupported dynamic sizes in forall op");
}		}
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

/// Determines if the size of the kernel configuration is supported by the GPU		/// Determines if the size of the kernel configuration is supported by the
/// architecture being used. It presently makes use of CUDA limitations, however		/// GPU architecture being used. It presently makes use of CUDA limitations,
/// that aspect may be enhanced for other GPUs.		/// however that aspect may be enhanced for other GPUs.
static DiagnosedSilenceableFailure checkGpuLimits(		static DiagnosedSilenceableFailure checkGpuLimits(
TransformOpInterface transformOp, std::optional<int64_t> gridDimX,		TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,		std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,		std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
std::optional<int64_t> blockDimZ) {		std::optional<int64_t> blockDimZ) {

static constexpr int maxTotalBlockdim = 1024;		static constexpr int maxTotalBlockdim = 1024;
static constexpr int maxBlockdimx = 1024;		static constexpr int maxBlockdimx = 1024;
Show All 10 Lines	if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
maxTotalGriddim \|\|		maxTotalGriddim \|\|
blockDimX.value_or(1) > maxBlockdimx \|\|		blockDimX.value_or(1) > maxBlockdimx \|\|
blockDimY.value_or(1) > maxBlockdimy \|\|		blockDimY.value_or(1) > maxBlockdimy \|\|
blockDimZ.value_or(1) > maxBlockdimz \|\|		blockDimZ.value_or(1) > maxBlockdimz \|\|
gridDimY.value_or(1) > maxGriddimy \|\|		gridDimY.value_or(1) > maxGriddimy \|\|
gridDimZ.value_or(1) > maxGriddimz \|\|		gridDimZ.value_or(1) > maxGriddimz \|\|
gridDimX.value_or(1) > maxGriddimx) {		gridDimX.value_or(1) > maxGriddimx) {
return transformOp.emitSilenceableError()		return transformOp.emitSilenceableError()
<< "Trying to launch a GPU kernel with gridDim = ("		<< "Trying to launch a GPU kernel with grid_dims = ("
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "		<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
<< gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)		<< gridDimZ.value_or(1) << ") block_dims = ("
<< ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)		<< blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
<< "). It is larger than the limits.";		<< blockDimZ.value_or(1) << "). It is larger than the limits.";
}		}
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and		/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
/// put a terminator within.		/// and put a terminator within.
static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
createGpuLaunch(RewriterBase &rewriter, Location loc,		createGpuLaunch(RewriterBase &rewriter, Location loc,
TransformOpInterface transformOp, LaunchOp &launchOp,		TransformOpInterface transformOp, LaunchOp &launchOp,
std::optional<int64_t> gridDimX = std::nullopt,		std::optional<int64_t> gridDimX = std::nullopt,
std::optional<int64_t> gridDimY = std::nullopt,		std::optional<int64_t> gridDimY = std::nullopt,
std::optional<int64_t> gridDimZ = std::nullopt,		std::optional<int64_t> gridDimZ = std::nullopt,
std::optional<int64_t> blockDimX = std::nullopt,		std::optional<int64_t> blockDimX = std::nullopt,
std::optional<int64_t> blockDimY = std::nullopt,		std::optional<int64_t> blockDimY = std::nullopt,
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	if (blockDimY.has_value())
gpuLaunch.getBlockSizeYMutable().assign(		gpuLaunch.getBlockSizeYMutable().assign(
createConstValue(blockDimY.value()));		createConstValue(blockDimY.value()));
if (blockDimZ.has_value())		if (blockDimZ.has_value())
gpuLaunch.getBlockSizeZMutable().assign(		gpuLaunch.getBlockSizeZMutable().assign(
createConstValue(blockDimZ.value()));		createConstValue(blockDimZ.value()));
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

//===----------------------------------------------------------------------===//		/// Struct to return the result of the rewrite of a forall operation.
// MapForallToBlocks		struct ForallRewriteResult {
//===----------------------------------------------------------------------===//		SmallVector<int64_t> mappingSizes;
		SmallVector<Value> mappingIds;
		};

static FailureOr<SmallVector<int64_t>> rewriteOneForallCommonImpl(		/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
		template <typename OpTy, typename OperationOrBlock>
		static void
		replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
		OperationOrBlock *parent, Value replacement,
		ArrayRef<int64_t> availableMappingSizes) {
		parent->walk([&](OpTy idOp) {
		if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
		rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
		});
		}

		static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp,		scf::ForallOp forallOp, ForallRewriteResult &result,
const SmallVectorImpl<int64_t> &availableMappingSizes,		ArrayRef<int64_t> availableMappingSizes, const GpuIdBuilder &gpuIdBuilder) {
const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
IdGeneratorFnType idGenerator) {
LDBG("Start rewriteOneForallCommonImpl");		LDBG("Start rewriteOneForallCommonImpl");

// Step 0. GPU-specific verifications. There is no better place to anchor		// Step 0. GPU-specific verifications. There is no better place to anchor
// those right now: the ForallOp is target-independent and the transform op		// those right now: the ForallOp is target-independent and the transform
// does not apply to individual ForallOp.		// op does not apply to individual ForallOp.
DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);		DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);
if (!diag.succeeded())		if (!diag.succeeded())
return failure();		return diag;

// Step 1. Complete the mapping to a full mapping (with 1s) if necessary.		// Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(		SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(
llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {		llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
auto maybeStaticValue = getConstantIntValue(ofr);		auto maybeStaticValue = getConstantIntValue(ofr);
assert(maybeStaticValue && "expected static value");		assert(maybeStaticValue && "expected static value");
return maybeStaticValue.value();		return maybeStaticValue.value();
}));		}));
SmallVector<Attribute> forallMappings =		SmallVector<Attribute> forallMappingAttrs =
llvm::to_vector(forallOp.getMapping()->getValue());		llvm::to_vector(forallOp.getMapping()->getValue());
for (auto attr : allMappingAttributes) {		for (auto attr : gpuIdBuilder.mappingAttributes) {
if (llvm::is_contained(forallMappings, attr))		if (llvm::is_contained(forallMappingAttrs, attr))
continue;		continue;
forallMappings.push_back(attr);		forallMappingAttrs.push_back(attr);
tmpMappingSizes.push_back(1);		tmpMappingSizes.push_back(1);
}		}
		LLVM_DEBUG(llvm::interleaveComma(
		tmpMappingSizes,
		DBGS() << "--tmpMappingSizes extracted from scf.forall op: ");
		llvm::dbgs() << "\n");

// Step 2. sort the values by the corresponding DeviceMappingAttrInterface.		// Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
auto comparator = [&](DeviceMappingAttrInterface a,		auto comparator = [&](DeviceMappingAttrInterface a,
DeviceMappingAttrInterface b) -> bool {		DeviceMappingAttrInterface b) -> bool {
return a.getMappingId() < b.getMappingId();		return a.getMappingId() < b.getMappingId();
};		};
SmallVector<int64_t> mappingSizes =		SmallVector<int64_t> forallMappingSizes =
getValuesSortedByKey(forallMappings, tmpMappingSizes, comparator);		getValuesSortedByKey(forallMappingAttrs, tmpMappingSizes, comparator);
LLVM_DEBUG(llvm::interleaveComma(mappingSizes, DBGS() << "mappingSizes: ");		LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
llvm::dbgs() << "\n";		DBGS() << "--forallMappingSizes: ");
llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: ");		llvm::dbgs() << "\n"; llvm::interleaveComma(
		forallMappingAttrs, DBGS() << "--mappingAttrs: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");

// Step 3. Generate the mappingIdOps using the provided generator and map the		// Step 3. Generate the mappingIdOps using the provided generator and map
// induction variables to the newly created ops. Replace ids of dimension		// the induction variables to the newly created ops.
// known to be of size 1 by zero to simplify the IR.		IdBuilderResult builderResult = gpuIdBuilder.idBuilder(
SmallVector<Value> mappingIdOps;		rewriter, forallOp, forallMappingSizes, availableMappingSizes);
Location loc = forallOp.getLoc();
idGenerator(rewriter, forallOp, mappingIdOps);
LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: ");
llvm::dbgs() << "\n");
assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes");
Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
if (!availableMappingSizes.empty()) {
for (size_t i : llvm::seq(size_t(0), availableMappingSizes.size())) {
if (availableMappingSizes[i] == 1)
mappingIdOps[i] = zero;
}
}

		SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
IRMapping bvm;		IRMapping bvm;
for (auto [iv, dim] :		for (auto [iv, dim] :
llvm::zip_equal(forallOp.getInductionVars(),		llvm::zip_equal(forallOp.getInductionVars(),
ArrayRef<Attribute>{forallMappings}.take_front(		ArrayRef<Attribute>{forallMappingAttrs}.take_front(
forallOp.getInductionVars().size()))) {		forallOp.getInductionVars().size()))) {
Value peIdOp = mappingIdOps[static_cast<int64_t>(		Value peIdOp = mappingIdOps[static_cast<int64_t>(
dim.cast<DeviceMappingAttrInterface>().getMappingId())];		dim.cast<DeviceMappingAttrInterface>().getMappingId())];
bvm.map(iv, peIdOp);		bvm.map(iv, peIdOp);
}		}

// Step 4. Maybe create conditionals to predicate the region.		// Step 4. Maybe create conditionals to predicate the region.
// Skip this step when availableMappingSizes is empty.		// Skip this step when availableMappingSizes is empty.
		Location loc = forallOp.getLoc();
Value predicate;		Value predicate;
if (!availableMappingSizes.empty()) {		if (!availableMappingSizes.empty()) {
LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes,		SmallVector<int64_t> predicateMappingSizes =
DBGS() << "availableMappingSizes: ");		builderResult.predicateMappingSizes;
		SmallVector<Value> predicateIdOps = builderResult.predicateIdOps;
		// clang-format off
		LLVM_DEBUG(
		llvm::interleaveComma(
		predicateMappingSizes, DBGS() << "--predicateMappingSizes: ");
		llvm::dbgs() << "\n";
		llvm::interleaveComma(
		availableMappingSizes, DBGS() << "--availableMappingSizes: ");
		llvm::dbgs() << "\n";
		llvm::interleaveComma(predicateIdOps, DBGS() << "--predicateIdOps: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");
for (auto [id, mappingSize, availableMappingSize] :		// clang-format on
llvm::zip_equal(mappingIdOps, mappingSizes, availableMappingSizes)) {		for (auto [id, mappingSize, availableMappingSize] : llvm::zip_equal(
		predicateIdOps, predicateMappingSizes, availableMappingSizes)) {
if (mappingSize > availableMappingSize) {		if (mappingSize > availableMappingSize) {
(void)failureHelper(		return definiteFailureHelper(
transformOp, forallOp,		transformOp, forallOp,
"Trying to map to fewer GPU threads than loop iterations but "		"Trying to map to fewer GPU threads than loop iterations but "
"overprovisioning is not yet supported. "		"overprovisioning is not yet supported. "
"Try additional tiling of the before mapping or map to more "		"Try additional tiling of the before mapping or map to more "
"threads.");		"threads.");
return failure();
}		}
if (mappingSize == availableMappingSize)		if (mappingSize == availableMappingSize)
continue;		continue;
Value idx = rewriter.create<arith::ConstantIndexOp>(loc, mappingSize);		Value idx = rewriter.create<arith::ConstantIndexOp>(loc, mappingSize);
Value tmpPredicate = rewriter.create<arith::CmpIOp>(		Value tmpPredicate = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, id, idx);		loc, arith::CmpIPredicate::ult, id, idx);
LDBG("predicate: " << tmpPredicate);		LDBG("--predicate: " << tmpPredicate);
predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,		predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
tmpPredicate)		tmpPredicate)
: tmpPredicate;		: tmpPredicate;
}		}
}		}

// Step 5. Move the body of forallOp.		// Step 5. Move the body of forallOp.
// Erase the terminator first, it will not be used.		// Erase the terminator first, it will not be used.
rewriter.eraseOp(forallOp.getTerminator());		rewriter.eraseOp(forallOp.getTerminator());
Block *targetBlock;		Block *targetBlock;
Block::iterator insertionPoint;		Block::iterator insertionPoint;
if (predicate) {		if (predicate) {
// Step 5.a. If predicated, move at the beginning.		// Step 5.a. If predicated, move at the beginning.
auto ifOp =		auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
rewriter.create<scf::IfOp>(loc, predicate, /withElseRegion=/false);		/withElseRegion=/false);
targetBlock = ifOp.thenBlock();		targetBlock = ifOp.thenBlock();
insertionPoint = ifOp.thenBlock()->begin();		insertionPoint = ifOp.thenBlock()->begin();
} else {		} else {
// Step 5.b. Otherwise, move inline just at the rewriter insertion point.		// Step 5.b. Otherwise, move inline just at the rewriter insertion
		// point.
targetBlock = forallOp->getBlock();		targetBlock = forallOp->getBlock();
insertionPoint = rewriter.getInsertionPoint();		insertionPoint = rewriter.getInsertionPoint();
}		}
Block &sourceBlock = forallOp.getRegion().front();		Block &sourceBlock = forallOp.getRegion().front();
targetBlock->getOperations().splice(insertionPoint,		targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());		sourceBlock.getOperations());

// Step 6. RAUW thread indices to thread ops.		// Step 6. RAUW indices.
for (Value loopIndex : forallOp.getInductionVars()) {		for (Value loopIndex : forallOp.getInductionVars()) {
Value threadIdx = bvm.lookup(loopIndex);		Value threadIdx = bvm.lookup(loopIndex);
rewriter.replaceAllUsesWith(loopIndex, threadIdx);		rewriter.replaceAllUsesWith(loopIndex, threadIdx);
}		}

// Step 7. Erase old op.		// Step 7. Erase old op.
rewriter.eraseOp(forallOp);		rewriter.eraseOp(forallOp);

return mappingSizes;		result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
		return DiagnosedSilenceableFailure::success();
}		}

		//===----------------------------------------------------------------------===//
		// MapForallToBlocks
		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
RewriterBase &rewriter, TransformOpInterface transformOp,		RewriterBase &rewriter, TransformOpInterface transformOp,
scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,		scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,		const GpuIdBuilder &gpuIdBuilder) {
IdGeneratorFnType idGenerator) {
// Pass an empty anyAvailableMappingSizes.		// Create an early zero index value for replacements.
		Location loc = forallOp.getLoc();
		Block *parentBlock = forallOp->getBlock();
		Value zero;
		{
		// RAII block.
		ThomasRaouxUnsubmitted Done Reply Inline Actions nit: doesn't feel like a very useful comment ThomasRaoux: nit: doesn't feel like a very useful comment
		OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPointToStart(parentBlock);
		zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
		}

SmallVector<int64_t> anyAvailableMappingSizes;		SmallVector<int64_t> anyAvailableMappingSizes;
FailureOr<SmallVector<int64_t>> maybeMappingSizes =		ForallRewriteResult rewriteResult;
rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,		// Pass an empty anyAvailableMappingSizes.
anyAvailableMappingSizes, allMappingAttributes,		DiagnosedSilenceableFailure diag =
idGenerator);		rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult,
if (failed(maybeMappingSizes))		anyAvailableMappingSizes, gpuIdBuilder);
return DiagnosedSilenceableFailure::definiteFailure();
gridDims = *maybeMappingSizes;		// Return if anything goes wrong, use silenceable failure as a match failure.
		if (!diag.succeeded())
		return diag;

		// Set the gridDims that act as a return.
		gridDims = rewriteResult.mappingSizes;

		// Replace ids of dimensions known to be 1 by 0 to simplify the IR.
		// Here, the result of mapping determines the available mapping sizes.
		replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
		gridDims);

return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
mlir::transform::gpu::findTopLevelForallOp(Operation *target,		mlir::transform::gpu::findTopLevelForallOp(Operation *target,
scf::ForallOp &topLevelForallOp,		scf::ForallOp &topLevelForallOp,
TransformOpInterface transformOp) {		TransformOpInterface transformOp) {
auto walkResult = target->walk([&](scf::ForallOp forallOp) {		auto walkResult = target->walk([&](scf::ForallOp forallOp) {
Show All 32 Lines	transform::MapForallToBlocks::applyToOne(Operation *target,
scf::ForallOp topLevelForallOp;		scf::ForallOp topLevelForallOp;
DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(		DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(
target, topLevelForallOp, transformOp);		target, topLevelForallOp, transformOp);
if (!diag.succeeded()) {		if (!diag.succeeded()) {
diag.attachNote(target->getLoc()) << "when applied to this payload op";		diag.attachNote(target->getLoc()) << "when applied to this payload op";
return diag;		return diag;
}		}

SmallVector<int64_t> gridDims = extractFromI64ArrayAttr(getGridDim());		SmallVector<int64_t> gridDims{getGridDims()};
if (!getGenerateGpuLaunch() && gridDims.size() != 3)		if (!getGenerateGpuLaunch() && gridDims.size() != 3)
return transformOp.emitDefiniteFailure("transform require size-3 mapping");		return transformOp.emitDefiniteFailure("transform require size-3 mapping");

OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(topLevelForallOp);		rewriter.setInsertionPoint(topLevelForallOp);

// Generate gpu launch here and move the forall inside		// Generate gpu launch here and move the forall inside
if (getGenerateGpuLaunch()) {		if (getGenerateGpuLaunch()) {
DiagnosedSilenceableFailure diag =		DiagnosedSilenceableFailure diag =
createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);		createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
if (!diag.succeeded()) {		if (!diag.succeeded()) {
return diag;		return diag;
}		}
rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());		rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
Operation newForallOp = rewriter.clone(topLevelForallOp);		Operation newForallOp = rewriter.clone(topLevelForallOp);
rewriter.eraseOp(topLevelForallOp);		rewriter.eraseOp(topLevelForallOp);
topLevelForallOp = cast<scf::ForallOp>(newForallOp);		topLevelForallOp = cast<scf::ForallOp>(newForallOp);
}		}

diag = verifyGpuMapping(transformOp, topLevelForallOp);		GpuBlockIdBuilder gpuBlockIdBuilder(getContext());
if (!diag.succeeded())
return diag;

MappingToGpuBlocksHelper helper(getContext());
diag = mlir::transform::gpu::mapForallToBlocksImpl(		diag = mlir::transform::gpu::mapForallToBlocksImpl(
rewriter, transformOp, topLevelForallOp, gridDims,		rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
helper.mappingAttributes, helper.idGenerator);
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

		// Set the GPU launch configuration for the grid dims late, this is subject to
		// IR inspection.
diag = alterGpuLaunch(rewriter, gpuLaunch,		diag = alterGpuLaunch(rewriter, gpuLaunch,
cast<TransformOpInterface>(getOperation()), gridDims[0],		cast<TransformOpInterface>(getOperation()), gridDims[0],
gridDims[1], gridDims[2]);		gridDims[1], gridDims[2]);

results.push_back(gpuLaunch);		results.push_back(gpuLaunch);
return diag;		return diag;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// MapNestedForallToThreads		// MapNestedForallToThreads
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,		scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
bool syncAfterDistribute,		bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) {
const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,		// Ignore cases with different attributes than this builder supports.
IdGeneratorFnType idGenerator) {
DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
target->walk([&](scf::ForallOp forallOp) {
// Ignore cases with different attributes.
for (Attribute map : forallOp.getMapping()->getValue()) {		for (Attribute map : forallOp.getMapping()->getValue()) {
if (!llvm::is_contained(allMappingAttributes, map)) {		if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) {
return WalkResult::skip();		LDBG("--skip " << map);
		LLVM_DEBUG(llvm::interleaveComma(gpuIdBuilder.mappingAttributes,
		DBGS() << "----not in: ");
		llvm::dbgs() << "\n";);
		return emitSilenceableFailure(forallOp);
}		}
}		}
diag = verifyGpuMapping(transformOp, forallOp);
if (diag.succeeded()) {
// Take the loc ahead of time
Location loc = forallOp.getLoc();		Location loc = forallOp.getLoc();
OpBuilder::InsertionGuard g(rewriter);		OpBuilder::InsertionGuard g(rewriter);
		// Insert after to allow for syncthreads after `forall` is erased.
rewriter.setInsertionPointAfter(forallOp);		rewriter.setInsertionPointAfter(forallOp);
if (failed(rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,		ForallRewriteResult rewriteResult;
kernelBlockDims,		DiagnosedSilenceableFailure diag =
allMappingAttributes, idGenerator)))		rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult,
diag = DiagnosedSilenceableFailure::definiteFailure();		availableMappingSizes, gpuIdBuilder);

		// Return if anything goes wrong, use silenceable failure as a match failure.
		if (!diag.succeeded())
		return diag;

// Add a syncthreads if needed. TODO: warpsync		// Add a syncthreads if needed. TODO: warpsync
if (syncAfterDistribute)		if (syncAfterDistribute)
rewriter.create<BarrierOp>(loc);		rewriter.create<BarrierOp>(loc);

		return DiagnosedSilenceableFailure::success();
		}

		DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
		Operation *target, ArrayRef<int64_t> blockDims, ArrayRef<int64_t> warpDims,
		bool syncAfterDistribute) {
		MLIRContext *ctx = rewriter.getContext();

		if (blockDims.size() != 3)
		return definiteFailureHelper(transformOp, target,
		"requires size-3 thread mapping");
		if (!warpDims.empty()) {
		if (warpDims.size() != 3)
		return definiteFailureHelper(transformOp, target,
		"requires empty or size-3 warp mapping");
		}

		// Create an early zero index value for replacements.
		Location loc = target->getLoc();
		Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
		SmallVector<OpFoldResult> blockDimsOfr =
		getAsIndexOpFoldResult(ctx, blockDims);

		DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
		WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
		//===--------------------------------------------------------------------===//
		// Mapping to warp ids.
		//===--------------------------------------------------------------------===//
		if (!warpDims.empty()) {
		LLVM_DEBUG(
		llvm::interleaveComma(
		warpDims, DBGS() << "mapNestedForallToThreadsImpl warpDims: ");
		llvm::dbgs() << "\n");
		GpuWarpIdBuilder gpuWarpIdBuilder(ctx);
		diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
		rewriter, transformOp, forallOp, warpDims, syncAfterDistribute,
		gpuWarpIdBuilder);
		// Use silenceable failure to encode "failure to match" and pass
		// through.
		if (diag.isDefiniteFailure())
		return WalkResult::interrupt();

		// Perform late SubgroupIdOp replacement, taking blockDims into
		// account.
		if (diag.succeeded()) {
		target->walk([&](SubgroupIdOp subgroupIdOp) {
		OpBuilder::InsertionGuard g(rewriter);
		rewriter.setInsertionPoint(subgroupIdOp);
		auto linearThreadId = getStaticLinearThreadId(
		rewriter, subgroupIdOp.getLoc(), blockDimsOfr);
		LDBG("----linearThreadId: " << linearThreadId);

		AffineExpr ltid = getAffineDimExpr(0, ctx);
		auto warpId = makeComposedFoldedAffineApply(
		rewriter, subgroupIdOp.getLoc(), ltid.floorDiv(kWarpSize),
		{linearThreadId});
		LDBG("----warpId: " << warpId);
		rewriter.replaceAllUsesWith(subgroupIdOp, warpId.get<Value>());
		});
		return WalkResult::skip();
}		}
return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();		}

		//===--------------------------------------------------------------------===//
		// Mapping to linear ids.
		//===--------------------------------------------------------------------===//
		LDBG("mapNestedForallToThreadsImpl linearDims");
		int64_t numThreads = 1;
		for (int64_t b : blockDims)
		numThreads *= b;
		GpuLinearIdBuilder gpuLinearIdBuilder(ctx);
		diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
		rewriter, transformOp, forallOp, {numThreads}, syncAfterDistribute,
		gpuLinearIdBuilder);
		// Use silenceable failure to encode "failure to match" and pass through.
		if (diag.isDefiniteFailure())
		return WalkResult::interrupt();
		if (diag.succeeded()) {
		// Perform late replacement of LinearIdOp, taking blockDims into account.
		target->walk([&](LinearIdOp linearIdOp) {
		OpBuilder::InsertionGuard g(rewriter);
		rewriter.setInsertionPoint(linearIdOp);
		auto linearThreadId = getStaticLinearThreadId(
		rewriter, linearIdOp.getLoc(), blockDimsOfr);
		LDBG("----linearThreadId: " << linearThreadId);
		rewriter.replaceAllUsesWith(linearIdOp, linearThreadId.get<Value>());
});		});
		return WalkResult::skip();
		}

		//===--------------------------------------------------------------------===//
		// Mapping to block ids (happens last so we can replay ThreadIdOp).
		//===--------------------------------------------------------------------===//
		LLVM_DEBUG(
		llvm::interleaveComma(
		blockDims, DBGS() << "mapNestedForallToThreadsImpl blockDims: ");
		llvm::dbgs() << "\n");
		GpuThreadIdBuilder gpuThreadIdBuilder(ctx);
		diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
		rewriter, transformOp, forallOp, blockDims, syncAfterDistribute,
		gpuThreadIdBuilder);
		// Use silenceable failure to encode "failure to match" and pass through.
		if (diag.isDefiniteFailure())
		return WalkResult::interrupt();

		return WalkResult::advance();
		});
		if (walkResult.wasInterrupted())
return diag;		return diag;

		// Replace ids of dimensions known to be 1 by 0 to simplify the IR.
		// Here, the result of mapping determines the available mapping sizes.
		replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
		blockDims);

		return DiagnosedSilenceableFailure::success();
}		}

DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(		DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
Operation *target, ApplyToEachResultList &results, TransformState &state) {		Operation *target, ApplyToEachResultList &results, TransformState &state) {
LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);		LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
auto transformOp = cast<TransformOpInterface>(getOperation());		auto transformOp = cast<TransformOpInterface>(getOperation());

// Basic high-level verifications.		// Basic high-level verifications.
if (!gpuLaunch)		if (!gpuLaunch)
return emitSilenceableError() << "Given target is not a gpu.launch";		return emitSilenceableError() << "Given target is not a gpu.launch";

SmallVector<int64_t> blockDims = extractFromI64ArrayAttr(getBlockDim());		// Mapping to block ids.
if (blockDims.size() != 3)		SmallVector<int64_t> blockDims{getBlockDims()};
return transformOp.emitDefiniteFailure("transform require size-3 mapping");

DiagnosedSilenceableFailure diag =		DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,		checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
blockDims[0], blockDims[1], blockDims[2]);		blockDims[0], blockDims[1], blockDims[2]);
if (diag.isSilenceableFailure()) {		if (diag.isSilenceableFailure()) {
diag.attachNote(getLoc()) << getBlockDimAttrName() << " is too large";		diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
return diag;		return diag;
}		}

MLIRContext *ctx = getContext();		// Set the GPU launch configuration for the block dims early, this is not
IRRewriter rewriter(ctx);		// subject to IR inspection.
MappingToGpuThreadsHelper helper(ctx);		IRRewriter rewriter(getContext());
diag = mlir::transform::gpu::mapNestedForallToThreadsImpl(
rewriter, transformOp, target, blockDims, getSyncAfterDistribute(),
helper.mappingAttributes, helper.idGenerator);

if (!diag.succeeded())
return diag;

diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,		diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
std::nullopt, std::nullopt, blockDims[0], blockDims[1],		std::nullopt, std::nullopt, blockDims[0], blockDims[1],
blockDims[2]);		blockDims[2]);

		rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
		diag =
		mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
		getWarpDims(), getSyncAfterDistribute());

results.push_back(gpuLaunch.getOperation());		results.push_back(gpuLaunch.getOperation());
return diag;		return diag;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Transform op registration		// Transform op registration
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

Show All 26 Lines

mlir/test/Dialect/GPU/transform-gpu-failing.mlir

// RUN: mlir-opt --test-transform-dialect-interpreter --split-input-file -canonicalize -cse --verify-diagnostics %s		// RUN: mlir-opt --test-transform-dialect-interpreter --split-input-file -canonicalize -cse --verify-diagnostics %s

func.func @map_nested_forall_to_threads_not_gpu_launch() -> () {		func.func @map_nested_forall_to_threads_not_gpu_launch() -> () {
%1 = tensor.empty() : tensor<4xf32>		%1 = tensor.empty() : tensor<4xf32>
return		return
}		}
transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb0(%arg0: !pdl.operation):		^bb0(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{Given target is not a gpu.launch}}		// expected-error @below {{Given target is not a gpu.launch}}
%1 = transform.gpu.map_nested_forall_to_threads %funcop		%1 = transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1]
}		}

// -----		// -----

func.func @map_nested_forall_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {		func.func @map_nested_forall_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%c900 = arith.constant 900 : index		%c900 = arith.constant 900 : index
%c9 = arith.constant 9 : index		%c9 = arith.constant 9 : index
Show All 22 Lines	%name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
gpu.terminator		gpu.terminator
}		}

return %y : memref<2 x 32 x f32>		return %y : memref<2 x 32 x f32>
}		}
transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{Trying to launch a GPU kernel with gridDim = (1, 1, 1) blockDim = (1200, 9, 1). It is larger than the limits.}}		// expected-error @below {{Trying to launch a GPU kernel with grid_dims = (1, 1, 1) block_dims = (1200, 9, 1). It is larger than the limits.}}
// expected-note @below {{"blockDim" is too large}}		// expected-note @below {{"block_dims" is too large}}
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [1200, 9, 1] }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1200, 9, 1]
}		}

// -----		// -----

func.func @map_nested_forall_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {		func.func @map_nested_forall_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%c900 = arith.constant 900 : index		%c900 = arith.constant 900 : index
%c9 = arith.constant 9 : index		%c9 = arith.constant 9 : index
Show All 24 Lines	func.func @map_nested_forall_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {

return %y : memref<2 x 32 x f32>		return %y : memref<2 x 32 x f32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{Trying to map to fewer GPU threads than loop iterations but overprovisioning is not yet supported. Try additional tiling of the before mapping or map to more threads.}}		// expected-error @below {{Trying to map to fewer GPU threads than loop iterations but overprovisioning is not yet supported. Try additional tiling of the before mapping or map to more threads.}}
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1]
}		}

// -----		// -----

func.func @map_nested_forall_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> {		func.func @map_nested_forall_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%c900 = arith.constant 900 : index		%c900 = arith.constant 900 : index
%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)		%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
Show All 9 Lines	func.func @map_nested_forall_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> {
}		}
return %y : memref<2 x 32 x f32>		return %y : memref<2 x 32 x f32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{unsupported dynamic sizes}}		// expected-error @below {{unsupported dynamic sizes}}
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1]
}		}

// -----		// -----

func.func @map_nested_forall_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) {		func.func @map_nested_forall_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)		%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)		threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
{		{
%t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32>		%t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32>
gpu.terminator		gpu.terminator
}		}
return		return
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!pdl.operation) -> !pdl.operation
%forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )		%forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{only bufferized scf.forall can be mapped}}		// expected-error @below {{only bufferized scf.forall can be mapped}}
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1]
}		}

// -----		// -----


func.func @map_forall_to_blocks_not_gpu_launch() -> () {		func.func @map_forall_to_blocks_not_gpu_launch() -> () {
// expected-note @below {{when applied to this payload op}}		// expected-note @below {{when applied to this payload op}}
%1 = tensor.empty() : tensor<4xf32>		%1 = tensor.empty() : tensor<4xf32>
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	scf.forall (%i, %j) in (%c65535, %c65535) {
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>		memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
} { mapping = [#gpu.block<x>, #gpu.block<y>] }		} { mapping = [#gpu.block<x>, #gpu.block<y>] }
return %y : memref<2 x 32 x f32>		return %y : memref<2 x 32 x f32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb0(%arg0: !pdl.operation):		^bb0(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{Trying to launch a GPU kernel with gridDim = (65535, 65535, 1) blockDim = (1, 1, 1). It is larger than the limits.}}		// expected-error @below {{Trying to launch a GPU kernel with grid_dims = (65535, 65535, 1) block_dims = (1, 1, 1). It is larger than the limits.}}
%1 = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch }		%1 = transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch
}		}

// -----		// -----

!type = memref<32x32xf32>		!type = memref<32x32xf32>
func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {		func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
%c32 = arith.constant 32 : index		%c32 = arith.constant 32 : index
%one = arith.constant 1 : index		%one = arith.constant 1 : index
Show All 10 Lines	func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{duplicated attribute, cannot map different loops to the same processor}}		// expected-error @below {{duplicated attribute, cannot map different loops to the same processor}}
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [32, 32, 1]}		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1]
}		}

// -----		// -----

func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) {		func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)		%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)		threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
Show All 23 Lines

mlir/test/Dialect/GPU/transform-gpu.mlir

Show All 27 Lines	%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_forall_to_blocks %funcop { gridDim = [12, 9, 1]}		transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1]
}		}

// -----		// -----

!type = memref<2 x 32 x f32>		!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>		!type1d = memref<32 x f32>

// CHECK-LABEL: func.func @saxpy2d(		// CHECK-LABEL: func.func @saxpy2d(
Show All 37 Lines	%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1] }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1]
}		}

// -----		// -----

!type4d = memref<32x64x4x32xf32>		!type4d = memref<32x64x4x32xf32>

// CHECK-LABEL: func.func @saxpy4d(		// CHECK-LABEL: func.func @saxpy4d(
// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32>		// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32>
Show All 23 Lines	// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
} { mapping = [#gpu.block<x>, #gpu.block<y>] }		} { mapping = [#gpu.block<x>, #gpu.block<y>] }
return %y : !type4d		return %y : !type4d
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
%gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch }		%gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch }
transform.gpu.map_nested_forall_to_threads %gpuLaunch { blockDim = [32, 4, 1] }		transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1]
}		}

// -----		// -----

!type = memref<2 x 32 x f32>		!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>		!type1d = memref<32 x f32>

// CHECK-LABEL: func.func @saxpy2d_no_barrier(		// CHECK-LABEL: func.func @saxpy2d_no_barrier(
Show All 16 Lines	%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false
}		}

// -----		// -----

!type = memref<32x32xf32>		!type = memref<32x32xf32>
// CHECK-LABEL: func.func @saxpy2d_singleloop(		// CHECK-LABEL: func.func @saxpy2d_singleloop(
// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x32xf32>		// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x32xf32>
// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x32xf32>		// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x32xf32>
Show All 15 Lines	// CHECK: memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]]
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [32, 1, 1]}		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 1, 1]
}		}

// -----		// -----

!type = memref<3 x 2 x 32 x f32>		!type = memref<3 x 2 x 32 x f32>
!type1d = memref<32 x f32>		!type1d = memref<32 x f32>

// CHECK-LABEL: func.func @saxpy3d_fold_id_z(		// CHECK-LABEL: func.func @saxpy3d_fold_id_z(
Show All 19 Lines	// CHECK: memref.store %{{.}}, %{{.}}[%[[C0]]
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }		transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false
}		}

// -----		// -----

!type = memref<2 x 32 x f32>		!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>		!type1d = memref<32 x f32>

		// CHECK-DAG: #[[$MAPWY:.]] = affine_map<(d0, d1) -> (((d0 + d1 12) floordiv 32) floordiv 4)>
		// CHECK-DAG: #[[$MAPWX:.]] = affine_map<(d0, d1) -> ((((d0 + d1 12) floordiv 32) mod 4) floordiv 2)>

		// CHECK-DAG: #[[$MAPLIN:.]] = affine_map<(d0, d1) -> (d0 + d1 12)>
		// CHECK-DAG: #[[$MAPLY:.]] = affine_map<(d0, d1) -> ((d0 + d1 12) floordiv 20)>
		// CHECK-DAG: #[[$MAPLX:.]] = affine_map<(d0, d1) -> (((d0 + d1 12) mod 20) floordiv 10)>

// CHECK-LABEL: func.func @map_multi_level(		// CHECK-LABEL: func.func @map_multi_level(
func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {		func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%c12 = arith.constant 12 : index		%c10 = arith.constant 10 : index
%c9 = arith.constant 9 : index		%c9 = arith.constant 9 : index
%c7 = arith.constant 7 : index		%c7 = arith.constant 7 : index
// check that the thread level got distributed but not the warp level.		%c1 = arith.constant 1 : index
// CHECK-NOT: {mapping = #gpu.thread		%c2 = arith.constant 2 : index
// CHECK: {mapping = [#gpu.warp<x>]}
		// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
		// CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index
		// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
		// CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index

		// check that both the thread level and the warp level got distributed.
		// CHECK-NOT: #gpu.thread
		// CHECK-NOT: #gpu.warp
%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)		%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)		threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
{		{
		// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x
		// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y
scf.forall (%i, %j) in (%c7, %c9) {		scf.forall (%i, %j) in (%c7, %c9) {
%4 = memref.load %x[%i, %j] : !type		%4 = memref.load %x[%i, %j] : !type
%5 = memref.load %y[%i, %j] : !type		%5 = memref.load %y[%i, %j] : !type
%6 = math.fma %alpha, %4, %5 : f32		%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : !type		memref.store %6, %y[%i, %j] : !type
} { mapping = [#gpu.thread<y>, #gpu.thread<x>]}		} { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.forall (%i) in (%c12) {
		// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
		// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])
		// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index
		// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index
		// CHECK: %[[COND:.*]] = arith.andi %[[CMPY]], %[[CMPX]] : i1
		// CHECK: scf.if %[[COND]]
		scf.forall (%i) in (%c1) {
%7 = memref.load %t[%i] : !type1d		%7 = memref.load %t[%i] : !type1d
%8 = arith.addf %alpha, %7 : f32		%8 = arith.addf %alpha, %7 : f32
memref.store %8, %t[%i] : !type1d		memref.store %8, %t[%i] : !type1d
} {mapping = [#gpu.warp<x>] }		} {mapping = [#gpu.warp<x>] }

		// CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])
		// CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])
		// CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
		// CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
		// CHECK: scf.if %[[COND]]
		scf.forall (%i, %j) in (%c10, %c2) {
		%7 = memref.load %t[%i] : !type1d
		%8 = arith.addf %alpha, %7 : f32
		memref.store %8, %t[%j] : !type1d
		} {mapping = [#gpu.linear<x>, #gpu.linear<y>] }
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1] }		transform.gpu.map_nested_forall_to_threads %funcop
		block_dims = [12, 11, 1] warp_dims = [2, 2, 1]
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Transform] Add support for mapping to GPU warps and to linear ids
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 505892

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

mlir/test/Dialect/GPU/transform-gpu-failing.mlir

mlir/test/Dialect/GPU/transform-gpu.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Transform] Add support for mapping to GPU warps and to linear idsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 505892

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

mlir/test/Dialect/GPU/transform-gpu-failing.mlir

mlir/test/Dialect/GPU/transform-gpu.mlir

[mlir][Transform] Add support for mapping to GPU warps and to linear ids
ClosedPublic