Diff 542861

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

	Show All 14 Lines

	include "mlir/Dialect/GPU/IR/GPUBase.td"			include "mlir/Dialect/GPU/IR/GPUBase.td"
	include "mlir/IR/EnumAttr.td"			include "mlir/IR/EnumAttr.td"
	include "mlir/Dialect/SCF/IR/DeviceMappingInterface.td"			include "mlir/Dialect/SCF/IR/DeviceMappingInterface.td"

	def DimX : I64EnumAttrCase<"DimX", 0, "x">;			def DimX : I64EnumAttrCase<"DimX", 0, "x">;
	def DimY : I64EnumAttrCase<"DimY", 1, "y">;			def DimY : I64EnumAttrCase<"DimY", 1, "y">;
	def DimZ : I64EnumAttrCase<"DimZ", 2, "z">;			def DimZ : I64EnumAttrCase<"DimZ", 2, "z">;
				def LinearDim0 : I64EnumAttrCase<"LinearDim0", 3, "linear_dim_0">;
	def ThreadsEnum : I64EnumAttr<"Threads", "threads for loop mapping", [			def LinearDim1 : I64EnumAttrCase<"LinearDim1", 4, "linear_dim_1">;
	DimX, DimY, DimZ]> {			def LinearDim2 : I64EnumAttrCase<"LinearDim2", 5, "linear_dim_2">;
				def LinearDim3 : I64EnumAttrCase<"LinearDim3", 6, "linear_dim_3">;
				def LinearDim4 : I64EnumAttrCase<"LinearDim4", 7, "linear_dim_4">;
				def LinearDim5 : I64EnumAttrCase<"LinearDim5", 8, "linear_dim_5">;
				def LinearDim6 : I64EnumAttrCase<"LinearDim6", 9, "linear_dim_6">;
				def LinearDim7 : I64EnumAttrCase<"LinearDim7", 10, "linear_dim_7">;
				def LinearDim8 : I64EnumAttrCase<"LinearDim8", 11, "linear_dim_8">;
				def LinearDim9 : I64EnumAttrCase<"LinearDim9", 12, "linear_dim_9">;
				ftynseUnsubmitted Done Reply Inline Actions Should we just make it a proper (non-enum) attribute parameterized by an integer? ftynse: Should we just make it a proper (non-enum) attribute parameterized by an integer?
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions I am not sure how to mix an optional enum and an optional attr with a name. Let's punt on the cosmetic for now if you don't mind, I'll read up and update at a later time. nicolasvasilache: I am not sure how to mix an optional enum and an optional attr with a name. Let's punt on the…

				// TODO: This would be better represented with separate Grid and Linear Mapping
				// ids. Unfortunately it is not yet possible to have an optional EnumParameter
				// so we currently embed the 2 modes in the same enum.
				def MappingIdEnum : I64EnumAttr<"MappingId", "Mapping ids for loop mapping", [
				DimX, DimY, DimZ,
				LinearDim0, LinearDim1, LinearDim2, LinearDim3, LinearDim4,
				LinearDim5, LinearDim6, LinearDim7, LinearDim8, LinearDim9]> {
	let cppNamespace = "::mlir::gpu";			let cppNamespace = "::mlir::gpu";
	}			}

	def GPUThreadMappingAttr			def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [
	: GPU_Attr<"GPUThreadMapping", "thread", [
	DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {			DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
	let parameters = (ins			let parameters = (ins
	EnumParameter<ThreadsEnum>:$thread			EnumParameter<MappingIdEnum>:$block
	);			);
	let assemblyFormat = "`<` params `>`";			let assemblyFormat = "`<` params `>`";
	let description = [{			let description = [{
	An attribute that allows defining thread parallelism for GPU devices.			An attribute that allows defining thread block parallelism for GPU devices.

	Thread (aka work item) are grouped into a thread blocks where block may be			Thread blocks (aka workgroup) are grouped into a grid described by a
	described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates			3-dimensional rectangle.
	that thread parallelism is desired. It can be consumed by lowering to			This attribute indicates that thread block parallelism is desired.
	generate GPU.			It can be consumed by lowering to generate GPU code.
	}];			2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode.
	}

	def WarpsEnum : I64EnumAttr<"Warps", "threads for loop mapping", [			#### 3D mapping mode
	DimX, DimY, DimZ]> {
	let cppNamespace = "::mlir::gpu";			The 3D block id is simply the 3D index of the block `(bidx, bidy, bidz)`.
				If required, predication occurs on a per-dimension basis. This allows
				specifying predication on a 3D sub-rectangle of the grid.

				#### Linear mapping mode

				The linear block id is obtained by linearizing the index of the block.
				If required, predication occurs on the linear id. This allows specifying
				predication on a 1D subset of the (linearized) grid.

				For instance, if the basis is denoted as (GX, GY, GZ) and the block id is
				denoted by (bx, by, bz), the block id is:
				`linear_id = bx + by * GX + bz * GX * GBY)`.
				The linear block id is fixed for the duration of a GPU kernel.

				This linear id mapping attribute indicates a different linearization relation
				is applied locally to a loop nest.

				For instance, if the new basis is denoted as (LBD0, LBD1, LBD2, LBD3) the
				block id in the new basis is:
				```(linear_id mod LBD0 ,
				(linear_id / LBD0) mod * LBD1,
				(linear_id / (LBD0 * LBD1)) mod LBD2,
				(linear_id / (LBD0 * LBD1 * LBD2)) mod LBD3)```.
				This reinterpretation is only fixed for the duration of a loop nest.
				}];
	}			}

	def GPUWarpMappingAttr : GPU_Attr<"GPUWarpMapping", "warp", [			def GPUWarpgroupMappingAttr
				: GPU_Attr<"GPUWarpgroupMapping", "warpgroup", [
	DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {			DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
	let parameters = (ins			let parameters = (ins
	EnumParameter<WarpsEnum>:$warp			EnumParameter<MappingIdEnum>:$warpgroup
	);			);
	let assemblyFormat = "`<` params `>`";			let assemblyFormat = "`<` params `>`";
	let description = [{			let description = [{
	An attribute that allows defining thread block parallelism for GPU devices.			An attribute that allows defining warpgroup parallelism for GPU devices.

	Warp (aka subgroup) are grouped into a grid where grid may be			Threads of proper granularity (e.g. multiple of
	described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates			"kNumWarpsPerGroup * kWarpSize" on CUDA devices) can be grouped into
	that thread block parallelism is desired. It can be consumed by lowering to			warpgroups described by a 3-dimensional rectangle.
	generate GPU code.			This attribute indicates that warpgroup parallelism is desired.
	}];			It can be consumed by lowering to generate GPU code.
	}			2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode.

	def LinearIdEnum : I64EnumAttr<"LinearId", "linear ids for loop mapping", [			#### 3D mapping mode
	DimX, DimY, DimZ]> {
	let cppNamespace = "::mlir::gpu";			The 3D warpgroup id is simply the adjusted 3D index of the thread
				`(tidx / (kNumWarpsPerGroup * kWarpSize), tidy, tidz)`.
				If required, predication occurs on a per-dimension basis. This allows
				specifying predication on a 3D sub-rectangle of the warpgroups.

				#### Linear mapping mode

				The linear warpgroup id is obtained by linearizing the index of the warpgroup.
				If required, predication occurs on the linear id. This allows specifying
				predication on a 1D "kNumWarpsPerGroup * kWarpSize"-aligned subset of the
				(linearized) block.

				For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is
				id is denoted by (tx, ty, tz), the linear warpgroup id is:
				```linear_id = (tx + ty * BX + tz * BX * BY)
				/ (kNumWarpsPerGroup * kWarpSize)```.
				The linear warpgroup id is fixed for the duration of a GPU kernel.

				This linear id mapping attribute indicates a different linearization relation
				is applied locally to a loop nest.

				For instance, if the new basis is denoted as (LWGD0, LWGD1, LWGD2, LWGD3) the
				warpgroup id in the new basis is:
				```(linear_id mod LWGD0 ,
				(linear_id / LWGD0) mod * LWGD1,
				(linear_id / (LWGD0 * LWGD1)) mod LWGD2,
				(linear_id / (LWGD0 * LWGD1 * LWGD2)) mod LWGD3)```.
				This reinterpretation is only fixed for the duration of a loop nest.
				}];
	}			}

	def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [			def GPUWarpMappingAttr
				: GPU_Attr<"GPUWarpMapping", "warp", [
	DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {			DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
	let parameters = (ins			let parameters = (ins
	EnumParameter<LinearIdEnum>:$linear_id			EnumParameter<MappingIdEnum>:$warp
	);			);
	let assemblyFormat = "`<` params `>`";			let assemblyFormat = "`<` params `>`";
	let description = [{			let description = [{
	An attribute to allow re-interpreting the linear mapping for threads in GPU			An attribute that allows defining warp parallelism for GPU devices.
	devices.

	Threads (aka work item) are grouped into a thread block where block may be			Threads of proper granularity (e.g. multiple of "warp size" on CUDA devices)
	described by a 1-, 2-, or 3-dimensional rectangular basis.			can be grouped into warps described by a 3-dimensional rectangle.
	The linear thread id is obtained by linearizing the 1-, 2- or 3-dimensional			This attribute indicates that warp parallelism is desired.
	index. For instance, if the basis is denoted as (BX, BY, BZ) and the thread			It can be consumed by lowering to generate GPU code.
	id is denoted by (tx, ty, tz), the linear thread id is:			2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode.
	`linear_id = tx + ty * BX + tz * BX * BY)`.
	The linear thread id is fixed for the duration of a GPU kernel.			#### 3D mapping mode

				The 3D warp id is simply the adjusted 3D index of the thread
				`(tidx / kWarpSize, tidy, tidz)`.
				If required, predication occurs on a per-dimension basis. This allows
				specifying predication on a 3D sub-rectangle of the warpgroups.

				#### Linear mapping mode

				The linear warp id is obtained by linearizing the index of the warp.
				If required, predication occurs on the linear id. This allows specifying
				predication on a 1D "kWarpSize"-aligned subset of the (linearized) block.

				For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is
				id is denoted by (tx, ty, tz), the linear warp id is:
				`linear_id = (tx + ty * BX + tz * BX * BY) / kWarpSize`.
				The linear warp id is fixed for the duration of a GPU kernel.

	This linear id mapping attribute indicates a different linearization relation			This linear id mapping attribute indicates a different linearization relation
	is applied locally to a loop nest.			is applied locally to a loop nest.

	For instance, if the new basis is denoted as (LBX, LBY, LBZ) the thread id			For instance, if the new basis is denoted as (LWD0, LWD1, LWD2, LWD3) the
	in the new basis is:			warp id in the new basis is:
	`(linear_id mod LBX , (linear_id / LBX) mod * LBY, linear_id / (LBX * LBY))`.			```(linear_id mod LWD0 ,
	This reinterpretation is only fixe for the duration of a loop nest.			(linear_id / LWD0) mod * LWD1,
				(linear_id / (LWD0 * LWD1)) mod LWD2,
	It can be consumed by lowering to generate GPU code.			(linear_id / (LWD0 * LWD1 * LWD2)) mod LWD3)```.
				This reinterpretation is only fixed for the duration of a loop nest.
	}];			}];
	}			}

	def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [			def GPUThreadMappingAttr
	DimX, DimY, DimZ]> {			: GPU_Attr<"GPUThreadMapping", "thread", [
	let cppNamespace = "::mlir::gpu";
	}

	def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [
	DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {			DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
	let parameters = (ins			let parameters = (ins
	EnumParameter<BlocksEnum>:$block			EnumParameter<MappingIdEnum>:$thread
	);			);
	let assemblyFormat = "`<` params `>`";			let assemblyFormat = "`<` params `>`";
	let description = [{			let description = [{
	An attribute that allows defining thread block parallelism for GPU devices.			An attribute that allows defining thread parallelism for GPU devices.

				Thread (aka work item) are grouped into a thread blocks described by a
				3-dimensional rectangle.
				This attribute indicates that thread parallelism is desired.
				It can be consumed by lowering to generate GPU.

				#### 3D mapping mode

	Thread blocks (aka work-group) are grouped into a grid where grid may be			The 3D thread id is simply the 3D index of the thread `(tidx, tidy, tidz)`.
	described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates			If required, predication occurs on a per-dimension basis. This allows
	that thread block parallelism is desired. It can be consumed by lowering to			specifying predication on a 3D sub-rectangle of the block.
	generate GPU code.
				#### Linear mapping mode

				The linear thread id is obtained by linearizing the index of the thread.
				If required, predication occurs on the linear id. This allows specifying
				predication on a 1D subset of the (linearized) block.

				For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is
				id is denoted by (tx, ty, tz), the linear thread id is:
				```linear_id = (tx + ty * BX + tz * BX * BY)
				/ (kNumWarpsPerGroup * kWarpSize)```.
				qedawkinsUnsubmitted Done Reply Inline Actions Is the division by `(kNumWarpsPerGroup * kWarpSize)` correct here? qedawkins: Is the division by `(kNumWarpsPerGroup * kWarpSize)` correct here?
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions ah thanks! nicolasvasilache: ah thanks!
				The linear thread id is fixed for the duration of a GPU kernel.

				This linear id mapping attribute indicates a different linearization relation
				is applied locally to a loop nest.

				For instance, if the new basis is denoted as (LTD0, LTD1, LTD2, LTD3) the
				thread id in the new basis is:
				```(linear_id mod LTD0 ,
				(linear_id / LTD0) mod * LTD1,
				(linear_id / (LTD0 * LTD1)) mod LTD2,
				(linear_id / (LTD0 * LTD1 * LTD2)) mod LTD3)```.
				This reinterpretation is only fixed for the duration of a loop nest.
	}];			}];
	}			}


	def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [			def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
	DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {			DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
	let parameters = (ins			let parameters = (ins
	EnumParameter<GPU_AddressSpaceEnum>:$address_space			EnumParameter<GPU_AddressSpaceEnum>:$address_space
	);			);
	let assemblyFormat = "`<` params `>`";			let assemblyFormat = "`<` params `>`";
	let description = [{			let description = [{
	An attribute that allows defining memory hierarchy for GPU devices.			An attribute that allows defining memory hierarchy for GPU devices.

	GPU Memory has three memory space, global, workgroup, and private. The global memory			GPU Memory has three memory space, global, workgroup, and private. The global memory
	is visible to all workitems and workgroups, the workgroup memory is only available for workitems			is visible to all workitems and workgroups, the workgroup memory is only available for workitems
	within a workgroup, and private memory is only visible to a single workitem. This attribute indicates			within a workgroup, and private memory is only visible to a single workitem. This attribute indicates
	that using memory hiearchy is desired. It can be consumed by lowering to			that using memory hiearchy is desired. It can be consumed by lowering to
	move data to a specific address space in GPU code.			move data to a specific address space in GPU code.
	}];			}];
	}			}


	#endif // GPU_DEVICE_MAPPING_ATTR			#endif // GPU_DEVICE_MAPPING_ATTR

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

	Show All 25 Lines

	#define GET_OP_CLASSES			#define GET_OP_CLASSES
	#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"			#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"

	namespace mlir {			namespace mlir {
	class DialectRegistry;			class DialectRegistry;
	namespace transform {			namespace transform {
	namespace gpu {			namespace gpu {
				struct GpuIdBuilder;

	/// Helper type for functions that generate ids for the mapping of a			/// Map the top level `scf.forall` op to GPU blocks.
	/// scf.forall.
	struct IdBuilderResult {
	// Ops used to replace the forall induction variables.
	SmallVector<Value> mappingIdOps;
	// Actual mapping sizes used to predicate the forall body when they are
	// smaller than the available mapping sizes.
	SmallVector<int64_t> predicateMappingSizes;
	// Ops used to predicate the forall body when predicateMappingSizes is smaller
	// than the available mapping sizes.
	SmallVector<Value> predicateIdOps;
	};

	/// Common gpu id builder type, allows the configuration of lowering for various
	/// mapping schemes. Takes:
	/// - A rewriter with insertion point set before the forall op to rewrite.
	/// - The loc of the forall op to rewrite.
	/// - A list of positive integers carrying the mapping sizes for the current
	/// forall op to rewrite.
	using GpuIdBuilderFnType =
	std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;

	/// Helper struct for configuring the rewrite of mapped scf.forall ops to
	/// various gpu id configurations.
	struct GpuIdBuilder {
	GpuIdBuilder(ArrayRef<OpFoldResult> blockDims, ArrayRef<int64_t> mappingSizes)
	: blockDimsOfr(blockDims), availableMappingSizes(mappingSizes),
	mappingAttributes(), idBuilder() {}

	/// List of OpFoldResult carrying the multi-dimensional number of
	/// threads available in the current kernel (i.e. the current blockDims in
	/// CUDA parlance).
	ArrayRef<OpFoldResult> blockDimsOfr;

	/// A list of positive integers carrying the number of available mapping
	/// resources that can trigger predication,
	ArrayRef<int64_t> availableMappingSizes;

	/// The mapping attributes targeted by this generator.
	SmallVector<DeviceMappingAttrInterface> mappingAttributes;

	/// The constructor that builds the concrete IR for mapping ids.
	GpuIdBuilderFnType idBuilder;
	};

	/// Map the top level `scf.forall` op to GPU Thread Blocks.
	/// Mapping is one-to-one and the induction variables of `scf.forall` are			/// Mapping is one-to-one and the induction variables of `scf.forall` are
	/// rewritten to gpu.block_id according to the thread_dim_mapping attribute.			/// rewritten to gpu.block_id according to the thread_dim_mapping attribute.
	///			///
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic block dim sizes are currently not supported.			/// Dynamic `gridDims` are currently not supported.
	DiagnosedSilenceableFailure			DiagnosedSilenceableFailure
	mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,			mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,
	scf::ForallOp forallOp,			scf::ForallOp forallOp,
	SmallVectorImpl<int64_t> &gridDims,			SmallVectorImpl<int64_t> &gridDims,
	const GpuIdBuilder &gpuIdBuilder);			const GpuIdBuilder &gpuIdBuilder);

	/// Search `scf.forall` ops nested under `target` and map each such op to an			/// Search `scf.forall` ops nested under `target` and map each such op to an
	/// explicit GPU implementation along `availableMappingSizes`.			/// explicit GPU implementation along `availableMappingSizes`.
	/// The mapping is one-to-one and the induction variables of `scf.forall` are			/// The mapping is one-to-one and the induction variables of `scf.forall` are
	/// rewritten to gpuIdBuilder.idBuilder according to the			/// rewritten to gpuIdBuilder.idBuilder according to the
	/// gpuIdBuilder.mappingAttributes attribute.			/// gpuIdBuilder.mappingAttributes attribute.
	///			///
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic `availableMappingSizes` sizes are currently not supported.			/// Dynamic `availableMappingSizes` sizes are currently not supported.
	/// `availableMappingSizes` is expected to be of size 3.			/// `availableMappingSizes` is expected to be of size 3.
	DiagnosedSilenceableFailure mapOneForallToThreadsImpl(			DiagnosedSilenceableFailure mapOneForallToThreadsImpl(
	RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,			RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
	scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,			scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
	bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder);			ArrayRef<int64_t> warpDims, bool syncAfterDistribute);

	/// Search `scf.forall` ops nested under `target` and map each such op to an			/// Search `scf.forall` ops nested under `target` and map each such op to an
	/// explicit GPU implementation along blockDims and warpDims.			/// explicit GPU implementation along `availableMappingSizes` and
	/// The mapping is one-to-one and the induction variables of `scf.forall` are			/// `maybeWarpDims`. The mapping is one-to-one and the induction variables of
	/// rewritten to threads and warps ids according to the mapping attribute.			/// `scf.forall` are rewritten to appropriate ids according to the mapping
				/// attribute.
	///			///
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic `blockDims` or `warpDims` or `linearDims` sizes are currently not			/// Dynamic `availableMappingSizes` or `newBasis` entries are currently not
	/// supported.			/// supported. `availableMappingSizes` is expected to be of size 3.
	/// `blockDims` is expected to be of size 3.
	/// `warpDims` is expected to be empty or of size 3.
	///			///
	/// The insertion point of the `rewriter` is expected to be set at the			/// The insertion point of the `rewriter` is expected to be set at the
	/// beginning of the `target` body block and dominate all other blocks.			/// beginning of the `target` body block and dominate all other blocks.
	DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(			DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
	RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,			RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
	Operation *target, ArrayRef<int64_t> blockDimsOfr,			Operation *target, ArrayRef<int64_t> availableMappingSizes,
	ArrayRef<int64_t> warpDims, bool syncAfterDistribute);			ArrayRef<int64_t> warpDims, bool syncAfterDistribute);

	/// Find the unique top level scf::ForallOp within a given target op.
	DiagnosedSilenceableFailure
	findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
	TransformOpInterface transformOp);

	} // namespace gpu			} // namespace gpu
	} // namespace transform			} // namespace transform

	namespace gpu {			namespace gpu {
	void registerTransformDialectExtension(DialectRegistry &registry);			void registerTransformDialectExtension(DialectRegistry &registry);
	} // namespace gpu			} // namespace gpu
	} // namespace mlir			} // namespace mlir

	#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H			#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H

mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h

This file was added.

				//===- Utils.h - Utils for GPU transform ops --------------------- C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
				#define MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H

				#include "mlir/Dialect/GPU/IR/GPUDialect.h"
				#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
				#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
				#include "mlir/IR/OpImplementation.h"
				#include "mlir/IR/PatternMatch.h"

				namespace mlir {
				namespace gpu {
				class GPUOp;
				class LaunchOp;
				enum class MappingId : uint64_t;
				} // namespace gpu
				namespace scf {
				class ForallOp;
				} // namespace scf
				namespace transform {
				namespace gpu {

				/// Helper type for functions that generate ids for the mapping of a
				/// scf.forall.
				struct IdBuilderResult {
				// Ops used to replace the forall induction variables.
				SmallVector<Value> mappingIdOps;
				// Actual mapping sizes used to predicate the forall body when they are
				// smaller than the available mapping sizes.
				SmallVector<int64_t> predicateMappingSizes;
				// Ops used to predicate the forall body when predicateMappingSizes is smaller
				// than the available mapping sizes.
				SmallVector<Value> predicateIdOps;
				};

				/// Common gpu id builder type, allows the configuration of lowering for various
				/// mapping schemes. Takes:
				/// - A rewriter with insertion point set before the forall op to rewrite.
				/// - The loc of the forall op to rewrite.
				/// - A list of positive integers carrying the mapping sizes for the current
				/// forall op to rewrite.
				using GpuIdBuilderFnType =
				std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;

				/// Helper struct for configuring the rewrite of mapped scf.forall ops to
				/// various gpu id configurations.
				struct GpuIdBuilder {
				using MappingIdBuilderFnType = std::function<DeviceMappingAttrInterface(
				MLIRContext *, mlir::gpu::MappingId)>;

				GpuIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes, bool useLinearMapping,
				MappingIdBuilderFnType builder);

				/// List of OpFoldResult carrying the multi-dimensional number of
				/// threads available in the current kernel (i.e. the current originalBasisOfr
				/// in CUDA parlance).
				SmallVector<OpFoldResult> originalBasisOfr;

				/// A list of positive integers carrying the number of available mapping
				/// resources that can trigger predication,
				SmallVector<int64_t> availableMappingSizes;

				/// The mapping attributes targeted by this generator.
				SmallVector<DeviceMappingAttrInterface> mappingAttributes;

				/// The constructor that builds the concrete IR for mapping ids.
				GpuIdBuilderFnType idBuilder;
				};

				/// Builder for gpu::BlockIdOps used to map scf.forall to blocks.
				/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
				/// used for indexing rewrites as well as 3D sizes for predicate generation.
				/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
				/// used for indexing rewrites as well as 1D sizes for predicate generation.
				struct GpuBlockIdBuilder : public GpuIdBuilder {
				GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping = false);
				};

				/// Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
				/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
				/// used for indexing rewrites as well as 3D sizes for predicate generation.
				/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
				/// used for indexing rewrites as well as 1D sizes for predicate generation.
				struct GpuWarpgroupIdBuilder : public GpuIdBuilder {
				GpuWarpgroupIdBuilder(MLIRContext *ctx,
				ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping = false);
				/// Static specification of the warp size.
				/// In the future this may be configured by the transformation.
				static constexpr int64_t kNumWarpsPerGroup = 4;
				static constexpr int64_t kWarpSize = 32;
				};
				qedawkinsUnsubmitted Done Reply Inline Actions Is this part of the planned changes? Previously warp_dims effectively allowed specifying the warp size, but that is removed here. qedawkins: Is this part of the planned changes? Previously warp_dims effectively allowed specifying the…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions Good point, yes I can see how the num_warps could be made to support other warp sizes. I'll add an attribute for passing this quantity. I'll leave kNumWarpsPerGroup to 4 for now, we can generalize in the future if needed. nicolasvasilache: Good point, yes I can see how the num_warps could be made to support other warp sizes. I'll…
				qedawkinsUnsubmitted Done Reply Inline Actions Perfect, thanks! qedawkins: Perfect, thanks!

				/// Builder for warp ids used to map scf.forall to reindexed warps.
				/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
				/// used for indexing rewrites as well as 3D sizes for predicate generation.
				/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
				/// used for indexing rewrites as well as 1D sizes for predicate generation.
				struct GpuWarpIdBuilder : public GpuIdBuilder {
				GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping = false);
				/// Static specification of the warp size.
				/// In the future this may be configured by the transformation.
				static constexpr int64_t kWarpSize = 32;
				};

				/// Builder for warp ids used to map scf.forall to reindexed threads.
				/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
				/// used for indexing rewrites as well as 3D sizes for predicate generation.
				/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
				/// used for indexing rewrites as well as 1D sizes for predicate generation.
				struct GpuThreadIdBuilder : public GpuIdBuilder {
				GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping = false);
				};

				/// Determine if the size of the kernel configuration is supported by the
				/// GPU architecture being used.
				/// TODO this is currently hardwired to CUDA, parameterize and generalize.
				DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
				std::optional<int64_t> gridDimX,
				std::optional<int64_t> gridDimY,
				std::optional<int64_t> gridDimZ,
				std::optional<int64_t> blockDimX,
				std::optional<int64_t> blockDimY,
				std::optional<int64_t> blockDimZ);

				/// Create an empty-body gpu::LaunchOp using the provided kernel settings
				/// and put a terminator within.
				DiagnosedSilenceableFailure
				createGpuLaunch(RewriterBase &rewriter, Location loc,
				TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp,
				std::optional<int64_t> gridDimX = std::nullopt,
				std::optional<int64_t> gridDimY = std::nullopt,
				std::optional<int64_t> gridDimZ = std::nullopt,
				std::optional<int64_t> blockDimX = std::nullopt,
				std::optional<int64_t> blockDimY = std::nullopt,
				std::optional<int64_t> blockDimZ = std::nullopt);

				/// Alter kernel configuration of the given kernel.
				DiagnosedSilenceableFailure
				alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch,
				TransformOpInterface transformOp,
				std::optional<int64_t> gridDimX = std::nullopt,
				std::optional<int64_t> gridDimY = std::nullopt,
				std::optional<int64_t> gridDimZ = std::nullopt,
				std::optional<int64_t> blockDimX = std::nullopt,
				std::optional<int64_t> blockDimY = std::nullopt,
				std::optional<int64_t> blockDimZ = std::nullopt);

				/// Find the unique top level scf::ForallOp within a given target op.
				DiagnosedSilenceableFailure
				findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
				TransformOpInterface transformOp);

				} // namespace gpu
				} // namespace transform
				} // namespace mlir

				#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H

mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td

Show All 30 Lines	let description = [{
can be connected to the given processing unit.		can be connected to the given processing unit.

Currently, `scf.forall` uses this interface to express the mapping		Currently, `scf.forall` uses this interface to express the mapping
of the loops it contains to the GPU's parallelism units such as threads and		of the loops it contains to the GPU's parallelism units such as threads and
thread blocks.		thread blocks.
}];		}];

let methods = [		let methods = [
		InterfaceMethod<
		"Return mapping as an integer from the attribute.",
		"int64_t",
		"getMappingId",
		(ins)
		springermUnsubmitted Done Reply Inline Actions We usually write `/desc=/` etc here. springerm: We usually write `/desc=/` etc here.
		>,
		InterfaceMethod<
		"Return true if the attribute specifies a linear mapping",
		"bool",
		"isLinearMapping",
		(ins)
		>,
InterfaceMethod<[{		InterfaceMethod<[{
Returns mapping as an integer from the attribute.		Return the [0..n) relative index of the attribute depending on its group.
		This can be used to index into a contiguous array.
}],		}],
"int64_t", "getMappingId", (ins)		"int64_t",
		"getRelativeIndex",
		(ins)
>		>
];		];
}		}

def DeviceMappingArrayAttr :		def DeviceMappingArrayAttr :
TypedArrayAttrBase<DeviceMappingAttrInterface,		TypedArrayAttrBase<DeviceMappingAttrInterface,
"Device Mapping array attribute"> { }		"Device Mapping array attribute"> { }

#endif // MLIR_DEVICEMAPPINGINTERFACE		#endif // MLIR_DEVICEMAPPINGINTERFACE

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

	Show All 22 Lines
	#include "mlir/IR/FunctionImplementation.h"			#include "mlir/IR/FunctionImplementation.h"
	#include "mlir/IR/Matchers.h"			#include "mlir/IR/Matchers.h"
	#include "mlir/IR/OpImplementation.h"			#include "mlir/IR/OpImplementation.h"
	#include "mlir/IR/PatternMatch.h"			#include "mlir/IR/PatternMatch.h"
	#include "mlir/IR/TypeUtilities.h"			#include "mlir/IR/TypeUtilities.h"
	#include "mlir/Interfaces/SideEffectInterfaces.h"			#include "mlir/Interfaces/SideEffectInterfaces.h"
	#include "mlir/Transforms/InliningUtils.h"			#include "mlir/Transforms/InliningUtils.h"
	#include "llvm/ADT/TypeSwitch.h"			#include "llvm/ADT/TypeSwitch.h"
				#include "llvm/Support/ErrorHandling.h"

	using namespace mlir;			using namespace mlir;
	using namespace mlir::gpu;			using namespace mlir::gpu;

	#include "mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc"			#include "mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc"

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// GPU Device Mapping Attributes			// GPU Device Mapping Attributes
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	int64_t GPUBlockMappingAttr::getMappingId() const {			int64_t GPUBlockMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getBlock());			return static_cast<int64_t>(getBlock());
	}			}

				bool GPUBlockMappingAttr::isLinearMapping() const {
				return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
				}

				int64_t GPUBlockMappingAttr::getRelativeIndex() const {
				return isLinearMapping()
				? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
				: getMappingId();
				}

				int64_t GPUWarpgroupMappingAttr::getMappingId() const {
				return static_cast<int64_t>(getWarpgroup());
				}

				bool GPUWarpgroupMappingAttr::isLinearMapping() const {
				return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
				}

				int64_t GPUWarpgroupMappingAttr::getRelativeIndex() const {
				return isLinearMapping()
				? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
				: getMappingId();
				}

	int64_t GPUWarpMappingAttr::getMappingId() const {			int64_t GPUWarpMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getWarp());			return static_cast<int64_t>(getWarp());
	}			}

	int64_t GPULinearIdMappingAttr::getMappingId() const {			bool GPUWarpMappingAttr::isLinearMapping() const {
	return static_cast<int64_t>(getLinearId());			return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
				}

				int64_t GPUWarpMappingAttr::getRelativeIndex() const {
				return isLinearMapping()
				? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
				: getMappingId();
	}			}

	int64_t GPUThreadMappingAttr::getMappingId() const {			int64_t GPUThreadMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getThread());			return static_cast<int64_t>(getThread());
	}			}

				bool GPUThreadMappingAttr::isLinearMapping() const {
				return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
				}

				int64_t GPUThreadMappingAttr::getRelativeIndex() const {
				return isLinearMapping()
				? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
				: getMappingId();
				}

	int64_t GPUMemorySpaceMappingAttr::getMappingId() const {			int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getAddressSpace());			return static_cast<int64_t>(getAddressSpace());
	}			}

				bool GPUMemorySpaceMappingAttr::isLinearMapping() const {
				llvm_unreachable("GPUMemorySpaceMappingAttr does not support linear mapping");
				}

				int64_t GPUMemorySpaceMappingAttr::getRelativeIndex() const {
				llvm_unreachable("GPUMemorySpaceMappingAttr does not support relative index");
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// MMAMatrixType			// MMAMatrixType
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	MMAMatrixType MMAMatrixType::get(ArrayRef<int64_t> shape, Type elementType,			MMAMatrixType MMAMatrixType::get(ArrayRef<int64_t> shape, Type elementType,
	StringRef operand) {			StringRef operand) {
	return Base::get(elementType.getContext(), shape, elementType, operand);			return Base::get(elementType.getContext(), shape, elementType, operand);
	}			}
	▲ Show 20 Lines • Show All 1,696 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt

	add_mlir_dialect_library(MLIRGPUTransformOps			add_mlir_dialect_library(MLIRGPUTransformOps
	GPUTransformOps.cpp			GPUTransformOps.cpp
				Utils.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps			${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces			${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces

	DEPENDS			DEPENDS
	MLIRGPUTransformOpsIncGen			MLIRGPUTransformOpsIncGen
	MLIRDeviceMappingInterfacesIncGen			MLIRDeviceMappingInterfacesIncGen
	Show All 11 Lines

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//		//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"		#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"		#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"		#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"		#include "mlir/Dialect/GPU/TransformOps/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"		#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"		#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"		#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"		#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"		#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"		#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineExpr.h"		#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"		#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/IRMapping.h"		#include "mlir/IR/IRMapping.h"
#include "mlir/IR/MLIRContext.h"		#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OpDefinition.h"		#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/Visitors.h"		#include "mlir/IR/Visitors.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"		#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
		#include "llvm/Support/ErrorHandling.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::gpu;		using namespace mlir::gpu;
using namespace mlir::transform;		using namespace mlir::transform;
using namespace mlir::transform::gpu;		using namespace mlir::transform::gpu;

#define DEBUG_TYPE "gpu-transforms"		#define DEBUG_TYPE "gpu-transforms"
#define DEBUG_TYPE_ALIAS "gpu-transforms-alias"		#define DEBUG_TYPE_ALIAS "gpu-transforms-alias"
▲ Show 20 Lines • Show All 574 Lines • ▼ Show 20 Lines
void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {		void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
patterns.insert<BarrierElimination>(getContext());		patterns.insert<BarrierElimination>(getContext());
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Block and thread mapping utilities.		// Block and thread mapping utilities.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

namespace {

/// Return a flattened thread id for the workgroup with given sizes.
static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
ArrayRef<OpFoldResult> blockDimsOfr) {
LLVM_DEBUG(llvm::interleaveComma(
blockDimsOfr,
DBGS() << "----buildLinearThreadId with blockDimsOfr: ");
llvm::dbgs() << "\n");
assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes");
AffineExpr tx, ty, tz, BDX, BDY;
bindDims(rewriter.getContext(), tx, ty, tz);
bindSymbols(rewriter.getContext(), BDX, BDY);
IndexType indexType = rewriter.getIndexType();
SmallVector<OpFoldResult> threadsAndWorkGroups{
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
threadsAndWorkGroups.push_back(blockDimsOfr[0]);
threadsAndWorkGroups.push_back(blockDimsOfr[1]);
OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
}

/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks.
/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
/// as 3-D sizes for predicate generation.
struct GpuBlockIdBuilder : public GpuIdBuilder {

GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
ArrayRef<int64_t> mappingSizes)
: GpuIdBuilder(blockDims, mappingSizes) {
mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
GPUBlockMappingAttr::get(ctx, Blocks::DimY),
GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
idBuilder = [](RewriterBase &rewriter, Location loc,
ArrayRef<int64_t> forallMappingSizes) {
IndexType indexType = rewriter.getIndexType();
SmallVector<Value> ids{
rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
// Return 3-D ids for indexing rewrite and 3-D sizes and ids for
// predicate generation.
return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
ids};
};
}
};

/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without
/// any reindexing.
/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
/// as 3-D sizes for predicate generation.
struct GpuThreadIdBuilder : public GpuIdBuilder {
GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
ArrayRef<int64_t> mappingSizes)
: GpuIdBuilder(blockDims, mappingSizes) {
mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
GPUThreadMappingAttr::get(ctx, Threads::DimY),
GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
idBuilder = [](RewriterBase &rewriter, Location loc,
ArrayRef<int64_t> forallMappingSizes) {
IndexType indexType = rewriter.getIndexType();
SmallVector<Value> ids{
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
// Return 3-D ids for indexing rewrite and 3-D sizes and ids for
// predicate generation.
return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
ids};
};
}
};

/// Builder for warp ids used in mapping scf.forall to warps.
/// This builder requires a specification of the number of warps along each
/// dimension to more finely control mapping to warps as well a predication than
/// by solely analyzing the IR.
/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
/// as 3-D sizes for predicate generation.
struct GpuWarpIdBuilder : public GpuIdBuilder {
GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
ArrayRef<int64_t> mappingSizes)
: GpuIdBuilder(blockDims, mappingSizes) {
mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
GPUWarpMappingAttr::get(ctx, Warps::DimY),
GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
idBuilder = [this](RewriterBase &rewriter, Location loc,
ArrayRef<int64_t> forallMappingSizes) {
// Build the linear warp id and decompose it in the basis of
// `forallMappingSizes`.
Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
OpFoldResult warpIdOfr = affine::makeComposedFoldedAffineApply(
rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
// Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
// "row-major" order.
SmallVector<int64_t> reverseBasisSizes(
llvm::reverse(this->availableMappingSizes));
SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
SmallVector<Value> ids;
// Reverse back to be in [x, y, z] order.
for (AffineExpr e : llvm::reverse(delinearizingExprs))
ids.push_back(
affine::makeComposedAffineApply(rewriter, loc, e, {warpId}));

// clang-format off
LDBG("----linearId: " << linearId);
LDBG("----warpId: " << warpId);
LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
DBGS() << "--delinearization basis: ");
llvm::dbgs() << "\n";
llvm::interleaveComma(strides,
DBGS() << "--delinearization strides: ");
llvm::dbgs() << "\n";
llvm::interleaveComma(delinearizingExprs,
DBGS() << "--delinearization exprs: ");
llvm::dbgs() << "\n";
llvm::interleaveComma(ids, DBGS() << "--ids: ");
llvm::dbgs() << "\n";);
// clang-format on

// Return 3-D ids for indexing rewrite and 3-D sizes and ids for
// predicate generation.
return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
ids};
};
}

/// Static specification of the warp size.
/// In the future this may be configured by the transformation.
static constexpr int64_t kWarpSize = 32;
};

/// Builder for linear ids used in mapping scf.forall to reindexed threads.
/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
/// as 1-D sizes for predicate generation.
struct GpuLinearIdBuilder : public GpuIdBuilder {
GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
ArrayRef<int64_t> mappingSizes)
: GpuIdBuilder(blockDims, mappingSizes) {
mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
idBuilder = [this](RewriterBase &rewriter, Location loc,
ArrayRef<int64_t> forallMappingSizes) {
// Build the linear thread id and decompose it in the basis of
// `forallMappingSizes`.
Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
// Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
// "row-major" order.
SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
AffineExpr d0;
bindDims(rewriter.getContext(), d0);
SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
SmallVector<Value> ids;
// Reverse back to be in [x, y, z] order.
for (AffineExpr e : llvm::reverse(delinearizingExprs))
ids.push_back(
affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));

// clang-format off
LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
DBGS() << "--delinearization basis: ");
llvm::dbgs() << "\n";
llvm::interleaveComma(strides,
DBGS() << "--delinearization strides: ");
llvm::dbgs() << "\n";
llvm::interleaveComma(delinearizingExprs,
DBGS() << "--delinearization exprs: ");
llvm::dbgs() << "\n";
llvm::interleaveComma(ids, DBGS() << "--ids: ");
llvm::dbgs() << "\n";);
// clang-format on

// Compute and return the 1-D actual mapping size spanned by the linearId,
// it will be used to predicate against the linearized total number of
// threads.
int64_t actualMappingSize = 1;
for (int64_t s : forallMappingSizes)
actualMappingSize *= s;

// Return 3-D ids for indexing rewrite and 1-D size and id for
// predicate generation.
return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
SmallVector<Value>{linearId}};
};
}
};

} // namespace

static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
definiteFailureHelper(std::optional<TransformOpInterface> transformOp,		definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
Operation *target, const Twine &message) {		Operation *target, const Twine &message) {
if (transformOp.has_value())		if (transformOp.has_value())
return transformOp->emitDefiniteFailure() << message;		return transformOp->emitDefiniteFailure() << message;
return emitDefiniteFailure(target, message);		return emitDefiniteFailure(target, message);
}		}

/// Check if given mapping attributes are one of the desired attributes		/// Check if given mapping attributes are one of the desired attributes
static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,		checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp) {		scf::ForallOp forallOp) {
if (!forallOp.getMapping().has_value())		if (!forallOp.getMapping().has_value())
return definiteFailureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"mapping must be present");		"mapping must be present");

bool hasBlockMapping =		bool hasBlockMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return isa<GPUBlockMappingAttr>(attr);		return isa<GPUBlockMappingAttr>(attr);
});		});
bool hasThreadMapping =		bool hasWarpgroupMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return isa<GPUThreadMappingAttr>(attr);		return isa<GPUWarpgroupMappingAttr>(attr);
});		});
bool hasWarpMapping =		bool hasWarpMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return isa<GPUWarpMappingAttr>(attr);		return isa<GPUWarpMappingAttr>(attr);
});		});
bool hasLinearMapping =		bool hasThreadMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return isa<GPULinearIdMappingAttr>(attr);		return isa<GPUThreadMappingAttr>(attr);
});		});
int64_t countMappingTypes = 0;		int64_t countMappingTypes = 0;
countMappingTypes += hasBlockMapping ? 1 : 0;		countMappingTypes += hasBlockMapping ? 1 : 0;
countMappingTypes += hasThreadMapping ? 1 : 0;		countMappingTypes += hasWarpgroupMapping ? 1 : 0;
countMappingTypes += hasWarpMapping ? 1 : 0;		countMappingTypes += hasWarpMapping ? 1 : 0;
countMappingTypes += hasLinearMapping ? 1 : 0;		countMappingTypes += hasThreadMapping ? 1 : 0;
if (countMappingTypes > 1) {		if (countMappingTypes > 1) {
return definiteFailureHelper(		return definiteFailureHelper(
transformOp, forallOp,		transformOp, forallOp,
"cannot mix different mapping types, use nesting");		"cannot mix different mapping types, use nesting");
}		}

DenseSet<Attribute> seen;		DenseSet<Attribute> seen;
for (Attribute map : forallOp.getMapping()->getValue()) {		for (Attribute map : forallOp.getMapping()->getValue()) {
if (seen.contains(map)) {		if (seen.contains(map)) {
return definiteFailureHelper(		return definiteFailureHelper(
transformOp, forallOp,		transformOp, forallOp,
"duplicated attribute, cannot map different loops "		"duplicated attribute, cannot map different loops "
"to the same processor");		"to the same mapping id");
}		}
seen.insert(map);		seen.insert(map);
}		}

		auto isLinear = [](Attribute a) {
		return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
		};
		if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
		!llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
		return definiteFailureHelper(
		transformOp, forallOp,
		"cannot mix linear and non-linear mapping modes");
		}

return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
verifyGpuMapping(std::optional<TransformOpInterface> transformOp,		verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp) {		scf::ForallOp forallOp) {
// Check the types of the mapping attributes match.		// Check the types of the mapping attributes match.
DiagnosedSilenceableFailure typeRes =		DiagnosedSilenceableFailure typeRes =
checkMappingAttributeTypes(transformOp, forallOp);		checkMappingAttributeTypes(transformOp, forallOp);
if (!typeRes.succeeded())		if (!typeRes.succeeded())
return typeRes;		return typeRes;

// Perform other non-types verifications.		// Perform other non-types verifications.
if (!forallOp.isNormalized())		if (!forallOp.isNormalized())
return definiteFailureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"unsupported non-normalized loops");		"unsupported non-normalized loops");
if (forallOp.getNumResults() > 0)		if (forallOp.getNumResults() > 0)
return definiteFailureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"only bufferized scf.forall can be mapped");		"only bufferized scf.forall can be mapped");
if (forallOp.getRank() > 3)		bool useLinearMapping = cast<DeviceMappingAttrInterface>(
		forallOp.getMapping()->getValue().front())
		.isLinearMapping();
		// TODO: This would be more natural with support for Optional<EnumParameter>
		// in GPUDeviceMappingAttr.
		int64_t maxNumMappingsSupported =
		useLinearMapping ? (getMaxEnumValForMappingId() -
		static_cast<uint64_t>(MappingId::DimZ))
		: 3;
		if (forallOp.getRank() > maxNumMappingsSupported) {
return definiteFailureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"scf.forall with rank > 3 does not lower");		"scf.forall with rank > ")
		<< maxNumMappingsSupported
		<< " does not lower for the specified mapping attribute type";
		}
if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) {		if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();		return !getConstantIntValue(ofr).has_value();
})) {		})) {
return definiteFailureHelper(transformOp, forallOp,		return definiteFailureHelper(transformOp, forallOp,
"unsupported dynamic sizes in forall op");		"unsupported dynamic sizes in forall op");
}		}
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

/// Determines if the size of the kernel configuration is supported by the
/// GPU architecture being used. It presently makes use of CUDA limitations,
/// however that aspect may be enhanced for other GPUs.
static DiagnosedSilenceableFailure checkGpuLimits(
TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
std::optional<int64_t> blockDimZ) {

static constexpr int maxTotalBlockdim = 1024;
static constexpr int maxBlockdimx = 1024;
static constexpr int maxBlockdimy = 1024;
static constexpr int maxBlockdimz = 64;
static constexpr int maxTotalGriddim = 2147483647;
static constexpr int maxGriddimx = 2147483647;
static constexpr int maxGriddimy = 65535;
static constexpr int maxGriddimz = 65535;

if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
maxTotalBlockdim \|\|
(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
maxTotalGriddim \|\|
blockDimX.value_or(1) > maxBlockdimx \|\|
blockDimY.value_or(1) > maxBlockdimy \|\|
blockDimZ.value_or(1) > maxBlockdimz \|\|
gridDimY.value_or(1) > maxGriddimy \|\|
gridDimZ.value_or(1) > maxGriddimz \|\|
gridDimX.value_or(1) > maxGriddimx) {
return transformOp.emitSilenceableError()
<< "Trying to launch a GPU kernel with grid_dims = ("
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
<< gridDimZ.value_or(1) << ") block_dims = ("
<< blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
<< blockDimZ.value_or(1) << "). It is larger than the limits.";
}
return DiagnosedSilenceableFailure::success();
}

/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
/// and put a terminator within.
static DiagnosedSilenceableFailure
createGpuLaunch(RewriterBase &rewriter, Location loc,
TransformOpInterface transformOp, LaunchOp &launchOp,
std::optional<int64_t> gridDimX = std::nullopt,
std::optional<int64_t> gridDimY = std::nullopt,
std::optional<int64_t> gridDimZ = std::nullopt,
std::optional<int64_t> blockDimX = std::nullopt,
std::optional<int64_t> blockDimY = std::nullopt,
std::optional<int64_t> blockDimZ = std::nullopt) {
DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
blockDimY, blockDimZ);
if (!diag.succeeded())
return diag;

auto createConst = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(loc, dim);
};
OpBuilder::InsertionGuard guard(rewriter);
Value one = createConst(1);
Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
blkSizeX, blkSizeY, blkSizeZ);
rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
rewriter.create<TerminatorOp>(loc);
return DiagnosedSilenceableFailure::success();
}

/// Alter kernel configuration of the given kernel.
static DiagnosedSilenceableFailure
alterGpuLaunch(RewriterBase &rewriter, LaunchOp gpuLaunch,
TransformOpInterface transformOp,
std::optional<int64_t> gridDimX = std::nullopt,
std::optional<int64_t> gridDimY = std::nullopt,
std::optional<int64_t> gridDimZ = std::nullopt,
std::optional<int64_t> blockDimX = std::nullopt,
std::optional<int64_t> blockDimY = std::nullopt,
std::optional<int64_t> blockDimZ = std::nullopt) {
DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
blockDimY, blockDimZ);
if (!diag.succeeded())
return diag;

KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfterValue(currentBlockdim.x);
auto createConstValue = [&](int dim) {
return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
dim);
};

if (gridDimX.has_value())
gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
if (gridDimY.has_value())
gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
if (gridDimZ.has_value())
gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
if (blockDimX.has_value())
gpuLaunch.getBlockSizeXMutable().assign(
createConstValue(blockDimX.value()));
if (blockDimY.has_value())
gpuLaunch.getBlockSizeYMutable().assign(
createConstValue(blockDimY.value()));
if (blockDimZ.has_value())
gpuLaunch.getBlockSizeZMutable().assign(
createConstValue(blockDimZ.value()));
return DiagnosedSilenceableFailure::success();
}

/// Struct to return the result of the rewrite of a forall operation.		/// Struct to return the result of the rewrite of a forall operation.
		springermUnsubmitted Done Reply Inline Actions double parentheses springerm: double parentheses
struct ForallRewriteResult {		struct ForallRewriteResult {
SmallVector<int64_t> mappingSizes;		SmallVector<int64_t> mappingSizes;
SmallVector<Value> mappingIds;		SmallVector<Value> mappingIds;
};		};

/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.		/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
template <typename OpTy, typename OperationOrBlock>		template <typename OpTy, typename OperationOrBlock>
static void		static void
replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,		replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
OperationOrBlock *parent, Value replacement,		OperationOrBlock *parent, Value replacement,
ArrayRef<int64_t> availableMappingSizes) {		ArrayRef<int64_t> availableMappingSizes) {
parent->walk([&](OpTy idOp) {		parent->walk([&](OpTy idOp) {
if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)		if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
rewriter.replaceAllUsesWith(idOp.getResult(), replacement);		rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
});		});
}		}

static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(		static DiagnosedSilenceableFailure
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		rewriteOneForallCommonImpl(RewriterBase &rewriter,
		std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp, ForallRewriteResult &result,		scf::ForallOp forallOp, ForallRewriteResult &result,
ArrayRef<int64_t> availableMappingSizes, const GpuIdBuilder &gpuIdBuilder) {		const GpuIdBuilder &gpuIdBuilder) {
LDBG("--start rewriteOneForallCommonImpl");		LDBG("--start rewriteOneForallCommonImpl");

// Step 0. GPU-specific verifications. There is no better place to anchor		// Step 0. GPU-specific verifications. There is no better place to anchor
// those right now: the ForallOp is target-independent and the transform		// those right now: the ForallOp is target-independent and the transform
// op does not apply to individual ForallOp.		// op does not apply to individual ForallOp.
DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);		DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

// Step 1. Complete the mapping to a full mapping (with 1s) if necessary.		// Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(		SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(
llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {		llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
auto maybeStaticValue = getConstantIntValue(ofr);		auto maybeStaticValue = getConstantIntValue(ofr);
assert(maybeStaticValue && "expected static value");		assert(maybeStaticValue && "expected static value");
return maybeStaticValue.value();		return maybeStaticValue.value();
}));		}));
SmallVector<Attribute> forallMappingAttrs =		SetVector<Attribute> forallMappingAttrs;
llvm::to_vector(forallOp.getMapping()->getValue());		forallMappingAttrs.insert(forallOp.getMapping()->getValue().begin(),
		springermUnsubmitted Done Reply Inline Actions double parentheses springerm: double parentheses
		forallOp.getMapping()->getValue().end());
		auto comparator = [](Attribute a, Attribute b) -> bool {
		return cast<DeviceMappingAttrInterface>(a).getMappingId() <
		cast<DeviceMappingAttrInterface>(b).getMappingId();
		};
		// Compute the max mapping to avoid needlessly mapping all dimensions.
		DeviceMappingAttrInterface maxMapping =
		cast<DeviceMappingAttrInterface>(*std::max_element(
		forallMappingAttrs.begin(), forallMappingAttrs.end(), comparator));
		DeviceMappingAttrInterface maxLinearMapping;
		if (maxMapping.isLinearMapping())
		maxLinearMapping = maxMapping;
for (auto attr : gpuIdBuilder.mappingAttributes) {		for (auto attr : gpuIdBuilder.mappingAttributes) {
if (llvm::is_contained(forallMappingAttrs, attr))		// If attr overflows, just skip.
		if (maxLinearMapping && comparator(maxLinearMapping, attr))
		continue;
		// Try to insert. If element was already present, just continue.
		if (!forallMappingAttrs.insert(attr))
continue;		continue;
forallMappingAttrs.push_back(attr);		// Otherwise, we have a new insertion without a size -> use size 1.
tmpMappingSizes.push_back(1);		tmpMappingSizes.push_back(1);
}		}
LLVM_DEBUG(		LLVM_DEBUG(
llvm::interleaveComma(		llvm::interleaveComma(
tmpMappingSizes,		tmpMappingSizes,
DBGS() << "----tmpMappingSizes extracted from scf.forall op: ");		DBGS() << "----tmpMappingSizes extracted from scf.forall op: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");

// Step 2. sort the values by the corresponding DeviceMappingAttrInterface.		// Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
auto comparator = [&](Attribute a, Attribute b) -> bool {		SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
return cast<DeviceMappingAttrInterface>(a).getMappingId() <		forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
cast<DeviceMappingAttrInterface>(b).getMappingId();
};
SmallVector<int64_t> forallMappingSizes =
getValuesSortedByKey(forallMappingAttrs, tmpMappingSizes, comparator);
LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,		LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
DBGS() << "----forallMappingSizes: ");		DBGS() << "----forallMappingSizes: ");
llvm::dbgs() << "\n"; llvm::interleaveComma(		llvm::dbgs() << "\n"; llvm::interleaveComma(
forallMappingAttrs, DBGS() << "----mappingAttrs: ");		forallMappingAttrs, DBGS() << "----forallMappingAttrs: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");

// Step 3. Generate the mappingIdOps using the provided generator.		// Step 3. Generate the mappingIdOps using the provided generator.
Location loc = forallOp.getLoc();		Location loc = forallOp.getLoc();
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(forallOp);		rewriter.setInsertionPoint(forallOp);
IdBuilderResult builderResult =		IdBuilderResult builderResult =
gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes);		gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes);

// Step 4. Map the induction variables to the mappingIdOps, this may involve a		// Step 4. Map the induction variables to the mappingIdOps, this may involve
// permutation.		// a permutation.
SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;		SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
IRMapping bvm;		IRMapping bvm;
for (auto [iv, dim] :		for (auto [iv, dim] : llvm::zip_equal(
llvm::zip_equal(forallOp.getInductionVars(),		forallOp.getInductionVars(),
ArrayRef<Attribute>{forallMappingAttrs}.take_front(		forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
forallOp.getInductionVars().size()))) {		auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
Value peIdOp = mappingIdOps[static_cast<int64_t>(		Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
cast<DeviceMappingAttrInterface>(dim).getMappingId())];
bvm.map(iv, peIdOp);		bvm.map(iv, peIdOp);
}		}

// Step 5. If the availableMappingSizes are already known, create conditionals		// Step 5. If the availableMappingSizes are already known, create
// to predicate the region. Otherwise, the current forall determines the		// conditionals to predicate the region. Otherwise, the current forall
// availableMappingSizes and no predication occurs.		// determines the availableMappingSizes and no predication occurs.
Value predicate;		Value predicate;
if (!availableMappingSizes.empty()) {		if (!gpuIdBuilder.availableMappingSizes.empty()) {
SmallVector<int64_t> predicateMappingSizes =		SmallVector<int64_t> predicateMappingSizes =
builderResult.predicateMappingSizes;		builderResult.predicateMappingSizes;
SmallVector<Value> predicateIdOps = builderResult.predicateIdOps;		SmallVector<Value> predicateIdOps = builderResult.predicateIdOps;
// clang-format off		// clang-format off
LLVM_DEBUG(		LLVM_DEBUG(
llvm::interleaveComma(		llvm::interleaveComma(
predicateMappingSizes, DBGS() << "----predicateMappingSizes: ");		predicateMappingSizes, DBGS() << "----predicateMappingSizes: ");
llvm::dbgs() << "\n";		llvm::dbgs() << "\n";
llvm::interleaveComma(		llvm::interleaveComma(
availableMappingSizes, DBGS() << "----availableMappingSizes: ");		gpuIdBuilder.availableMappingSizes, DBGS() << "----availableMappingSizes: ");
llvm::dbgs() << "\n";		llvm::dbgs() << "\n";
llvm::interleaveComma(predicateIdOps, DBGS() << "----predicateIdOps: ");		llvm::interleaveComma(predicateIdOps, DBGS() << "----predicateIdOps: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");
// clang-format on		// clang-format on
for (auto [id, mappingSize, availableMappingSize] : llvm::zip_equal(		for (auto [id, mappingSize, availableMappingSize] :
predicateIdOps, predicateMappingSizes, availableMappingSizes)) {		llvm::zip_equal(predicateIdOps, predicateMappingSizes,
		gpuIdBuilder.availableMappingSizes)) {
if (mappingSize > availableMappingSize) {		if (mappingSize > availableMappingSize) {
return definiteFailureHelper(		return definiteFailureHelper(
transformOp, forallOp,		transformOp, forallOp,
"Trying to map to fewer GPU threads than loop iterations but "		"Trying to map to fewer GPU threads than loop iterations but "
"overprovisioning is not yet supported. "		"overprovisioning is not yet supported. "
"Try additional tiling of the before mapping or map to more "		"Try additional tiling of the before mapping or map to more "
"threads.");		"threads.");
}		}
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
{		{
// Create an early zero index value for replacements and immediately reset		// Create an early zero index value for replacements and immediately reset
// the insertion point.		// the insertion point.
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(parentBlock);		rewriter.setInsertionPointToStart(parentBlock);
zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);		zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
}		}

SmallVector<int64_t> anyAvailableMappingSizes;
ForallRewriteResult rewriteResult;		ForallRewriteResult rewriteResult;
// Pass an empty anyAvailableMappingSizes.		DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(
DiagnosedSilenceableFailure diag =		rewriter, transformOp, forallOp, rewriteResult, gpuIdBuilder);
rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult,
anyAvailableMappingSizes, gpuIdBuilder);

// Return if anything goes wrong, use silenceable failure as a match failure.		// Return if anything goes wrong, use silenceable failure as a match
		// failure.
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

// Set the gridDims that act as a return.		// Set the gridDims that act as a return.
gridDims = rewriteResult.mappingSizes;		gridDims = rewriteResult.mappingSizes;

// Replace ids of dimensions known to be 1 by 0 to simplify the IR.		// Replace ids of dimensions known to be 1 by 0 to simplify the IR.
// Here, the result of mapping determines the available mapping sizes.		// Here, the result of mapping determines the available mapping sizes.
replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,		replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
gridDims);		gridDims);
		qedawkinsUnsubmitted Done Reply Inline Actions nit: gridDims is now effectively just being used to check that there are <= 3 mappingSizes after `rewriteOneForallCommonImpl`. qedawkins: nit: gridDims is now effectively just being used to check that there are <= 3 mappingSizes…
		nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions Not sure I can relate this comment to code anymore, gridDims is either used if provided or filled if not. I can update in a followup if necessary. nicolasvasilache: Not sure I can relate this comment to code anymore, gridDims is either used if provided or…

return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
mlir::transform::gpu::findTopLevelForallOp(Operation *target,		mlir::transform::gpu::findTopLevelForallOp(Operation *target,
scf::ForallOp &topLevelForallOp,		scf::ForallOp &topLevelForallOp,
TransformOpInterface transformOp) {		TransformOpInterface transformOp) {
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
}		}

GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), {}, {});		GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), {}, {});
diag = mlir::transform::gpu::mapForallToBlocksImpl(		diag = mlir::transform::gpu::mapForallToBlocksImpl(
rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);		rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

// Set the GPU launch configuration for the grid dims late, this is subject to		// Set the GPU launch configuration for the grid dims late, this is
// IR inspection.		// subject to IR inspection.
diag = alterGpuLaunch(rewriter, gpuLaunch,		diag = alterGpuLaunch(rewriter, gpuLaunch,
cast<TransformOpInterface>(getOperation()), gridDims[0],		cast<TransformOpInterface>(getOperation()), gridDims[0],
gridDims[1], gridDims[2]);		gridDims[1], gridDims[2]);

results.push_back(gpuLaunch);		results.push_back(gpuLaunch);
return diag;		return diag;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// MapNestedForallToThreads		// MapNestedForallToThreads
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

		static DiagnosedSilenceableFailure
		getIdBuilder(std::optional<TransformOpInterface> transformOp,
		scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
		ArrayRef<int64_t> warpDims, GpuIdBuilder *gpuIdBuilder) {
		MLIRContext *ctx = forallOp.getContext();
		SmallVector<OpFoldResult> availableMappingSizesOfr =
		getAsIndexOpFoldResult(ctx, availableMappingSizes);
		auto mappingAttr = cast<DeviceMappingAttrInterface>(
		forallOp.getMapping()->getValue().front());
		bool useLinearMapping = mappingAttr.isLinearMapping();
		if (isa<GPUBlockMappingAttr>(mappingAttr)) {
		*gpuIdBuilder = GpuBlockIdBuilder(ctx, availableMappingSizesOfr,
		availableMappingSizes, useLinearMapping);
		return DiagnosedSilenceableFailure::success();
		}
		if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
		if (warpDims.front() % GpuWarpgroupIdBuilder::kNumWarpsPerGroup != 0) {
		springermUnsubmitted Done Reply Inline Actions typo springerm: typo
		auto diag = emitDefiniteFailure(
		forallOp.getLoc(),
		springermUnsubmitted Done Reply Inline Actions typo springerm: typo
		Twine("number of warps must be a multiple of ") +
		std::to_string(GpuWarpgroupIdBuilder::kNumWarpsPerGroup));
		diag.attachNote(forallOp->getLoc()) << "when applied to this payload op";
		return diag;
		}
		SmallVector<int64_t> warpgroupDims(warpDims.begin(), warpDims.end());
		warpgroupDims.front() /= GpuWarpgroupIdBuilder::kNumWarpsPerGroup;
		*gpuIdBuilder = GpuWarpgroupIdBuilder(ctx, availableMappingSizesOfr,
		warpgroupDims, useLinearMapping);
		return DiagnosedSilenceableFailure::success();
		}
		if (isa<GPUWarpMappingAttr>(mappingAttr)) {
		*gpuIdBuilder = GpuWarpIdBuilder(ctx, availableMappingSizesOfr, warpDims,
		useLinearMapping);
		return DiagnosedSilenceableFailure::success();
		}
		if (isa<GPUThreadMappingAttr>(mappingAttr)) {
		*gpuIdBuilder = GpuThreadIdBuilder(ctx, availableMappingSizesOfr,
		availableMappingSizes, useLinearMapping);
		return DiagnosedSilenceableFailure::success();
		}
		llvm_unreachable("unknown mapping attribute");
		}

DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,		scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) {		ArrayRef<int64_t> warpDims, bool syncAfterDistribute) {
// Ignore cases with different attributes than this builder supports.
for (Attribute map : forallOp.getMapping()->getValue()) {		GpuIdBuilder *gpuIdBuilder;
if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) {		{
LDBG("--skip " << map);		// Try to build the id builder, if it fails, return.
LLVM_DEBUG(llvm::interleaveComma(gpuIdBuilder.mappingAttributes,		DiagnosedSilenceableFailure diag = getIdBuilder(
DBGS() << "----not in: ");		transformOp, forallOp, availableMappingSizes, warpDims, gpuIdBuilder);
llvm::dbgs() << "\n";);		if (!diag.succeeded())
return emitSilenceableFailure(forallOp);		return diag;
}
}		}

Location loc = forallOp.getLoc();		Location loc = forallOp.getLoc();
OpBuilder::InsertionGuard g(rewriter);		OpBuilder::InsertionGuard g(rewriter);
// Insert after to allow for syncthreads after `forall` is erased.		// Insert after to allow for syncthreads after `forall` is erased.
rewriter.setInsertionPointAfter(forallOp);		rewriter.setInsertionPointAfter(forallOp);
ForallRewriteResult rewriteResult;		ForallRewriteResult rewriteResult;
DiagnosedSilenceableFailure diag =		DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(
rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult,		rewriter, transformOp, forallOp, rewriteResult, *gpuIdBuilder);
availableMappingSizes, gpuIdBuilder);

// Return if anything goes wrong, use silenceable failure as a match failure.
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

// Add a syncthreads if needed. TODO: warpsync		// Add a syncthreads if needed. TODO: warpsync
if (syncAfterDistribute)		if (syncAfterDistribute)
rewriter.create<BarrierOp>(loc);		rewriter.create<BarrierOp>(loc);

return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
Operation *target, ArrayRef<int64_t> blockDims, ArrayRef<int64_t> warpDims,		Operation *target, ArrayRef<int64_t> blockDims, ArrayRef<int64_t> warpDims,
bool syncAfterDistribute) {		bool syncAfterDistribute) {
LDBG("Start mapNestedForallToThreadsImpl");		LDBG("Start mapNestedForallToThreadsImpl");
MLIRContext *ctx = rewriter.getContext();		MLIRContext *ctx = rewriter.getContext();
SmallVector<OpFoldResult> blockDimsOfr =		SmallVector<OpFoldResult> blockDimsOfr =
getAsIndexOpFoldResult(ctx, blockDims);		getAsIndexOpFoldResult(ctx, blockDims);

if (blockDims.size() != 3)		if (blockDims.size() != 3)
return definiteFailureHelper(transformOp, target,		return definiteFailureHelper(transformOp, target,
"requires size-3 thread mapping");		"requires size-3 thread mapping");
if (!warpDims.empty()) {
if (warpDims.size() != 3)
return definiteFailureHelper(transformOp, target,
"requires empty or size-3 warp mapping");
}

// Create an early zero index value for replacements.		// Create an early zero index value for replacements.
Location loc = target->getLoc();		Location loc = target->getLoc();
Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);		Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();		DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {		WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
//===--------------------------------------------------------------------===//
// Mapping to warp ids.
//===--------------------------------------------------------------------===//
if (!warpDims.empty()) {
LLVM_DEBUG(
llvm::interleaveComma(
warpDims, DBGS() << "+mapNestedForallToThreadsImpl warpDims: ");
llvm::dbgs() << "\n");
LLVM_DEBUG(llvm::interleaveComma(
blockDimsOfr, DBGS() << "--warpDims with blockDimsOfr: ");
llvm::dbgs() << "\n");
GpuWarpIdBuilder gpuWarpIdBuilder(ctx, blockDimsOfr, warpDims);
diag = mlir::transform::gpu::mapOneForallToThreadsImpl(		diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
rewriter, transformOp, forallOp, warpDims, syncAfterDistribute,		rewriter, transformOp, forallOp, blockDims, warpDims,
gpuWarpIdBuilder);		syncAfterDistribute);
// Use silenceable failure to encode "failure to match" and pass
// through.
if (diag.isDefiniteFailure())		if (diag.isDefiniteFailure())
return WalkResult::interrupt();		return WalkResult::interrupt();
if (diag.succeeded())		if (diag.succeeded())
return WalkResult::skip();		return WalkResult::skip();
}

//===--------------------------------------------------------------------===//
// Mapping to linear ids.
//===--------------------------------------------------------------------===//
LDBG("+mapNestedForallToThreadsImpl linearDims");
LLVM_DEBUG(llvm::interleaveComma(
blockDimsOfr, DBGS() << "--linearDims with blockDimsOfr: ");
llvm::dbgs() << "\n");
int64_t numThreads = 1;
for (int64_t b : blockDims)
numThreads *= b;
GpuLinearIdBuilder gpuLinearIdBuilder(ctx, blockDimsOfr, numThreads);
diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
rewriter, transformOp, forallOp, numThreads, syncAfterDistribute,
gpuLinearIdBuilder);
// Use silenceable failure to encode "failure to match" and pass through.
if (diag.isDefiniteFailure())
return WalkResult::interrupt();
if (diag.succeeded())
return WalkResult::skip();

//===--------------------------------------------------------------------===//
// Mapping to block ids (happens last so we can replay ThreadIdOp).
//===--------------------------------------------------------------------===//
LLVM_DEBUG(
llvm::interleaveComma(
blockDimsOfr, DBGS() << "mapNestedForallToThreadsImpl blockDims: ");
llvm::dbgs() << "\n");
GpuThreadIdBuilder gpuThreadIdBuilder(ctx, blockDimsOfr, blockDims);
diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
rewriter, transformOp, forallOp, blockDims, syncAfterDistribute,
gpuThreadIdBuilder);
// Use silenceable failure to encode "failure to match" and pass through.
if (diag.isDefiniteFailure())
return WalkResult::interrupt();

return WalkResult::advance();		return WalkResult::advance();
});		});
if (walkResult.wasInterrupted())		if (walkResult.wasInterrupted())
return diag;		return diag;

// Replace ids of dimensions known to be 1 by 0 to simplify the IR.		// Replace ids of dimensions known to be 1 by 0 to simplify the IR.
// Here, the result of mapping determines the available mapping sizes.		// Here, the result of mapping determines the available mapping sizes.
replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,		replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
Show All 9 Lines	DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
auto transformOp = cast<TransformOpInterface>(getOperation());		auto transformOp = cast<TransformOpInterface>(getOperation());

// Basic high-level verifications.		// Basic high-level verifications.
if (!gpuLaunch)		if (!gpuLaunch)
return emitSilenceableError() << "Given target is not a gpu.launch";		return emitSilenceableError() << "Given target is not a gpu.launch";

// Mapping to block ids.		// Mapping to block ids.
SmallVector<int64_t> blockDims{getBlockDims()};		SmallVector<int64_t> blockDims{getBlockDims()};

DiagnosedSilenceableFailure diag =		DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,		checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
blockDims[0], blockDims[1], blockDims[2]);		blockDims[0], blockDims[1], blockDims[2]);
if (diag.isSilenceableFailure()) {		if (diag.isSilenceableFailure()) {
diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";		diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
return diag;		return diag;
}		}

▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/TransformOps/Utils.cpp

This file was added.

				//===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Dialect/GPU/TransformOps/Utils.h"

				#include "mlir/Dialect/Affine/IR/AffineOps.h"
				#include "mlir/Dialect/Arith/IR/Arith.h"
				#include "mlir/Dialect/Func/IR/FuncOps.h"
				#include "mlir/Dialect/GPU/IR/GPUDialect.h"
				#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
				#include "mlir/Dialect/MemRef/IR/MemRef.h"
				#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
				#include "mlir/Dialect/SCF/IR/SCF.h"
				#include "mlir/Dialect/Transform/IR/TransformDialect.h"
				#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
				#include "mlir/Dialect/Utils/IndexingUtils.h"
				#include "mlir/Dialect/Vector/IR/VectorOps.h"
				#include "mlir/IR/AffineExpr.h"
				#include "mlir/IR/Builders.h"
				#include "mlir/IR/BuiltinAttributes.h"
				#include "mlir/IR/IRMapping.h"
				#include "mlir/IR/MLIRContext.h"
				#include "mlir/IR/OpDefinition.h"
				#include "mlir/IR/Value.h"
				#include "mlir/IR/Visitors.h"
				#include "mlir/Support/LLVM.h"
				#include "llvm/ADT/STLExtras.h"
				#include "llvm/ADT/SmallVector.h"
				#include "llvm/ADT/TypeSwitch.h"
				#include "llvm/Support/Debug.h"

				using namespace mlir;
				using namespace mlir::gpu;
				using namespace mlir::transform;
				using namespace mlir::transform::gpu;

				#define DEBUG_TYPE "gpu-transforms"

				#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
				#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
				#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")

				/// Return a flattened thread id for the workgroup with given sizes.
				template <typename ThreadOrBlockIdOp>
				static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
				ArrayRef<OpFoldResult> originalBasisOfr) {
				LLVM_DEBUG(llvm::interleaveComma(
				originalBasisOfr,
				DBGS() << "----buildLinearThreadId with originalBasisOfr: ");
				llvm::dbgs() << "\n");
				assert(originalBasisOfr.size() == 3 && "expected 3 workgroup sizes");
				IndexType indexType = rewriter.getIndexType();
				AffineExpr tx, ty, tz, BDX, BDY;
				bindDims(rewriter.getContext(), tx, ty, tz);
				bindSymbols(rewriter.getContext(), BDX, BDY);
				SmallVector<OpFoldResult> vals{
				rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
				.getResult(),
				rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
				.getResult(),
				rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
				.getResult(),
				originalBasisOfr[0], originalBasisOfr[1]};
				OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
				rewriter, loc, tx + ty * BDX + tz * BDX * BDY, vals);
				return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
				}

				/// Create a linear id builder that takes the `originalBasisOfr` and decompose
				/// it in the basis of `forallMappingSizes`. The linear id builder returns an
				/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
				template <typename ThreadOrBlockIdOp>
				static GpuIdBuilderFnType
				commonLinearIdBuilderFn(ArrayRef<OpFoldResult> originalBasisOfr,
				int64_t multiplicity = 1) {
				auto res = [originalBasisOfr,
				multiplicity](RewriterBase &rewriter, Location loc,
				ArrayRef<int64_t> forallMappingSizes) {
				OpFoldResult linearId =
				buildLinearThreadId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
				// Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
				// "row-major" order.
				SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
				SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
				AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
				linearId = affine::makeComposedFoldedAffineApply(
				rewriter, loc, d0.floorDiv(multiplicity), {linearId});
				SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
				SmallVector<Value> ids;
				// Reverse back to be in [0 .. n] order.
				for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
				ids.push_back(
				affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
				}

				// clang-format off
				LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
				DBGS() << "--delinearization basis: ");
				llvm::dbgs() << "\n";
				llvm::interleaveComma(strides,
				DBGS() << "--delinearization strides: ");
				llvm::dbgs() << "\n";
				llvm::interleaveComma(delinearizingExprs,
				DBGS() << "--delinearization exprs: ");
				llvm::dbgs() << "\n";
				llvm::interleaveComma(ids, DBGS() << "--ids: ");
				llvm::dbgs() << "\n";);
				// clang-format on

				// Return n-D ids for indexing and 1-D size + id for predicate generation.
				return IdBuilderResult{
				ids, SmallVector<int64_t>{computeProduct(forallMappingSizes)},
				SmallVector<Value>{linearId.get<Value>()}};
				};

				return res;
				}

				/// Create a simple 3-D id builder that takes the `originalBasisOfr`
				/// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes
				/// + ids for predicate generation.
				template <typename ThreadOrBlockIdOp>
				static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
				auto res = [multiplicity](RewriterBase &rewriter, Location loc,
				ArrayRef<int64_t> forallMappingSizes) {
				IndexType indexType = rewriter.getIndexType();
				SmallVector<Value> ids{
				rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
				rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
				rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
				AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
				ids[0] = affine::makeComposedFoldedAffineApply(
				rewriter, loc, d0.floorDiv(multiplicity), {ids[0]})
				.get<Value>();
				return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
				};
				return res;
				}

				namespace mlir {
				namespace transform {
				namespace gpu {

				GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx,
				ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping, MappingIdBuilderFnType fn)
				: originalBasisOfr(originalBasisOfr), availableMappingSizes(),
				mappingAttributes(), idBuilder() {
				if (useLinearMapping) {
				availableMappingSizes.push_back(computeProduct(mappingSizes));
				for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0),
				e = getMaxEnumValForMappingId();
				d <= e; ++d)
				mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
				} else {
				llvm::append_range(availableMappingSizes, mappingSizes);
				for (uint64_t d = static_cast<uint64_t>(MappingId::DimX),
				e = static_cast<uint64_t>(MappingId::DimZ);
				d <= e; ++d)
				mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
				}
				}

				GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx,
				ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping)
				: GpuIdBuilder(ctx, originalBasisOfr, mappingSizes, useLinearMapping,
				[](MLIRContext *ctx, MappingId id) {
				return GPUBlockMappingAttr::get(ctx, id);
				}) {
				idBuilder = useLinearMapping
				? commonLinearIdBuilderFn<BlockIdOp>(this->originalBasisOfr)
				: common3DIdBuilderFn<BlockIdOp>();
				}

				GpuWarpgroupIdBuilder::GpuWarpgroupIdBuilder(
				MLIRContext *ctx, ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes, bool useLinearMapping)
				: GpuIdBuilder(ctx, originalBasisOfr, mappingSizes, useLinearMapping,
				[](MLIRContext *ctx, MappingId id) {
				return GPUWarpgroupMappingAttr::get(ctx, id);
				}) {
				idBuilder = useLinearMapping
				? commonLinearIdBuilderFn<ThreadIdOp>(
				this->originalBasisOfr,
				/multiplicity=/kNumWarpsPerGroup * kWarpSize)
				: common3DIdBuilderFn<ThreadIdOp>(
				/multiplicity=/kNumWarpsPerGroup * kWarpSize);
				}

				GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx,
				ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping)
				: GpuIdBuilder(ctx, originalBasisOfr, mappingSizes, useLinearMapping,
				[](MLIRContext *ctx, MappingId id) {
				return GPUWarpMappingAttr::get(ctx, id);
				}) {
				idBuilder =
				useLinearMapping
				? commonLinearIdBuilderFn<ThreadIdOp>(this->originalBasisOfr,
				/multiplicity=/kWarpSize)
				: common3DIdBuilderFn<ThreadIdOp>(/multiplicity=/kWarpSize);
				}

				GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx,
				ArrayRef<OpFoldResult> originalBasisOfr,
				ArrayRef<int64_t> mappingSizes,
				bool useLinearMapping)
				: GpuIdBuilder(ctx, originalBasisOfr, mappingSizes, useLinearMapping,
				[](MLIRContext *ctx, MappingId id) {
				return GPUThreadMappingAttr::get(ctx, id);
				}) {
				idBuilder = useLinearMapping
				? commonLinearIdBuilderFn<ThreadIdOp>(this->originalBasisOfr)
				: common3DIdBuilderFn<ThreadIdOp>();
				}

				DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
				std::optional<int64_t> gridDimX,
				std::optional<int64_t> gridDimY,
				std::optional<int64_t> gridDimZ,
				std::optional<int64_t> blockDimX,
				std::optional<int64_t> blockDimY,
				std::optional<int64_t> blockDimZ) {

				// TODO: pass a configuration object to set the limits properly.
				static constexpr int maxTotalBlockdim = 1024;
				static constexpr int maxBlockdimx = 1024;
				static constexpr int maxBlockdimy = 1024;
				static constexpr int maxBlockdimz = 64;
				static constexpr int maxTotalGriddim = 2147483647;
				static constexpr int maxGriddimx = 2147483647;
				static constexpr int maxGriddimy = 65535;
				static constexpr int maxGriddimz = 65535;

				if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
				maxTotalBlockdim \|\|
				(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
				maxTotalGriddim \|\|
				blockDimX.value_or(1) > maxBlockdimx \|\|
				blockDimY.value_or(1) > maxBlockdimy \|\|
				blockDimZ.value_or(1) > maxBlockdimz \|\|
				gridDimY.value_or(1) > maxGriddimy \|\|
				gridDimZ.value_or(1) > maxGriddimz \|\|
				gridDimX.value_or(1) > maxGriddimx) {
				return transformOp.emitSilenceableError()
				<< "Trying to launch a GPU kernel with grid_dims = ("
				<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
				<< gridDimZ.value_or(1) << ") block_dims = ("
				<< blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
				<< blockDimZ.value_or(1) << "). It is larger than the limits.";
				}
				return DiagnosedSilenceableFailure::success();
				}

				DiagnosedSilenceableFailure createGpuLaunch(
				RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
				LaunchOp &launchOp, std::optional<int64_t> gridDimX,
				std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
				std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
				std::optional<int64_t> blockDimZ) {
				DiagnosedSilenceableFailure diag =
				checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
				blockDimY, blockDimZ);
				if (!diag.succeeded())
				return diag;

				auto createConst = [&](int dim) {
				return rewriter.create<arith::ConstantIndexOp>(loc, dim);
				};
				OpBuilder::InsertionGuard guard(rewriter);
				Value one = createConst(1);
				Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
				Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
				Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
				Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
				Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
				Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
				launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
				blkSizeX, blkSizeY, blkSizeZ);
				rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
				rewriter.create<TerminatorOp>(loc);
				return DiagnosedSilenceableFailure::success();
				}

				/// Alter kernel configuration of the given kernel.
				DiagnosedSilenceableFailure alterGpuLaunch(
				RewriterBase &rewriter, LaunchOp gpuLaunch,
				TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
				std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
				std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
				std::optional<int64_t> blockDimZ) {
				DiagnosedSilenceableFailure diag =
				checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
				blockDimY, blockDimZ);
				if (!diag.succeeded())
				return diag;

				KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
				OpBuilder::InsertionGuard guard(rewriter);
				rewriter.setInsertionPointAfterValue(currentBlockdim.x);
				auto createConstValue = [&](int dim) {
				return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
				dim);
				};

				if (gridDimX.has_value())
				gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
				if (gridDimY.has_value())
				gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
				if (gridDimZ.has_value())
				gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
				if (blockDimX.has_value())
				gpuLaunch.getBlockSizeXMutable().assign(
				createConstValue(blockDimX.value()));
				if (blockDimY.has_value())
				gpuLaunch.getBlockSizeYMutable().assign(
				createConstValue(blockDimY.value()));
				if (blockDimZ.has_value())
				gpuLaunch.getBlockSizeZMutable().assign(
				createConstValue(blockDimZ.value()));
				return DiagnosedSilenceableFailure::success();
				}

				} // namespace gpu
				} // namespace transform
				} // namespace mlir

mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp

Show All 18 Lines
#include <numeric>		#include <numeric>

using namespace mlir;		using namespace mlir;

#define DEBUG_TYPE "linalg-transforms"		#define DEBUG_TYPE "linalg-transforms"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")		#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")		#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

static Attribute linearIdX(MLIRContext *ctx) {		static Attribute linearId0(MLIRContext *ctx) {
return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimX);		return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
}		}
static Attribute linearIdY(MLIRContext *ctx) {		static Attribute linearId1(MLIRContext *ctx) {
return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimY);		return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
}		}
static Attribute linearIdZ(MLIRContext *ctx) {		static Attribute linearId2(MLIRContext *ctx) {
return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimZ);		return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
}		}

transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,		transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
int totalNumThreads,		int totalNumThreads,
int64_t desiredBitAlignment,		int64_t desiredBitAlignment,
ArrayRef<int64_t> copySizes,		ArrayRef<int64_t> copySizes,
bool favorPredication,		bool favorPredication,
int64_t elementalBitwidth) {		int64_t elementalBitwidth) {
Show All 30 Lines	transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,

// Compute the smallest bounding box.		// Compute the smallest bounding box.
this->smallestBoundingTileSizes = llvm::to_vector(		this->smallestBoundingTileSizes = llvm::to_vector(
llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {		llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
int64_t size, numThreads;		int64_t size, numThreads;
std::tie(size, numThreads) = pair;		std::tie(size, numThreads) = pair;
return mlir::ceilDiv(size, numThreads);		return mlir::ceilDiv(size, numThreads);
}));		}));
SmallVector<Attribute> allThreadMappings{linearIdZ(ctx), linearIdY(ctx),		SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
linearIdX(ctx)};		linearId0(ctx)};

// Set the thread mapping.		// Set the thread mapping.
this->threadMapping =		this->threadMapping =
llvm::to_vector(ArrayRef(allThreadMappings)		llvm::to_vector(ArrayRef(allThreadMappings)
.take_back(this->smallestBoundingTileSizes.size()));		.take_back(this->smallestBoundingTileSizes.size()));
LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");		LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
}		}

▲ Show 20 Lines • Show All 177 Lines • Show Last 20 Lines

mlir/test/Dialect/GPU/transform-gpu.mlir

Show First 20 Lines • Show All 245 Lines • ▼ Show 20 Lines
// CHECK-LABEL: func.func @map_multi_level(		// CHECK-LABEL: func.func @map_multi_level(
func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {		func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
%one = arith.constant 1 : index		%one = arith.constant 1 : index
%c10 = arith.constant 10 : index		%c10 = arith.constant 10 : index
%c9 = arith.constant 9 : index		%c9 = arith.constant 9 : index
%c7 = arith.constant 7 : index		%c7 = arith.constant 7 : index
%c1 = arith.constant 1 : index		%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index		%c2 = arith.constant 2 : index
		%c3 = arith.constant 3 : index

// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index		// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index		// CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index
// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index		// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
// CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index		// CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index

// check that both the thread level and the warp level got distributed.		// check that both the thread level and the warp level got distributed.
// CHECK-NOT: #gpu.thread		// CHECK-NOT: #gpu.thread
Show All 11 Lines	%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
} { mapping = [#gpu.thread<y>, #gpu.thread<x>]}		} { mapping = [#gpu.thread<y>, #gpu.thread<x>]}

// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])		// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])		// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index		// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index
// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index		// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index
// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1		// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
// CHECK: scf.if %[[COND]]		// CHECK: scf.if %[[COND]]
scf.forall (%i) in (%c1) {		scf.forall (%i, %j) in (%c2, %c1) {
%7 = memref.load %t[%i] : !type1d		%7 = memref.load %x[%i, %j] : !type
%8 = arith.addf %alpha, %7 : f32		%8 = arith.addf %alpha, %7 : f32
memref.store %8, %t[%i] : !type1d		memref.store %8, %y[%i, %j] : !type
} {mapping = [#gpu.warp<x>] }		} {mapping = [#gpu.warp<x>, #gpu.warp<y>] }

// CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])		// CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])		// CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])		// CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index		// CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
// CHECK: scf.if %[[COND]]		// CHECK: scf.if %[[COND]]
// CHECK: memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32>		// CHECK: memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32>
// CHECK: memref.store %{{.*}}[%[[LIDY]]] : memref<32xf32>		// CHECK: memref.store %{{.*}}[%[[LIDY]]] : memref<32xf32>
scf.forall (%i, %j) in (%c10, %c2) {		scf.forall (%i, %j) in (%c10, %c2) {
%7 = memref.load %t[%i] : !type1d		%7 = memref.load %t[%i] : !type1d
%8 = arith.addf %alpha, %7 : f32		%8 = arith.addf %alpha, %7 : f32
memref.store %8, %t[%j] : !type1d		memref.store %8, %t[%j] : !type1d
} {mapping = [#gpu.linear<x>, #gpu.linear<y>] }		} {mapping = [#gpu.thread<linear_dim_0>, #gpu.thread<linear_dim_1>] }
gpu.terminator		gpu.terminator
}		}
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !transform.any_op):		^bb1(%arg0: !transform.any_op):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
transform.gpu.map_nested_forall_to_threads %funcop		transform.gpu.map_nested_forall_to_threads %funcop
block_dims = [12, 11, 1] warp_dims = [3, 2, 1] : (!transform.any_op) -> !transform.any_op		block_dims = [12, 11, 1] warp_dims = [3, 2, 1] : (!transform.any_op) -> !transform.any_op
}		}

// -----

// CHECK-LABEL: func.func @tiling_buffer_semantic_op(
// CHECK: gpu.launch {{.*}} {
// CHECK: scf.forall {{.*}} {
// CHECK: memref.subview
// CHECK: memref.subview
// CHECK: linalg.generic
// CHECK: }
// CHECK: }
func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) {
%one = arith.constant 1 : index
%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
{
linalg.generic
{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%x : memref<32x32xf32>)
outs(%y : memref<32x32xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
gpu.terminator
}
return
}

transform.sequence failures(propagate) {
^bb1(%arg0: !transform.any_op):
%matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
%forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][transforms] Revamp the implementation of mapping loops to GPUs
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 542861

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h

mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

mlir/lib/Dialect/GPU/TransformOps/Utils.cpp

mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp

mlir/test/Dialect/GPU/transform-gpu.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][transforms] Revamp the implementation of mapping loops to GPUsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 542861

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h

mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

mlir/lib/Dialect/GPU/TransformOps/Utils.cpp

mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp

mlir/test/Dialect/GPU/transform-gpu.mlir

[mlir][transforms] Revamp the implementation of mapping loops to GPUs
ClosedPublic