diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td @@ -20,107 +20,214 @@ def DimX : I64EnumAttrCase<"DimX", 0, "x">; def DimY : I64EnumAttrCase<"DimY", 1, "y">; def DimZ : I64EnumAttrCase<"DimZ", 2, "z">; - -def ThreadsEnum : I64EnumAttr<"Threads", "threads for loop mapping", [ - DimX, DimY, DimZ]> { +def LinearDim0 : I64EnumAttrCase<"LinearDim0", 3, "linear_dim_0">; +def LinearDim1 : I64EnumAttrCase<"LinearDim1", 4, "linear_dim_1">; +def LinearDim2 : I64EnumAttrCase<"LinearDim2", 5, "linear_dim_2">; +def LinearDim3 : I64EnumAttrCase<"LinearDim3", 6, "linear_dim_3">; +def LinearDim4 : I64EnumAttrCase<"LinearDim4", 7, "linear_dim_4">; +def LinearDim5 : I64EnumAttrCase<"LinearDim5", 8, "linear_dim_5">; +def LinearDim6 : I64EnumAttrCase<"LinearDim6", 9, "linear_dim_6">; +def LinearDim7 : I64EnumAttrCase<"LinearDim7", 10, "linear_dim_7">; +def LinearDim8 : I64EnumAttrCase<"LinearDim8", 11, "linear_dim_8">; +def LinearDim9 : I64EnumAttrCase<"LinearDim9", 12, "linear_dim_9">; + +// TODO: This would be better represented with separate Grid and Linear Mapping +// ids. Unfortunately it is not yet possible to have an optional EnumParameter +// so we currently embed the 2 modes in the same enum. +def MappingIdEnum : I64EnumAttr<"MappingId", "Mapping ids for loop mapping", [ + DimX, DimY, DimZ, + LinearDim0, LinearDim1, LinearDim2, LinearDim3, LinearDim4, + LinearDim5, LinearDim6, LinearDim7, LinearDim8, LinearDim9]> { let cppNamespace = "::mlir::gpu"; } -def GPUThreadMappingAttr - : GPU_Attr<"GPUThreadMapping", "thread", [ - DeclareAttrInterfaceMethods ]> { +def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [ + DeclareAttrInterfaceMethods ] > { let parameters = (ins - EnumParameter:$thread + EnumParameter:$block ); let assemblyFormat = "`<` params `>`"; let description = [{ - An attribute that allows defining thread parallelism for GPU devices. + An attribute that allows defining thread block parallelism for GPU devices. - Thread (aka work item) are grouped into a thread blocks where block may be - described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates - that thread parallelism is desired. It can be consumed by lowering to - generate GPU. - }]; -} + Thread blocks (aka workgroup) are grouped into a grid described by a + 3-dimensional rectangle. + This attribute indicates that thread block parallelism is desired. + It can be consumed by lowering to generate GPU code. + 2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode. -def WarpsEnum : I64EnumAttr<"Warps", "threads for loop mapping", [ - DimX, DimY, DimZ]> { - let cppNamespace = "::mlir::gpu"; + #### 3D mapping mode + + The 3D block id is simply the 3D index of the block `(bidx, bidy, bidz)`. + If required, predication occurs on a per-dimension basis. This allows + specifying predication on a 3D sub-rectangle of the grid. + + #### Linear mapping mode + + The linear block id is obtained by linearizing the index of the block. + If required, predication occurs on the linear id. This allows specifying + predication on a 1D subset of the (linearized) grid. + + For instance, if the basis is denoted as (GX, GY, GZ) and the block id is + denoted by (bx, by, bz), the block id is: + `linear_id = bx + by * GX + bz * GX * GBY)`. + The linear block id is fixed for the duration of a GPU kernel. + + This linear id mapping attribute indicates a different linearization relation + is applied locally to a loop nest. + + For instance, if the new basis is denoted as (LBD0, LBD1, LBD2, LBD3) the + block id in the new basis is: + ```(linear_id mod LBD0 , + (linear_id / LBD0) mod * LBD1, + (linear_id / (LBD0 * LBD1)) mod LBD2, + (linear_id / (LBD0 * LBD1 * LBD2)) mod LBD3)```. + This reinterpretation is only fixed for the duration of a loop nest. + }]; } -def GPUWarpMappingAttr : GPU_Attr<"GPUWarpMapping", "warp", [ - DeclareAttrInterfaceMethods ] > { +def GPUWarpgroupMappingAttr + : GPU_Attr<"GPUWarpgroupMapping", "warpgroup", [ + DeclareAttrInterfaceMethods ]> { let parameters = (ins - EnumParameter:$warp + EnumParameter:$warpgroup ); let assemblyFormat = "`<` params `>`"; let description = [{ - An attribute that allows defining thread block parallelism for GPU devices. + An attribute that allows defining warpgroup parallelism for GPU devices. - Warp (aka subgroup) are grouped into a grid where grid may be - described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates - that thread block parallelism is desired. It can be consumed by lowering to - generate GPU code. - }]; -} + Threads of proper granularity (e.g. multiple of + "kNumWarpsPerGroup * kWarpSize" on CUDA devices) can be grouped into + warpgroups described by a 3-dimensional rectangle. + This attribute indicates that warpgroup parallelism is desired. + It can be consumed by lowering to generate GPU code. + 2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode. -def LinearIdEnum : I64EnumAttr<"LinearId", "linear ids for loop mapping", [ - DimX, DimY, DimZ]> { - let cppNamespace = "::mlir::gpu"; + #### 3D mapping mode + + The 3D warpgroup id is simply the adjusted 3D index of the thread + `(tidx / (kNumWarpsPerGroup * kWarpSize), tidy, tidz)`. + If required, predication occurs on a per-dimension basis. This allows + specifying predication on a 3D sub-rectangle of the warpgroups. + + #### Linear mapping mode + + The linear warpgroup id is obtained by linearizing the index of the warpgroup. + If required, predication occurs on the linear id. This allows specifying + predication on a 1D "kNumWarpsPerGroup * kWarpSize"-aligned subset of the + (linearized) block. + + For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is + id is denoted by (tx, ty, tz), the linear warpgroup id is: + ```linear_id = (tx + ty * BX + tz * BX * BY) + / (kNumWarpsPerGroup * kWarpSize)```. + The linear warpgroup id is fixed for the duration of a GPU kernel. + + This linear id mapping attribute indicates a different linearization relation + is applied locally to a loop nest. + + For instance, if the new basis is denoted as (LWGD0, LWGD1, LWGD2, LWGD3) the + warpgroup id in the new basis is: + ```(linear_id mod LWGD0 , + (linear_id / LWGD0) mod * LWGD1, + (linear_id / (LWGD0 * LWGD1)) mod LWGD2, + (linear_id / (LWGD0 * LWGD1 * LWGD2)) mod LWGD3)```. + This reinterpretation is only fixed for the duration of a loop nest. + }]; } -def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [ - DeclareAttrInterfaceMethods ] > { +def GPUWarpMappingAttr + : GPU_Attr<"GPUWarpMapping", "warp", [ + DeclareAttrInterfaceMethods ]> { let parameters = (ins - EnumParameter:$linear_id + EnumParameter:$warp ); let assemblyFormat = "`<` params `>`"; let description = [{ - An attribute to allow re-interpreting the linear mapping for threads in GPU - devices. + An attribute that allows defining warp parallelism for GPU devices. - Threads (aka work item) are grouped into a thread block where block may be - described by a 1-, 2-, or 3-dimensional rectangular basis. - The linear thread id is obtained by linearizing the 1-, 2- or 3-dimensional - index. For instance, if the basis is denoted as (BX, BY, BZ) and the thread - id is denoted by (tx, ty, tz), the linear thread id is: - `linear_id = tx + ty * BX + tz * BX * BY)`. - The linear thread id is fixed for the duration of a GPU kernel. + Threads of proper granularity (e.g. multiple of "warp size" on CUDA devices) + can be grouped into warps described by a 3-dimensional rectangle. + This attribute indicates that warp parallelism is desired. + It can be consumed by lowering to generate GPU code. + 2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode. + + #### 3D mapping mode + + The 3D warp id is simply the adjusted 3D index of the thread + `(tidx / kWarpSize, tidy, tidz)`. + If required, predication occurs on a per-dimension basis. This allows + specifying predication on a 3D sub-rectangle of the warpgroups. + + #### Linear mapping mode + + The linear warp id is obtained by linearizing the index of the warp. + If required, predication occurs on the linear id. This allows specifying + predication on a 1D "kWarpSize"-aligned subset of the (linearized) block. + + For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is + id is denoted by (tx, ty, tz), the linear warp id is: + `linear_id = (tx + ty * BX + tz * BX * BY) / kWarpSize`. + The linear warp id is fixed for the duration of a GPU kernel. This linear id mapping attribute indicates a different linearization relation is applied locally to a loop nest. - For instance, if the new basis is denoted as (LBX, LBY, LBZ) the thread id - in the new basis is: - `(linear_id mod LBX , (linear_id / LBX) mod * LBY, linear_id / (LBX * LBY))`. - This reinterpretation is only fixe for the duration of a loop nest. - - It can be consumed by lowering to generate GPU code. + For instance, if the new basis is denoted as (LWD0, LWD1, LWD2, LWD3) the + warp id in the new basis is: + ```(linear_id mod LWD0 , + (linear_id / LWD0) mod * LWD1, + (linear_id / (LWD0 * LWD1)) mod LWD2, + (linear_id / (LWD0 * LWD1 * LWD2)) mod LWD3)```. + This reinterpretation is only fixed for the duration of a loop nest. }]; } -def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [ - DimX, DimY, DimZ]> { - let cppNamespace = "::mlir::gpu"; -} - -def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [ - DeclareAttrInterfaceMethods ] > { +def GPUThreadMappingAttr + : GPU_Attr<"GPUThreadMapping", "thread", [ + DeclareAttrInterfaceMethods ]> { let parameters = (ins - EnumParameter:$block + EnumParameter:$thread ); let assemblyFormat = "`<` params `>`"; let description = [{ - An attribute that allows defining thread block parallelism for GPU devices. + An attribute that allows defining thread parallelism for GPU devices. + + Thread (aka work item) are grouped into a thread blocks described by a + 3-dimensional rectangle. + This attribute indicates that thread parallelism is desired. + It can be consumed by lowering to generate GPU. + + #### 3D mapping mode + + The 3D thread id is simply the 3D index of the thread `(tidx, tidy, tidz)`. + If required, predication occurs on a per-dimension basis. This allows + specifying predication on a 3D sub-rectangle of the block. + + #### Linear mapping mode - Thread blocks (aka work-group) are grouped into a grid where grid may be - described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates - that thread block parallelism is desired. It can be consumed by lowering to - generate GPU code. + The linear thread id is obtained by linearizing the index of the thread. + If required, predication occurs on the linear id. This allows specifying + predication on a 1D subset of the (linearized) block. + + For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is + id is denoted by (tx, ty, tz), the linear thread id is: + ```linear_id = (tx + ty * BX + tz * BX * BY)```. + The linear thread id is fixed for the duration of a GPU kernel. + + This linear id mapping attribute indicates a different linearization relation + is applied locally to a loop nest. + + For instance, if the new basis is denoted as (LTD0, LTD1, LTD2, LTD3) the + thread id in the new basis is: + ```(linear_id mod LTD0 , + (linear_id / LTD0) mod * LTD1, + (linear_id / (LTD0 * LTD1)) mod LTD2, + (linear_id / (LTD0 * LTD1 * LTD2)) mod LTD3)```. + This reinterpretation is only fixed for the duration of a loop nest. }]; } - def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [ DeclareAttrInterfaceMethods ] > { let parameters = (ins @@ -138,5 +245,4 @@ }]; } - #endif // GPU_DEVICE_MAPPING_ATTR diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h @@ -31,58 +31,14 @@ class DialectRegistry; namespace transform { namespace gpu { +struct GpuIdBuilder; -/// Helper type for functions that generate ids for the mapping of a -/// scf.forall. -struct IdBuilderResult { - // Ops used to replace the forall induction variables. - SmallVector mappingIdOps; - // Actual mapping sizes used to predicate the forall body when they are - // smaller than the available mapping sizes. - SmallVector predicateMappingSizes; - // Ops used to predicate the forall body when predicateMappingSizes is smaller - // than the available mapping sizes. - SmallVector predicateIdOps; -}; - -/// Common gpu id builder type, allows the configuration of lowering for various -/// mapping schemes. Takes: -/// - A rewriter with insertion point set before the forall op to rewrite. -/// - The loc of the forall op to rewrite. -/// - A list of positive integers carrying the mapping sizes for the current -/// forall op to rewrite. -using GpuIdBuilderFnType = - std::function)>; - -/// Helper struct for configuring the rewrite of mapped scf.forall ops to -/// various gpu id configurations. -struct GpuIdBuilder { - GpuIdBuilder(ArrayRef blockDims, ArrayRef mappingSizes) - : blockDimsOfr(blockDims), availableMappingSizes(mappingSizes), - mappingAttributes(), idBuilder() {} - - /// List of OpFoldResult carrying the multi-dimensional number of - /// threads available in the current kernel (i.e. the current blockDims in - /// CUDA parlance). - ArrayRef blockDimsOfr; - - /// A list of positive integers carrying the number of available mapping - /// resources that can trigger predication, - ArrayRef availableMappingSizes; - - /// The mapping attributes targeted by this generator. - SmallVector mappingAttributes; - - /// The constructor that builds the concrete IR for mapping ids. - GpuIdBuilderFnType idBuilder; -}; - -/// Map the top level `scf.forall` op to GPU Thread Blocks. +/// Map the top level `scf.forall` op to GPU blocks. /// Mapping is one-to-one and the induction variables of `scf.forall` are /// rewritten to gpu.block_id according to the thread_dim_mapping attribute. /// /// Dynamic, `scf.forall` trip counts are currently not supported. -/// Dynamic block dim sizes are currently not supported. +/// Dynamic `gridDims` are currently not supported. DiagnosedSilenceableFailure mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, @@ -90,41 +46,36 @@ const GpuIdBuilder &gpuIdBuilder); /// Search `scf.forall` ops nested under `target` and map each such op to an -/// explicit GPU implementation along `availableMappingSizes`. +/// explicit GPU implementation along `blockDims`. /// The mapping is one-to-one and the induction variables of `scf.forall` are /// rewritten to gpuIdBuilder.idBuilder according to the /// gpuIdBuilder.mappingAttributes attribute. /// /// Dynamic, `scf.forall` trip counts are currently not supported. -/// Dynamic `availableMappingSizes` sizes are currently not supported. -/// `availableMappingSizes` is expected to be of size 3. -DiagnosedSilenceableFailure mapOneForallToThreadsImpl( - RewriterBase &rewriter, std::optional transformOp, - scf::ForallOp forallOp, ArrayRef availableMappingSizes, - bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder); +/// Dynamic `blockDims` sizes are currently not supported. +/// `blockDims` is expected to be of size 3. +DiagnosedSilenceableFailure +mapOneForallToThreadsImpl(RewriterBase &rewriter, + std::optional transformOp, + scf::ForallOp forallOp, ArrayRef blockDims, + int64_t warpSize, bool syncAfterDistribute); /// Search `scf.forall` ops nested under `target` and map each such op to an -/// explicit GPU implementation along blockDims and warpDims. +/// explicit GPU implementation along `blockDims`. /// The mapping is one-to-one and the induction variables of `scf.forall` are -/// rewritten to threads and warps ids according to the mapping attribute. +/// rewritten to appropriate ids according to the mapping attribute. /// /// Dynamic, `scf.forall` trip counts are currently not supported. -/// Dynamic `blockDims` or `warpDims` or `linearDims` sizes are currently not -/// supported. -/// `blockDims` is expected to be of size 3. -/// `warpDims` is expected to be empty or of size 3. +/// Dynamic `blockDims` or `newBasis` entries are currently not +/// supported. `blockDims` is expected to be of size 3. /// /// The insertion point of the `rewriter` is expected to be set at the /// beginning of the `target` body block and dominate all other blocks. -DiagnosedSilenceableFailure mapNestedForallToThreadsImpl( - RewriterBase &rewriter, std::optional transformOp, - Operation *target, ArrayRef blockDimsOfr, - ArrayRef warpDims, bool syncAfterDistribute); - -/// Find the unique top level scf::ForallOp within a given target op. DiagnosedSilenceableFailure -findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, - TransformOpInterface transformOp); +mapNestedForallToThreadsImpl(RewriterBase &rewriter, + std::optional transformOp, + Operation *target, ArrayRef blockDims, + int64_t warpSize, bool syncAfterDistribute); } // namespace gpu } // namespace transform diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -144,15 +144,15 @@ let arguments = (ins TransformHandleTypeInterface:$target, DefaultValuedAttr:$block_dims, - DefaultValuedOptionalAttr:$warp_dims, - DefaultValuedAttr:$sync_after_distribute); + DefaultValuedAttr:$sync_after_distribute, + DefaultValuedAttr:$warp_size); let results = (outs TransformHandleTypeInterface:$result); let assemblyFormat = [{ $target `block_dims` `=` $block_dims - (`warp_dims` `=` $warp_dims^)? (`sync_after_distribute` `=` $sync_after_distribute^)? + (`warp_size` `=` $warp_size^)? attr-dict `:` functional-type($target, $result) }]; diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h @@ -0,0 +1,157 @@ +//===- Utils.h - Utils for GPU transform ops --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H +#define MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +namespace gpu { +class GPUOp; +class LaunchOp; +enum class MappingId : uint64_t; +} // namespace gpu +namespace scf { +class ForallOp; +} // namespace scf +namespace transform { +namespace gpu { + +/// Helper type for functions that generate ids for the mapping of a +/// scf.forall. +struct IdBuilderResult { + // Ops used to replace the forall induction variables. + SmallVector mappingIdOps; + // Actual mapping sizes used to predicate the forall body when they are + // smaller than the available mapping sizes. + SmallVector predicateMappingSizes; + // Available mapping sizes used to predicate the forall body when they are + // larger than the predicate mapping sizes. + SmallVector availableMappingSizes; + // Ops used to predicate the forall body when predicateMappingSizes is smaller + // than the available mapping sizes. + SmallVector predicateIdOps; +}; + +/// Common gpu id builder type, allows the configuration of lowering for various +/// mapping schemes. Takes: +/// - A rewriter with insertion point set before the forall op to rewrite. +/// - The loc of the forall op to rewrite. +/// - A list of positive integers carrying the mapping sizes for the current +/// forall op to rewrite. +using GpuIdBuilderFnType = std::function, ArrayRef)>; + +/// Helper struct for configuring the rewrite of mapped scf.forall ops to +/// various gpu id configurations. +struct GpuIdBuilder { + using MappingIdBuilderFnType = std::function; + + GpuIdBuilder() = default; + GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping, + MappingIdBuilderFnType builder); + + /// The mapping attributes targeted by this generator. + SmallVector mappingAttributes; + + /// The constructor that builds the concrete IR for mapping ids. + GpuIdBuilderFnType idBuilder; +}; + +/// Builder for gpu::BlockIdOps used to map scf.forall to blocks. +/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values +/// used for indexing rewrites as well as 3D sizes for predicate generation. +/// If `useLinearMapping` is true, the `idBuilder` method returns nD values +/// used for indexing rewrites as well as 1D sizes for predicate generation. +struct GpuBlockIdBuilder : public GpuIdBuilder { + GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping = false); +}; + +/// Builder for warpgroup ids used to map scf.forall to reindexed warpgroups. +/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values +/// used for indexing rewrites as well as 3D sizes for predicate generation. +/// If `useLinearMapping` is true, the `idBuilder` method returns nD values +/// used for indexing rewrites as well as 1D sizes for predicate generation. +struct GpuWarpgroupIdBuilder : public GpuIdBuilder { + GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, + bool useLinearMapping = false); + int64_t warpSize = 32; + /// In the future this may be configured by the transformation. + static constexpr int64_t kNumWarpsPerGroup = 4; +}; + +/// Builder for warp ids used to map scf.forall to reindexed warps. +/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values +/// used for indexing rewrites as well as 3D sizes for predicate generation. +/// If `useLinearMapping` is true, the `idBuilder` method returns nD values +/// used for indexing rewrites as well as 1D sizes for predicate generation. +struct GpuWarpIdBuilder : public GpuIdBuilder { + GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, + bool useLinearMapping = false); + int64_t warpSize = 32; +}; + +/// Builder for warp ids used to map scf.forall to reindexed threads. +/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values +/// used for indexing rewrites as well as 3D sizes for predicate generation. +/// If `useLinearMapping` is true, the `idBuilder` method returns nD values +/// used for indexing rewrites as well as 1D sizes for predicate generation. +struct GpuThreadIdBuilder : public GpuIdBuilder { + GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping = false); +}; + +/// Determine if the size of the kernel configuration is supported by the +/// GPU architecture being used. +/// TODO this is currently hardwired to CUDA, parameterize and generalize. +DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, + std::optional gridDimX, + std::optional gridDimY, + std::optional gridDimZ, + std::optional blockDimX, + std::optional blockDimY, + std::optional blockDimZ); + +/// Create an empty-body gpu::LaunchOp using the provided kernel settings +/// and put a terminator within. +DiagnosedSilenceableFailure +createGpuLaunch(RewriterBase &rewriter, Location loc, + TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, + std::optional gridDimX = std::nullopt, + std::optional gridDimY = std::nullopt, + std::optional gridDimZ = std::nullopt, + std::optional blockDimX = std::nullopt, + std::optional blockDimY = std::nullopt, + std::optional blockDimZ = std::nullopt); + +/// Alter kernel configuration of the given kernel. +DiagnosedSilenceableFailure +alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, + TransformOpInterface transformOp, + std::optional gridDimX = std::nullopt, + std::optional gridDimY = std::nullopt, + std::optional gridDimZ = std::nullopt, + std::optional blockDimX = std::nullopt, + std::optional blockDimY = std::nullopt, + std::optional blockDimZ = std::nullopt); + +/// Find the unique top level scf::ForallOp within a given target op. +DiagnosedSilenceableFailure +findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, + TransformOpInterface transformOp); + +} // namespace gpu +} // namespace transform +} // namespace mlir + +#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H diff --git a/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td b/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td --- a/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td +++ b/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td @@ -36,10 +36,26 @@ }]; let methods = [ - InterfaceMethod<[{ - Returns mapping as an integer from the attribute. + InterfaceMethod< + /*desc=*/"Return mapping as an integer from the attribute.", + /*retTy=*/"int64_t", + /*methodName=*/"getMappingId", + /*args=*/(ins) + >, + InterfaceMethod< + /*desc=*/"Return true if the attribute specifies a linear mapping", + /*retTy=*/"bool", + /*methodName=*/"isLinearMapping", + /*args=*/(ins) + >, + InterfaceMethod< + /*desc=*/[{ + Return the [0..n) relative index of the attribute depending on its group. + This can be used to index into a contiguous array. }], - "int64_t", "getMappingId", (ins) + /*retTy=*/"int64_t", + /*methodName=*/"getRelativeIndex", + /*args=*/(ins) > ]; } diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h --- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h +++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h @@ -19,6 +19,7 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallVectorExtras.h" namespace mlir { @@ -84,6 +85,9 @@ /// If ofr is a constant integer or an IntegerAttr, return the integer. std::optional getConstantIntValue(OpFoldResult ofr); +/// If all ifs are constant integers or IntegerAttrs, return the integers. +std::optional> +getConstantIntValues(ArrayRef ofrs); /// Return true if `ofr` is constant integer equal to `value`. bool isConstantIntValue(OpFoldResult ofr, int64_t value); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -28,6 +28,7 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/ErrorHandling.h" using namespace mlir; using namespace mlir::gpu; @@ -42,22 +43,70 @@ return static_cast(getBlock()); } +bool GPUBlockMappingAttr::isLinearMapping() const { + return getMappingId() >= static_cast(MappingId::LinearDim0); +} + +int64_t GPUBlockMappingAttr::getRelativeIndex() const { + return isLinearMapping() + ? getMappingId() - static_cast(MappingId::LinearDim0) + : getMappingId(); +} + +int64_t GPUWarpgroupMappingAttr::getMappingId() const { + return static_cast(getWarpgroup()); +} + +bool GPUWarpgroupMappingAttr::isLinearMapping() const { + return getMappingId() >= static_cast(MappingId::LinearDim0); +} + +int64_t GPUWarpgroupMappingAttr::getRelativeIndex() const { + return isLinearMapping() + ? getMappingId() - static_cast(MappingId::LinearDim0) + : getMappingId(); +} + int64_t GPUWarpMappingAttr::getMappingId() const { return static_cast(getWarp()); } -int64_t GPULinearIdMappingAttr::getMappingId() const { - return static_cast(getLinearId()); +bool GPUWarpMappingAttr::isLinearMapping() const { + return getMappingId() >= static_cast(MappingId::LinearDim0); +} + +int64_t GPUWarpMappingAttr::getRelativeIndex() const { + return isLinearMapping() + ? getMappingId() - static_cast(MappingId::LinearDim0) + : getMappingId(); } int64_t GPUThreadMappingAttr::getMappingId() const { return static_cast(getThread()); } +bool GPUThreadMappingAttr::isLinearMapping() const { + return getMappingId() >= static_cast(MappingId::LinearDim0); +} + +int64_t GPUThreadMappingAttr::getRelativeIndex() const { + return isLinearMapping() + ? getMappingId() - static_cast(MappingId::LinearDim0) + : getMappingId(); +} + int64_t GPUMemorySpaceMappingAttr::getMappingId() const { return static_cast(getAddressSpace()); } +bool GPUMemorySpaceMappingAttr::isLinearMapping() const { + llvm_unreachable("GPUMemorySpaceMappingAttr does not support linear mapping"); +} + +int64_t GPUMemorySpaceMappingAttr::getRelativeIndex() const { + llvm_unreachable("GPUMemorySpaceMappingAttr does not support relative index"); +} + //===----------------------------------------------------------------------===// // MMAMatrixType //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRGPUTransformOps GPUTransformOps.cpp + Utils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -12,7 +12,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/GPU/TransformOps/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -32,6 +32,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" using namespace mlir; using namespace mlir::gpu; @@ -622,204 +623,6 @@ // Block and thread mapping utilities. //===----------------------------------------------------------------------===// -namespace { - -/// Return a flattened thread id for the workgroup with given sizes. -static Value buildLinearThreadId(RewriterBase &rewriter, Location loc, - ArrayRef blockDimsOfr) { - LLVM_DEBUG(llvm::interleaveComma( - blockDimsOfr, - DBGS() << "----buildLinearThreadId with blockDimsOfr: "); - llvm::dbgs() << "\n"); - assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes"); - AffineExpr tx, ty, tz, BDX, BDY; - bindDims(rewriter.getContext(), tx, ty, tz); - bindSymbols(rewriter.getContext(), BDX, BDY); - IndexType indexType = rewriter.getIndexType(); - SmallVector threadsAndWorkGroups{ - rewriter.create(loc, indexType, Dimension::x).getResult(), - rewriter.create(loc, indexType, Dimension::y).getResult(), - rewriter.create(loc, indexType, Dimension::z).getResult()}; - threadsAndWorkGroups.push_back(blockDimsOfr[0]); - threadsAndWorkGroups.push_back(blockDimsOfr[1]); - OpFoldResult ofr = affine::makeComposedFoldedAffineApply( - rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups); - return getValueOrCreateConstantIndexOp(rewriter, loc, ofr); -} - -/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks. -/// The `idBuilder` method returns 3-D values used for indexing rewrites as well -/// as 3-D sizes for predicate generation. -struct GpuBlockIdBuilder : public GpuIdBuilder { - - GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef blockDims, - ArrayRef mappingSizes) - : GpuIdBuilder(blockDims, mappingSizes) { - mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX), - GPUBlockMappingAttr::get(ctx, Blocks::DimY), - GPUBlockMappingAttr::get(ctx, Blocks::DimZ)}, - idBuilder = [](RewriterBase &rewriter, Location loc, - ArrayRef forallMappingSizes) { - IndexType indexType = rewriter.getIndexType(); - SmallVector ids{ - rewriter.create(loc, indexType, Dimension::x), - rewriter.create(loc, indexType, Dimension::y), - rewriter.create(loc, indexType, Dimension::z)}; - // Return 3-D ids for indexing rewrite and 3-D sizes and ids for - // predicate generation. - return IdBuilderResult{ids, SmallVector{forallMappingSizes}, - ids}; - }; - } -}; - -/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without -/// any reindexing. -/// The `idBuilder` method returns 3-D values used for indexing rewrites as well -/// as 3-D sizes for predicate generation. -struct GpuThreadIdBuilder : public GpuIdBuilder { - GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef blockDims, - ArrayRef mappingSizes) - : GpuIdBuilder(blockDims, mappingSizes) { - mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX), - GPUThreadMappingAttr::get(ctx, Threads::DimY), - GPUThreadMappingAttr::get(ctx, Threads::DimZ)}; - idBuilder = [](RewriterBase &rewriter, Location loc, - ArrayRef forallMappingSizes) { - IndexType indexType = rewriter.getIndexType(); - SmallVector ids{ - rewriter.create(loc, indexType, Dimension::x), - rewriter.create(loc, indexType, Dimension::y), - rewriter.create(loc, indexType, Dimension::z)}; - // Return 3-D ids for indexing rewrite and 3-D sizes and ids for - // predicate generation. - return IdBuilderResult{ids, SmallVector{forallMappingSizes}, - ids}; - }; - } -}; - -/// Builder for warp ids used in mapping scf.forall to warps. -/// This builder requires a specification of the number of warps along each -/// dimension to more finely control mapping to warps as well a predication than -/// by solely analyzing the IR. -/// The `idBuilder` method returns 3-D values used for indexing rewrites as well -/// as 3-D sizes for predicate generation. -struct GpuWarpIdBuilder : public GpuIdBuilder { - GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef blockDims, - ArrayRef mappingSizes) - : GpuIdBuilder(blockDims, mappingSizes) { - mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX), - GPUWarpMappingAttr::get(ctx, Warps::DimY), - GPUWarpMappingAttr::get(ctx, Warps::DimZ)}; - idBuilder = [this](RewriterBase &rewriter, Location loc, - ArrayRef forallMappingSizes) { - // Build the linear warp id and decompose it in the basis of - // `forallMappingSizes`. - Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr); - AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); - OpFoldResult warpIdOfr = affine::makeComposedFoldedAffineApply( - rewriter, loc, d0.floorDiv(kWarpSize), {linearId}); - Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr); - // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in - // "row-major" order. - SmallVector reverseBasisSizes( - llvm::reverse(this->availableMappingSizes)); - SmallVector strides = computeStrides(reverseBasisSizes); - SmallVector delinearizingExprs = delinearize(d0, strides); - SmallVector ids; - // Reverse back to be in [x, y, z] order. - for (AffineExpr e : llvm::reverse(delinearizingExprs)) - ids.push_back( - affine::makeComposedAffineApply(rewriter, loc, e, {warpId})); - - // clang-format off - LDBG("----linearId: " << linearId); - LDBG("----warpId: " << warpId); - LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, - DBGS() << "--delinearization basis: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(strides, - DBGS() << "--delinearization strides: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(delinearizingExprs, - DBGS() << "--delinearization exprs: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(ids, DBGS() << "--ids: "); - llvm::dbgs() << "\n";); - // clang-format on - - // Return 3-D ids for indexing rewrite and 3-D sizes and ids for - // predicate generation. - return IdBuilderResult{ids, SmallVector{forallMappingSizes}, - ids}; - }; - } - - /// Static specification of the warp size. - /// In the future this may be configured by the transformation. - static constexpr int64_t kWarpSize = 32; -}; - -/// Builder for linear ids used in mapping scf.forall to reindexed threads. -/// The `idBuilder` method returns 3-D values used for indexing rewrites as well -/// as 1-D sizes for predicate generation. -struct GpuLinearIdBuilder : public GpuIdBuilder { - GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef blockDims, - ArrayRef mappingSizes) - : GpuIdBuilder(blockDims, mappingSizes) { - mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX), - GPULinearIdMappingAttr::get(ctx, LinearId::DimY), - GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)}; - idBuilder = [this](RewriterBase &rewriter, Location loc, - ArrayRef forallMappingSizes) { - // Build the linear thread id and decompose it in the basis of - // `forallMappingSizes`. - Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr); - // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in - // "row-major" order. - SmallVector reverseBasisSizes(llvm::reverse(forallMappingSizes)); - SmallVector strides = computeStrides(reverseBasisSizes); - AffineExpr d0; - bindDims(rewriter.getContext(), d0); - SmallVector delinearizingExprs = delinearize(d0, strides); - SmallVector ids; - // Reverse back to be in [x, y, z] order. - for (AffineExpr e : llvm::reverse(delinearizingExprs)) - ids.push_back( - affine::makeComposedAffineApply(rewriter, loc, e, {linearId})); - - // clang-format off - LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, - DBGS() << "--delinearization basis: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(strides, - DBGS() << "--delinearization strides: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(delinearizingExprs, - DBGS() << "--delinearization exprs: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(ids, DBGS() << "--ids: "); - llvm::dbgs() << "\n";); - // clang-format on - - // Compute and return the 1-D actual mapping size spanned by the linearId, - // it will be used to predicate against the linearized total number of - // threads. - int64_t actualMappingSize = 1; - for (int64_t s : forallMappingSizes) - actualMappingSize *= s; - - // Return 3-D ids for indexing rewrite and 1-D size and id for - // predicate generation. - return IdBuilderResult{ids, SmallVector{actualMappingSize}, - SmallVector{linearId}}; - }; - } -}; - -} // namespace - static DiagnosedSilenceableFailure definiteFailureHelper(std::optional transformOp, Operation *target, const Twine &message) { @@ -840,23 +643,23 @@ llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { return isa(attr); }); - bool hasThreadMapping = + bool hasWarpgroupMapping = llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { - return isa(attr); + return isa(attr); }); bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { return isa(attr); }); - bool hasLinearMapping = + bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { - return isa(attr); + return isa(attr); }); int64_t countMappingTypes = 0; countMappingTypes += hasBlockMapping ? 1 : 0; - countMappingTypes += hasThreadMapping ? 1 : 0; + countMappingTypes += hasWarpgroupMapping ? 1 : 0; countMappingTypes += hasWarpMapping ? 1 : 0; - countMappingTypes += hasLinearMapping ? 1 : 0; + countMappingTypes += hasThreadMapping ? 1 : 0; if (countMappingTypes > 1) { return definiteFailureHelper( transformOp, forallOp, @@ -868,12 +671,22 @@ if (seen.contains(map)) { return definiteFailureHelper( transformOp, forallOp, - "duplicated attribute, cannot map different loops " - "to the same processor"); + "duplicate attribute, cannot map different loops " + "to the same mapping id"); } seen.insert(map); } + auto isLinear = [](Attribute a) { + return cast(a).isLinearMapping(); + }; + if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) && + !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) { + return definiteFailureHelper( + transformOp, forallOp, + "cannot mix linear and non-linear mapping modes"); + } + return DiagnosedSilenceableFailure::success(); } @@ -893,133 +706,31 @@ if (forallOp.getNumResults() > 0) return definiteFailureHelper(transformOp, forallOp, "only bufferized scf.forall can be mapped"); - if (forallOp.getRank() > 3) + bool useLinearMapping = cast( + forallOp.getMapping()->getValue().front()) + .isLinearMapping(); + // TODO: This would be more natural with support for Optional + // in GPUDeviceMappingAttr. + int64_t maxNumMappingsSupported = + useLinearMapping ? (getMaxEnumValForMappingId() - + static_cast(MappingId::DimZ)) + : 3; + if (forallOp.getRank() > maxNumMappingsSupported) { return definiteFailureHelper(transformOp, forallOp, - "scf.forall with rank > 3 does not lower"); - if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) { - return !getConstantIntValue(ofr).has_value(); - })) { - return definiteFailureHelper(transformOp, forallOp, - "unsupported dynamic sizes in forall op"); + "scf.forall with rank > ") + << maxNumMappingsSupported + << " does not lower for the specified mapping attribute type"; } - return DiagnosedSilenceableFailure::success(); -} - -/// Determines if the size of the kernel configuration is supported by the -/// GPU architecture being used. It presently makes use of CUDA limitations, -/// however that aspect may be enhanced for other GPUs. -static DiagnosedSilenceableFailure checkGpuLimits( - TransformOpInterface transformOp, std::optional gridDimX, - std::optional gridDimY, std::optional gridDimZ, - std::optional blockDimX, std::optional blockDimY, - std::optional blockDimZ) { - - static constexpr int maxTotalBlockdim = 1024; - static constexpr int maxBlockdimx = 1024; - static constexpr int maxBlockdimy = 1024; - static constexpr int maxBlockdimz = 64; - static constexpr int maxTotalGriddim = 2147483647; - static constexpr int maxGriddimx = 2147483647; - static constexpr int maxGriddimy = 65535; - static constexpr int maxGriddimz = 65535; - - if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > - maxTotalBlockdim || - (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > - maxTotalGriddim || - blockDimX.value_or(1) > maxBlockdimx || - blockDimY.value_or(1) > maxBlockdimy || - blockDimZ.value_or(1) > maxBlockdimz || - gridDimY.value_or(1) > maxGriddimy || - gridDimZ.value_or(1) > maxGriddimz || - gridDimX.value_or(1) > maxGriddimx) { - return transformOp.emitSilenceableError() - << "Trying to launch a GPU kernel with grid_dims = (" - << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " - << gridDimZ.value_or(1) << ") block_dims = (" - << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", " - << blockDimZ.value_or(1) << "). It is larger than the limits."; + auto numParallelIterations = + getConstantIntValues((forallOp.getMixedUpperBound())); + if (!forallOp.isNormalized() || !numParallelIterations.has_value()) { + return definiteFailureHelper( + transformOp, forallOp, + "requires statically sized, normalized forall op"); } return DiagnosedSilenceableFailure::success(); } -/// Creates an empty-body gpu::LaunchOp using the provided kernel settings -/// and put a terminator within. -static DiagnosedSilenceableFailure -createGpuLaunch(RewriterBase &rewriter, Location loc, - TransformOpInterface transformOp, LaunchOp &launchOp, - std::optional gridDimX = std::nullopt, - std::optional gridDimY = std::nullopt, - std::optional gridDimZ = std::nullopt, - std::optional blockDimX = std::nullopt, - std::optional blockDimY = std::nullopt, - std::optional blockDimZ = std::nullopt) { - DiagnosedSilenceableFailure diag = - checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, - blockDimY, blockDimZ); - if (!diag.succeeded()) - return diag; - - auto createConst = [&](int dim) { - return rewriter.create(loc, dim); - }; - OpBuilder::InsertionGuard guard(rewriter); - Value one = createConst(1); - Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one; - Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one; - Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one; - Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one; - Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one; - Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one; - launchOp = rewriter.create(loc, gridSizeX, gridSizeY, gridSizeZ, - blkSizeX, blkSizeY, blkSizeZ); - rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); - rewriter.create(loc); - return DiagnosedSilenceableFailure::success(); -} - -/// Alter kernel configuration of the given kernel. -static DiagnosedSilenceableFailure -alterGpuLaunch(RewriterBase &rewriter, LaunchOp gpuLaunch, - TransformOpInterface transformOp, - std::optional gridDimX = std::nullopt, - std::optional gridDimY = std::nullopt, - std::optional gridDimZ = std::nullopt, - std::optional blockDimX = std::nullopt, - std::optional blockDimY = std::nullopt, - std::optional blockDimZ = std::nullopt) { - DiagnosedSilenceableFailure diag = - checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, - blockDimY, blockDimZ); - if (!diag.succeeded()) - return diag; - - KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointAfterValue(currentBlockdim.x); - auto createConstValue = [&](int dim) { - return rewriter.create(currentBlockdim.x.getLoc(), - dim); - }; - - if (gridDimX.has_value()) - gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value())); - if (gridDimY.has_value()) - gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value())); - if (gridDimZ.has_value()) - gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value())); - if (blockDimX.has_value()) - gpuLaunch.getBlockSizeXMutable().assign( - createConstValue(blockDimX.value())); - if (blockDimY.has_value()) - gpuLaunch.getBlockSizeYMutable().assign( - createConstValue(blockDimY.value())); - if (blockDimZ.has_value()) - gpuLaunch.getBlockSizeZMutable().assign( - createConstValue(blockDimZ.value())); - return DiagnosedSilenceableFailure::success(); -} - /// Struct to return the result of the rewrite of a forall operation. struct ForallRewriteResult { SmallVector mappingSizes; @@ -1040,8 +751,8 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl( RewriterBase &rewriter, std::optional transformOp, - scf::ForallOp forallOp, ForallRewriteResult &result, - ArrayRef availableMappingSizes, const GpuIdBuilder &gpuIdBuilder) { + scf::ForallOp forallOp, ArrayRef availableMappingSizes, + ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) { LDBG("--start rewriteOneForallCommonImpl"); // Step 0. GPU-specific verifications. There is no better place to anchor @@ -1052,18 +763,36 @@ return diag; // Step 1. Complete the mapping to a full mapping (with 1s) if necessary. - SmallVector tmpMappingSizes = llvm::to_vector( - llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) { - auto maybeStaticValue = getConstantIntValue(ofr); - assert(maybeStaticValue && "expected static value"); - return maybeStaticValue.value(); - })); - SmallVector forallMappingAttrs = - llvm::to_vector(forallOp.getMapping()->getValue()); + auto numParallelIterations = + getConstantIntValues((forallOp.getMixedUpperBound())); + assert(forallOp.isNormalized() && numParallelIterations.has_value() && + "requires statically sized, normalized forall op"); + SmallVector tmpMappingSizes = numParallelIterations.value(); + SetVector forallMappingAttrs; + forallMappingAttrs.insert(forallOp.getMapping()->getValue().begin(), + forallOp.getMapping()->getValue().end()); + auto comparator = [](Attribute a, Attribute b) -> bool { + return cast(a).getMappingId() < + cast(b).getMappingId(); + }; + + // Step 1.b. In the linear case, compute the max mapping to avoid needlessly + // mapping all dimensions. In the 3-D mapping case we need to map all + // dimensions. + DeviceMappingAttrInterface maxMapping = + cast(*std::max_element( + forallMappingAttrs.begin(), forallMappingAttrs.end(), comparator)); + DeviceMappingAttrInterface maxLinearMapping; + if (maxMapping.isLinearMapping()) + maxLinearMapping = maxMapping; for (auto attr : gpuIdBuilder.mappingAttributes) { - if (llvm::is_contained(forallMappingAttrs, attr)) + // If attr overflows, just skip. + if (maxLinearMapping && comparator(maxLinearMapping, attr)) continue; - forallMappingAttrs.push_back(attr); + // Try to insert. If element was already present, just continue. + if (!forallMappingAttrs.insert(attr)) + continue; + // Otherwise, we have a new insertion without a size -> use size 1. tmpMappingSizes.push_back(1); } LLVM_DEBUG( @@ -1073,45 +802,50 @@ llvm::dbgs() << "\n"); // Step 2. sort the values by the corresponding DeviceMappingAttrInterface. - auto comparator = [&](Attribute a, Attribute b) -> bool { - return cast(a).getMappingId() < - cast(b).getMappingId(); - }; - SmallVector forallMappingSizes = - getValuesSortedByKey(forallMappingAttrs, tmpMappingSizes, comparator); + SmallVector forallMappingSizes = getValuesSortedByKey( + forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator); LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes, DBGS() << "----forallMappingSizes: "); llvm::dbgs() << "\n"; llvm::interleaveComma( - forallMappingAttrs, DBGS() << "----mappingAttrs: "); + forallMappingAttrs, DBGS() << "----forallMappingAttrs: "); llvm::dbgs() << "\n"); // Step 3. Generate the mappingIdOps using the provided generator. Location loc = forallOp.getLoc(); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(forallOp); + SmallVector originalBasis(availableMappingSizes); + bool originalBasisWasProvided = !originalBasis.empty(); + if (!originalBasisWasProvided) { + originalBasis = forallMappingSizes; + while (originalBasis.size() < 3) + originalBasis.push_back(1); + } + IdBuilderResult builderResult = - gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes); + gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis); - // Step 4. Map the induction variables to the mappingIdOps, this may involve a - // permutation. + // Step 4. Map the induction variables to the mappingIdOps, this may involve + // a permutation. SmallVector mappingIdOps = builderResult.mappingIdOps; IRMapping bvm; - for (auto [iv, dim] : - llvm::zip_equal(forallOp.getInductionVars(), - ArrayRef{forallMappingAttrs}.take_front( - forallOp.getInductionVars().size()))) { - Value peIdOp = mappingIdOps[static_cast( - cast(dim).getMappingId())]; + for (auto [iv, dim] : llvm::zip_equal( + forallOp.getInductionVars(), + forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) { + auto mappingAttr = cast(dim); + Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()]; bvm.map(iv, peIdOp); } - // Step 5. If the availableMappingSizes are already known, create conditionals - // to predicate the region. Otherwise, the current forall determines the - // availableMappingSizes and no predication occurs. + // Step 5. If the originalBasis is already known, create conditionals to + // predicate the region. Otherwise, the current forall determines the + // originalBasis and no predication occurs. Value predicate; - if (!availableMappingSizes.empty()) { + if (originalBasisWasProvided) { SmallVector predicateMappingSizes = builderResult.predicateMappingSizes; + SmallVector availableMappingSizes = + builderResult.availableMappingSizes; SmallVector predicateIdOps = builderResult.predicateIdOps; // clang-format off LLVM_DEBUG( @@ -1176,6 +910,12 @@ // Step 8. Erase old op. rewriter.eraseOp(forallOp); + LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes, + DBGS() << "----result forallMappingSizes: "); + llvm::dbgs() << "\n"; llvm::interleaveComma( + mappingIdOps, DBGS() << "----result mappingIdOps: "); + llvm::dbgs() << "\n"); + result = ForallRewriteResult{forallMappingSizes, mappingIdOps}; return DiagnosedSilenceableFailure::success(); } @@ -1201,24 +941,28 @@ zero = rewriter.create(loc, 0); } - SmallVector anyAvailableMappingSizes; ForallRewriteResult rewriteResult; - // Pass an empty anyAvailableMappingSizes. - DiagnosedSilenceableFailure diag = - rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult, - anyAvailableMappingSizes, gpuIdBuilder); + DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl( + rewriter, transformOp, forallOp, + /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder); - // Return if anything goes wrong, use silenceable failure as a match failure. + // Return if anything goes wrong, use silenceable failure as a match + // failure. if (!diag.succeeded()) return diag; - // Set the gridDims that act as a return. - gridDims = rewriteResult.mappingSizes; + // If gridDims was not provided already, set it from the return. + if (gridDims.empty()) { + gridDims = rewriteResult.mappingSizes; + while (gridDims.size() < 3) + gridDims.push_back(1); + } + assert(gridDims.size() == 3 && "Need 3-D gridDims"); // Replace ids of dimensions known to be 1 by 0 to simplify the IR. // Here, the result of mapping determines the available mapping sizes. replaceUnitMappingIdsHelper(rewriter, loc, parentBlock, zero, - gridDims); + rewriteResult.mappingSizes); return DiagnosedSilenceableFailure::success(); } @@ -1277,23 +1021,28 @@ if (getGenerateGpuLaunch()) { DiagnosedSilenceableFailure diag = createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch); - if (!diag.succeeded()) { + if (!diag.succeeded()) return diag; - } + rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); Operation *newForallOp = rewriter.clone(*topLevelForallOp); rewriter.eraseOp(topLevelForallOp); topLevelForallOp = cast(newForallOp); } - GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), {}, {}); + // The BlockIdBuilder adapts to whatever is thrown at it. + auto mappingAttr = cast( + topLevelForallOp.getMapping()->getValue().front()); + bool useLinearMapping = mappingAttr.isLinearMapping(); + GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping); + diag = mlir::transform::gpu::mapForallToBlocksImpl( rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder); if (!diag.succeeded()) return diag; - // Set the GPU launch configuration for the grid dims late, this is subject to - // IR inspection. + // Set the GPU launch configuration for the grid dims late, this is + // subject to IR inspection. diag = alterGpuLaunch(rewriter, gpuLaunch, cast(getOperation()), gridDims[0], gridDims[1], gridDims[2]); @@ -1306,19 +1055,90 @@ // MapNestedForallToThreads //===----------------------------------------------------------------------===// +static DiagnosedSilenceableFailure checkMappingSpec( + std::optional transformOp, scf::ForallOp forallOp, + ArrayRef numParallelIterations, ArrayRef blockOrGridSizes, + int factor, bool useLinearMapping = false) { + if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) { + auto diag = definiteFailureHelper( + transformOp, forallOp, + Twine("3-D mapping: size of threadIdx.x must be a multiple of ") + + std::to_string(factor)); + return diag; + } + if (computeProduct(numParallelIterations) * factor > + computeProduct(blockOrGridSizes)) { + auto diag = definiteFailureHelper( + transformOp, forallOp, + Twine( + "the number of required parallel ressources (blocks or threads) ") + + std::to_string(computeProduct(numParallelIterations) * factor) + + std::string(" overflows the number of available ressources ") + + std::to_string(computeProduct(blockOrGridSizes))); + return diag; + } + return DiagnosedSilenceableFailure::success(); +} + +static DiagnosedSilenceableFailure +getThreadIdBuilder(std::optional transformOp, + scf::ForallOp forallOp, ArrayRef blockSizes, + int64_t warpSize, GpuIdBuilder &gpuIdBuilder) { + auto mappingAttr = cast( + forallOp.getMapping()->getValue().front()); + bool useLinearMapping = mappingAttr.isLinearMapping(); + + // Sanity checks that may result in runtime verification errors. + auto numParallelIterations = + getConstantIntValues((forallOp.getMixedUpperBound())); + if (!forallOp.isNormalized() || !numParallelIterations.has_value()) { + return definiteFailureHelper( + transformOp, forallOp, + "requires statically sized, normalized forall op"); + } + int64_t factor = 1; + if (isa(mappingAttr)) { + factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize; + } else if (isa(mappingAttr)) { + factor = warpSize; + } + DiagnosedSilenceableFailure diag = + checkMappingSpec(transformOp, forallOp, numParallelIterations.value(), + blockSizes, factor, useLinearMapping); + if (!diag.succeeded()) + return diag; + + // Start mapping. + MLIRContext *ctx = forallOp.getContext(); + gpuIdBuilder = + TypeSwitch(mappingAttr) + .Case([&](GPUWarpgroupMappingAttr) { + return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping); + }) + .Case([&](GPUWarpMappingAttr) { + return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping); + }) + .Case([&](GPUThreadMappingAttr) { + return GpuThreadIdBuilder(ctx, useLinearMapping); + }) + .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder { + llvm_unreachable("unknown mapping attribute"); + }); + return DiagnosedSilenceableFailure::success(); +} + DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl( RewriterBase &rewriter, std::optional transformOp, - scf::ForallOp forallOp, ArrayRef availableMappingSizes, - bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) { - // Ignore cases with different attributes than this builder supports. - for (Attribute map : forallOp.getMapping()->getValue()) { - if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) { - LDBG("--skip " << map); - LLVM_DEBUG(llvm::interleaveComma(gpuIdBuilder.mappingAttributes, - DBGS() << "----not in: "); - llvm::dbgs() << "\n";); - return emitSilenceableFailure(forallOp); - } + scf::ForallOp forallOp, ArrayRef blockSizes, int64_t warpSize, + bool syncAfterDistribute) { + + GpuIdBuilder gpuIdBuilder; + { + // Try to construct the id builder, if it fails, return. + DiagnosedSilenceableFailure diag = getThreadIdBuilder( + transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder); + if (!diag.succeeded()) + return diag; } Location loc = forallOp.getLoc(); @@ -1326,14 +1146,10 @@ // Insert after to allow for syncthreads after `forall` is erased. rewriter.setInsertionPointAfter(forallOp); ForallRewriteResult rewriteResult; - DiagnosedSilenceableFailure diag = - rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult, - availableMappingSizes, gpuIdBuilder); - - // Return if anything goes wrong, use silenceable failure as a match failure. + DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl( + rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder); if (!diag.succeeded()) return diag; - // Add a syncthreads if needed. TODO: warpsync if (syncAfterDistribute) rewriter.create(loc); @@ -1343,20 +1159,12 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl( RewriterBase &rewriter, std::optional transformOp, - Operation *target, ArrayRef blockDims, ArrayRef warpDims, + Operation *target, ArrayRef blockDims, int64_t warpSize, bool syncAfterDistribute) { LDBG("Start mapNestedForallToThreadsImpl"); - MLIRContext *ctx = rewriter.getContext(); - SmallVector blockDimsOfr = - getAsIndexOpFoldResult(ctx, blockDims); - - if (blockDims.size() != 3) + if (blockDims.size() != 3) { return definiteFailureHelper(transformOp, target, "requires size-3 thread mapping"); - if (!warpDims.empty()) { - if (warpDims.size() != 3) - return definiteFailureHelper(transformOp, target, - "requires empty or size-3 warp mapping"); } // Create an early zero index value for replacements. @@ -1364,64 +1172,13 @@ Value zero = rewriter.create(loc, 0); DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success(); WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) { - //===--------------------------------------------------------------------===// - // Mapping to warp ids. - //===--------------------------------------------------------------------===// - if (!warpDims.empty()) { - LLVM_DEBUG( - llvm::interleaveComma( - warpDims, DBGS() << "+mapNestedForallToThreadsImpl warpDims: "); - llvm::dbgs() << "\n"); - LLVM_DEBUG(llvm::interleaveComma( - blockDimsOfr, DBGS() << "--warpDims with blockDimsOfr: "); - llvm::dbgs() << "\n"); - GpuWarpIdBuilder gpuWarpIdBuilder(ctx, blockDimsOfr, warpDims); - diag = mlir::transform::gpu::mapOneForallToThreadsImpl( - rewriter, transformOp, forallOp, warpDims, syncAfterDistribute, - gpuWarpIdBuilder); - // Use silenceable failure to encode "failure to match" and pass - // through. - if (diag.isDefiniteFailure()) - return WalkResult::interrupt(); - if (diag.succeeded()) - return WalkResult::skip(); - } - - //===--------------------------------------------------------------------===// - // Mapping to linear ids. - //===--------------------------------------------------------------------===// - LDBG("+mapNestedForallToThreadsImpl linearDims"); - LLVM_DEBUG(llvm::interleaveComma( - blockDimsOfr, DBGS() << "--linearDims with blockDimsOfr: "); - llvm::dbgs() << "\n"); - int64_t numThreads = 1; - for (int64_t b : blockDims) - numThreads *= b; - GpuLinearIdBuilder gpuLinearIdBuilder(ctx, blockDimsOfr, numThreads); diag = mlir::transform::gpu::mapOneForallToThreadsImpl( - rewriter, transformOp, forallOp, numThreads, syncAfterDistribute, - gpuLinearIdBuilder); - // Use silenceable failure to encode "failure to match" and pass through. + rewriter, transformOp, forallOp, blockDims, warpSize, + syncAfterDistribute); if (diag.isDefiniteFailure()) return WalkResult::interrupt(); if (diag.succeeded()) return WalkResult::skip(); - - //===--------------------------------------------------------------------===// - // Mapping to block ids (happens last so we can replay ThreadIdOp). - //===--------------------------------------------------------------------===// - LLVM_DEBUG( - llvm::interleaveComma( - blockDimsOfr, DBGS() << "mapNestedForallToThreadsImpl blockDims: "); - llvm::dbgs() << "\n"); - GpuThreadIdBuilder gpuThreadIdBuilder(ctx, blockDimsOfr, blockDims); - diag = mlir::transform::gpu::mapOneForallToThreadsImpl( - rewriter, transformOp, forallOp, blockDims, syncAfterDistribute, - gpuThreadIdBuilder); - // Use silenceable failure to encode "failure to match" and pass through. - if (diag.isDefiniteFailure()) - return WalkResult::interrupt(); - return WalkResult::advance(); }); if (walkResult.wasInterrupted()) @@ -1447,7 +1204,6 @@ // Mapping to block ids. SmallVector blockDims{getBlockDims()}; - DiagnosedSilenceableFailure diag = checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt, blockDims[0], blockDims[1], blockDims[2]); @@ -1465,7 +1221,7 @@ rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); diag = mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims, - getWarpDims(), getSyncAfterDistribute()); + getWarpSize(), getSyncAfterDistribute()); results.push_back(gpuLaunch.getOperation()); return diag; diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp @@ -0,0 +1,322 @@ +//===- Utils.cpp - Utils for GPU transform ops ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/TransformOps/Utils.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Value.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Debug.h" + +using namespace mlir; +using namespace mlir::gpu; +using namespace mlir::transform; +using namespace mlir::transform::gpu; + +#define DEBUG_TYPE "gpu-transforms" + +#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") +#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ") + +/// Return a flattened thread id for the workgroup with given sizes. +template +static Value buildLinearId(RewriterBase &rewriter, Location loc, + ArrayRef originalBasisOfr) { + LLVM_DEBUG(llvm::interleaveComma( + originalBasisOfr, + DBGS() << "----buildLinearId with originalBasisOfr: "); + llvm::dbgs() << "\n"); + assert(originalBasisOfr.size() == 3 && "expected 3 sizes"); + IndexType indexType = rewriter.getIndexType(); + AffineExpr tx, ty, tz, BDX, BDY; + bindDims(rewriter.getContext(), tx, ty, tz); + bindSymbols(rewriter.getContext(), BDX, BDY); + SmallVector vals{ + rewriter.create(loc, indexType, Dimension::x) + .getResult(), + rewriter.create(loc, indexType, Dimension::y) + .getResult(), + rewriter.create(loc, indexType, Dimension::z) + .getResult(), + originalBasisOfr[0], originalBasisOfr[1]}; + OpFoldResult ofr = affine::makeComposedFoldedAffineApply( + rewriter, loc, tx + ty * BDX + tz * BDX * BDY, vals); + return getValueOrCreateConstantIndexOp(rewriter, loc, ofr); +} + +/// Create a linear id builder that takes the `originalBasisOfr` and decompose +/// it in the basis of `forallMappingSizes`. The linear id builder returns an +/// n-D vector of ids for indexing and 1-D size + id for predicate generation. +template +static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity = 1) { + auto res = [multiplicity](RewriterBase &rewriter, Location loc, + ArrayRef forallMappingSizes, + ArrayRef originalBasis) { + SmallVector originalBasisOfr = + getAsIndexOpFoldResult(rewriter.getContext(), originalBasis); + OpFoldResult linearId = + buildLinearId(rewriter, loc, originalBasisOfr); + // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in + // "row-major" order. + SmallVector reverseBasisSizes(llvm::reverse(forallMappingSizes)); + SmallVector strides = computeStrides(reverseBasisSizes); + AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); + linearId = affine::makeComposedFoldedAffineApply( + rewriter, loc, d0.floorDiv(multiplicity), {linearId}); + SmallVector delinearizingExprs = delinearize(d0, strides); + SmallVector ids; + // Reverse back to be in [0 .. n] order. + for (AffineExpr e : llvm::reverse(delinearizingExprs)) { + ids.push_back( + affine::makeComposedAffineApply(rewriter, loc, e, {linearId})); + } + + // clang-format off + LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, + DBGS() << "--delinearization basis: "); + llvm::dbgs() << "\n"; + llvm::interleaveComma(strides, + DBGS() << "--delinearization strides: "); + llvm::dbgs() << "\n"; + llvm::interleaveComma(delinearizingExprs, + DBGS() << "--delinearization exprs: "); + llvm::dbgs() << "\n"; + llvm::interleaveComma(ids, DBGS() << "--ids: "); + llvm::dbgs() << "\n";); + // clang-format on + + // Return n-D ids for indexing and 1-D size + id for predicate generation. + return IdBuilderResult{ + ids, SmallVector{computeProduct(forallMappingSizes)}, + SmallVector{computeProduct(originalBasis)}, + SmallVector{linearId.get()}}; + }; + + return res; +} + +/// Create a simple 3-D id builder that takes the `originalBasisOfr` +/// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes +/// + ids for predicate generation. +template +static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) { + auto res = [multiplicity](RewriterBase &rewriter, Location loc, + ArrayRef forallMappingSizes, + ArrayRef originalBasis) { + IndexType indexType = rewriter.getIndexType(); + SmallVector ids{ + rewriter.create(loc, indexType, Dimension::x), + rewriter.create(loc, indexType, Dimension::y), + rewriter.create(loc, indexType, Dimension::z)}; + AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext()); + ids[0] = affine::makeComposedFoldedAffineApply( + rewriter, loc, d0.floorDiv(multiplicity), {ids[0]}) + .get(); + return IdBuilderResult{ids, SmallVector{forallMappingSizes}, + SmallVector{originalBasis}, ids}; + }; + return res; +} + +namespace mlir { +namespace transform { +namespace gpu { + +GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping, + MappingIdBuilderFnType fn) + : mappingAttributes(), idBuilder() { + if (useLinearMapping) { + for (uint64_t d = static_cast(MappingId::LinearDim0), + e = getMaxEnumValForMappingId(); + d <= e; ++d) + mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value())); + } else { + for (uint64_t d = static_cast(MappingId::DimX), + e = static_cast(MappingId::DimZ); + d <= e; ++d) + mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value())); + } +} + +GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping) + : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { + return GPUBlockMappingAttr::get(ctx, id); + }) { + idBuilder = useLinearMapping + ? commonLinearIdBuilderFn(/*multiplicity=*/1) + : common3DIdBuilderFn(/*multiplicity=*/1); +} + +GpuWarpgroupIdBuilder::GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, + bool useLinearMapping) + : GpuIdBuilder(ctx, useLinearMapping, + [](MLIRContext *ctx, MappingId id) { + return GPUWarpgroupMappingAttr::get(ctx, id); + }), + warpSize(warpSize) { + idBuilder = useLinearMapping + ? commonLinearIdBuilderFn( + /*multiplicity=*/kNumWarpsPerGroup * warpSize) + : common3DIdBuilderFn( + /*multiplicity=*/kNumWarpsPerGroup * warpSize); +} + +GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, + bool useLinearMapping) + : GpuIdBuilder(ctx, useLinearMapping, + [](MLIRContext *ctx, MappingId id) { + return GPUWarpMappingAttr::get(ctx, id); + }), + warpSize(warpSize) { + idBuilder = + useLinearMapping + ? commonLinearIdBuilderFn(/*multiplicity=*/warpSize) + : common3DIdBuilderFn(/*multiplicity=*/warpSize); +} + +GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping) + : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) { + return GPUThreadMappingAttr::get(ctx, id); + }) { + idBuilder = useLinearMapping + ? commonLinearIdBuilderFn(/*multiplicity=*/1) + : common3DIdBuilderFn(/*multiplicity=*/1); +} + +DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, + std::optional gridDimX, + std::optional gridDimY, + std::optional gridDimZ, + std::optional blockDimX, + std::optional blockDimY, + std::optional blockDimZ) { + + // TODO: pass a configuration object to set the limits properly. + static constexpr int maxTotalBlockdim = 1024; + static constexpr int maxBlockdimx = 1024; + static constexpr int maxBlockdimy = 1024; + static constexpr int maxBlockdimz = 64; + static constexpr int maxTotalGriddim = 2147483647; + static constexpr int maxGriddimx = 2147483647; + static constexpr int maxGriddimy = 65535; + static constexpr int maxGriddimz = 65535; + + if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > + maxTotalBlockdim || + (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > + maxTotalGriddim || + blockDimX.value_or(1) > maxBlockdimx || + blockDimY.value_or(1) > maxBlockdimy || + blockDimZ.value_or(1) > maxBlockdimz || + gridDimY.value_or(1) > maxGriddimy || + gridDimZ.value_or(1) > maxGriddimz || + gridDimX.value_or(1) > maxGriddimx) { + return transformOp.emitSilenceableError() + << "Trying to launch a GPU kernel with grid_dims = (" + << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " + << gridDimZ.value_or(1) << ") block_dims = (" + << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", " + << blockDimZ.value_or(1) << "). It is larger than the limits."; + } + return DiagnosedSilenceableFailure::success(); +} + +DiagnosedSilenceableFailure createGpuLaunch( + RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, + LaunchOp &launchOp, std::optional gridDimX, + std::optional gridDimY, std::optional gridDimZ, + std::optional blockDimX, std::optional blockDimY, + std::optional blockDimZ) { + DiagnosedSilenceableFailure diag = + checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, + blockDimY, blockDimZ); + if (!diag.succeeded()) + return diag; + + auto createConst = [&](int dim) { + return rewriter.create(loc, dim); + }; + OpBuilder::InsertionGuard guard(rewriter); + Value one = createConst(1); + Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one; + Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one; + Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one; + Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one; + Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one; + Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one; + launchOp = rewriter.create(loc, gridSizeX, gridSizeY, gridSizeZ, + blkSizeX, blkSizeY, blkSizeZ); + rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); + rewriter.create(loc); + return DiagnosedSilenceableFailure::success(); +} + +/// Alter kernel configuration of the given kernel. +DiagnosedSilenceableFailure alterGpuLaunch( + RewriterBase &rewriter, LaunchOp gpuLaunch, + TransformOpInterface transformOp, std::optional gridDimX, + std::optional gridDimY, std::optional gridDimZ, + std::optional blockDimX, std::optional blockDimY, + std::optional blockDimZ) { + DiagnosedSilenceableFailure diag = + checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, + blockDimY, blockDimZ); + if (!diag.succeeded()) + return diag; + + KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfterValue(currentBlockdim.x); + auto createConstValue = [&](int dim) { + return rewriter.create(currentBlockdim.x.getLoc(), + dim); + }; + + if (gridDimX.has_value()) + gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value())); + if (gridDimY.has_value()) + gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value())); + if (gridDimZ.has_value()) + gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value())); + if (blockDimX.has_value()) + gpuLaunch.getBlockSizeXMutable().assign( + createConstValue(blockDimX.value())); + if (blockDimY.has_value()) + gpuLaunch.getBlockSizeYMutable().assign( + createConstValue(blockDimY.value())); + if (blockDimZ.has_value()) + gpuLaunch.getBlockSizeZMutable().assign( + createConstValue(blockDimZ.value())); + return DiagnosedSilenceableFailure::success(); +} + +} // namespace gpu +} // namespace transform +} // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp b/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp @@ -24,14 +24,14 @@ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") -static Attribute linearIdX(MLIRContext *ctx) { - return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimX); +static Attribute linearId0(MLIRContext *ctx) { + return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0); } -static Attribute linearIdY(MLIRContext *ctx) { - return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimY); +static Attribute linearId1(MLIRContext *ctx) { + return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1); } -static Attribute linearIdZ(MLIRContext *ctx) { - return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimZ); +static Attribute linearId2(MLIRContext *ctx) { + return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2); } transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx, @@ -78,8 +78,8 @@ std::tie(size, numThreads) = pair; return mlir::ceilDiv(size, numThreads); })); - SmallVector allThreadMappings{linearIdZ(ctx), linearIdY(ctx), - linearIdX(ctx)}; + SmallVector allThreadMappings{linearId2(ctx), linearId1(ctx), + linearId0(ctx)}; // Set the thread mapping. this->threadMapping = diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp --- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp +++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp @@ -121,6 +121,20 @@ return std::nullopt; } +std::optional> +getConstantIntValues(ArrayRef ofrs) { + bool failed = false; + SmallVector res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) { + auto cv = getConstantIntValue(ofr); + if (!cv.has_value()) + failed = true; + return cv.has_value() ? cv.value() : 0; + }); + if (failed) + return std::nullopt; + return res; +} + /// Return true if `ofr` is constant integer equal to `value`. bool isConstantIntValue(OpFoldResult ofr, int64_t value) { auto val = getConstantIntValue(ofr); diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir --- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir @@ -89,7 +89,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !transform.any_op): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{Trying to map to fewer GPU threads than loop iterations but overprovisioning is not yet supported. Try additional tiling of the before mapping or map to more threads.}} + // expected-error @below {{the number of required parallel ressources (blocks or threads) 6300 overflows the number of available ressources 512}} transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op } @@ -115,7 +115,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !transform.any_op): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{unsupported dynamic sizes}} + // expected-error @below {{requires statically sized, normalized forall op}} transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op } @@ -135,11 +135,11 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !transform.any_op): %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op - %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread, #gpu.thread, #gpu.thread ] ) + %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [2, 3, 1] (mapping = [ #gpu.thread, #gpu.thread, #gpu.thread ] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op // expected-error @below {{only bufferized scf.forall can be mapped}} - transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [96, 4, 1] : (!transform.any_op) -> !transform.any_op } // ----- @@ -250,6 +250,33 @@ // ----- +!type = memref<32x32xf32> +func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { + %c32 = arith.constant 32 : index + %one = arith.constant 1 : index + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + scf.forall (%i, %j) in (%c32, %c32) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = arith.mulf %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.thread, #gpu.warp] } + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{cannot mix different mapping types, use nesting}} + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op +} + +// ----- + !type = memref<32x32xf32> func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { %c32 = arith.constant 32 : index @@ -271,6 +298,33 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !transform.any_op): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{duplicated attribute, cannot map different loops to the same processor}} + // expected-error @below {{duplicate attribute, cannot map different loops to the same mapping id}} + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op +} + +// ----- + +!type = memref<32x32xf32> +func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { + %c32 = arith.constant 32 : index + %one = arith.constant 1 : index + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + scf.forall (%i, %j) in (%c32, %c32) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = arith.mulf %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.thread, #gpu.thread] } + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{cannot mix linear and non-linear mapping modes}} transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op } diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -3,11 +3,11 @@ !type = memref<2 x 32 x f32> !type1d = memref<32 x f32> -// CHECK-LABEL: func.func @saxpy2dblock( +// CHECK-LABEL: func.func @blocks_3d( // CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> // CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> // CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> -func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { +func.func @blocks_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %one = arith.constant 1 : index @@ -41,11 +41,112 @@ !type = memref<2 x 32 x f32> !type1d = memref<32 x f32> -// CHECK-LABEL: func.func @saxpy2d( +// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 floordiv 128)> + +// CHECK-LABEL: func.func @warpgroup_3d( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> +func.func @warpgroup_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %one = arith.constant 1 : index + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index + // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index + // CHECK-DAG: %[[C512:.*]] = arith.constant 512 : index + +// CHECK: gpu.launch +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK-DAG: %[[WG:.*]] = affine.apply #[[$MAP]](%[[TIDX]]) +// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WG]], %[[C3]] : index +// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index +// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 +// CHECK: scf.if %[[COND]] +// CHECK: memref.load %[[ARGX]][%[[WG]], %[[TIDY]]] +// CHECK: memref.load %[[ARGY]][%[[WG]], %[[TIDY]]] + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + scf.forall (%i, %j) in (%c3, %c1) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.warpgroup, #gpu.warpgroup]} + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [512, 2, 1] : (!transform.any_op) -> !transform.any_op +} + +// ----- + +!type = memref<2 x 32 x f32> +!type1d = memref<32 x f32> + +// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> + +// CHECK-LABEL: func.func @warp_3d( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> +func.func @warp_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %one = arith.constant 1 : index + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index + // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index + // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + // CHECK-DAG: %[[c64:.*]] = arith.constant 64 : index + +// CHECK: gpu.launch +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK-DAG: %[[W:.*]] = affine.apply #[[$MAP]](%[[TIDX]]) +// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[W]], %[[C2]] : index +// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C3]] : index +// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 +// CHECK: scf.if %[[COND]] +// CHECK: memref.load %[[ARGX]][%[[W]], %[[TIDY]]] +// CHECK: memref.load %[[ARGY]][%[[W]], %[[TIDY]]] + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + scf.forall (%i, %j, %k) in (%c2, %c3, %c3) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.warp, #gpu.warp, #gpu.warp]} + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [64, 4, 3] warp_size = 16: (!transform.any_op) -> !transform.any_op +} + +// ----- + +!type = memref<2 x 32 x f32> +!type1d = memref<32 x f32> + +// CHECK-LABEL: func.func @threads_3d( // CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> // CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> // CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> -func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { +func.func @threads_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { %one = arith.constant 1 : index %c12 = arith.constant 12 : index %c9 = arith.constant 9 : index @@ -231,30 +332,142 @@ transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false : (!transform.any_op) -> !transform.any_op } + // ----- !type = memref<2 x 32 x f32> !type1d = memref<32 x f32> -// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) mod 3)> -// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 6) floordiv 3)> +// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<(d0, d1, d2) -> (d2 * 2 + (d0 + d1 * 32) floordiv 128)> +// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 32) floordiv 128) mod 2)> +// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<(d0, d1, d2) -> (d2 + ((d0 + d1 * 32) floordiv 128) floordiv 2)> -// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 12)> -// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) mod 10)> -// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)> +// CHECK-LABEL: func.func @warpgroup_linear( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> +func.func @warpgroup_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %one = arith.constant 1 : index -// CHECK-LABEL: func.func @map_multi_level( -func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + +// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x +// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y +// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z +// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]](%[[TIDX]], %[[TIDY]], %[[TIDZ]]) +// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]](%[[TIDX]], %[[TIDY]]) +// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]](%[[TIDX]], %[[TIDY]], %[[TIDZ]]) +// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C6]] : index +// CHECK: scf.if %[[CMPLIN]] +// CHECK: memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]] +// CHECK: memref.load %[[ARGY]][%[[WIDX]], %[[WIDY]]] + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + scf.forall (%i, %j) in (%c2, %c3) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.warpgroup, #gpu.warpgroup]} + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 8, 4] : (!transform.any_op) -> !transform.any_op +} + +// ----- + +!type = memref<2 x 32 x f32> +!type1d = memref<32 x f32> + +// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 8 + d0 floordiv 32)> +// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1, d2) -> ((d1 + d2 * 8 + d0 floordiv 32) mod 2)> +// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1, d2) -> ((d1 + d2 * 8 + d0 floordiv 32) floordiv 2)> + +// CHECK-LABEL: func.func @warp_linear( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> +func.func @warp_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %one = arith.constant 1 : index + +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + +// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x +// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y +// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z +// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]](%[[TIDX]], %[[TIDY]], %[[TIDZ]]) +// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]], %[[TIDZ]]) +// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]], %[[TIDZ]]) +// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C6]] : index +// CHECK: scf.if %[[CMPLIN]] +// CHECK: memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]] +// CHECK: memref.load %[[ARGY]][%[[WIDX]], %[[WIDY]]] + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { + scf.forall (%i, %j) in (%c2, %c3) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.warp, #gpu.warp]} + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 8, 4] : (!transform.any_op) -> !transform.any_op +} + +// ----- + +!type = memref<2 x 32 x f32> +!type1d = memref<32 x f32> + +// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) floordiv 32)> +// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 18) floordiv 32) mod 3)> +// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 18) floordiv 32) mod 6) floordiv 3)> + +// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 18)> +// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) mod 10)> +// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) floordiv 10)> + +// CHECK-LABEL: func.func @map_multi_level_linear( +func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { %one = arith.constant 1 : index %c10 = arith.constant 10 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index // CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index - // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index + // CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index // CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index // check that both the thread level and the warp level got distributed. @@ -272,17 +485,16 @@ memref.store %6, %y[%i, %j] : !type } { mapping = [#gpu.thread, #gpu.thread]} + // CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]]) - // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index - // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index - // CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 - // CHECK: scf.if %[[COND]] - scf.forall (%i) in (%c1) { - %7 = memref.load %t[%i] : !type1d + // CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C6]] : index + // CHECK: scf.if %[[CMPLIN]] + scf.forall (%i, %j, %k) in (%c3, %c2, %c1) { + %7 = memref.load %x[%i, %j] : !type %8 = arith.addf %alpha, %7 : f32 - memref.store %8, %t[%i] : !type1d - } {mapping = [#gpu.warp] } + memref.store %8, %y[%i, %j] : !type + } {mapping = [#gpu.warp, #gpu.warp, #gpu.warp] } // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]]) @@ -295,7 +507,7 @@ %7 = memref.load %t[%i] : !type1d %8 = arith.addf %alpha, %7 : f32 memref.store %8, %t[%j] : !type1d - } {mapping = [#gpu.linear, #gpu.linear] } + } {mapping = [#gpu.thread, #gpu.thread] } gpu.terminator } return %y : !type @@ -305,41 +517,103 @@ ^bb1(%arg0: !transform.any_op): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.gpu.map_nested_forall_to_threads %funcop - block_dims = [12, 11, 1] warp_dims = [3, 2, 1] : (!transform.any_op) -> !transform.any_op + block_dims = [18, 11, 1] : (!transform.any_op) -> !transform.any_op } // ----- -// CHECK-LABEL: func.func @tiling_buffer_semantic_op( -// CHECK: gpu.launch {{.*}} { -// CHECK: scf.forall {{.*}} { -// CHECK: memref.subview -// CHECK: memref.subview -// CHECK: linalg.generic -// CHECK: } -// CHECK: } -func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) { +!type = memref<2 x 32 x f32> +!type1d = memref<32 x f32> + +// CHECK-DAG: #[[$MAPBLIN:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 12 + d2 * 108)> +// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<(d0, d1, d2) -> ((d0 + d1 * 12 + d2 * 108) mod 7)> +// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<(d0, d1, d2) -> ((d0 + d1 * 12 + d2 * 108) floordiv 7)> + +// CHECK-LABEL: func.func @block_linear_existing_launch( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> +func.func @block_linear_existing_launch( + %x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %c9 = arith.constant 9 : index + %c7 = arith.constant 7 : index %one = arith.constant 1 : index + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index + // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index + // CHECK-DAG: %[[C63:.*]] = arith.constant 63 : index +// CHECK: gpu.launch async [{{.*}}] blocks({{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads +// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x +// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y +// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z +// CHECK-DAG: %[[BIDLIN:.*]] = affine.apply #[[$MAPBLIN]](%[[BIDX]], %[[BIDY]], %[[BIDZ]]) +// CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]](%[[BIDX]], %[[BIDY]], %[[BIDZ]]) +// CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]](%[[BIDX]], %[[BIDY]], %[[BIDZ]]) +// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[BIDLIN]], %[[C63]] : index +// CHECK: scf.if %[[CMPLIN]] +// CHECK: memref.load %[[ARGX]][%[[BLX]], %[[BLY]]] +// CHECK: memref.load %[[ARGY]][%[[BLX]], %[[BLY]]] %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { - linalg.generic - {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%x : memref<32x32xf32>) - outs(%y : memref<32x32xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } + scf.forall (%i, %j) in (%c7, %c9) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.block, #gpu.block]} gpu.terminator } - return + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op +} + +// ----- + +!type = memref<2 x 32 x f32> +!type1d = memref<32 x f32> + +// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<(d0) -> (d0 mod 7)> +// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 9 + d0 floordiv 7)> + +// CHECK-LABEL: func.func @block_linear_generate_launch( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> +// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> +func.func @block_linear_generate_launch( + %x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %c9 = arith.constant 9 : index + %c7 = arith.constant 7 : index + %one = arith.constant 1 : index + + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index + // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index +// CHECK: gpu.launch blocks({{.*}}) in (%{{.*}} = %[[C7]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads +// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x +// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y +// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z +// CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]](%[[BIDX]]) +// CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]](%[[BIDX]], %[[BIDY]], %[[BIDZ]]) +// CHECK: memref.load %[[ARGX]][%[[BLX]], %[[BLY]]] +// CHECK: memref.load %[[ARGY]][%[[BLX]], %[[BLY]]] + scf.forall (%i, %j) in (%c7, %c9) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.block, #gpu.block]} + + return %y : !type } transform.sequence failures(propagate) { ^bb1(%arg0: !transform.any_op): - %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op - %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread, #gpu.thread, #gpu.thread ] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch : (!transform.any_op) -> !transform.any_op } diff --git a/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir b/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir --- a/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir @@ -9,7 +9,7 @@ /// minor transfer size -> 1 thread. // CHECK: scf.forall {{.*}} in (1) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<8xf16> - // CHECK: {mapping = [#gpu.linear]} + // CHECK: {mapping = [#gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -63,7 +63,7 @@ /// minor transfer size -> 2 threads. // CHECK: scf.forall {{.*}} in (2) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<8xf16> - // CHECK: {mapping = [#gpu.linear]} + // CHECK: {mapping = [#gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -87,7 +87,7 @@ /// minor transfer size -> 5 threads. // CHECK: scf.forall {{.*}} in (5) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<4xf16> - // CHECK: {mapping = [#gpu.linear]} + // CHECK: {mapping = [#gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -112,7 +112,7 @@ /// minor transfer size -> 5 threads. // CHECK: scf.forall {{.*}} in (5) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<4xf16> - // CHECK: {mapping = [#gpu.linear]} + // CHECK: {mapping = [#gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -136,7 +136,7 @@ /// the transfer size to 4xf16. // CHECK: scf.forall {{.*}} in (32) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<4xf16> - // CHECK: {mapping = [#gpu.linear]} + // CHECK: {mapping = [#gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -159,7 +159,7 @@ /// Enough data for all threads and no need for predication. // CHECK: scf.forall {{.*}} in (32) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<8xf16> - // CHECK: {mapping = [#gpu.linear]} + // CHECK: {mapping = [#gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -181,7 +181,7 @@ func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt { // CHECK: scf.forall {{.*}} in (1, 8, 4) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<16x4x16xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -203,7 +203,7 @@ func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt { // CHECK: scf.forall {{.*}} in (1, 4, 8) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<16x8x8xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -225,7 +225,7 @@ func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt { // CHECK: scf.forall {{.*}} in (4, 8, 1) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<1x1x16xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -247,7 +247,7 @@ func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt { // CHECK: scf.forall {{.*}} in (1, 2, 16) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<4x4x1xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -273,7 +273,7 @@ // take 3. // CHECK: scf.forall {{.*}} in (3, 1, 7) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<1x5x1xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -296,7 +296,7 @@ // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost. // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<2x5x1xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } @@ -319,7 +319,7 @@ // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost. // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}} // CHECK: linalg.copy {{.*}} -> tensor<2x5x8xi8> - // CHECK: {mapping = [#gpu.linear, #gpu.linear, #gpu.linear]} + // CHECK: {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt return %0 : !tt } diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt @@ -8,7 +8,6 @@ MLIRAnalysis MLIRArithDialect MLIRBuiltinToLLVMIRTranslation - MLIRExecutionEngine MLIRFuncDialect MLIRGPUDialect MLIRGPUToGPURuntimeTransforms @@ -35,6 +34,22 @@ MLIRVectorToLLVM ) +# This is how it is defined in mlir/lib/Dialect/GPU/CMakeLists.txt +# We probably want something better project-wide +if(MLIR_ENABLE_CUDA_RUNNER) + # Enable gpu-to-cubin pass. + target_compile_definitions(MLIRGPUTestPasses + PRIVATE + MLIR_GPU_TO_CUBIN_PASS_ENABLE=1 + ) + + set(LIBS + ${LIBS} + + MLIRExecutionEngine + ) +endif() + add_mlir_library(MLIRGPUTestPasses TestConvertGPUKernelToCubin.cpp TestConvertGPUKernelToHsaco.cpp @@ -47,13 +62,3 @@ LINK_LIBS PUBLIC ${LIBS} ) - -# This is how it is defined in mlir/lib/Dialect/GPU/CMakeLists.txt -# We probably want something better project-wide -if(MLIR_ENABLE_CUDA_RUNNER) - # Enable gpu-to-cubin pass. - target_compile_definitions(MLIRGPUTestPasses - PRIVATE - MLIR_GPU_TO_CUBIN_PASS_ENABLE=1 - ) -endif()