diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(IR) add_subdirectory(Transforms) +add_subdirectory(TransformOps) diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt @@ -0,0 +1,6 @@ +set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td) +mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls) +mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs) +add_public_tablegen_target(MLIRGPUTransformOpsIncGen) + +add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc) diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h @@ -0,0 +1,75 @@ +//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H +#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H + +#include "mlir/Dialect/PDL/IR/PDLTypes.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +namespace gpu { +class GpuOp; +} // namespace gpu +} // namespace mlir + +//===----------------------------------------------------------------------===// +// GPU Transform Operations +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc" + +namespace mlir { +class DialectRegistry; +namespace transform { +namespace gpu { + +/// Searches `scf.foreach_thread` ops nested under `target` and maps each such +/// op to GPU threads. Mapping is one-to-one and the induction variables of +/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the +/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in +/// which case, the union of the number of threads is computed and may result in +/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not +/// supported. Dynamic block dim sizes are currently not supported. +DiagnosedSilenceableFailure +mapNestedForeachToThreadsImp(RewriterBase &rewriter, Operation *target, + const SmallVectorImpl &blockDim, + bool syncAfterDistribute, + llvm::Optional transformOp); + +/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is +/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten +/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic, +/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block +/// dim sizes are currently not supported. +DiagnosedSilenceableFailure mapForeachToBlocksImp( + RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + function_ref &)> + blockIdGenerator, + SmallVectorImpl &gridDims, TransformOpInterface transformOp); + +/// Finds the top level scf::ForeachThreadOp of given target. +DiagnosedSilenceableFailure +findTopLevelForeachThreadOp(Operation *target, + scf::ForeachThreadOp &topLevelForeachThreadOp, + TransformOpInterface transformOp); + +} // namespace gpu +} // namespace transform + +namespace gpu { +void registerTransformDialectExtension(DialectRegistry ®istry); +} // namespace gpu +} // namespace mlir + +#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -0,0 +1,175 @@ +//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef GPU_TRANSFORM_OPS +#define GPU_TRANSFORM_OPS + +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformEffects.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" +include "mlir/Dialect/PDL/IR/PDLTypes.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/OpBase.td" + +def MapNestedForeachToThreads : + Op { + let description = [{ + Target the `gpu.launch op` and rewrite all `scf.foreach_thread` + nested in it to distributed `gpu.thread_id` attribute. + + The operation searches for `scf.foreach_thread` ops nested under `target` + and maps each such op to GPU threads. Mapping is one-to-one and the + induction variables of `scf.foreach_thread` are rewritten to + `gpu.thread_id` according to the `thread_dim_mapping` attribute. + + Sibling `scf.foreach_thread` are supported in which case, the union of + the number of threads is computed and may result in predication. + + Multiple scf.foreach_thread are supported per `gpu.launch` in which case, + the max of all the threads is computed and taken for the global + `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the + whole thread range result in predicated computations. + + Dynamic `scf.foreach_thread` trip counts are currently not supported. + Dynamic block dim sizes are currently not supported. + + Only **bufferized** `scf.foreach_thread` are currently supported. + Only `scf.foreach_thread` distributed to **at most 3 dimensions** are + currently supported. + + Barriers are inserted after each scf.foreach_thread op for now. + + The operation alters the block size of the given gpu_launch using + blockDim argument. + + #### Return modes: + + This operation ignores non-gpu_launch ops and drops them in the return. + + If any scf.foreach_thread with tensors is found, the transform definitely + fails. + + If all the scf.foreach_thread operations contained within the LaunchOp + referred to by the `target` PDLOperation lower to GPU properly, the + transform succeeds. Otherwise the transform definitely fails. + + The returned handle points to the same LaunchOp operand, consuming it and + producing a new SSA value to satisfy chaining and linearity of the IR + properties. + + #### Example: + + ``` + gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) + threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) { + scf.foreach_thread (%i, %j) in (7, 9) { + ... // body 1 + } {thread_dim_mapping = [1, 0, 2]} + scf.foreach_thread (%i) in (12) { + ... // body 2 + } + gpu.terminator + } + ``` + is translated to: + + ``` + %bdimX = arith.constant 12 : index + %bdimY = arith.constant 9 : index + gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) + threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) { + if (threadIdx.x < 9 && threadIdx.y < 7) { + ... // body 1 + } + gpu.barrier + if (threadIdx.y < 1) { + ... // body 2 + } + gpu.barrier + gpu.terminator + } + ``` + }]; + + let arguments = (ins PDL_Operation:$target, + DefaultValuedAttr:$blockDim, + DefaultValuedAttr:$syncAfterDistribute); + let results = (outs PDL_Operation:$result); + + let assemblyFormat = "$target attr-dict"; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state); + }]; +} + + +def MapForeachToBlocks : + Op { + let description = [{ + Target the gpu_launch op and rewrite the top level `scf.foreach_thread` + to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute + is set, then first generates `gpu_launch` and moves the top level + `scf.foreach_thread` inside. + + The operation searches top level `scf.foreach_thread` ops under + `gpu_launch` and maps each such op to GPU blocks. Mapping is + one-to-one and the induction variables of `scf.foreach_thread` are + rewritten to gpu.block_id according to the `thread_dim_apping` attribute. + + Dynamic, `scf.foreach_thread` trip counts are currently not supported. + Dynamic block dim sizes are currently not supported. + + Only **bufferized** scf.foreach_thread are currently supported. + Only scf.foreach_thread distributed to **at most 3 dimensions** are + currently supported. + + The operation alters the block size of the given gpu_launch using + gridDim argument. + + #### Return modes: + + This operation ignores non-gpu_launch ops and drops them in the return. + + If any scf.foreach_thread with tensors is found, the transform definitely + fails. + + If all the scf.foreach_thread operations contained within the LaunchOp + referred to by the `target` PDLOperation lower to GPU properly, the + transform succeeds. Otherwise the transform definitely fails. + + The returned handle points to the same LaunchOp operand, consuming it and + producing a new SSA value to satisfy chaining and linearity of the IR + properties. + }]; + + let arguments = (ins PDL_Operation:$target, + DefaultValuedAttr:$gridDim, + UnitAttr:$generate_gpu_launch); + let results = (outs PDL_Operation:$result); + + let assemblyFormat = "$target attr-dict"; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state); + }]; +} + +#endif // GPU_TRANSFORM_OPS diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -751,161 +751,6 @@ }]; } -def MapNestedForeachThreadToGpuThreads : - Op { - let description = [{ - Target the gpu_launch op and rewrite all scf.foreach_thread - to distributed gpu.thread_id attribute. - - The operation searches `scf.foreach_thread` ops nested under `target` - and maps each such op to GPU threads. Mapping is one-to-one and the - induction variables of `scf.foreach_thread` are rewritten to - gpu.thread_id according to the thread_dim_apping attribute. - - Sibling `scf.foreach_thread` are supported in which case, the union of - the number of threads is computed and may result in predication. - - Multiple scf.foreach_thread are supported per function in which case, the - max of all the threads is computed and taken for the global gpu.thread_id. - If necessary, scf.foreach_thread that do not use the whole thread range - result in predicated computations. - - Dynamic, `scf.foreach_thread` trip counts are currently not supported. - Dynamic block dim sizes are currently not supported. - - Only **bufferized** scf.foreach_thread are currently supported. - Only scf.foreach_thread distributed to **at most 3 dimensions** are - currently supported. - - Barriers are inserted after each scf.foreach_thread op for now. - - The operation alters the block size of the given gpu_launch using - blockDim argument. - - #### Return modes: - - This operation ignores non-gpu_launch ops and drops them in the return. - - If any scf.foreach_thread with tensors is found, the transform definitely - fails. - - If all the scf.foreach_thread operations contained within the LaunchOp - referred to by the `target` PDLOperation lower to GPU properly, the - transform succeeds. Otherwise the transform definitely fails. - - The returned handle points to the same LaunchOp operand, consuming it and - producing a new SSA value to satisfy chaining and linearity of the IR - properties. - - #### Example: - - ``` - gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) - threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) { - scf.foreach_thread (%i, %j) in (7, 9) { - ... // body 1 - } {thread_dim_mapping = [1, 0, 2]} - scf.foreach_thread (%i) in (12) { - ... // body 2 - } - gpu.terminator - } - ``` - is translated to: - - ``` - %bdimX = arith.constant 12 : index - %bdimY = arith.constant 9 : index - gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) - threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) { - if (threadIdx.x < 9 && threadIdx.y < 7) { - ... // body 1 - } - gpu.barrier - if (threadIdx.y < 1) { - ... // body 2 - } - gpu.barrier - gpu.terminator - } - ``` - }]; - - let arguments = (ins PDL_Operation:$target, - DefaultValuedAttr:$blockDim, - DefaultValuedAttr:$syncAfterDistribute); - let results = (outs PDL_Operation:$result); - - let assemblyFormat = "$target attr-dict"; - let extraClassDeclaration = [{ - ::mlir::DiagnosedSilenceableFailure applyToOne( - ::mlir::Operation *target, - ::llvm::SmallVectorImpl<::mlir::Operation *> &results, - ::mlir::transform::TransformState &state); - }]; -} - -def MapNestedForeachThreadToGpuBlocks : Op { - let description = [{ - Target the gpu_launch op and rewrite the top level `scf.foreach_thread` - to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute - is set, then first generates `gpu_launch` and moves the top level - `scf.foreach_thread` inside. - - The operation searches top level `scf.foreach_thread` ops under - `gpu_launch` and maps each such op to GPU blocks. Mapping is - one-to-one and the induction variables of `scf.foreach_thread` are - rewritten to gpu.block_id according to the `thread_dim_apping` attribute. - - Dynamic, `scf.foreach_thread` trip counts are currently not supported. - Dynamic block dim sizes are currently not supported. - - Only **bufferized** scf.foreach_thread are currently supported. - Only scf.foreach_thread distributed to **at most 3 dimensions** are - currently supported. - - The operation alters the block size of the given gpu_launch using - gridDim argument. - - #### Return modes: - - This operation ignores non-gpu_launch ops and drops them in the return. - - If any scf.foreach_thread with tensors is found, the transform definitely - fails. - - If all the scf.foreach_thread operations contained within the LaunchOp - referred to by the `target` PDLOperation lower to GPU properly, the - transform succeeds. Otherwise the transform definitely fails. - - The returned handle points to the same LaunchOp operand, consuming it and - producing a new SSA value to satisfy chaining and linearity of the IR - properties. - }]; - - let arguments = (ins PDL_Operation:$target, - DefaultValuedAttr:$gridDim, - UnitAttr:$generate_gpu_launch); - let results = (outs PDL_Operation:$result); - - let assemblyFormat = "$target attr-dict"; - let extraClassDeclaration = [{ - ::mlir::DiagnosedSilenceableFailure applyToOne( - ::mlir::Operation *target, - ::llvm::SmallVectorImpl<::mlir::Operation *> &results, - ::mlir::transform::TransformState &state); - }]; -} - def VectorizeOp : Op { diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -125,32 +125,6 @@ FailureOr fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand); -/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is -/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten -/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic, -/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block -/// dim sizes are currently not supported. -LogicalResult rewriteTopLevelForeachThreadToGpuBlocks( - RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, - function_ref &)> - blockIdGenerator, - SmallVector &gridDims); - -/// Finds the top level scf::ForeachThreadOp of given target. -FailureOr findTopLevelForeachThreadOp(Operation *target); - -/// Searches `scf.foreach_thread` ops nested under `target` and maps each such -/// op to GPU threads. Mapping is one-to-one and the induction variables of -/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the -/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in -/// which case, the union of the number of threads is computed and may result in -/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not -/// supported. Dynamic block dim sizes are currently not supported. -mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads( - RewriterBase &rewriter, Operation *target, - const SmallVector &blockDim, bool syncAfterDistribute); - /// Split the given `op` into two parts along the given iteration space /// `dimension` at the specified `splitPoint`, and return the two parts. /// diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -31,6 +31,7 @@ #include "mlir/Dialect/EmitC/IR/EmitC.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" @@ -115,6 +116,7 @@ linalg::registerTransformDialectExtension(registry); memref::registerTransformDialectExtension(registry); scf::registerTransformDialectExtension(registry); + gpu::registerTransformDialectExtension(registry); // Register all external models. arith::registerBufferizableOpInterfaceExternalModels(registry); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -82,6 +82,8 @@ MLIRTransformUtils ) +add_subdirectory(TransformOps) + if(MLIR_ENABLE_CUDA_RUNNER) if(NOT MLIR_ENABLE_CUDA_CONVERSIONS) message(SEND_ERROR diff --git a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt @@ -0,0 +1,18 @@ +add_mlir_dialect_library(MLIRGPUTransformOps + GPUTransformOps.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps + + DEPENDS + MLIRGPUTransformOpsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRGPUTransforms + MLIRParser + MLIRPDLDialect + MLIRSideEffectInterfaces + MLIRTransformDialect + MLIRGPUOps + ) diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -0,0 +1,507 @@ +//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/PDL/IR/PDL.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/Value.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" + +using namespace mlir; +using namespace mlir::gpu; +using namespace mlir::transform; + +namespace { +/// A simple pattern rewriter that implements no special logic. +class SimpleRewriter : public PatternRewriter { +public: + SimpleRewriter(MLIRContext *context) : PatternRewriter(context) {} +}; +} // namespace + +/// Determines if the size of the kernel configuration is supported by the GPU +/// architecture being used. It presently makes use of CUDA limitations, however +/// that aspect may be enhanced for other GPUs. +static DiagnosedSilenceableFailure +checkGpuLimits(TransformOpInterface transformOp, Optional gridDimX, + Optional gridDimY, Optional gridDimZ, + Optional blockDimX, Optional blockDimY, + Optional blockDimZ) { + + static constexpr int max_total_blockdim = 1024; + static constexpr int max_blockdimx = 1024; + static constexpr int max_blockdimy = 1024; + static constexpr int max_blockdimz = 64; + static constexpr int max_total_griddim = 2147483647; + static constexpr int max_griddimx = 2147483647; + static constexpr int max_griddimy = 65535; + static constexpr int max_griddimz = 65535; + + if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > + max_total_blockdim || + (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > + max_total_griddim || + blockDimX.value_or(1) > max_blockdimx || + blockDimY.value_or(1) > max_blockdimy || + blockDimZ.value_or(1) > max_blockdimz || + gridDimY.value_or(1) > max_griddimy || + gridDimZ.value_or(1) > max_griddimz || + gridDimX.value_or(1) > max_griddimx) { + return transformOp.emitSilenceableError() + << "Trying to launch a GPU kernel with gridDim = (" + << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " + << gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1) + << ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1) + << "). It is larger than the limits."; + } + return DiagnosedSilenceableFailure::success(); +} + +/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and +/// put a terminator within. +static DiagnosedSilenceableFailure +createGpuLaunch(RewriterBase &rewriter, Location loc, + TransformOpInterface transformOp, LaunchOp &launchOp, + Optional gridDimX = llvm::None, + Optional gridDimY = llvm::None, + Optional gridDimZ = llvm::None, + Optional blockDimX = llvm::None, + Optional blockDimY = llvm::None, + Optional blockDimZ = llvm::None) { + DiagnosedSilenceableFailure diag = + checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, + blockDimY, blockDimZ); + if (!diag.succeeded()) + return diag; + + auto createConst = [&](int dim) { + return rewriter.create(loc, dim); + }; + OpBuilder::InsertionGuard guard(rewriter); + Value one = createConst(1); + Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one; + Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one; + Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one; + Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one; + Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one; + Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one; + launchOp = rewriter.create(loc, gridSizeX, gridSizeY, gridSizeZ, + blkSizeX, blkSizeY, blkSizeZ); + rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); + rewriter.create(loc); + return DiagnosedSilenceableFailure(success()); +} + +/// Alter kernel configuration of the given kernel. +static DiagnosedSilenceableFailure +alterGpuLaunch(SimpleRewriter &rewriter, LaunchOp gpuLaunch, + TransformOpInterface transformOp, + Optional gridDimX = llvm::None, + Optional gridDimY = llvm::None, + Optional gridDimZ = llvm::None, + Optional blockDimX = llvm::None, + Optional blockDimY = llvm::None, + Optional blockDimZ = llvm::None) { + DiagnosedSilenceableFailure diag = + checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX, + blockDimY, blockDimZ); + if (!diag.succeeded()) + return diag; + + KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfterValue(currentBlockdim.x); + auto createConstValue = [&](int dim) { + return rewriter.create(currentBlockdim.x.getLoc(), + dim); + }; + + if (gridDimX.has_value()) + gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value())); + if (gridDimY.has_value()) + gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value())); + if (gridDimZ.has_value()) + gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value())); + if (blockDimX.has_value()) + gpuLaunch.getBlockSizeXMutable().assign( + createConstValue(blockDimX.value())); + if (blockDimY.has_value()) + gpuLaunch.getBlockSizeYMutable().assign( + createConstValue(blockDimY.value())); + if (blockDimZ.has_value()) + gpuLaunch.getBlockSizeZMutable().assign( + createConstValue(blockDimZ.value())); + return DiagnosedSilenceableFailure::success(); +} + +//===----------------------------------------------------------------------===// +// MapForeachToBlocks +//===----------------------------------------------------------------------===// + +DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImp( + RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + function_ref &)> + blockIdGenerator, + SmallVectorImpl &gridDims, TransformOpInterface transformOp) { + if (foreachThreadOp.getNumResults() > 0) + return transformOp.emitSilenceableError() + << "only bufferized scf.foreach_thread lowers to gpu.block_id"; + if (foreachThreadOp.getNumThreads().size() > 3) + return transformOp.emitSilenceableError() + << "scf.foreach_thread with rank > 3 does not lower to gpu.block_id"; + + // Step 0. Outline the compute workload region and set up the workload + // operands. + FailureOr> potentialGridDim = + foreachThreadOp.getPermutedNumThreads(rewriter); + + if (failed(potentialGridDim) || + llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) { + return !getConstantIntValue(ofr).has_value(); + })) { + return transformOp.emitSilenceableError() << "unsupported dynamic gridDim"; + } + + for (OpFoldResult ofr : *potentialGridDim) + gridDims.push_back(getConstantIntValue(ofr).value()); + + SmallVector blockOps; + blockIdGenerator(rewriter, foreachThreadOp, blockOps); + + // Step 1. Move the body of foreachThreadOp. + // Erase the terminator first, it will not be used since we are on buffers. + rewriter.eraseOp(foreachThreadOp.getTerminator()); + Block *targetBlock = foreachThreadOp->getBlock(); + Block::iterator insertionPoint = Block::iterator(foreachThreadOp); + Block &sourceBlock = foreachThreadOp.getRegion().front(); + targetBlock->getOperations().splice(insertionPoint, + sourceBlock.getOperations()); + + // Step 2. RAUW thread indices to thread ops. + SmallVector threadIndices = + *foreachThreadOp.getPermutedThreadIndices(); + assert(blockOps.size() == 3 && "3 block id ops are required"); + for (auto [blockIdx, blockOp] : llvm::zip(threadIndices, blockOps)) { + Value val = blockIdx; + Value blkOp = blockOp; + if (!val) + continue; + for (Operation *user : llvm::make_early_inc_range(val.getUsers())) + user->replaceUsesOfWith(val, blkOp); + } + + // Step 3. Erase old op. + rewriter.eraseOp(foreachThreadOp); + + return DiagnosedSilenceableFailure::success(); +} + +DiagnosedSilenceableFailure mlir::transform::gpu::findTopLevelForeachThreadOp( + Operation *target, scf::ForeachThreadOp &topLevelForeachThreadOp, + TransformOpInterface transformOp) { + auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { + if (foreachThreadOp->getParentOfType()) + return WalkResult::advance(); + if (topLevelForeachThreadOp) + // TODO: Handle multiple foreach if there is no dependences between them + return WalkResult::interrupt(); + topLevelForeachThreadOp = foreachThreadOp; + return WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) + return transformOp.emitSilenceableError() + << "could not find a unique topLevel scf.foreach_thread"; + return DiagnosedSilenceableFailure::success(); +} + +/// This is a helper that is only used in +/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id. +static void generateGpuBlockIds(RewriterBase &rewriter, + scf::ForeachThreadOp foreachOp, + SmallVectorImpl &blockOps) { + Location loc = foreachOp->getLoc(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(foreachOp); + IndexType indexType = rewriter.getIndexType(); + SmallVector gpuDims{Dimension::x, Dimension::y, Dimension::z}; + for (int64_t idx : llvm::seq(0, gpuDims.size())) { + blockOps.push_back( + rewriter.create(loc, indexType, gpuDims[idx])); + } +} + +DiagnosedSilenceableFailure +transform::MapForeachToBlocks::applyToOne(Operation *target, + SmallVectorImpl &results, + transform::TransformState &state) { + LaunchOp gpuLaunch = dyn_cast(target); + SimpleRewriter rewriter(getContext()); + auto transformOp = cast(getOperation()); + + if (!getGenerateGpuLaunch() && !gpuLaunch) { + results.assign({target}); + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "Given target is not gpu.launch, set `generate_gpu_launch` " + "attribute"; + diag.attachNote(target->getLoc()) << "when applied to this payload op"; + return diag; + } + + scf::ForeachThreadOp topLevelForeachThreadOp; + DiagnosedSilenceableFailure diag = + mlir::transform::gpu::findTopLevelForeachThreadOp( + target, topLevelForeachThreadOp, transformOp); + if (!diag.succeeded()) { + results.assign({target}); + diag.attachNote(target->getLoc()) << "when applied to this payload op"; + return diag; + } + + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(topLevelForeachThreadOp); + + // Generate gpu launch here and move the foreach_thread inside + if (getGenerateGpuLaunch()) { + DiagnosedSilenceableFailure diag = + createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch); + if (!diag.succeeded()) { + results.assign({target}); + return diag; + } + rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); + Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp); + rewriter.eraseOp(topLevelForeachThreadOp); + topLevelForeachThreadOp = cast(newForeachThreadOp); + } + + SmallVector gridDim = extractFromI64ArrayAttr(getGridDim()); + diag = mlir::transform::gpu::mapForeachToBlocksImp( + rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim, + transformOp); + if (diag.succeeded()) { + diag = alterGpuLaunch(rewriter, gpuLaunch, + cast(getOperation()), + gridDim[0], gridDim[1], gridDim[2]); + } + + results.assign({gpuLaunch}); + return diag; +} + +//===----------------------------------------------------------------------===// +// MapNestedForeachToThreads +//===----------------------------------------------------------------------===// + +/// Searches `scf.foreach_thread` ops nested under `target` and maps each such +/// op to GPU threads. Mapping is one-to-one and the induction variables of +/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the +/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in +/// which case, the union of the number of threads is computed and may result +/// in predication. Dynamic, `scf.foreach_thread` trip counts are currently +/// not supported. Dynamic block dim sizes are currently not supported. +static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads( + RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + const SmallVectorImpl &globalBlockDims, bool syncAfterDistribute, + llvm::Optional transformOp) { + auto failureHelper = + [&](const Twine &message) -> DiagnosedSilenceableFailure { + if (transformOp.has_value()) { + return transformOp->emitSilenceableError() << message; + } + foreachThreadOp->emitError() << message; + return DiagnosedSilenceableFailure::definiteFailure(); + }; + + if (foreachThreadOp.getNumResults() > 0) + return failureHelper( + "only bufferized scf.foreach_thread lowers to gpu.thread_id"); + + if (foreachThreadOp.getNumThreads().size() > 3) + return failureHelper( + "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id"); + + auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter); + if (failed(potentialBlockDim) || + llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) { + return !getConstantIntValue(ofr).has_value(); + })) { + return failureHelper("unsupported dynamic blockdim size"); + } + + SmallVector blockDim = + llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) { + return getConstantIntValue(ofr).value(); + })); + + // Step 1. Create the gpu.thread ops + Location loc = foreachThreadOp.getLoc(); + IndexType indexType = rewriter.getIndexType(); + + SmallVector gpuDims{Dimension::x, Dimension::y, Dimension::z}; + SmallVector threadOps; + for (int64_t idx : llvm::seq(0, blockDim.size())) { + threadOps.push_back( + rewriter.create(loc, indexType, gpuDims[idx])); + } + // Step 2. Maybe create conditionals to predicate the region. + Value predicate; + for (auto [threadId, blockDim, globalBlockDim] : + llvm::zip(threadOps, blockDim, globalBlockDims)) { + if (blockDim > globalBlockDim) { + return failureHelper( + "The GPU threads are fewer than the loop trip counts. " + "Try to tile scf.foreach_thread before mapping."); + } + if (blockDim == globalBlockDim) + continue; + Value blockIdx = rewriter.create(loc, blockDim); + Value tmpPredicate = rewriter.create( + loc, arith::CmpIPredicate::ult, threadId, blockIdx); + predicate = + predicate ? rewriter.create(loc, predicate, tmpPredicate) + : tmpPredicate; + } + + // Step 3. Move the body of foreachThreadOp. + // Erase the terminator first, it will not be used. + rewriter.eraseOp(foreachThreadOp.getTerminator()); + Block *targetBlock; + Block::iterator insertionPoint; + if (predicate) { + // Step 3.a. If predicated, move at the beginning. + auto ifOp = + rewriter.create(loc, predicate, /*withElseRegion=*/false); + targetBlock = ifOp.thenBlock(); + insertionPoint = ifOp.thenBlock()->begin(); + } else { + // Step 3.a. Otherwise, move inline just before foreachThreadOp. + targetBlock = foreachThreadOp->getBlock(); + insertionPoint = Block::iterator(foreachThreadOp); + } + Block &sourceBlock = foreachThreadOp.getRegion().front(); + targetBlock->getOperations().splice(insertionPoint, + sourceBlock.getOperations()); + + // Step 4. RAUW thread indices to thread ops. + SmallVector threadIndices = + *foreachThreadOp.getPermutedThreadIndices(); + for (auto [threadIdx, threadOp] : llvm::zip(threadIndices, threadOps)) { + Value val = threadIdx; + Value op = threadOp; + if (!val) + continue; + for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { + user->replaceUsesOfWith(val, op); + } + } + + // Step 5. syncthreads. + // TODO: Need warpsync + if (syncAfterDistribute) + rewriter.create(loc); + + // Step 6. Erase old op. + rewriter.eraseOp(foreachThreadOp); + + return DiagnosedSilenceableFailure::success(); +} + +DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForeachToThreadsImp( + RewriterBase &rewriter, Operation *target, + const SmallVectorImpl &blockDim, bool syncAfterDistribute, + llvm::Optional transformOp) { + DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success(); + target->walk([&](scf::ForeachThreadOp foreachThreadOp) { + rewriter.setInsertionPoint(foreachThreadOp); + diag = rewriteOneForeachThreadToGpuThreads( + rewriter, foreachThreadOp, blockDim, syncAfterDistribute, transformOp); + return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt(); + }); + return diag; +} + +DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state) { + LaunchOp gpuLaunch = dyn_cast(target); + auto transformOp = cast(getOperation()); + + if (!gpuLaunch) { + results.assign({target}); + return emitSilenceableError() << "Given target is not gpu.launch"; + } + + SmallVector blockDim = extractFromI64ArrayAttr(getBlockDim()); + blockDim.resize(/*size=*/3, /*value=*/1); + + DiagnosedSilenceableFailure diag = + checkGpuLimits(transformOp, llvm::None, llvm::None, llvm::None, + blockDim[0], blockDim[1], blockDim[2]); + if (diag.isSilenceableFailure()) { + results.assign({target}); + diag.attachNote(getLoc()) << getBlockDimAttrName() << " is very large"; + return diag; + } + + SimpleRewriter rewriter(getContext()); + rewriter.setInsertionPoint(target); + + diag = mlir::transform::gpu::mapNestedForeachToThreadsImp( + rewriter, target, blockDim, getSyncAfterDistribute(), llvm::None); + if (diag.succeeded()) { + diag = + alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None, llvm::None, + llvm::None, blockDim[0], blockDim[1], blockDim[2]); + } + + results.assign({gpuLaunch}); + return diag; +} + +//===----------------------------------------------------------------------===// +// Transform op registration +//===----------------------------------------------------------------------===// + +namespace { +/// Registers new ops and declares PDL as dependent dialect since the +/// additional ops are using PDL types for operands and results. +class GPUTransformDialectExtension + : public transform::TransformDialectExtension< + GPUTransformDialectExtension> { +public: + GPUTransformDialectExtension() { + declareDependentDialect(); + declareGeneratedDialect(); + declareGeneratedDialect(); + declareGeneratedDialect(); + registerTransformOps< +#define GET_OP_LIST +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc" + >(); + } +}; +} // namespace + +#define GET_OP_CLASSES +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc" + +void mlir::gpu::registerTransformDialectExtension(DialectRegistry ®istry) { + registry.addExtensions(); +} diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1167,392 +1167,6 @@ modifiesPayload(effects); } -//===----------------------------------------------------------------------===// -// MapNestedForeachThreadToGpuThreads -//===----------------------------------------------------------------------===// - -/// Searches `scf.foreach_thread` ops nested under `target` and maps each such -/// op to GPU threads. Mapping is one-to-one and the induction variables of -/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the -/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in -/// which case, the union of the number of threads is computed and may result in -/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not -/// supported. Dynamic block dim sizes are currently not supported. -static FailureOr> rewriteOneForeachThreadToGpuThreads( - RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, - const SmallVector &globalBlockDims, bool syncAfterDistribute) { - if (foreachThreadOp.getNumResults() > 0) - return foreachThreadOp->emitError( - "only bufferized scf.foreach_thread lowers to gpu.thread"); - if (foreachThreadOp.getNumThreads().size() > 3) - return foreachThreadOp->emitError( - "scf.foreach_thread with rank > 3 does not lower to gpu.thread"); - - auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter); - if (failed(potentialBlockDim) || - llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) { - return !getConstantIntValue(ofr).has_value(); - })) - return foreachThreadOp->emitError("unsupported dynamic blockdim size"); - - SmallVector blockDim = - llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) { - return getConstantIntValue(ofr).value(); - })); - - // Step 1. Create the gpu.thread ops - Location loc = foreachThreadOp.getLoc(); - IndexType indexType = rewriter.getIndexType(); - - SmallVector gpuDims{gpu::Dimension::x, gpu::Dimension::y, - gpu::Dimension::z}; - SmallVector threadOps; - for (int64_t idx : llvm::seq(0, blockDim.size())) { - threadOps.push_back( - rewriter.create(loc, indexType, gpuDims[idx])); - } - // Step 2. Maybe create conditionals to predicate the region. - Value predicate; - for (auto [threadId, blockDim, globalBlockDim] : - llvm::zip(threadOps, blockDim, globalBlockDims)) { - if (blockDim > globalBlockDim) { - return foreachThreadOp.emitOpError("blockDim size overflow: ") - << blockDim << " > " << globalBlockDim; - } - if (blockDim == globalBlockDim) - continue; - Value tmpPredicate = rewriter.create( - loc, arith::CmpIPredicate::ult, threadId, - rewriter.create(loc, blockDim)); - predicate = - predicate ? rewriter.create(loc, predicate, tmpPredicate) - : tmpPredicate; - } - - // Step 3. Move the body of foreachThreadOp. - // Erase the terminator first, it will not be used. - rewriter.eraseOp(foreachThreadOp.getTerminator()); - Block *targetBlock; - Block::iterator insertionPoint; - if (predicate) { - // Step 3.a. If predicated, move at the beginning. - auto ifOp = - rewriter.create(loc, predicate, /*withElseRegion=*/false); - targetBlock = ifOp.thenBlock(); - insertionPoint = ifOp.thenBlock()->begin(); - } else { - // Step 3.a. Otherwise, move inline just before foreachThreadOp. - targetBlock = foreachThreadOp->getBlock(); - insertionPoint = Block::iterator(foreachThreadOp); - } - Block &sourceBlock = foreachThreadOp.getRegion().front(); - targetBlock->getOperations().splice(insertionPoint, - sourceBlock.getOperations()); - - // Step 4. RAUW thread indices to thread ops. - SmallVector threadIndices = - *foreachThreadOp.getPermutedThreadIndices(); - for (auto it : llvm::zip(threadIndices, threadOps)) { - Value val = std::get<0>(it); - if (!val) - continue; - for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { - rewriter.updateRootInPlace( - user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); }); - } - } - - // Step 5. syncthreads. - // TODO: Need warpsync - if (syncAfterDistribute) - rewriter.create(loc); - - // Step 6. Erase old op. - rewriter.eraseOp(foreachThreadOp); - - return *potentialBlockDim; -} - -mlir::WalkResult mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads( - RewriterBase &rewriter, Operation *target, - const SmallVector &blockDim, bool syncAfterDistribute) { - auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { - rewriter.setInsertionPoint(foreachThreadOp); - if (failed(rewriteOneForeachThreadToGpuThreads( - rewriter, foreachThreadOp, blockDim, syncAfterDistribute))) - return WalkResult::interrupt(); - return WalkResult::advance(); - }); - return walkResult; -} - -static LogicalResult -checkGpuLimits(Optional gridDimX, Optional gridDimY, - Optional gridDimZ, Optional blockDimX, - Optional blockDimY, Optional blockDimZ) { - // TODO The limits should live in the gpu dialect, but it's not like that - // right now. Read them in the common gpu dialect - if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > - 1024 || - gridDimY.value_or(1) > 65535 || gridDimZ.value_or(1) > 65535 || - gridDimX.value_or(1) > 2147483647) - return failure(); - return success(); -} - -/// Alter grid or block dimensions of the given kernel -static LogicalResult alterGpuLaunch(SimpleRewriter &rewriter, - gpu::LaunchOp gpuLaunch, - Optional gridDimX = llvm::None, - Optional gridDimY = llvm::None, - Optional gridDimZ = llvm::None, - Optional blockDimX = llvm::None, - Optional blockDimY = llvm::None, - Optional blockDimZ = llvm::None) { - if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, - blockDimZ))) { - gpuLaunch->emitError( - "Requested kernel thread configuration is larger than the limits"); - return failure(); - } - - gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointAfterValue(currentBlockdim.x); - auto createConstValue = [&](int dim) { - return rewriter.create(currentBlockdim.x.getLoc(), - dim); - }; - - if (gridDimX.has_value()) - gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value())); - if (gridDimY.has_value()) - gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value())); - if (gridDimZ.has_value()) - gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value())); - if (blockDimX.has_value()) - gpuLaunch.getBlockSizeXMutable().assign( - createConstValue(blockDimX.value())); - if (blockDimY.has_value()) - gpuLaunch.getBlockSizeYMutable().assign( - createConstValue(blockDimY.value())); - if (blockDimZ.has_value()) - gpuLaunch.getBlockSizeZMutable().assign( - createConstValue(blockDimZ.value())); - return success(); -} - -DiagnosedSilenceableFailure -transform::MapNestedForeachThreadToGpuThreads::applyToOne( - Operation *target, SmallVectorImpl &results, - transform::TransformState &state) { - - gpu::LaunchOp gpuLaunch = dyn_cast(target); - if (!gpuLaunch) { - target->emitError("Given target is not gpu.launch"); - return DiagnosedSilenceableFailure::definiteFailure(); - } - - SmallVector blockDim = extractFromI64ArrayAttr(getBlockDim()); - blockDim.resize(/*size=*/3, /*value=*/1); - SimpleRewriter rewriter(getContext()); - rewriter.setInsertionPoint(target); - auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads( - rewriter, target, blockDim, getSyncAfterDistribute()); - if (walkResult.wasInterrupted()) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - - LogicalResult result = - alterGpuLaunch(rewriter, gpuLaunch, llvm::None, llvm::None, llvm::None, - blockDim[0], blockDim[1], blockDim[2]); - if (failed(result)) - return DiagnosedSilenceableFailure::definiteFailure(); - - results.assign({target}); - return DiagnosedSilenceableFailure(success()); -} - -//===----------------------------------------------------------------------===// -// MapNestedForeachThreadToGpuBlocks -//===----------------------------------------------------------------------===// - -LogicalResult mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks( - RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, - function_ref &)> - blockIdGenerator, - SmallVector &gridDims) { - if (foreachThreadOp.getNumResults() > 0) - return foreachThreadOp->emitError( - "only bufferized scf.foreach_thread lowers to gpu.block_id"); - if (foreachThreadOp.getNumThreads().size() > 3) - return foreachThreadOp->emitError( - "scf.foreach_thread with rank > 3 does not lower to gpu.block_id"); - - // Step 0. Outline the compute workload region and set up the workload - // operands. - auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter); - if (failed(potentialGridDim) || - llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) { - return !getConstantIntValue(ofr).has_value(); - })) - return foreachThreadOp->emitError("unsupported dynamic gridDim"); - - for (OpFoldResult ofr : *potentialGridDim) - gridDims.push_back(getConstantIntValue(ofr).value()); - - SmallVector blockOps; - blockIdGenerator(rewriter, foreachThreadOp, blockOps); - - // Step 1. Move the body of foreachThreadOp. - // Erase the terminator first, it will not be used since we are on buffers. - rewriter.eraseOp(foreachThreadOp.getTerminator()); - Block *targetBlock = foreachThreadOp->getBlock(); - Block::iterator insertionPoint = Block::iterator(foreachThreadOp); - Block &sourceBlock = foreachThreadOp.getRegion().front(); - targetBlock->getOperations().splice(insertionPoint, - sourceBlock.getOperations()); - - // Step 2. RAUW thread indices to thread ops. - SmallVector threadIndices = - *foreachThreadOp.getPermutedThreadIndices(); - assert(blockOps.size() == 3 && "3 block id ops are required"); - for (auto it : llvm::zip(threadIndices, blockOps)) { - Value val = std::get<0>(it); - if (!val) - continue; - for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { - rewriter.updateRootInPlace( - user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); }); - } - } - - // Step 3. Erase old op. - rewriter.eraseOp(foreachThreadOp); - - return success(); -} - -FailureOr -mlir::linalg::findTopLevelForeachThreadOp(Operation *target) { - scf::ForeachThreadOp topLevelForeachThreadOp; - auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { - if (foreachThreadOp->getParentOfType()) - return WalkResult::advance(); - if (topLevelForeachThreadOp) - // TODO Handle multiple foreach if there is no dependences between them - return WalkResult::interrupt(); - topLevelForeachThreadOp = foreachThreadOp; - return WalkResult::advance(); - }); - - if (walkResult.wasInterrupted()) - return target->emitError( - "could not find a unique topLevel scf.foreach_thread"); - - return topLevelForeachThreadOp; -} - -/// Create gpuLauncOp with given kernel configurations -static FailureOr -createGpuLaunch(RewriterBase &rewriter, Location loc, - Optional gridDimX = llvm::None, - Optional gridDimY = llvm::None, - Optional gridDimZ = llvm::None, - Optional blockDimX = llvm::None, - Optional blockDimY = llvm::None, - Optional blockDimZ = llvm::None) { - if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, - blockDimZ))) - return failure(); - auto createConstant = [&](int dim) { - return rewriter.create(loc, dim); - }; - Value one = createConstant(1); - Value gridSizeX = - gridDimX.has_value() ? createConstant(gridDimX.value()) : one; - Value gridSizeY = - gridDimY.has_value() ? createConstant(gridDimY.value()) : one; - Value gridSizeZ = - gridDimZ.has_value() ? createConstant(gridDimZ.value()) : one; - Value blockSizeX = - blockDimX.has_value() ? createConstant(blockDimX.value()) : one; - Value blockSizeY = - blockDimY.has_value() ? createConstant(blockDimY.value()) : one; - Value blockSizeZ = - blockDimZ.has_value() ? createConstant(blockDimZ.value()) : one; - auto launchOp = rewriter.create( - loc, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ); - rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); - rewriter.create(loc); - return launchOp; -} - -/// This is an helper that is only used in -/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id -static void generateGpuBlockIds(RewriterBase &rewriter, - scf::ForeachThreadOp foreachOp, - SmallVector &blockOps) { - Location loc = foreachOp->getLoc(); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(foreachOp); - IndexType indexType = rewriter.getIndexType(); - SmallVector gpuDims{gpu::Dimension::x, gpu::Dimension::y, - gpu::Dimension::z}; - for (int64_t idx : llvm::seq(0, gpuDims.size())) { - blockOps.push_back( - rewriter.create(loc, indexType, gpuDims[idx])); - } -} - -DiagnosedSilenceableFailure -transform::MapNestedForeachThreadToGpuBlocks::applyToOne( - Operation *target, SmallVectorImpl &results, - transform::TransformState &state) { - gpu::LaunchOp gpuLaunch = dyn_cast(target); - SimpleRewriter rewriter(getContext()); - - if (!getGenerateGpuLaunch() && !gpuLaunch) { - target->emitError("Given target is not gpu.launch, set " - "`generate_gpu_launch` attribute"); - return DiagnosedSilenceableFailure::definiteFailure(); - } - - auto res = mlir::linalg::findTopLevelForeachThreadOp(target); - if (failed(res)) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - - scf::ForeachThreadOp topLevelForeachThreadOp = *res; - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(topLevelForeachThreadOp); - - // Generate gpu launch here and move the foreach_thread inside - if (getGenerateGpuLaunch()) { - FailureOr maybeGpuLaunch = - createGpuLaunch(rewriter, target->getLoc()); - if (failed(maybeGpuLaunch)) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - gpuLaunch = *maybeGpuLaunch; - rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); - Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp); - rewriter.eraseOp(topLevelForeachThreadOp); - topLevelForeachThreadOp = - dyn_cast(newForeachThreadOp); - } - - SmallVector gridDim = extractFromI64ArrayAttr(getGridDim()); - if (failed(mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks( - rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim))) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - - if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1], - gridDim[2]))) - return DiagnosedSilenceableFailure::definiteFailure(); - - results.assign({gpuLaunch}); - return DiagnosedSilenceableFailure(success()); -} - //===----------------------------------------------------------------------===// // TileToForeachThreadOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Linalg/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir rename from mlir/test/Dialect/Linalg/transform-gpu.mlir rename to mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/Linalg/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -35,7 +35,7 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 - transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { blockDim = [12, 9, 1]} + transform.gpu.map_foreach_to_blocks %funcop { blockDim = [12, 9, 1]} } } @@ -92,7 +92,7 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 - transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] } + transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1] } } } @@ -134,8 +134,8 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["func.func"]} in %arg0 - %gpuLaunch = transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { generate_gpu_launch } - transform.structured.map_nested_foreach_thread_to_gpu_threads %gpuLaunch { blockDim = [32, 4, 1] } + %gpuLaunch = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch } + transform.gpu.map_nested_foreach_to_threads %gpuLaunch { blockDim = [32, 4, 1] } } } @@ -171,6 +171,6 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 - transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } + transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } } } diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3761,6 +3761,65 @@ ]), ) +td_library( + name = "GPUTransformOpsTdFiles", + srcs = [ + "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td", + ], + includes = ["include"], + deps = [ + ":PDLDialectTdFiles", + ":TransformDialectTdFiles", + ], +) + +gentbl_cc_library( + name = "GPUTransformOpsIncGen", + strip_include_prefix = "include", + tbl_outs = [ + ( + ["-gen-op-decls"], + "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc", + ), + ( + ["-gen-op-defs"], + "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td", + deps = [ + ":GPUTransformOpsTdFiles", + ], +) + +cc_library( + name = "GPUTransformOps", + srcs = [ + "lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp", + ], + hdrs = [ + "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h", + ], + includes = ["include"], + deps = [ + ":ArithDialect", + ":AsmParser", + ":ControlFlowDialect", + ":GPUDialect", + ":GPUTransformOpsIncGen", + ":GPUTransforms", + ":IR", + ":PDLDialect", + ":Parser", + ":SCFDialect", + ":SideEffectInterfaces", + ":TransformDialect", + ":TransformUtils", + "//llvm:Support", + ], +) + td_library( name = "LLVMOpsTdFiles", srcs = [ @@ -6401,6 +6460,7 @@ ":GPUToROCDLTransforms", ":GPUToSPIRV", ":GPUToVulkanTransforms", + ":GPUTransformOps", ":GPUTransforms", ":IR", ":LLVMDialect",