diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(IR) add_subdirectory(Transforms) +add_subdirectory(TransformOps) diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt @@ -0,0 +1,6 @@ +set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td) +mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls) +mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs) +add_public_tablegen_target(MLIRGPUTransformOpsIncGen) + +add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc) diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h @@ -0,0 +1,70 @@ +//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H +#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H + +#include "mlir/Dialect/PDL/IR/PDLTypes.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +namespace gpu { +class GpuOp; +} // namespace gpu +} // namespace mlir + +//===----------------------------------------------------------------------===// +// GPU Transform Operations +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc" + +namespace mlir { +class DialectRegistry; +namespace transform { +namespace gpu { + +/// Searches `scf.foreach_thread` ops nested under `target` and maps each such +/// op to GPU threads. Mapping is one-to-one and the induction variables of +/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the +/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in +/// which case, the union of the number of threads is computed and may result in +/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not +/// supported. Dynamic block dim sizes are currently not supported. +LogicalResult rewriteMapNestedForeachThreadToGpuThreads( + RewriterBase &rewriter, Operation *target, + const SmallVector &blockDim, bool syncAfterDistribute); + +/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is +/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten +/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic, +/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block +/// dim sizes are currently not supported. +LogicalResult rewriteTopLevelForeachThreadToGpuBlocks( + RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + function_ref &)> + blockIdGenerator, + SmallVector &gridDims); + +/// Finds the top level scf::ForeachThreadOp of given target. +FailureOr findTopLevelForeachThreadOp(Operation *target); + +} // namespace gpu +} // namespace transform + +namespace gpu { +void registerTransformDialectExtension(DialectRegistry ®istry); +} // namespace gpu +} // namespace mlir + +#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -0,0 +1,176 @@ +//===- GPUTransformOps.td - GPU transform ops --------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef GPU_TRANSFORM_OPS +#define GPU_TRANSFORM_OPS + +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformEffects.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" +include "mlir/Dialect/PDL/IR/PDLTypes.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/OpBase.td" + +def MapNestedForeachToThreads : + Op { + let description = [{ + Target the `gpu_launch op` and rewrite all `scf.foreach_thread` + to distributed gpu.thread_id attribute. + + The operation searches `scf.foreach_thread` ops nested under `target` + and maps each such op to GPU threads. Mapping is one-to-one and the + induction variables of `scf.foreach_thread` are rewritten to + gpu.thread_id according to the thread_dim_apping attribute. + + Sibling `scf.foreach_thread` are supported in which case, the union of + the number of threads is computed and may result in predication. + + Multiple scf.foreach_thread are supported per function in which case, the + max of all the threads is computed and taken for the global gpu.thread_id. + If necessary, scf.foreach_thread that do not use the whole thread range + result in predicated computations. + + Dynamic, `scf.foreach_thread` trip counts are currently not supported. + Dynamic block dim sizes are currently not supported. + + Only **bufferized** scf.foreach_thread are currently supported. + Only scf.foreach_thread distributed to **at most 3 dimensions** are + currently supported. + + Barriers are inserted after each scf.foreach_thread op for now. + + The operation alters the block size of the given gpu_launch using + blockDim argument. + + Return modes: + ============= + This operation ignores non-gpu_launch ops and drops them in the return. + + If any scf.foreach_thread with tensors is found, the transform definitely + fails. + + If all the scf.foreach_thread operations contained within the LaunchOp + referred to by the `target` PDLOperation lower to GPU properly, the + transform succeeds. Otherwise the transform definitely fails. + + The returned handle points to the same LaunchOp operand, consuming it and + producing a new SSA value to satisfy chaining and linearity of the IR + properties. + + Example: + ======== + + ``` + gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) + threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) { + scf.foreach_thread (%i, %j) in (7, 9) { + ... // body 1 + } {thread_dim_mapping = [1, 0, 2]} + scf.foreach_thread (%i) in (12) { + ... // body 2 + } + gpu.terminator + } + ``` + is translated to: + + ``` + %bdimX = arith.constant 12 : index + %bdimY = arith.constant 9 : index + gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) + threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) { + if (threadIdx.x < 9 && threadIdx.y < 7) { + ... // body 1 + } + gpu.barrier + if (threadIdx.y < 1) { + ... // body 2 + } + gpu.barrier + gpu.terminator + } + ``` + }]; + + let arguments = (ins PDL_Operation:$target, + DefaultValuedAttr:$blockDim, + DefaultValuedAttr:$syncAfterDistribute); + let results = (outs PDL_Operation:$result); + + let assemblyFormat = "$target attr-dict"; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state); + }]; +} + + +def MapForeachToBlocks : + Op { + let description = [{ + Target the gpu_launch op and rewrite the top level `scf.foreach_thread` + to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute + is set, then first generates `gpu_launch` and moves the top level + `scf.foreach_thread` inside. + + The operation searches top level `scf.foreach_thread` ops under + `gpu_launch` and maps each such op to GPU blocks. Mapping is + one-to-one and the induction variables of `scf.foreach_thread` are + rewritten to gpu.block_id according to the `thread_dim_apping` attribute. + + Dynamic, `scf.foreach_thread` trip counts are currently not supported. + Dynamic block dim sizes are currently not supported. + + Only **bufferized** scf.foreach_thread are currently supported. + Only scf.foreach_thread distributed to **at most 3 dimensions** are + currently supported. + + The operation alters the block size of the given gpu_launch using + gridDim argument. + + #### Return modes: + + This operation ignores non-gpu_launch ops and drops them in the return. + + If any scf.foreach_thread with tensors is found, the transform definitely + fails. + + If all the scf.foreach_thread operations contained within the LaunchOp + referred to by the `target` PDLOperation lower to GPU properly, the + transform succeeds. Otherwise the transform definitely fails. + + The returned handle points to the same LaunchOp operand, consuming it and + producing a new SSA value to satisfy chaining and linearity of the IR + properties. + }]; + + let arguments = (ins PDL_Operation:$target, + DefaultValuedAttr:$gridDim, + UnitAttr:$generate_gpu_launch); + let results = (outs PDL_Operation:$result); + + let assemblyFormat = "$target attr-dict"; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state); + }]; +} + +#endif // GPU_TRANSFORM_OPS diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -748,162 +748,6 @@ }]; } -def MapNestedForeachThreadToGpuThreads : - Op { - let description = [{ - Target the gpu_launch op and rewrite all scf.foreach_thread - to distributed gpu.thread_id attribute. - - The operation searches `scf.foreach_thread` ops nested under `target` - and maps each such op to GPU threads. Mapping is one-to-one and the - induction variables of `scf.foreach_thread` are rewritten to - gpu.thread_id according to the thread_dim_apping attribute. - - Sibling `scf.foreach_thread` are supported in which case, the union of - the number of threads is computed and may result in predication. - - Multiple scf.foreach_thread are supported per function in which case, the - max of all the threads is computed and taken for the global gpu.thread_id. - If necessary, scf.foreach_thread that do not use the whole thread range - result in predicated computations. - - Dynamic, `scf.foreach_thread` trip counts are currently not supported. - Dynamic block dim sizes are currently not supported. - - Only **bufferized** scf.foreach_thread are currently supported. - Only scf.foreach_thread distributed to **at most 3 dimensions** are - currently supported. - - Barriers are inserted after each scf.foreach_thread op for now. - - The operation alters the block size of the given gpu_launch using - blockDim argument. - - Return modes: - ============= - This operation ignores non-gpu_launch ops and drops them in the return. - - If any scf.foreach_thread with tensors is found, the transform definitely - fails. - - If all the scf.foreach_thread operations contained within the LaunchOp - referred to by the `target` PDLOperation lower to GPU properly, the - transform succeeds. Otherwise the transform definitely fails. - - The returned handle points to the same LaunchOp operand, consuming it and - producing a new SSA value to satisfy chaining and linearity of the IR - properties. - - Example: - ======== - - ``` - gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) - threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) { - scf.foreach_thread (%i, %j) in (7, 9) { - ... // body 1 - } {thread_dim_mapping = [1, 0, 2]} - scf.foreach_thread (%i) in (12) { - ... // body 2 - } - gpu.terminator - } - ``` - is translated to: - - ``` - %bdimX = arith.constant 12 : index - %bdimY = arith.constant 9 : index - gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) - threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) { - if (threadIdx.x < 9 && threadIdx.y < 7) { - ... // body 1 - } - gpu.barrier - if (threadIdx.y < 1) { - ... // body 2 - } - gpu.barrier - gpu.terminator - } - ``` - }]; - - let arguments = (ins PDL_Operation:$target, - DefaultValuedAttr:$blockDim, - DefaultValuedAttr:$syncAfterDistribute); - let results = (outs PDL_Operation:$result); - - let assemblyFormat = "$target attr-dict"; - let extraClassDeclaration = [{ - ::mlir::DiagnosedSilenceableFailure applyToOne( - ::mlir::Operation *target, - ::llvm::SmallVectorImpl<::mlir::Operation *> &results, - ::mlir::transform::TransformState &state); - }]; -} - -def MapNestedForeachThreadToGpuBlocks : Op { - let description = [{ - Target the gpu_launch op and rewrite the top level `scf.foreach_thread` - to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute - is set, then first generates `gpu_launch` and moves the top level - `scf.foreach_thread` inside. - - The operation searches top level `scf.foreach_thread` ops under - `gpu_launch` and maps each such op to GPU blocks. Mapping is - one-to-one and the induction variables of `scf.foreach_thread` are - rewritten to gpu.block_id according to the `thread_dim_apping` attribute. - - Dynamic, `scf.foreach_thread` trip counts are currently not supported. - Dynamic block dim sizes are currently not supported. - - Only **bufferized** scf.foreach_thread are currently supported. - Only scf.foreach_thread distributed to **at most 3 dimensions** are - currently supported. - - The operation alters the block size of the given gpu_launch using - gridDim argument. - - #### Return modes: - - This operation ignores non-gpu_launch ops and drops them in the return. - - If any scf.foreach_thread with tensors is found, the transform definitely - fails. - - If all the scf.foreach_thread operations contained within the LaunchOp - referred to by the `target` PDLOperation lower to GPU properly, the - transform succeeds. Otherwise the transform definitely fails. - - The returned handle points to the same LaunchOp operand, consuming it and - producing a new SSA value to satisfy chaining and linearity of the IR - properties. - }]; - - let arguments = (ins PDL_Operation:$target, - DefaultValuedAttr:$gridDim, - UnitAttr:$generate_gpu_launch); - let results = (outs PDL_Operation:$result); - - let assemblyFormat = "$target attr-dict"; - let extraClassDeclaration = [{ - ::mlir::DiagnosedSilenceableFailure applyToOne( - ::mlir::Operation *target, - ::llvm::SmallVectorImpl<::mlir::Operation *> &results, - ::mlir::transform::TransformState &state); - }]; -} - def VectorizeOp : Op { diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -125,32 +125,6 @@ FailureOr fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand); -/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is -/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten -/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic, -/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block -/// dim sizes are currently not supported. -LogicalResult rewriteTopLevelForeachThreadToGpuBlocks( - RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, - function_ref &)> - blockIdGenerator, - SmallVector &gridDims); - -/// Finds the top level scf::ForeachThreadOp of given target. -FailureOr findTopLevelForeachThreadOp(Operation *target); - -/// Searches `scf.foreach_thread` ops nested under `target` and maps each such -/// op to GPU threads. Mapping is one-to-one and the induction variables of -/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the -/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in -/// which case, the union of the number of threads is computed and may result in -/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not -/// supported. Dynamic block dim sizes are currently not supported. -mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads( - RewriterBase &rewriter, Operation *target, - const SmallVector &blockDim, bool syncAfterDistribute); - /// Split the given `op` into two parts along the given iteration space /// `dimension` at the specified `splitPoint`, and return the two parts. /// diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -31,6 +31,7 @@ #include "mlir/Dialect/EmitC/IR/EmitC.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" @@ -113,6 +114,7 @@ bufferization::registerTransformDialectExtension(registry); linalg::registerTransformDialectExtension(registry); scf::registerTransformDialectExtension(registry); + gpu::registerTransformDialectExtension(registry); // Register all external models. arith::registerBufferizableOpInterfaceExternalModels(registry); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -82,6 +82,25 @@ MLIRTransformUtils ) +add_mlir_dialect_library(MLIRGPUTransformOps + TransformOps/GPUTransformOps.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps + + DEPENDS + MLIRGPUTransformOpsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRGPUTransforms + MLIRParser + MLIRPDLDialect + MLIRSideEffectInterfaces + MLIRTransformDialect + MLIRGPUOps + ) + if(MLIR_ENABLE_CUDA_RUNNER) if(NOT MLIR_ENABLE_CUDA_CONVERSIONS) message(SEND_ERROR diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -0,0 +1,447 @@ +//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" + +#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/PDL/IR/PDL.h" +#include "mlir/Dialect/PDL/IR/PDLTypes.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Parser/Parser.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +using namespace mlir; +using namespace mlir::gpu; +using namespace mlir::transform; + +namespace { +/// A simple pattern rewriter that implements no special logic. +class SimpleRewriter : public PatternRewriter { +public: + SimpleRewriter(MLIRContext *context) : PatternRewriter(context) {} +}; +} // namespace + +static LogicalResult +checkGpuLimits(Optional gridDimX, Optional gridDimY, + Optional gridDimZ, Optional blockDimX, + Optional blockDimY, Optional blockDimZ) { + + static constexpr int max_total_blockdim = 1024; + static constexpr int max_blockdimx = 1024; + static constexpr int max_blockdimy = 1024; + static constexpr int max_blockdimz = 64; + static constexpr int max_total_griddim = 2147483647; + static constexpr int max_griddimx = 2147483647; + static constexpr int max_griddimy = 65535; + static constexpr int max_griddimz = 65535; + + if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > + max_total_blockdim || + (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > + max_total_griddim || + blockDimX.value_or(1) > max_blockdimx || + blockDimY.value_or(1) > max_blockdimy || + blockDimZ.value_or(1) > max_blockdimz || + gridDimY.value_or(1) > max_griddimy || + gridDimZ.value_or(1) > max_griddimz || + gridDimX.value_or(1) > max_griddimx) + return failure(); + return success(); +} + +/// Create gpuLauncOp with given kernel configurations +static FailureOr +createGpuLaunch(RewriterBase &rewriter, Location loc, + Optional gridDimX = llvm::None, + Optional gridDimY = llvm::None, + Optional gridDimZ = llvm::None, + Optional blockDimX = llvm::None, + Optional blockDimY = llvm::None, + Optional blockDimZ = llvm::None) { + if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + blockDimZ))) + return failure(); + auto createConst = [&](int dim) { + return rewriter.create(loc, dim); + }; + Value one = createConst(1); + Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one; + Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one; + Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one; + Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one; + Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one; + Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one; + auto launchOp = rewriter.create( + loc, gridSizeX, gridSizeY, gridSizeZ, blkSizeX, blkSizeY, blkSizeZ); + rewriter.setInsertionPointToEnd(&launchOp.body().front()); + rewriter.create(loc); + return launchOp; +} + +/// Alter grid or block dimensions of the given kernel +static LogicalResult alterGpuLaunch(SimpleRewriter &rewriter, + LaunchOp gpuLaunch, + Optional gridDimX = llvm::None, + Optional gridDimY = llvm::None, + Optional gridDimZ = llvm::None, + Optional blockDimX = llvm::None, + Optional blockDimY = llvm::None, + Optional blockDimZ = llvm::None) { + if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + blockDimZ))) { + gpuLaunch->emitError( + "Requested kernel thread configuration is larger than the limits"); + return failure(); + } + + KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfterValue(currentBlockdim.x); + auto createConstValue = [&](int dim) { + return rewriter.create(currentBlockdim.x.getLoc(), + dim); + }; + + if (gridDimX.has_value()) + gpuLaunch.gridSizeXMutable().assign(createConstValue(gridDimX.value())); + if (gridDimY.has_value()) + gpuLaunch.gridSizeYMutable().assign(createConstValue(gridDimY.value())); + if (gridDimZ.has_value()) + gpuLaunch.gridSizeZMutable().assign(createConstValue(gridDimZ.value())); + if (blockDimX.has_value()) + gpuLaunch.blockSizeXMutable().assign(createConstValue(blockDimX.value())); + if (blockDimY.has_value()) + gpuLaunch.blockSizeYMutable().assign(createConstValue(blockDimY.value())); + if (blockDimZ.has_value()) + gpuLaunch.blockSizeZMutable().assign(createConstValue(blockDimZ.value())); + return success(); +} + +//===----------------------------------------------------------------------===// +// MapForeachToBlocks +//===----------------------------------------------------------------------===// + +LogicalResult mlir::transform::gpu::rewriteTopLevelForeachThreadToGpuBlocks( + RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + function_ref &)> + blockIdGenerator, + SmallVector &gridDims) { + if (foreachThreadOp.getNumResults() > 0) + return foreachThreadOp->emitError( + "only bufferized scf.foreach_thread lowers to gpu.block_id"); + if (foreachThreadOp.getNumThreads().size() > 3) + return foreachThreadOp->emitError( + "scf.foreach_thread with rank > 3 does not lower to gpu.block_id"); + + // Step 0. Outline the compute workload region and set up the workload + // operands. + auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter); + if (failed(potentialGridDim) || + llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) { + return !getConstantIntValue(ofr).has_value(); + })) + return foreachThreadOp->emitError("unsupported dynamic gridDim"); + + for (OpFoldResult ofr : *potentialGridDim) + gridDims.push_back(getConstantIntValue(ofr).value()); + + SmallVector blockOps; + blockIdGenerator(rewriter, foreachThreadOp, blockOps); + + // Step 1. Move the body of foreachThreadOp. + // Erase the terminator first, it will not be used since we are on buffers. + rewriter.eraseOp(foreachThreadOp.getTerminator()); + Block *targetBlock = foreachThreadOp->getBlock(); + Block::iterator insertionPoint = Block::iterator(foreachThreadOp); + Block &sourceBlock = foreachThreadOp.getRegion().front(); + targetBlock->getOperations().splice(insertionPoint, + sourceBlock.getOperations()); + + // Step 2. RAUW thread indices to thread ops. + SmallVector threadIndices = + *foreachThreadOp.getPermutedThreadIndices(); + assert(blockOps.size() == 3 && "3 block id ops are required"); + for (auto it : llvm::zip(threadIndices, blockOps)) { + Value val = std::get<0>(it); + if (!val) + continue; + for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { + rewriter.updateRootInPlace( + user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); }); + } + } + + // Step 3. Erase old op. + rewriter.eraseOp(foreachThreadOp); + + return success(); +} + +FailureOr +mlir::transform::gpu::findTopLevelForeachThreadOp(Operation *target) { + scf::ForeachThreadOp topLevelForeachThreadOp; + auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { + if (foreachThreadOp->getParentOfType()) + return WalkResult::advance(); + if (topLevelForeachThreadOp) + // TODO Handle multiple foreach if there is no dependences between them + return WalkResult::interrupt(); + topLevelForeachThreadOp = foreachThreadOp; + return WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) + return target->emitError( + "could not find a unique topLevel scf.foreach_thread"); + + return topLevelForeachThreadOp; +} + +/// This is an helper that is only used in +/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id +static void generateGpuBlockIds(RewriterBase &rewriter, + scf::ForeachThreadOp foreachOp, + SmallVector &blockOps) { + Location loc = foreachOp->getLoc(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(foreachOp); + IndexType indexType = rewriter.getIndexType(); + SmallVector gpuDims{Dimension::x, Dimension::y, Dimension::z}; + for (int64_t idx : llvm::seq(0, gpuDims.size())) { + blockOps.push_back( + rewriter.create(loc, indexType, gpuDims[idx])); + } +} + +DiagnosedSilenceableFailure +transform::MapForeachToBlocks::applyToOne(Operation *target, + SmallVectorImpl &results, + transform::TransformState &state) { + LaunchOp gpuLaunch = dyn_cast(target); + SimpleRewriter rewriter(getContext()); + + if (!getGenerateGpuLaunch() && !gpuLaunch) { + target->emitError("Given target is not gpu.launch, set " + "`generate_gpu_launch` attribute"); + return DiagnosedSilenceableFailure::definiteFailure(); + } + + auto res = mlir::transform::gpu::findTopLevelForeachThreadOp(target); + if (failed(res)) + return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); + + scf::ForeachThreadOp topLevelForeachThreadOp = *res; + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(topLevelForeachThreadOp); + + // Generate gpu launch here and move the foreach_thread inside + if (getGenerateGpuLaunch()) { + FailureOr maybeGpuLaunch = + createGpuLaunch(rewriter, target->getLoc()); + if (failed(maybeGpuLaunch)) + return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); + gpuLaunch = *maybeGpuLaunch; + rewriter.setInsertionPointToStart(&gpuLaunch.body().front()); + Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp); + rewriter.eraseOp(topLevelForeachThreadOp); + topLevelForeachThreadOp = + dyn_cast(newForeachThreadOp); + } + + SmallVector gridDim = extractFromI64ArrayAttr(getGridDim()); + if (failed(mlir::transform::gpu::rewriteTopLevelForeachThreadToGpuBlocks( + rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim))) + return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); + + if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1], + gridDim[2]))) + return DiagnosedSilenceableFailure::definiteFailure(); + + results.assign({gpuLaunch}); + return DiagnosedSilenceableFailure(success()); +} + +//===----------------------------------------------------------------------===// +// MapNestedForeachToThreads +//===----------------------------------------------------------------------===// + +/// Searches `scf.foreach_thread` ops nested under `target` and maps each such +/// op to GPU threads. Mapping is one-to-one and the induction variables of +/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the +/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in +/// which case, the union of the number of threads is computed and may result in +/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not +/// supported. Dynamic block dim sizes are currently not supported. +static FailureOr> rewriteOneForeachThreadToGpuThreads( + RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, + const SmallVector &globalBlockDims, bool syncAfterDistribute) { + if (foreachThreadOp.getNumResults() > 0) + return foreachThreadOp->emitError( + "only bufferized scf.foreach_thread lowers to gpu.thread"); + if (foreachThreadOp.getNumThreads().size() > 3) + return foreachThreadOp->emitError( + "scf.foreach_thread with rank > 3 does not lower to gpu.thread"); + + auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter); + if (failed(potentialBlockDim) || + llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) { + return !getConstantIntValue(ofr).has_value(); + })) + return foreachThreadOp->emitError("unsupported dynamic blockdim size"); + + SmallVector blockDim = + llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) { + return getConstantIntValue(ofr).value(); + })); + + // Step 1. Create the gpu.thread ops + Location loc = foreachThreadOp.getLoc(); + IndexType indexType = rewriter.getIndexType(); + + SmallVector gpuDims{Dimension::x, Dimension::y, Dimension::z}; + SmallVector threadOps; + for (int64_t idx : llvm::seq(0, blockDim.size())) { + threadOps.push_back( + rewriter.create(loc, indexType, gpuDims[idx])); + } + // Step 2. Maybe create conditionals to predicate the region. + Value predicate; + for (auto [threadId, blockDim, globalBlockDim] : + llvm::zip(threadOps, blockDim, globalBlockDims)) { + if (blockDim > globalBlockDim) { + return foreachThreadOp.emitOpError("blockDim size overflow: ") + << blockDim << " > " << globalBlockDim; + } + if (blockDim == globalBlockDim) + continue; + Value tmpPredicate = rewriter.create( + loc, arith::CmpIPredicate::ult, threadId, + rewriter.create(loc, blockDim)); + predicate = + predicate ? rewriter.create(loc, predicate, tmpPredicate) + : tmpPredicate; + } + + // Step 3. Move the body of foreachThreadOp. + // Erase the terminator first, it will not be used. + rewriter.eraseOp(foreachThreadOp.getTerminator()); + Block *targetBlock; + Block::iterator insertionPoint; + if (predicate) { + // Step 3.a. If predicated, move at the beginning. + auto ifOp = + rewriter.create(loc, predicate, /*withElseRegion=*/false); + targetBlock = ifOp.thenBlock(); + insertionPoint = ifOp.thenBlock()->begin(); + } else { + // Step 3.a. Otherwise, move inline just before foreachThreadOp. + targetBlock = foreachThreadOp->getBlock(); + insertionPoint = Block::iterator(foreachThreadOp); + } + Block &sourceBlock = foreachThreadOp.getRegion().front(); + targetBlock->getOperations().splice(insertionPoint, + sourceBlock.getOperations()); + + // Step 4. RAUW thread indices to thread ops. + SmallVector threadIndices = + *foreachThreadOp.getPermutedThreadIndices(); + for (auto it : llvm::zip(threadIndices, threadOps)) { + Value val = std::get<0>(it); + if (!val) + continue; + for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { + rewriter.updateRootInPlace( + user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); }); + } + } + + // Step 5. syncthreads. + // TODO: Need warpsync + if (syncAfterDistribute) + rewriter.create(loc); + + // Step 6. Erase old op. + rewriter.eraseOp(foreachThreadOp); + + return *potentialBlockDim; +} + +LogicalResult mlir::transform::gpu::rewriteMapNestedForeachThreadToGpuThreads( + RewriterBase &rewriter, Operation *target, + const SmallVector &blockDim, bool syncAfterDistribute) { + auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { + rewriter.setInsertionPoint(foreachThreadOp); + if (failed(rewriteOneForeachThreadToGpuThreads( + rewriter, foreachThreadOp, blockDim, syncAfterDistribute))) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + return walkResult.wasInterrupted() ? failure() : success(); +} + +DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne( + ::mlir::Operation *target, + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, + ::mlir::transform::TransformState &state) { + LaunchOp gpuLaunch = dyn_cast(target); + if (!gpuLaunch) { + target->emitError("Given target is not gpu.launch"); + return DiagnosedSilenceableFailure::definiteFailure(); + } + + SmallVector blockDim = extractFromI64ArrayAttr(getBlockDim()); + blockDim.resize(/*size=*/3, /*value=*/1); + SimpleRewriter rewriter(getContext()); + rewriter.setInsertionPoint(target); + if (failed(mlir::transform::gpu::rewriteMapNestedForeachThreadToGpuThreads( + rewriter, target, blockDim, getSyncAfterDistribute()))) + return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); + + if (failed(alterGpuLaunch(rewriter, gpuLaunch, llvm::None, llvm::None, + llvm::None, blockDim[0], blockDim[1], blockDim[2]))) + return DiagnosedSilenceableFailure::definiteFailure(); + + results.assign({target}); + return DiagnosedSilenceableFailure(success()); +} + +//===----------------------------------------------------------------------===// +// Transform op registration +//===----------------------------------------------------------------------===// + +namespace { +/// Registers new ops and declares PDL as dependent dialect since the additional +/// ops are using PDL types for operands and results. +class GPUTransformDialectExtension + : public transform::TransformDialectExtension< + GPUTransformDialectExtension> { +public: + GPUTransformDialectExtension() { + declareDependentDialect(); + declareGeneratedDialect(); + declareGeneratedDialect(); + registerTransformOps< +#define GET_OP_LIST +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc" + >(); + } +}; +} // namespace + +#define GET_OP_CLASSES +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc" + +void mlir::gpu::registerTransformDialectExtension(DialectRegistry ®istry) { + registry.addExtensions(); +} \ No newline at end of file diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1166,389 +1166,6 @@ modifiesPayload(effects); } -//===----------------------------------------------------------------------===// -// MapNestedForeachThreadToGpuThreads -//===----------------------------------------------------------------------===// - -/// Searches `scf.foreach_thread` ops nested under `target` and maps each such -/// op to GPU threads. Mapping is one-to-one and the induction variables of -/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the -/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in -/// which case, the union of the number of threads is computed and may result in -/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not -/// supported. Dynamic block dim sizes are currently not supported. -static FailureOr> rewriteOneForeachThreadToGpuThreads( - RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, - const SmallVector &globalBlockDims, bool syncAfterDistribute) { - if (foreachThreadOp.getNumResults() > 0) - return foreachThreadOp->emitError( - "only bufferized scf.foreach_thread lowers to gpu.thread"); - if (foreachThreadOp.getNumThreads().size() > 3) - return foreachThreadOp->emitError( - "scf.foreach_thread with rank > 3 does not lower to gpu.thread"); - - auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter); - if (failed(potentialBlockDim) || - llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) { - return !getConstantIntValue(ofr).has_value(); - })) - return foreachThreadOp->emitError("unsupported dynamic blockdim size"); - - SmallVector blockDim = - llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) { - return getConstantIntValue(ofr).value(); - })); - - // Step 1. Create the gpu.thread ops - Location loc = foreachThreadOp.getLoc(); - IndexType indexType = rewriter.getIndexType(); - - SmallVector gpuDims{gpu::Dimension::x, gpu::Dimension::y, - gpu::Dimension::z}; - SmallVector threadOps; - for (int64_t idx : llvm::seq(0, blockDim.size())) { - threadOps.push_back( - rewriter.create(loc, indexType, gpuDims[idx])); - } - // Step 2. Maybe create conditionals to predicate the region. - Value predicate; - for (auto [threadId, blockDim, globalBlockDim] : - llvm::zip(threadOps, blockDim, globalBlockDims)) { - if (blockDim > globalBlockDim) { - return foreachThreadOp.emitOpError("blockDim size overflow: ") - << blockDim << " > " << globalBlockDim; - } - if (blockDim == globalBlockDim) - continue; - Value tmpPredicate = rewriter.create( - loc, arith::CmpIPredicate::ult, threadId, - rewriter.create(loc, blockDim)); - predicate = - predicate ? rewriter.create(loc, predicate, tmpPredicate) - : tmpPredicate; - } - - // Step 3. Move the body of foreachThreadOp. - // Erase the terminator first, it will not be used. - rewriter.eraseOp(foreachThreadOp.getTerminator()); - Block *targetBlock; - Block::iterator insertionPoint; - if (predicate) { - // Step 3.a. If predicated, move at the beginning. - auto ifOp = - rewriter.create(loc, predicate, /*withElseRegion=*/false); - targetBlock = ifOp.thenBlock(); - insertionPoint = ifOp.thenBlock()->begin(); - } else { - // Step 3.a. Otherwise, move inline just before foreachThreadOp. - targetBlock = foreachThreadOp->getBlock(); - insertionPoint = Block::iterator(foreachThreadOp); - } - Block &sourceBlock = foreachThreadOp.getRegion().front(); - targetBlock->getOperations().splice(insertionPoint, - sourceBlock.getOperations()); - - // Step 4. RAUW thread indices to thread ops. - SmallVector threadIndices = - *foreachThreadOp.getPermutedThreadIndices(); - for (auto it : llvm::zip(threadIndices, threadOps)) { - Value val = std::get<0>(it); - if (!val) - continue; - for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { - rewriter.updateRootInPlace( - user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); }); - } - } - - // Step 5. syncthreads. - // TODO: Need warpsync - if (syncAfterDistribute) - rewriter.create(loc); - - // Step 6. Erase old op. - rewriter.eraseOp(foreachThreadOp); - - return *potentialBlockDim; -} - -mlir::WalkResult mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads( - RewriterBase &rewriter, Operation *target, - const SmallVector &blockDim, bool syncAfterDistribute) { - auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { - rewriter.setInsertionPoint(foreachThreadOp); - if (failed(rewriteOneForeachThreadToGpuThreads( - rewriter, foreachThreadOp, blockDim, syncAfterDistribute))) - return WalkResult::interrupt(); - return WalkResult::advance(); - }); - return walkResult; -} - -static LogicalResult -checkGpuLimits(Optional gridDimX, Optional gridDimY, - Optional gridDimZ, Optional blockDimX, - Optional blockDimY, Optional blockDimZ) { - // TODO The limits should live in the gpu dialect, but it's not like that - // right now. Read them in the common gpu dialect - if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > - 1024 || - gridDimY.value_or(1) > 65535 || gridDimZ.value_or(1) > 65535 || - gridDimX.value_or(1) > 2147483647) - return failure(); - return success(); -} - -/// Alter grid or block dimensions of the given kernel -static LogicalResult alterGpuLaunch(SimpleRewriter &rewriter, - gpu::LaunchOp gpuLaunch, - Optional gridDimX = llvm::None, - Optional gridDimY = llvm::None, - Optional gridDimZ = llvm::None, - Optional blockDimX = llvm::None, - Optional blockDimY = llvm::None, - Optional blockDimZ = llvm::None) { - if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, - blockDimZ))) { - gpuLaunch->emitError( - "Requested kernel thread configuration is larger than the limits"); - return failure(); - } - - gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues(); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointAfterValue(currentBlockdim.x); - auto createConstValue = [&](int dim) { - return rewriter.create(currentBlockdim.x.getLoc(), - dim); - }; - - if (gridDimX.has_value()) - gpuLaunch.gridSizeXMutable().assign(createConstValue(gridDimX.value())); - if (gridDimY.has_value()) - gpuLaunch.gridSizeYMutable().assign(createConstValue(gridDimY.value())); - if (gridDimZ.has_value()) - gpuLaunch.gridSizeZMutable().assign(createConstValue(gridDimZ.value())); - if (blockDimX.has_value()) - gpuLaunch.blockSizeXMutable().assign(createConstValue(blockDimX.value())); - if (blockDimY.has_value()) - gpuLaunch.blockSizeYMutable().assign(createConstValue(blockDimY.value())); - if (blockDimZ.has_value()) - gpuLaunch.blockSizeZMutable().assign(createConstValue(blockDimZ.value())); - return success(); -} - -DiagnosedSilenceableFailure -transform::MapNestedForeachThreadToGpuThreads::applyToOne( - Operation *target, SmallVectorImpl &results, - transform::TransformState &state) { - - gpu::LaunchOp gpuLaunch = dyn_cast(target); - if (!gpuLaunch) { - target->emitError("Given target is not gpu.launch"); - return DiagnosedSilenceableFailure::definiteFailure(); - } - - SmallVector blockDim = extractFromI64ArrayAttr(getBlockDim()); - blockDim.resize(/*size=*/3, /*value=*/1); - SimpleRewriter rewriter(getContext()); - rewriter.setInsertionPoint(target); - auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads( - rewriter, target, blockDim, getSyncAfterDistribute()); - if (walkResult.wasInterrupted()) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - - LogicalResult result = - alterGpuLaunch(rewriter, gpuLaunch, llvm::None, llvm::None, llvm::None, - blockDim[0], blockDim[1], blockDim[2]); - if (failed(result)) - return DiagnosedSilenceableFailure::definiteFailure(); - - results.assign({target}); - return DiagnosedSilenceableFailure(success()); -} - -//===----------------------------------------------------------------------===// -// MapNestedForeachThreadToGpuBlocks -//===----------------------------------------------------------------------===// - -LogicalResult mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks( - RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp, - function_ref &)> - blockIdGenerator, - SmallVector &gridDims) { - if (foreachThreadOp.getNumResults() > 0) - return foreachThreadOp->emitError( - "only bufferized scf.foreach_thread lowers to gpu.block_id"); - if (foreachThreadOp.getNumThreads().size() > 3) - return foreachThreadOp->emitError( - "scf.foreach_thread with rank > 3 does not lower to gpu.block_id"); - - // Step 0. Outline the compute workload region and set up the workload - // operands. - auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter); - if (failed(potentialGridDim) || - llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) { - return !getConstantIntValue(ofr).has_value(); - })) - return foreachThreadOp->emitError("unsupported dynamic gridDim"); - - for (OpFoldResult ofr : *potentialGridDim) - gridDims.push_back(getConstantIntValue(ofr).value()); - - SmallVector blockOps; - blockIdGenerator(rewriter, foreachThreadOp, blockOps); - - // Step 1. Move the body of foreachThreadOp. - // Erase the terminator first, it will not be used since we are on buffers. - rewriter.eraseOp(foreachThreadOp.getTerminator()); - Block *targetBlock = foreachThreadOp->getBlock(); - Block::iterator insertionPoint = Block::iterator(foreachThreadOp); - Block &sourceBlock = foreachThreadOp.getRegion().front(); - targetBlock->getOperations().splice(insertionPoint, - sourceBlock.getOperations()); - - // Step 2. RAUW thread indices to thread ops. - SmallVector threadIndices = - *foreachThreadOp.getPermutedThreadIndices(); - assert(blockOps.size() == 3 && "3 block id ops are required"); - for (auto it : llvm::zip(threadIndices, blockOps)) { - Value val = std::get<0>(it); - if (!val) - continue; - for (Operation *user : llvm::make_early_inc_range(val.getUsers())) { - rewriter.updateRootInPlace( - user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); }); - } - } - - // Step 3. Erase old op. - rewriter.eraseOp(foreachThreadOp); - - return success(); -} - -FailureOr -mlir::linalg::findTopLevelForeachThreadOp(Operation *target) { - scf::ForeachThreadOp topLevelForeachThreadOp; - auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) { - if (foreachThreadOp->getParentOfType()) - return WalkResult::advance(); - if (topLevelForeachThreadOp) - // TODO Handle multiple foreach if there is no dependences between them - return WalkResult::interrupt(); - topLevelForeachThreadOp = foreachThreadOp; - return WalkResult::advance(); - }); - - if (walkResult.wasInterrupted()) - return target->emitError( - "could not find a unique topLevel scf.foreach_thread"); - - return topLevelForeachThreadOp; -} - -/// Create gpuLauncOp with given kernel configurations -static FailureOr -createGpuLaunch(RewriterBase &rewriter, Location loc, - Optional gridDimX = llvm::None, - Optional gridDimY = llvm::None, - Optional gridDimZ = llvm::None, - Optional blockDimX = llvm::None, - Optional blockDimY = llvm::None, - Optional blockDimZ = llvm::None) { - if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, - blockDimZ))) - return failure(); - auto createConstant = [&](int dim) { - return rewriter.create(loc, dim); - }; - Value one = createConstant(1); - Value gridSizeX = - gridDimX.has_value() ? createConstant(gridDimX.value()) : one; - Value gridSizeY = - gridDimY.has_value() ? createConstant(gridDimY.value()) : one; - Value gridSizeZ = - gridDimZ.has_value() ? createConstant(gridDimZ.value()) : one; - Value blockSizeX = - blockDimX.has_value() ? createConstant(blockDimX.value()) : one; - Value blockSizeY = - blockDimY.has_value() ? createConstant(blockDimY.value()) : one; - Value blockSizeZ = - blockDimZ.has_value() ? createConstant(blockDimZ.value()) : one; - auto launchOp = rewriter.create( - loc, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ); - rewriter.setInsertionPointToEnd(&launchOp.body().front()); - rewriter.create(loc); - return launchOp; -} - -/// This is an helper that is only used in -/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id -static void generateGpuBlockIds(RewriterBase &rewriter, - scf::ForeachThreadOp foreachOp, - SmallVector &blockOps) { - Location loc = foreachOp->getLoc(); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(foreachOp); - IndexType indexType = rewriter.getIndexType(); - SmallVector gpuDims{gpu::Dimension::x, gpu::Dimension::y, - gpu::Dimension::z}; - for (int64_t idx : llvm::seq(0, gpuDims.size())) { - blockOps.push_back( - rewriter.create(loc, indexType, gpuDims[idx])); - } -} - -DiagnosedSilenceableFailure -transform::MapNestedForeachThreadToGpuBlocks::applyToOne( - Operation *target, SmallVectorImpl &results, - transform::TransformState &state) { - gpu::LaunchOp gpuLaunch = dyn_cast(target); - SimpleRewriter rewriter(getContext()); - - if (!getGenerateGpuLaunch() && !gpuLaunch) { - target->emitError("Given target is not gpu.launch, set " - "`generate_gpu_launch` attribute"); - return DiagnosedSilenceableFailure::definiteFailure(); - } - - auto res = mlir::linalg::findTopLevelForeachThreadOp(target); - if (failed(res)) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - - scf::ForeachThreadOp topLevelForeachThreadOp = *res; - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(topLevelForeachThreadOp); - - // Generate gpu launch here and move the foreach_thread inside - if (getGenerateGpuLaunch()) { - FailureOr maybeGpuLaunch = - createGpuLaunch(rewriter, target->getLoc()); - if (failed(maybeGpuLaunch)) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - gpuLaunch = *maybeGpuLaunch; - rewriter.setInsertionPointToStart(&gpuLaunch.body().front()); - Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp); - rewriter.eraseOp(topLevelForeachThreadOp); - topLevelForeachThreadOp = - dyn_cast(newForeachThreadOp); - } - - SmallVector gridDim = extractFromI64ArrayAttr(getGridDim()); - if (failed(mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks( - rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim))) - return DiagnosedSilenceableFailure(reportUnknownTransformError(target)); - - if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1], - gridDim[2]))) - return DiagnosedSilenceableFailure::definiteFailure(); - - results.assign({gpuLaunch}); - return DiagnosedSilenceableFailure(success()); -} - //===----------------------------------------------------------------------===// // TileToForeachThreadOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Linalg/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir rename from mlir/test/Dialect/Linalg/transform-gpu.mlir rename to mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/Linalg/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -35,7 +35,7 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 - transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { blockDim = [12, 9, 1]} + transform.gpu.map_foreach_to_blocks %funcop { blockDim = [12, 9, 1]} } } @@ -92,7 +92,7 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 - transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] } + transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1] } } } @@ -134,8 +134,8 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["func.func"]} in %arg0 - %gpuLaunch = transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { generate_gpu_launch } - transform.structured.map_nested_foreach_thread_to_gpu_threads %gpuLaunch { blockDim = [32, 4, 1] } + %gpuLaunch = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch } + transform.gpu.map_nested_foreach_to_threads %gpuLaunch { blockDim = [32, 4, 1] } } } @@ -171,6 +171,6 @@ transform.sequence %arg0 failures(propagate) { ^bb1(%arg1: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 - transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } + transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } } }