diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -16,8 +16,13 @@ #include "mlir/Pass/Pass.h" namespace mlir { +/// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into +/// a separate kernel function. std::unique_ptr> createGpuKernelOutliningPass(); +/// Rewrites a function region so that GPU ops execute asynchronously. +std::unique_ptr> createGpuAsyncRegionPass(); + /// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect. void populateGpuAllReducePatterns(MLIRContext *context, OwningRewritePatternList &patterns); diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td --- a/mlir/include/mlir/Dialect/GPU/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Passes.td @@ -16,4 +16,9 @@ let constructor = "mlir::createGpuKernelOutliningPass()"; } +def GpuAsyncRegionPass : FunctionPass<"gpu-async-region"> { + let summary = "Make GPU ops async"; + let constructor = "mlir::createGpuAsyncRegionPass()"; +} + #endif // MLIR_DIALECT_GPU_PASSES diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_dialect_library(MLIRGPU IR/GPUDialect.cpp Transforms/AllReduceLowering.cpp + Transforms/AsyncRegionRewriter.cpp Transforms/KernelOutlining.cpp Transforms/MemoryPromotion.cpp Transforms/ParallelLoopMapper.cpp diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp @@ -0,0 +1,138 @@ +//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the GPU dialect pattern rewriters that make GPU op +// within a region execute asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "PassDetail.h" +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/GPU/Utils.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/RegionUtils.h" + +using namespace mlir; +namespace { +class GpuAsyncRegionRewriter : public RewritePattern { + struct Callback; + +public: + explicit GpuAsyncRegionRewriter() + : RewritePattern(1, RewritePattern::MatchAnyOpTypeTag{}) {} + +private: + // Replaces synchronous GPU ops in the op's region with asynchronous ones and + // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential + // execution semantics and that no GPU ops are asynchronous yet. + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const override; +}; + +class GpuAsyncRegionPass : public GpuAsyncRegionPassBase { + void runOnFunction() override; +}; +} // namespace + +struct GpuAsyncRegionRewriter::Callback { + // Region walk callback which turns all AsyncOpInterface ops into actual + // async and inserts a gpu.wait when it encounters a side-effecting or a + // terminator op. + WalkResult operator()(Operation *op) { + if (isa(op)) { + rewriter.notifyMatchFailure(op, "replace with gpu.launch_func first"); + return WalkResult::interrupt(); + } + if (isa(op)) + return WalkResult::advance(); + rewriter.setInsertionPoint(op); + if (auto asyncOp = dyn_cast(op)) { + // Replace GPU op with async version. + if (failed(rewriteAsyncOp(asyncOp))) + return WalkResult::interrupt(); + matchResult = success(); + return WalkResult::advance(); + } + if (!token) + return WalkResult::advance(); + if (!op->hasTrait() && + MemoryEffectOpInterface::hasNoEffect(op)) + return WalkResult::advance(); + // Insert host synchronization before return or op with side effects. + token = createWaitOp(op->getLoc(), Type(), {token}); + return WalkResult::advance(); + } + + LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) { + auto *op = asyncOp.getOperation(); + if (asyncOp.getAsyncToken()) + // TODO: Support ops that are already async. + return rewriter.notifyMatchFailure(op, "is already async"); + if (op->getNumRegions() > 0) + return rewriter.notifyMatchFailure(op, "regions are not supported"); + + if (!token) + token = createWaitOp(op->getLoc(), tokenType, {}); + asyncOp.addAsyncDependency(token); + + SmallVector resultTypes = {tokenType}; + resultTypes.reserve(1 + op->getNumResults()); + copy(op->getResultTypes(), std::back_inserter(resultTypes)); + + // Clone the asyncOp to add the token result. + auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes, + op->getOperands(), op->getMutableAttrDict(), + op->getSuccessors()); + auto results = newOp->getResults(); + token = results.front(); + rewriter.insert(newOp); + rewriter.replaceOp(op, results.drop_front()); + return success(); + } + + Value createWaitOp(Location loc, Type resultType, ValueRange operands) { + return rewriter.create(loc, resultType, operands).asyncToken(); + } + + PatternRewriter &rewriter; + LogicalResult &matchResult; // Set to success() if any op has been rewritten. + const Type tokenType = rewriter.getType(); + Value token; +}; + +LogicalResult +GpuAsyncRegionRewriter::matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const { + if (op->getNumRegions() == 0) + return failure(); + + PatternRewriter::InsertionGuard guard(rewriter); + auto result = success(); + for (auto ®ion : op->getRegions()) { + if (region.walk(Callback{rewriter, result}).wasInterrupted()) + return failure(); // Note: this is a persistent failure, but there is not + // error reporting for rewrite patterns. + } + return result; +} + +void GpuAsyncRegionPass::runOnFunction() { + OwningRewritePatternList patterns; + patterns.insert(); + applyOpPatternsAndFold(getFunction(), patterns); +} + +std::unique_ptr> mlir::createGpuAsyncRegionPass() { + return std::make_unique(); +} diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/GPU/async-region.mlir @@ -0,0 +1,27 @@ +// RUN: mlir-opt -gpu-async-region %s | FileCheck %s + +// CHECK: module attributes {gpu.container_module} +module attributes {gpu.container_module} { + + gpu.module @kernels { + gpu.func @kernel() kernel { gpu.return } + } + + func @foo() -> () + + // CHECK-LABEL:func @async(%{{.*}}: index) + func @async(%sz : index) { + // CHECK: %[[t0:.*]] = gpu.wait async + // CHECK: %[[t1:.*]] = gpu.launch_func async [%[[t0]]] + gpu.launch_func @kernels::@kernel + blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) + // CHECK: %[[t2:.*]] = gpu.launch_func async [%[[t1]]] + gpu.launch_func @kernels::@kernel + blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) + // CHECK: gpu.wait [%[[t2]]] + // CHECK: call @foo + call @foo() : () -> () + return + } + +}