Diff 309168

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//		//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements the GPU dialect pattern rewriters that make GPU op		// This file implements the GPU dialect pattern rewriters that make GPU op
// within a region execute asynchronously.		// within a region execute asynchronously.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "PassDetail.h"		#include "PassDetail.h"
		#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"		#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/GPU/Utils.h"		#include "mlir/Dialect/GPU/Utils.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"		#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"		#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/SymbolTable.h"		#include "mlir/IR/SymbolTable.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"		#include "mlir/Transforms/RegionUtils.h"
		#include "llvm/ADT/TypeSwitch.h"

using namespace mlir;		using namespace mlir;
namespace {		namespace {
class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {		class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
struct Callback;		struct ThreadTokenCallback;
		struct DeferWaitCallback;
void runOnFunction() override;		void runOnFunction() override;
};		};
} // namespace		} // namespace

		static bool isTerminator(Operation *op) { return !op->isKnownNonTerminator(); }
		herhutUnsubmitted Done Reply Inline Actions Use `op.isKnownTerminator()`? But you have to be careful because unknown operations, even if they are terminators, are reported as not being a terminator. So sometimes `!op.isKnownNonTerminator` is the better choice. herhut: Use `op.isKnownTerminator()`? But you have to be careful because unknown operations, even if…
		static bool hasSideEffects(Operation *op) {
		return !MemoryEffectOpInterface::hasNoEffect(op);
		}

// Region walk callback which makes GPU ops implementing the AsyncOpInterface		// Region walk callback which makes GPU ops implementing the AsyncOpInterface
// execute asynchronously.		// execute asynchronously.
struct GpuAsyncRegionPass::Callback {		struct GpuAsyncRegionPass::ThreadTokenCallback {
		ThreadTokenCallback(MLIRContext &context) : builder(&context) {}

// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to		// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
// create a current token (unless it already exists), and 'thread' that token		// create a current token (unless it already exists), and 'thread' that token
// through the `op` so that it executes asynchronously.		// through the `op` so that it executes asynchronously.
//		//
// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to		// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
		herhutUnsubmitted Done Reply Inline Actions Yeah, I think using the `!isKnownNonTerminator` is the right approach here. herhut: Yeah, I think using the `!isKnownNonTerminator` is the right approach here.
// host-synchronize execution.		// host-synchronize execution. A `!gpu.async.token` will therefore only be
		// used inside of its block and GPU execution will always synchronize with
		// the host at block boundaries.
WalkResult operator()(Operation *op) {		WalkResult operator()(Operation *op) {
if (isa<gpu::LaunchOp>(op))		if (isa<gpu::LaunchOp>(op))
return op->emitOpError("replace with gpu.launch_func first");		return op->emitOpError("replace with gpu.launch_func first");
if (isa<gpu::WaitOp>(op))		if (isa<gpu::WaitOp>(op))
return op->emitOpError("unexpected pre-existing gpu.wait");		return op->emitOpError("unexpected pre-existing gpu.wait");
builder.setInsertionPoint(op);		builder.setInsertionPoint(op);
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))		if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.		return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
if (!currentToken)		if (!currentToken)
return success();		return success();
if (!op->hasTrait<OpTrait::IsTerminator>() &&
MemoryEffectOpInterface::hasNoEffect(op))
return success();
// Insert host synchronization before terminator or op with side effects.		// Insert host synchronization before terminator or op with side effects.
		if (isTerminator(op) \|\| hasSideEffects(op))
currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});		currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
return success();		return success();
}		}

		private:
// Replaces asyncOp with a clone that returns a token.		// Replaces asyncOp with a clone that returns a token.
LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {		LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
auto *op = asyncOp.getOperation();		auto *op = asyncOp.getOperation();
if (asyncOp.getAsyncToken())		if (asyncOp.getAsyncToken())
// TODO: Support ops that are already async.		// TODO: Support ops that are already async.
return op->emitOpError("is already async");		return op->emitOpError("is already async");
if (op->getNumRegions() > 0)		if (op->getNumRegions() > 0)
return op->emitOpError("regions are not supported");		return op->emitOpError("regions are not supported");
Show All 30 Lines	private:
const Type tokenType = builder.getType<gpu::AsyncTokenType>();		const Type tokenType = builder.getType<gpu::AsyncTokenType>();
// The token that represents the current asynchronous dependency. It's valid		// The token that represents the current asynchronous dependency. It's valid
// range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.		// range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
// In between, each gpu::AsyncOpInterface depends on the current token and		// In between, each gpu::AsyncOpInterface depends on the current token and
// produces the new one.		// produces the new one.
Value currentToken = {};		Value currentToken = {};
};		};

		// Callback for `async.execute` ops which tries to push the contained
		// synchronous `gpu.wait` op to the dependencies of the `async.execute`.
		struct GpuAsyncRegionPass::DeferWaitCallback {
		// If the `executeOp`s token is used only in `async.execute` or `async.await`
		// ops, add the region's last `gpu.wait` op to the worklist if it is
		// synchronous and is the last op with side effects.
		void operator()(async::ExecuteOp executeOp) {
		if (!areAllUsersExecuteOrAwait(executeOp.token()))
		return;
		// async.execute's region is currently restricted to one block.
		herhutUnsubmitted Done Reply Inline Actions This explicitly walks the single block without the terminator. So this would already break if it gets remodeled. That is why I was confused below. herhut: This explicitly walks the single block without the terminator. So this would already break if…
		for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
		if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
		if (!waitOp.asyncToken())
		worklist.push_back(waitOp);
		return;
		}
		herhutUnsubmitted Not Done Reply Inline Actions How could it be a terminator? Or is this just for reuse? herhut: How could it be a terminator? Or is this just for reuse?
		csiggAuthorUnsubmitted Done Reply Inline Actions I added a comment and split it in two separate functions now. csigg: I added a comment and split it in two separate functions now.
		if (hasSideEffects(&op))
		return;
		}
		}

		// The destructor performs the actual rewrite work.
		~DeferWaitCallback() {
		for (size_t i = 0; i < worklist.size(); ++i) {
		auto waitOp = worklist[i];
		auto executeOp = waitOp.getParentOfType<async::ExecuteOp>();
		auto numDependencies = waitOp.asyncDependencies().size();

		// Erase `gpu.wait` and return async dependencies from region instead.
		auto &yieldOp = executeOp.getBody()->getOperations().back();
		yieldOp.insertOperands(yieldOp.getNumOperands(),
		waitOp.asyncDependencies());
		waitOp.erase();
		auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies);

		// Add the async dependency to each user of the `async.execute` token.
		for (Operation *user : executeOp.token().getUsers())
		addAsyncDependencyAfter(asyncTokens, user);
		}
		}

		private:
		// Append `count` `!async.value<!gpu.async.token>` results to `executeOp`.
		static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp,
		unsigned count) {
		auto numResults = executeOp.getNumResults() + count;

		// Construct new result type list with `count` additional types.
		SmallVector<Type, 2> resultTypes;
		resultTypes.reserve(numResults);
		copy(executeOp.getResultTypes(), std::back_inserter(resultTypes));
		OpBuilder builder(executeOp);
		auto tokenType = builder.getType<gpu::AsyncTokenType>();
		herhutUnsubmitted Done Reply Inline Actions Is the getOperation needed here? herhut: Is the getOperation needed here?
		resultTypes.resize(numResults, tokenType);
		herhutUnsubmitted Not Done Reply Inline Actions Does the execute op return an `async.token` or an `async.value<async.token>`? I assumed the latter, because then the body of the execute can unwrap it into an `async.token` or the `async.wait` can do the unwrapping. The token is an async value because it is only created during the execution of the parent async region. It could be a stream that gets created in there, no? herhut: Does the execute op return an `async.token` or an `async.value<async.token>`? I assumed the…
		csiggAuthorUnsubmitted Done Reply Inline Actions The `create` semantics of ExecuteOp changed recently and now automatically wraps the result types in `async.value<>`s. So yes, the execute op returns `async.value<gpu.async.token>`s. csigg: The `create` semantics of ExecuteOp changed recently and now automatically wraps the result…

		// Clone executeOp with the extra `!gpu.async.token` results.
		auto newOp = builder.create<async::ExecuteOp>(
		executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /drop token/,
		executeOp.dependencies(), executeOp.operands());
		BlockAndValueMapping mapper;
		newOp.getRegion().getBlocks().clear();
		herhutUnsubmitted Not Done Reply Inline Actions If you do not map anything here, why the mapper? herhut: If you do not map anything here, why the mapper?
		csiggAuthorUnsubmitted Not Done Reply Inline Actions The interface needs a mapper. Do I need to map anything? csigg: The interface needs a mapper. Do I need to map anything?
		herhutUnsubmitted Not Done Reply Inline Actions Ah, I see. I assumed it had a default of `{}` for the mapper, which some other variants of this functionality have. herhut: Ah, I see. I assumed it had a default of `{}` for the mapper, which some other variants of this…
		executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);

		// Replace executeOp with cloned one.
		herhutUnsubmitted Not Done Reply Inline Actions Why drop? herhut: Why drop?
		csiggAuthorUnsubmitted Done Reply Inline Actions This should have been `drop_back(count)` to exclude the newly added gpu.async.tokens. csigg: This should have been `drop_back(count)` to exclude the newly added gpu.async.tokens.
		executeOp.getOperation()->replaceAllUsesWith(
		newOp.getResults().drop_back(count));
		executeOp.erase();
		executeOp = newOp;

		// Return the new result values.
		return executeOp.getResults().take_back(count);
		}

		// Returns whether all token users are either 'async.execute' or 'async.await'
		// ops. This is used as a requirement for pushing 'gpu.wait' ops from a
		// 'async.execute' body to it's users. Specifically, we do not allow
		// terminator users, because it could mean that the `async.execute` is inside
		herhutUnsubmitted Done Reply Inline Actions You can also do `isa<async::ExecuteOp, async::AwaitOp>(user)` herhut: You can also do `isa<async::ExecuteOp, async::AwaitOp>(user)`
		// control flow code.
		static bool areAllUsersExecuteOrAwait(Value token) {
		return llvm::all_of(token.getUsers(), [](Operation *user) {
		return isa<async::ExecuteOp, async::AwaitOp>(user);
		});
		}

		// Add the `asyncToken` as dependency as needed after `op`.
		void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
		OpBuilder builder(op->getContext());
		auto loc = op->getLoc();

		herhutUnsubmitted Not Done Reply Inline Actions I am not sure why this is done. If the user of the token is a wait herhut: I am not sure why this is done. If the user of the token is a wait
		csiggAuthorUnsubmitted Done Reply Inline Actions Say you have something like this: %token, %async_gpu_token = async.execute() ... async.await %token This adds `%gpu_token = async.await %async_gpu_token`, and then further down we add `gpu.await %gpu_token`. I added a comment. csigg: Say you have something like this: ``` %token, %async_gpu_token = async.execute() ... async.
		Block::iterator it;
		SmallVector<Value, 1> tokens;
		tokens.reserve(asyncTokens.size());
		TypeSwitch<Operation *>(op)
		herhutUnsubmitted Done Reply Inline Actions Use `.result()`? herhut: Use `.result()`?
		.Case<async::AwaitOp>([&](auto awaitOp) {
		herhutUnsubmitted Not Done Reply Inline Actions This is nit picking, but should they not be inserted before? Because originally they also happened before. herhut: This is nit picking, but should they not be inserted before? Because originally they also…
		csiggAuthorUnsubmitted Done Reply Inline Actions If we insert them before, we won't advance below (because the async.await on the token has side-effects) and will never find a gpu async op to pair it up with. csigg: If we insert them before, we won't advance below (because the async.await on the token has side…
		// Add async.await ops to wait for the !gpu.async.tokens.
		builder.setInsertionPointAfter(op);
		for (auto asyncToken : asyncTokens)
		tokens.push_back(
		builder.create<async::AwaitOp>(loc, asyncToken).result());
		// Set `it` after the inserted async.await ops.
		it = builder.getInsertionPoint();
		})
		.Case<async::ExecuteOp>([&](auto executeOp) {
		// Set `it` to the beginning of the region and add asyncTokens to the
		// async.execute operands.
		it = executeOp.getBody()->begin();
		executeOp.operandsMutable().append(asyncTokens);
		herhutUnsubmitted Not Done Reply Inline Actions Should this not be the first operation after op, async or not? herhut: Should this not be the first operation after op, async or not?
		csiggAuthorUnsubmitted Done Reply Inline Actions I think the confusion came from that I reused `op`. I introduced `it` now. This function adds `gpu.wait` between `it` and the first async/terminator following `it`. Before `find_if`, `it` either points to the beginning of an `async.execute` body or just after the block of `async.await` ops (the first one synchronizing on the token, the others waiting for the !gpu.async.token). csigg: I think the confusion came from that I reused `op`. I introduced `it` now. This function adds…
		SmallVector<Type, 1> tokenTypes(
		asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
		copy(executeOp.getBody()->addArguments(tokenTypes),
		std::back_inserter(tokens));
		});

		// Advance `it` to terminator or op with side-effects.
		it = std::find_if(it, Block::iterator(), [](Operation &op) {
		return isTerminator(&op) \|\| hasSideEffects(&op);
		});

		// If `op` implements the AsyncOpInterface, add `token` to the list of async
		// dependencies.
		if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
		for (auto token : tokens)
		asyncOp.addAsyncDependency(token);
		return;
		}

		// Otherwise, insert a gpu.wait before 'it'.
		builder.setInsertionPoint(it->getBlock(), it);
		auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);

		// If the new waitOp is at the end of an async.execute region, add it to the
		// worklist. 'operator()(executeOp)' would do the same, but this is faster.
		auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
		if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
		!it->getNextNode())
		worklist.push_back(waitOp);
		}

		SmallVector<gpu::WaitOp, 8> worklist;
		};

// Replaces synchronous GPU ops in the op's region with asynchronous ones and		// Replaces synchronous GPU ops in the op's region with asynchronous ones and
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential		// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
// execution semantics and that no GPU ops are asynchronous yet.		// execution semantics and that no GPU ops are asynchronous yet.
void GpuAsyncRegionPass::runOnFunction() {		void GpuAsyncRegionPass::runOnFunction() {
Callback callback{OpBuilder(&getContext())};		if (getFunction()
if (getFunction().getRegion().walk(callback).wasInterrupted())		.getRegion()
		.walk(ThreadTokenCallback(getContext()))
		.wasInterrupted())
return signalPassFailure();		return signalPassFailure();

		// Collect gpu.wait ops that we can move out of gpu.execute regions.
		getFunction().getRegion().walk(DeferWaitCallback());
}		}

std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {		std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
return std::make_unique<GpuAsyncRegionPass>();		return std::make_unique<GpuAsyncRegionPass>();
}		}

mlir/test/Dialect/GPU/async-region.mlir

Show All 18 Lines	func @async(%sz : index) {
gpu.launch_func @kernels::@kernel		gpu.launch_func @kernels::@kernel
blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)		blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
// CHECK: gpu.wait [%[[t2]]]		// CHECK: gpu.wait [%[[t2]]]
// CHECK: call @foo		// CHECK: call @foo
call @foo() : () -> ()		call @foo() : () -> ()
return		return
}		}

		// CHECK-LABEL:func @defer_wait(%{{.*}}: index)
		func @defer_wait(%sz : index) {
		// CHECK: %[[a0:.]], %[[f0:.]] = async.execute
		%a0 = async.execute {
		// CHECK: %[[t:.*]] = gpu.launch_func async
		gpu.launch_func @kernels::@kernel
		blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
		// CHECK-NOT: gpu.wait
		// CHECK: async.yield %[[t]]
		async.yield
		}

		// CHECK: %[[a1:.]], %[[f1:.]] = async.execute
		// CHECK-SAME: %[[f0]]
		%a1 = async.execute [%a0] {
		// CHECK: %[[t:.*]] = gpu.launch_func async
		gpu.launch_func @kernels::@kernel
		blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
		// CHECK-NOT: gpu.wait
		// CHECK: async.yield %[[t]]
		async.yield
		}

		// CHECK: async.await %[[a1]]
		// CHECK: %[[t:.*]] = async.await %[[f1]]
		// CHECK: gpu.wait [%[[t]]]
		async.await %a1 : !async.token
		return
		}

		// CHECK-LABEL:func @defer_wait_blocked_by_side_effect(%{{.*}}: index)
		func @defer_wait_blocked_by_side_effect(%sz : index) {
		// CHECK: %[[a:.*]] = async.execute
		%a = async.execute {
		// CHECK: %[[t:.*]] = gpu.launch_func async
		gpu.launch_func @kernels::@kernel
		blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
		// CHECK: gpu.wait [%[[t]]]
		call @foo() : () -> ()
		async.yield
		}

		// CHECK: async.await %[[a]]
		// CHECK-NOT: gpu.wait
		async.await %a : !async.token
		return
		}

		// CHECK-LABEL:func @defer_wait_pass_through(%{{.*}}: index)
		func @defer_wait_pass_through(%sz : index) {
		// CHECK: %[[a0:.]], %[[f0:.]] = async.execute
		%a0 = async.execute {
		// CHECK: %[[t:.*]] = gpu.launch_func async
		gpu.launch_func @kernels::@kernel
		blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
		// CHECK-NOT: gpu.wait
		// CHECK: async.yield %[[t]]
		async.yield
		}

		// CHECK: %[[a1:.]], %[[f1:.]] = async.execute
		// CHECK-SAME: %[[f0]]
		%a1 = async.execute [%a0] {
		// CHECK-NOT: gpu.wait
		// CHECK: async.yield %{{.*}}
		async.yield
		}

		// CHECK: async.await %[[a1]]
		// CHECK: %[[t:.*]] = async.await %[[f1]]
		// CHECK: gpu.wait [%[[t]]]
		async.await %a1 : !async.token
		return
		}
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Move gpu.wait ops from async.execute regions to its dependencies.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 309168

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

mlir/test/Dialect/GPU/async-region.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Move gpu.wait ops from async.execute regions to its dependencies.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 309168

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

mlir/test/Dialect/GPU/async-region.mlir

[mlir][gpu] Move gpu.wait ops from async.execute regions to its dependencies.
ClosedPublic