Diff 351075

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	static bool hasSideEffects(Operation *op) {
return !MemoryEffectOpInterface::hasNoEffect(op);		return !MemoryEffectOpInterface::hasNoEffect(op);
}		}

// Region walk callback which makes GPU ops implementing the AsyncOpInterface		// Region walk callback which makes GPU ops implementing the AsyncOpInterface
// execute asynchronously.		// execute asynchronously.
struct GpuAsyncRegionPass::ThreadTokenCallback {		struct GpuAsyncRegionPass::ThreadTokenCallback {
ThreadTokenCallback(MLIRContext &context) : builder(&context) {}		ThreadTokenCallback(MLIRContext &context) : builder(&context) {}

		WalkResult operator()(Block *block) {
		for (Operation &op : make_early_inc_range(*block)) {
		if (failed(visit(&op)))
		return WalkResult::interrupt();
		}
		return WalkResult::advance();
		}

		private:
// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to		// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
// create a current token (unless it already exists), and 'thread' that token		// create a current token (unless it already exists), and 'thread' that token
// through the `op` so that it executes asynchronously.		// through the `op` so that it executes asynchronously.
//		//
// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to		// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
// host-synchronize execution. A `!gpu.async.token` will therefore only be		// host-synchronize execution. A `!gpu.async.token` will therefore only be
// used inside of its block and GPU execution will always synchronize with		// used inside of its block and GPU execution will always synchronize with
// the host at block boundaries.		// the host at block boundaries.
WalkResult operator()(Operation *op) {		LogicalResult visit(Operation *op) {
if (isa<gpu::LaunchOp>(op))		if (isa<gpu::LaunchOp>(op))
return op->emitOpError("replace with gpu.launch_func first");		return op->emitOpError("replace with gpu.launch_func first");
if (isa<gpu::WaitOp>(op))		if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
return op->emitOpError("unexpected pre-existing gpu.wait");		if (currentToken)
		waitOp.addAsyncDependency(currentToken);
		currentToken = waitOp.asyncToken();
		return success();
		}
builder.setInsertionPoint(op);		builder.setInsertionPoint(op);
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))		if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.		return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
if (!currentToken)		if (!currentToken)
return success();		return success();
// Insert host synchronization before terminator or op with side effects.		// Insert host synchronization before terminator or op with side effects.
if (isTerminator(op) \|\| hasSideEffects(op))		if (isTerminator(op) \|\| hasSideEffects(op))
currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});		currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
return success();		return success();
}		}

private:
// Replaces asyncOp with a clone that returns a token.		// Replaces asyncOp with a clone that returns a token.
LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {		LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
auto *op = asyncOp.getOperation();		auto *op = asyncOp.getOperation();
if (asyncOp.getAsyncToken())
// TODO: Support ops that are already async.
return op->emitOpError("is already async");

auto tokenType = builder.getType<gpu::AsyncTokenType>();		auto tokenType = builder.getType<gpu::AsyncTokenType>();

// If there is no current token, insert a `gpu.wait async` without		// If there is no current token, insert a `gpu.wait async` without
// dependencies to create one.		// dependencies to create one.
if (!currentToken)		if (!currentToken)
currentToken = createWaitOp(op->getLoc(), tokenType, {});		currentToken = createWaitOp(op->getLoc(), tokenType, {});
asyncOp.addAsyncDependency(currentToken);		asyncOp.addAsyncDependency(currentToken);

		// Return early if op returns a token already.
		currentToken = asyncOp.getAsyncToken();
		if (currentToken)
		return success();

// Clone the op to return a token in addition to the other results.		// Clone the op to return a token in addition to the other results.
SmallVector<Type, 1> resultTypes;		SmallVector<Type, 1> resultTypes;
resultTypes.reserve(1 + op->getNumResults());		resultTypes.reserve(1 + op->getNumResults());
copy(op->getResultTypes(), std::back_inserter(resultTypes));		copy(op->getResultTypes(), std::back_inserter(resultTypes));
resultTypes.push_back(tokenType);		resultTypes.push_back(tokenType);
auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,		auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
op->getOperands(), op->getAttrDictionary(),		op->getOperands(), op->getAttrDictionary(),
op->getSuccessors(), op->getNumRegions());		op->getSuccessors(), op->getNumRegions());
▲ Show 20 Lines • Show All 212 Lines • ▼ Show 20 Lines	void operator()(async::ExecuteOp executeOp) {
}		}
}		}
};		};

// Replaces synchronous GPU ops in the op's region with asynchronous ones and		// Replaces synchronous GPU ops in the op's region with asynchronous ones and
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential		// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
// execution semantics and that no GPU ops are asynchronous yet.		// execution semantics and that no GPU ops are asynchronous yet.
void GpuAsyncRegionPass::runOnFunction() {		void GpuAsyncRegionPass::runOnFunction() {
if (getFunction()		if (getFunction()->walk(ThreadTokenCallback(getContext())).wasInterrupted())
		herhutUnsubmitted Not Done Reply Inline Actions Why is this needed? It should be possible to run on multiple functions independently, no? herhut: Why is this needed? It should be possible to run on multiple functions independently, no?
		csiggAuthorUnsubmitted Done Reply Inline Actions Removed, this was just debugging leftover. Thanks for catching it. csigg: Removed, this was just debugging leftover. Thanks for catching it.
.getRegion()
.walk(ThreadTokenCallback(getContext()))
.wasInterrupted())
return signalPassFailure();		return signalPassFailure();

// Collect gpu.wait ops that we can move out of async.execute regions.		// Collect gpu.wait ops that we can move out of async.execute regions.
getFunction().getRegion().walk(DeferWaitCallback());		getFunction().getRegion().walk(DeferWaitCallback());
// Makes each !gpu.async.token returned from async.execute op have single use.		// Makes each !gpu.async.token returned from async.execute op have single use.
getFunction().getRegion().walk(SingleTokenUseCallback());		getFunction().getRegion().walk(SingleTokenUseCallback());
}		}

std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {		std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
return std::make_unique<GpuAsyncRegionPass>();		return std::make_unique<GpuAsyncRegionPass>();
}		}

mlir/test/Dialect/GPU/async-region.mlir

Show First 20 Lines • Show All 163 Lines • ▼ Show 20 Lines	%a2 = async.execute [%a0] {
// CHECK: %[[t:.*]] = gpu.launch_func async		// CHECK: %[[t:.*]] = gpu.launch_func async
gpu.launch_func @kernels::@kernel		gpu.launch_func @kernels::@kernel
blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)		blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
// CHECK: gpu.wait [%[[t]]]		// CHECK: gpu.wait [%[[t]]]
async.yield		async.yield
}		}
return		return
}		}

		// CHECK-LABEL:func @existing_tokens()
		func @existing_tokens() {
		// CHECK: %[[t0:.*]] = gpu.wait async
		// CHECK-NOT: [{{.*}}]
		%t0 = gpu.wait async
		// CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]], %[[t0]]]
		%t1 = gpu.wait async [%t0]
		// CHECK: %[[m:.]], %[[t2:.]] = gpu.alloc async [%[[t1]], %[[t0]]] ()
		%0 = gpu.alloc [%t0] () : memref<7xf32>
		// CHECK: %[[t3:.*]] = gpu.dealloc async [%[[t2]]] %[[m]]
		%t2 = gpu.dealloc async %0 : memref<7xf32>
		// CHECK: gpu.wait [%[[t3]]]
		gpu.wait
		// CHECK: gpu.wait
		// CHECK-NOT: async
		// CHECK-NOT: [{{.*}}]
		gpu.wait
		return
		}
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Support pre-existing tokens in 'gpu-async-region'
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 351075

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

mlir/test/Dialect/GPU/async-region.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Support pre-existing tokens in 'gpu-async-region'ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 351075

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

mlir/test/Dialect/GPU/async-region.mlir

[mlir] Support pre-existing tokens in 'gpu-async-region'
ClosedPublic