diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -18,6 +18,7 @@ #include "../PassDetail.h" #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Dialect/Async/IR/Async.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" @@ -51,8 +52,7 @@ void runOnOperation() override; }; -class FunctionCallBuilder { -public: +struct FunctionCallBuilder { FunctionCallBuilder(StringRef functionName, Type returnType, ArrayRef argumentTypes) : functionName(functionName), @@ -60,7 +60,6 @@ LLVM::CallOp create(Location loc, OpBuilder &builder, ArrayRef arguments) const; -private: StringRef functionName; LLVM::LLVMFunctionType functionType; }; @@ -202,6 +201,18 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertAsyncYieldToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(async::YieldOp yieldOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override; +}; + /// A rewrite pattern to convert gpu.wait operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertWaitOpToGpuRuntimeCallPattern @@ -429,11 +440,53 @@ return success(); } -// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm -// streams (i.e. void*). The converted op synchronizes the host with every -// stream and then destroys it. That is, it assumes that the stream is not used -// afterwards. In case this isn't correct, we will get a runtime error. -// Eventually, we will have a pass that guarantees this property. +static bool isGpuAsyncTokenType(Value value) { + return value.getType().isa(); +} + +// Converts !gpu.async.token operands of `async.yield` to runtime calls. The +// !gpu.async.token are lowered to stream within the async.execute region, but +// are passed as events between them. For each !gpu.async.token operand, we +// create an event and record it on the stream. +LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( + async::YieldOp yieldOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType)) + return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand"); + + Location loc = yieldOp.getLoc(); + SmallVector newOperands(operands.begin(), operands.end()); + llvm::SmallDenseSet streams; + for (auto &operand : yieldOp->getOpOperands()) { + if (!isGpuAsyncTokenType(operand.get())) + continue; + auto idx = operand.getOperandNumber(); + auto stream = operands[idx]; + auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); + eventRecordCallBuilder.create(loc, rewriter, {event, stream}); + newOperands[idx] = event; + streams.insert(stream); + } + for (auto stream : streams) + streamDestroyCallBuilder.create(loc, rewriter, {stream}); + + rewriter.updateRootInPlace(yieldOp, + [&] { yieldOp->setOperands(newOperands); }); + return success(); +} + +// Returns whether `value` is the result of an LLVM::CallOp to `functionName`. +static bool isDefinedByCallTo(Value value, StringRef functionName) { + assert(value.getType().isa()); + if (auto defOp = value.getDefiningOp()) + return defOp.callee()->equals(functionName); + return false; +} + +// Converts `gpu.wait` to runtime calls. The converted op synchronizes the host +// with the stream/event operands. The operands are destroyed. That is, it +// assumes that it is not used afterwards or elsewhere. Otherwise we will get a +// runtime error. Eventually, we should guarantee this property. LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::WaitOp waitOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { @@ -442,21 +495,28 @@ Location loc = waitOp.getLoc(); - for (auto asyncDependency : operands) - streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency}); - for (auto asyncDependency : operands) - streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency}); + for (auto operand : operands) { + if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { + // The converted operand's definition created a stream. + streamSynchronizeCallBuilder.create(loc, rewriter, {operand}); + streamDestroyCallBuilder.create(loc, rewriter, {operand}); + } else { + // Otherwise the converted operand is an event. This assumes that we use + // events in control flow code as well. + eventSynchronizeCallBuilder.create(loc, rewriter, {operand}); + eventDestroyCallBuilder.create(loc, rewriter, {operand}); + } + } rewriter.eraseOp(waitOp); return success(); } -// Converts `gpu.wait async` to runtime calls. The result is a new stream that -// is synchronized with all operands, which are CUDA or ROCm streams (i.e. -// void*). We create and record an event after the definition of the stream -// and make the new stream wait on that event before destroying it again. This -// assumes that there is no other use between the definition and this op, and -// the plan is to have a pass that guarantees this property. +// Converts `gpu.wait async` to runtime calls. The converted op creates a new +// stream that is synchronized with stream/event operands. The operands are +// destroyed. That is, it assumes that it is not used afterwards or elsewhere. +// Otherwise we will get a runtime error. Eventually, we should guarantee this +// property. LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::WaitOp waitOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { @@ -468,18 +528,21 @@ auto insertionPoint = rewriter.saveInsertionPoint(); SmallVector events; for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) { - auto token = std::get<0>(pair); - if (auto *defOp = token.getDefiningOp()) { + auto operand = std::get<1>(pair); + if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { + // The converted operand's definition created a stream. Insert an event + // into the stream just after the last use of the original token operand. + auto *defOp = std::get<0>(pair).getDefiningOp(); rewriter.setInsertionPointAfter(defOp); + auto event = + eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); + eventRecordCallBuilder.create(loc, rewriter, {event, operand}); + events.push_back(event); } else { - // If we can't find the defining op, we record the event at block start, - // which is late and therefore misses parallelism, but still valid. - rewriter.setInsertionPointToStart(waitOp->getBlock()); + // Otherwise the converted operand is an event. This assumes that we use + // events in control flow code as well. + events.push_back(operand); } - auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); - auto stream = std::get<1>(pair); - eventRecordCallBuilder.create(loc, rewriter, {event, stream}); - events.push_back(event); } rewriter.restoreInsertionPoint(insertionPoint); auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0); @@ -729,7 +792,8 @@ ConvertHostRegisterOpToGpuRuntimeCallPattern, ConvertMemcpyOpToGpuRuntimeCallPattern, ConvertWaitAsyncOpToGpuRuntimeCallPattern, - ConvertWaitOpToGpuRuntimeCallPattern>(converter); + ConvertWaitOpToGpuRuntimeCallPattern, + ConvertAsyncYieldToGpuRuntimeCallPattern>(converter); patterns.insert( converter, gpuBinaryAnnotation); patterns.insert(&converter.getContext()); diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp --- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp @@ -30,6 +30,7 @@ class GpuAsyncRegionPass : public GpuAsyncRegionPassBase { struct ThreadTokenCallback; struct DeferWaitCallback; + struct SingleTokenUseCallback; void runOnFunction() override; }; } // namespace @@ -120,6 +121,44 @@ Value currentToken = {}; }; +/// Erases `executeOp` and returns a clone with additional `results`. +async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp, + ValueRange results) { + // Add values to async.yield op. + Operation *yieldOp = executeOp.getBody()->getTerminator(); + yieldOp->insertOperands(yieldOp->getNumOperands(), results); + + // Construct new result type list with additional types. + SmallVector resultTypes; + resultTypes.reserve(executeOp.getNumResults() + results.size()); + transform(executeOp.getResultTypes(), std::back_inserter(resultTypes), + [](Type type) { + // Extract value type from !async.value. + if (auto valueType = type.dyn_cast()) + return valueType.getValueType(); + assert(type.isa() && "expected token type"); + return type; + }); + transform(results, std::back_inserter(resultTypes), + [](Value value) { return value.getType(); }); + + // Clone executeOp with the extra results. + OpBuilder builder(executeOp); + auto newOp = builder.create( + executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/, + executeOp.dependencies(), executeOp.operands()); + BlockAndValueMapping mapper; + newOp.getRegion().getBlocks().clear(); + executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper); + + // Replace executeOp with cloned one. + executeOp.getOperation()->replaceAllUsesWith( + newOp.getResults().drop_back(results.size())); + executeOp.erase(); + + return newOp; +} + // Callback for `async.execute` ops which tries to push the contained // synchronous `gpu.wait` op to the dependencies of the `async.execute`. struct GpuAsyncRegionPass::DeferWaitCallback { @@ -146,69 +185,30 @@ for (size_t i = 0; i < worklist.size(); ++i) { auto waitOp = worklist[i]; auto executeOp = waitOp->getParentOfType(); - auto numDependencies = waitOp.asyncDependencies().size(); - // Erase `gpu.wait` and return async dependencies from region instead. - auto &yieldOp = executeOp.getBody()->getOperations().back(); - yieldOp.insertOperands(yieldOp.getNumOperands(), - waitOp.asyncDependencies()); + // Erase `gpu.wait` and return async dependencies from execute op instead. + SmallVector dependencies = waitOp.asyncDependencies(); waitOp.erase(); - auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies); + executeOp = addExecuteResults(executeOp, dependencies); // Add the async dependency to each user of the `async.execute` token. + auto asyncTokens = executeOp.getResults().take_back(dependencies.size()); for (Operation *user : executeOp.token().getUsers()) addAsyncDependencyAfter(asyncTokens, user); } } private: - // Append `count` `!async.value` results to `executeOp`. - static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp, - unsigned count) { - auto numResults = executeOp.getNumResults() + count; - - // Construct new result type list with `count` additional types. - SmallVector resultTypes; - resultTypes.reserve(numResults); - transform(executeOp.getResultTypes(), std::back_inserter(resultTypes), - [](Type type) { - // Extract value type from !async.value. - if (auto valueType = type.dyn_cast()) - return valueType.getValueType(); - assert(type.isa() && "expected token type"); - return type; - }); - OpBuilder builder(executeOp); - auto tokenType = builder.getType(); - resultTypes.resize(numResults, tokenType); - - // Clone executeOp with the extra `!gpu.async.token` results. - auto newOp = builder.create( - executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/, - executeOp.dependencies(), executeOp.operands()); - BlockAndValueMapping mapper; - newOp.getRegion().getBlocks().clear(); - executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper); - - // Replace executeOp with cloned one. - executeOp.getOperation()->replaceAllUsesWith( - newOp.getResults().drop_back(count)); - executeOp.erase(); - executeOp = newOp; - - // Return the new result values. - return executeOp.getResults().take_back(count); - } - // Returns whether all token users are either 'async.execute' or 'async.await' // ops. This is used as a requirement for pushing 'gpu.wait' ops from a // 'async.execute' body to it's users. Specifically, we do not allow // terminator users, because it could mean that the `async.execute` is inside // control flow code. static bool areAllUsersExecuteOrAwait(Value token) { - return llvm::all_of(token.getUsers(), [](Operation *user) { - return isa(user); - }); + return !token.use_empty() && + llvm::all_of(token.getUsers(), [](Operation *user) { + return isa(user); + }); } // Add the `asyncToken` as dependency as needed after `op`. @@ -268,6 +268,46 @@ SmallVector worklist; }; +// Callback for `async.execute` ops which repeats !gpu.async.token results +// so that each of them is only used once. +struct GpuAsyncRegionPass::SingleTokenUseCallback { + void operator()(async::ExecuteOp executeOp) { + // Extract !gpu.async.token results which have multiple uses. + auto multiUseResults = + llvm::make_filter_range(executeOp.results(), [](OpResult result) { + if (result.use_empty() || result.hasOneUse()) + return false; + auto valueType = result.getType().dyn_cast(); + return valueType && + valueType.getValueType().isa(); + }); + if (multiUseResults.empty()) + return; + + // Indices within !async.execute results (i.e. without the async.token). + SmallVector indices; + transform(multiUseResults, std::back_inserter(indices), + [](OpResult result) { + return result.getResultNumber() - 1; // Index without token. + }); + + for (auto index : indices) { + assert(!executeOp.results()[index].getUses().empty()); + // Repeat async.yield token result, one for each use after the first one. + auto uses = llvm::drop_begin(executeOp.results()[index].getUses()); + auto count = std::distance(uses.begin(), uses.end()); + auto yieldOp = cast(executeOp.getBody()->getTerminator()); + SmallVector operands(count, yieldOp.getOperand(index)); + executeOp = addExecuteResults(executeOp, operands); + // Update 'uses' to refer to the new executeOp. + uses = llvm::drop_begin(executeOp.results()[index].getUses()); + auto results = executeOp.results().take_back(count); + for (auto pair : llvm::zip(uses, results)) + std::get<0>(pair).set(std::get<1>(pair)); + } + } +}; + // Replaces synchronous GPU ops in the op's region with asynchronous ones and // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential // execution semantics and that no GPU ops are asynchronous yet. @@ -280,6 +320,8 @@ // Collect gpu.wait ops that we can move out of async.execute regions. getFunction().getRegion().walk(DeferWaitCallback()); + // Makes each !gpu.async.token returned from async.execute op have single use. + getFunction().getRegion().walk(SingleTokenUseCallback()); } std::unique_ptr> mlir::createGpuAsyncRegionPass() { diff --git a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir @@ -12,8 +12,8 @@ // CHECK: llvm.call @mgpuEventDestroy(%[[e0]]) %t1 = gpu.wait async [%t0] // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]]) - // CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]]) // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]]) + // CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]]) // CHECK: llvm.call @mgpuStreamDestroy(%[[t1]]) gpu.wait [%t0, %t1] return diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir --- a/mlir/test/Dialect/GPU/async-region.mlir +++ b/mlir/test/Dialect/GPU/async-region.mlir @@ -125,4 +125,48 @@ // CHECK: return %[[x]] : index return %x : index } + + // CHECK-LABEL:func @async_execute_no_use(%{{.*}}: index) + func @async_execute_no_use(%sz : index) { + // CHECK: async.execute { + %a0 = async.execute { + // CHECK: %[[t:.*]] = gpu.launch_func async + gpu.launch_func @kernels::@kernel + blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) + // CHECK: gpu.wait [%[[t]]] + async.yield + } + return + } + + // CHECK-LABEL:func @async_execute_fork(%{{.*}}: index) + func @async_execute_fork(%sz : index) { + // CHECK: %[[a0:.*]], %[[f0:.*]]:2 = async.execute + // CHECK-SAME: -> (!async.value, !async.value) + %a0 = async.execute { + // CHECK: %[[t:.*]] = gpu.launch_func async + gpu.launch_func @kernels::@kernel + blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) + // CHECK-NOT: gpu.wait + // CHECK: async.yield %[[t]], %[[t]] : !gpu.async.token, !gpu.async.token + async.yield + } + // CHECK: async.execute [%[[a0]]] (%[[f0]]#0 as {{.*}}: !async.value) + %a1 = async.execute [%a0] { + // CHECK: %[[t:.*]] = gpu.launch_func async + gpu.launch_func @kernels::@kernel + blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) + // CHECK: gpu.wait [%[[t]]] + async.yield + } + // CHECK: async.execute [%[[a0]]] (%[[f0]]#1 as {{.*}}: !async.value) + %a2 = async.execute [%a0] { + // CHECK: %[[t:.*]] = gpu.launch_func async + gpu.launch_func @kernels::@kernel + blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) + // CHECK: gpu.wait [%[[t]]] + async.yield + } + return + } }