diff --git a/mlir/include/mlir/Dialect/GPU/Utils.h b/mlir/include/mlir/Dialect/GPU/Utils.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/Utils.h @@ -0,0 +1,44 @@ +//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file defines utility functions exposed by the GPU dialect +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_UTILS_H_ +#define MLIR_DIALECT_GPU_UTILS_H_ + +#include "mlir/Support/LLVM.h" + +namespace mlir { +struct LogicalResult; +class Value; + +namespace gpu { +class GPUFuncOp; +class LaunchOp; +} // namespace gpu + +/// Get a gpu.func created from outlining the region of a gpu.launch op with the +/// given `kernelFnName`. The region of the `launchOp` can use values from +/// above. These need to be captured and passed as arguments to the generated +/// gpu.func. The generated function has arguments +/// - corresponding to the values passed in as `operands`, in that order. +/// - any additional values that might be used within the region of the +/// `launchOp` and defined above it. These captured values are appended to the +/// `operands` list. +gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, + SmallVectorImpl &operands); + +/// Sink operations into the `launchOp` to reduce the number of values that are +/// used within the region of the operation, but defined outside of the +/// region. +LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp); + +} // namespace mlir +#endif // MLIR_DIALECT_GPU_UTILS_H_ diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/GPU/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" @@ -32,12 +33,15 @@ } // Add operations generating block/thread ids and grid/block dimensions at the -// beginning of the `body` region and replace uses of the respective function -// arguments. -static void injectGpuIndexOperations(Location loc, Region &body) { +// beginning of the `launchFuncOpBody` region. Add mapping from argument in +// entry block of `launchOpBody`, to the corresponding result value of the added +// operations. +static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, + Region &launchOpBody, + BlockAndValueMapping &map) { OpBuilder builder(loc->getContext()); - Block &firstBlock = body.front(); - builder.setInsertionPointToStart(&firstBlock); + Block &firstBlock = launchOpBody.front(); + builder.setInsertionPointToStart(&launchFuncOpBody.front()); SmallVector indexOps; createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); @@ -45,73 +49,89 @@ createForAllDimensions(builder, loc, indexOps); // Replace the leading 12 function args with the respective thread/block index // operations. Iterate backwards since args are erased and indices change. - for (int i = 11; i >= 0; --i) { - firstBlock.getArgument(i).replaceAllUsesWith(indexOps[i]); - firstBlock.eraseArgument(i); - } + for (auto indexOp : enumerate(indexOps)) + map.map(firstBlock.getArgument(indexOp.index()), indexOp.value()); } -static bool isInliningBeneficiary(Operation *op) { +static bool isSinkingBeneficiary(Operation *op) { return isa(op) || isa(op); } -// Move arguments of the given kernel function into the function if this reduces -// the number of kernel arguments. -static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc, - gpu::LaunchFuncOp launch) { - OpBuilder kernelBuilder(kernelFunc.getBody()); - auto &firstBlock = kernelFunc.getBody().front(); - SmallVector newLaunchArgs; - BlockAndValueMapping map; - for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) { - map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i)); - } - for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { - auto operandOp = launch.getKernelOperand(i).getDefiningOp(); - if (!operandOp || !isInliningBeneficiary(operandOp)) { - newLaunchArgs.push_back(launch.getKernelOperand(i)); +LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) { + Region &launchOpBody = launchOp.body(); + + // Identify uses from values defined outside of the scope of the launch + // operation. + llvm::SetVector sinkCandidates; + getUsedValuesDefinedAbove(launchOpBody, sinkCandidates); + + llvm::SetVector sunkValues; + llvm::SetVector sunkOperations; + for (Value operand : sinkCandidates) { + Operation *operandOp = operand.getDefiningOp(); + if (!operandOp || !isSinkingBeneficiary(operandOp)) continue; - } - // Only inline operations that do not create new arguments. - if (!llvm::all_of(operandOp->getOperands(), - [map](Value value) { return map.contains(value); })) { + // Only sink operations that do not create new sinkCandidates. + if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) { + return sinkCandidates.count(value); + })) continue; - } - auto clone = kernelBuilder.clone(*operandOp, map); - firstBlock.getArgument(i).replaceAllUsesWith(clone->getResult(0)); - firstBlock.eraseArgument(i); + sunkValues.insert(operand); + sunkOperations.insert(operandOp); } - if (newLaunchArgs.size() == launch.getNumKernelOperands()) - return launch; - - std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); - OpBuilder LaunchBuilder(launch); - SmallVector newArgumentTypes; - newArgumentTypes.reserve(firstBlock.getNumArguments()); - for (auto value : firstBlock.getArguments()) { - newArgumentTypes.push_back(value.getType()); + + // Insert operations so that the defs get cloned before uses. + BlockAndValueMapping map; + OpBuilder builder(launchOpBody); + DenseSet processed; + SmallVector clonedOps; + while (processed.size() != sunkOperations.size()) { + auto startSize = processed.size(); + for (Operation *sunkOperation : sunkOperations) { + if (processed.count(sunkOperation)) + continue; + + // Operation cant be cloned yet if any of its operands is also being sunk, + // but isnt cloned yet. + if (llvm::any_of( + sunkOperation->getOperands(), [&sunkValues, &map](Value value) { + return sunkValues.count(value) && !map.lookupOrNull(value); + })) + continue; + + Operation *clonedOp = builder.clone(*sunkOperation, map); + // Only replace uses within the launch op. + for (auto result : llvm::enumerate(sunkOperation->getResults())) { + auto replacement = clonedOp->getResult(result.index()); + for (auto &use : llvm::make_early_inc_range(result.value().getUses())) + if (use.getOwner()->getParentOfType() == launchOp) + use.set(replacement); + } + processed.insert(sunkOperation); + } + if (startSize == processed.size()) + return launchOp.emitError( + "found illegal cyclic dependency between operations while sinking"); } - kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); - auto newLaunch = LaunchBuilder.create( - launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), - launch.getBlockSizeOperandValues(), newLaunchArgs); - launch.erase(); - return newLaunch; + return success(); } // Outline the `gpu.launch` operation body into a kernel function. Replace // `gpu.terminator` operations by `gpu.return` in the generated function. -static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, - llvm::SetVector &operands) { +static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, + StringRef kernelFnName, + llvm::SetVector &operands) { Location loc = launchOp.getLoc(); // Create a builder with no insertion point, insertion will happen separately // due to symbol table manipulation. OpBuilder builder(launchOp.getContext()); + Region &launchOpBody = launchOp.body(); // Identify uses from values defined outside of the scope of the launch // operation. - getUsedValuesDefinedAbove(launchOp.body(), operands); + getUsedValuesDefinedAbove(launchOpBody, operands); + // Create the gpu.func operation. SmallVector kernelOperandTypes; kernelOperandTypes.reserve(operands.size()); for (Value operand : operands) { @@ -119,38 +139,68 @@ } FunctionType type = FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); - std::string kernelFuncName = - Twine(launchOp.getParentOfType().getName(), "_kernel").str(); - auto outlinedFunc = builder.create(loc, kernelFuncName, type); + auto outlinedFunc = builder.create(loc, kernelFnName, type); outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); - outlinedFunc.body().takeBody(launchOp.body()); - injectGpuIndexOperations(loc, outlinedFunc.body()); - Block &entryBlock = outlinedFunc.body().front(); - for (Value operand : operands) { - BlockArgument newArg = entryBlock.addArgument(operand.getType()); - replaceAllUsesInRegionWith(operand, newArg, outlinedFunc.body()); - } + BlockAndValueMapping map; + + // Map the arguments corresponding to the launch parameters like blockIdx, + // threadIdx, etc. + Region &outlinedFuncBody = outlinedFunc.body(); + injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); + + // Map arguments from gpu.launch region to the arguments of the gpu.func + // operation. + Block &entryBlock = outlinedFuncBody.front(); + for (auto operand : enumerate(operands)) + map.map(operand.value(), entryBlock.getArgument(operand.index())); + + // Clone the region of the gpu.launch operation into the gpu.func operation. + // TODO(ravishankarm): If cloneInto can be modified such that if a mapping for + // a block exists, that block will be used to clone operations into (at the + // end of the block), instead of creating a new block, this would be much + // cleaner. + launchOpBody.cloneInto(&outlinedFuncBody, map); + + // Branch from enty of the gpu.func operation to the block that is cloned from + // the entry block of the gpu.launch operation. + Block &launchOpEntry = launchOpBody.front(); + Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry); + builder.setInsertionPointToEnd(&entryBlock); + builder.create(loc, clonedLaunchOpEntry); + outlinedFunc.walk([](gpu::TerminatorOp op) { OpBuilder replacer(op); replacer.create(op.getLoc()); op.erase(); }); - return outlinedFunc; } +gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, + StringRef kernelFnName, + llvm::SmallVectorImpl &operands) { + DenseSet inputOperandSet; + inputOperandSet.insert(operands.begin(), operands.end()); + llvm::SetVector operandSet(operands.begin(), operands.end()); + auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet); + for (auto operand : operandSet) { + if (!inputOperandSet.count(operand)) + operands.push_back(operand); + } + return funcOp; +} + // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with // constant region arguments inlined. -static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, +static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands) { OpBuilder builder(launchOp); - auto launchFuncOp = builder.create( + builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), operands); - inlineBeneficiaryOps(kernelFunc, launchFuncOp); launchOp.erase(); } @@ -173,9 +223,16 @@ for (auto func : getModule().getOps()) { // Insert just after the function. Block::iterator insertPt(func.getOperation()->getNextNode()); - func.walk([&](gpu::LaunchOp op) { + auto funcWalkResult = func.walk([&](gpu::LaunchOp op) { llvm::SetVector operands; - gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op, operands); + std::string kernelFnName = + Twine(op.getParentOfType().getName(), "_kernel").str(); + + // Pull in instructions that can be sunk + if (failed(sinkOperationsIntoLaunchOp(op))) + return WalkResult::interrupt(); + gpu::GPUFuncOp outlinedFunc = + outlineKernelFuncImpl(op, kernelFnName, operands); // Create nested module and insert outlinedFunc. The module will // originally get the same name as the function, but may be renamed on @@ -186,7 +243,10 @@ // Potentially changes signature, pulling in constants. convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef()); modified = true; + return WalkResult::advance(); }); + if (funcWalkResult.wasInterrupted()) + return signalPassFailure(); } // If any new module was inserted in this module, annotate this module as diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -51,6 +51,8 @@ // CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index // CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index // CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index +// CHECK-NEXT: br ^[[BLOCK:.*]] +// CHECK-NEXT: ^[[BLOCK]]: // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> () // CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref @@ -108,6 +110,28 @@ // ----- +func @multiple_uses(%arg0 : memref) { + %c1 = constant 1 : index + %c2 = constant 2 : index + // CHECK: gpu.func {{.*}} { + // CHECK: %[[C2:.*]] = constant 2 : index + // CHECK: "use1"(%[[C2]], %[[C2]]) + // CHECK: "use2"(%[[C2]]) + // CHECK: gpu.return + // CHECK: } + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, + %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, + %block_z = %c1) { + "use1"(%c2, %c2) : (index, index) -> () + "use2"(%c2) : (index) -> () + gpu.terminator + } + return +} + +// ----- + llvm.mlir.global internal @global(42 : i64) : !llvm.i64 func @function_call(%arg0 : memref) {