diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -13,6 +13,7 @@ #ifndef MLIR_DIALECT_GPU_PASSES_H_ #define MLIR_DIALECT_GPU_PASSES_H_ +#include "mlir/Support/LLVM.h" #include namespace mlir { @@ -21,6 +22,26 @@ class ModuleOp; template class OpPassBase; class OwningRewritePatternList; +class Value; + +namespace gpu { +class GPUFuncOp; +class LaunchOp; +} // namespace gpu + +/// Get a gpu.func created from outlining the region of a gpu.launch op with the +/// given `kernelFnName`. The region of the `launchOp` can use values from +/// above. These need to be captured and passed as arguments to the generated +/// gpu.func. The generated function has arguments +/// - corresponding to the values passed in as `operands`, in that order. +/// - any additional values that might be used within the region of the +/// `launchOp` and defined above it. These captured values are appended to the +/// `operands` list. +/// A best effort is done to "sink" as many of the operations from "above" the +/// `launchOp` as possible to reduce the number of values that are appended to +/// `operands`. +gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, + SmallVectorImpl &operands); std::unique_ptr> createGpuKernelOutliningPass(); diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -32,12 +32,15 @@ } // Add operations generating block/thread ids and grid/block dimensions at the -// beginning of the `body` region and replace uses of the respective function -// arguments. -static void injectGpuIndexOperations(Location loc, Region &body) { +// beginning of the `launchFuncOpBody` region. Add mapping from argument in +// entry block of `launchOpBody`, to the corresponding result value of the added +// operations. +static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, + Region &launchOpBody, + BlockAndValueMapping &map) { OpBuilder builder(loc->getContext()); - Block &firstBlock = body.front(); - builder.setInsertionPointToStart(&firstBlock); + Block &firstBlock = launchOpBody.front(); + builder.setInsertionPointToStart(&launchFuncOpBody.front()); SmallVector indexOps; createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); @@ -45,73 +48,48 @@ createForAllDimensions(builder, loc, indexOps); // Replace the leading 12 function args with the respective thread/block index // operations. Iterate backwards since args are erased and indices change. - for (int i = 11; i >= 0; --i) { - firstBlock.getArgument(i).replaceAllUsesWith(indexOps[i]); - firstBlock.eraseArgument(i); - } + for (auto indexOp : enumerate(indexOps)) + map.map(firstBlock.getArgument(indexOp.index()), indexOp.value()); } -static bool isInliningBeneficiary(Operation *op) { +static bool isSinkingBeneficiary(Operation *op) { return isa(op) || isa(op); } -// Move arguments of the given kernel function into the function if this reduces -// the number of kernel arguments. -static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc, - gpu::LaunchFuncOp launch) { - OpBuilder kernelBuilder(kernelFunc.getBody()); - auto &firstBlock = kernelFunc.getBody().front(); - SmallVector newLaunchArgs; - BlockAndValueMapping map; - for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) { - map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i)); - } - for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { - auto operandOp = launch.getKernelOperand(i).getDefiningOp(); - if (!operandOp || !isInliningBeneficiary(operandOp)) { - newLaunchArgs.push_back(launch.getKernelOperand(i)); - continue; - } - // Only inline operations that do not create new arguments. - if (!llvm::all_of(operandOp->getOperands(), - [map](Value value) { return map.contains(value); })) { - continue; - } - auto clone = kernelBuilder.clone(*operandOp, map); - firstBlock.getArgument(i).replaceAllUsesWith(clone->getResult(0)); - firstBlock.eraseArgument(i); - } - if (newLaunchArgs.size() == launch.getNumKernelOperands()) - return launch; - - std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); - OpBuilder LaunchBuilder(launch); - SmallVector newArgumentTypes; - newArgumentTypes.reserve(firstBlock.getNumArguments()); - for (auto value : firstBlock.getArguments()) { - newArgumentTypes.push_back(value.getType()); - } - kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); - auto newLaunch = LaunchBuilder.create( - launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), - launch.getBlockSizeOperandValues(), newLaunchArgs); - launch.erase(); - return newLaunch; -} - // Outline the `gpu.launch` operation body into a kernel function. Replace // `gpu.terminator` operations by `gpu.return` in the generated function. -static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, - llvm::SetVector &operands) { +static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, + StringRef kernelFnName, + llvm::SetVector &operands) { Location loc = launchOp.getLoc(); // Create a builder with no insertion point, insertion will happen separately // due to symbol table manipulation. OpBuilder builder(launchOp.getContext()); + Region &launchOpBody = launchOp.body(); // Identify uses from values defined outside of the scope of the launch // operation. - getUsedValuesDefinedAbove(launchOp.body(), operands); + getUsedValuesDefinedAbove(launchOpBody, operands); + + // Prune those that will be sunk into the gpu.func operation. + llvm::SetVector elidedOperands; + llvm::SetVector sunkOperations; + for (Value operand : operands) { + Operation *operandOp = operand.getDefiningOp(); + if (!operandOp || !isSinkingBeneficiary(operandOp)) + continue; + // Only sink operations that do not create new arguments. + if (!llvm::all_of(operandOp->getOperands(), [&operands, + &elidedOperands](Value value) { + return operands.count(value) && !elidedOperands.count(value); + })) + continue; + elidedOperands.insert(operand); + sunkOperations.insert(operandOp); + } + operands.set_subtract(elidedOperands); + // Create the gpu.func operation. SmallVector kernelOperandTypes; kernelOperandTypes.reserve(operands.size()); for (Value operand : operands) { @@ -119,27 +97,61 @@ } FunctionType type = FunctionType::get(kernelOperandTypes, {}, launchOp.getContext()); - std::string kernelFuncName = - Twine(launchOp.getParentOfType().getName(), "_kernel").str(); - auto outlinedFunc = builder.create(loc, kernelFuncName, type); + auto outlinedFunc = builder.create(loc, kernelFnName, type); outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); - outlinedFunc.body().takeBody(launchOp.body()); - injectGpuIndexOperations(loc, outlinedFunc.body()); - Block &entryBlock = outlinedFunc.body().front(); - for (Value operand : operands) { - BlockArgument newArg = entryBlock.addArgument(operand.getType()); - replaceAllUsesInRegionWith(operand, newArg, outlinedFunc.body()); - } + BlockAndValueMapping map; + + // Map the arguments corresponding to the launch parameters like blockIdx, + // threadIdx, etc. + Region &outlinedFuncBody = outlinedFunc.body(); + injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); + + // Map arguments from gpu.launch region to the arguments of the gpu.func + // operation. + Block &entryBlock = outlinedFuncBody.front(); + for (auto operand : enumerate(operands)) + map.map(operand.value(), entryBlock.getArgument(operand.index())); + + // Clone the sunk operations into the gpu.func operation. + for (auto *sunkOp : sunkOperations) + entryBlock.push_back(sunkOp->clone(map)); + + // Clone the region of the gpu.launch operation into the gpu.func operation. + // TODO(ravishankarm): If cloneInto can be modified such that if a mapping for + // a block exists, that block will be used to clone operations into (at the + // end of the block), instead of creating a new block, this would be much + // cleaner. + launchOpBody.cloneInto(&outlinedFuncBody, map); + + // Branch from enty of the gpu.func operation to the block that is cloned from + // the entry block of the gpu.launch operation. + Block &launchOpEntry = launchOpBody.front(); + Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry); + builder.setInsertionPointToEnd(&entryBlock); + builder.create(loc, clonedLaunchOpEntry); + outlinedFunc.walk([](gpu::TerminatorOp op) { OpBuilder replacer(op); replacer.create(op.getLoc()); op.erase(); }); - return outlinedFunc; } +gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, + StringRef kernelFnName, + llvm::SmallVectorImpl &operands) { + llvm::SetVector inputOperandSet(operands.begin(), operands.end()); + llvm::SetVector operandSet(operands.begin(), operands.end()); + auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet); + for (auto operand : operandSet) { + if (!inputOperandSet.count(operand)) + operands.push_back(operand); + } + return funcOp; +} + // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with // constant region arguments inlined. @@ -147,10 +159,9 @@ gpu::GPUFuncOp kernelFunc, ValueRange operands) { OpBuilder builder(launchOp); - auto launchFuncOp = builder.create( + builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), operands); - inlineBeneficiaryOps(kernelFunc, launchFuncOp); launchOp.erase(); } @@ -175,7 +186,10 @@ Block::iterator insertPt(func.getOperation()->getNextNode()); func.walk([&](gpu::LaunchOp op) { llvm::SetVector operands; - gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op, operands); + std::string kernelFnName = + Twine(op.getParentOfType().getName(), "_kernel").str(); + gpu::GPUFuncOp outlinedFunc = + outlineKernelFuncImpl(op, kernelFnName, operands); // Create nested module and insert outlinedFunc. The module will // originally get the same name as the function, but may be renamed on diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -51,6 +51,8 @@ // CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index // CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index // CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index +// CHECK-NEXT: br ^[[BLOCK:.*]] +// CHECK-NEXT: ^[[BLOCK]]: // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> () // CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref