diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -18,6 +18,7 @@ #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/SymbolTable.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/RegionUtils.h" using namespace mlir; @@ -32,10 +33,10 @@ } } -// Add operations generating block/thread ids and grid/block dimensions at the -// beginning of the `launchFuncOpBody` region. Add mapping from argument in -// entry block of `launchOpBody`, to the corresponding result value of the added -// operations. +/// Adds operations generating block/thread ids and grid/block dimensions at the +/// beginning of the `launchFuncOpBody` region. Add mapping from argument in +/// entry block of `launchOpBody`, to the corresponding result value of the +/// added operations. static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, BlockAndValueMapping &map) { @@ -53,8 +54,48 @@ map.map(firstBlock.getArgument(indexOp.index()), indexOp.value()); } +/// Identifies operations that are beneficial to sink into kernels. These +/// operations may not have side-effects, as otherwise sinking (and hence +/// duplicating them) is not legal. static bool isSinkingBeneficiary(Operation *op) { - return isa(op); + return isa(op); +} + +/// For a given operation `op`, computes whether it is beneficial to sink the +/// operation into the kernel. An operation can be sunk if doing so does not +/// introduce new kernel arguments. Whether a value is already available in the +/// kernel (and hence does not introduce new arguments) is checked by +/// querying `availableValues`. +/// If an operand is not yet available, we recursively check whether it can be +/// made available by siking its defining op. +/// Operations that are indentified for sinking are added to `beneficiaryOps` in +/// the order the should appear in the kernel. Furthermore, `availableValues` is +/// updated with results that will be available after sinking the identified +/// ops. +static bool extractBeneficiaryOps(Operation *op, + llvm::SetVector &beneficiaryOps, + llvm::SetVector &availableValues) { + if (beneficiaryOps.count(op)) + return true; + + if (!isSinkingBeneficiary(op)) + return false; + + for (Value operand : op->getOperands()) { + // It is already visisble in the kernel, keep going. + if (availableValues.count(operand)) + continue; + // Else check whether it can be made available via sinking. + Operation *definingOp = operand.getDefiningOp(); + if (!definingOp || + !extractBeneficiaryOps(definingOp, beneficiaryOps, availableValues)) + return false; + } + // We will sink the operation, mark its results as now available. + beneficiaryOps.insert(op); + for (Value result : op->getResults()) + availableValues.insert(result); + return true; } LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) { @@ -65,59 +106,30 @@ llvm::SetVector sinkCandidates; getUsedValuesDefinedAbove(launchOpBody, sinkCandidates); - llvm::SetVector sunkValues; - llvm::SetVector sunkOperations; - for (Value operand : sinkCandidates) { + SmallVector worklist(sinkCandidates.begin(), sinkCandidates.end()); + llvm::SetVector toBeSunk; + for (Value operand : worklist) { Operation *operandOp = operand.getDefiningOp(); - if (!operandOp || !isSinkingBeneficiary(operandOp)) + if (!operandOp) continue; - // Only sink operations that do not create new sinkCandidates. - if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) { - return sinkCandidates.count(value); - })) - continue; - sunkValues.insert(operand); - sunkOperations.insert(operandOp); + extractBeneficiaryOps(operandOp, toBeSunk, sinkCandidates); } // Insert operations so that the defs get cloned before uses. BlockAndValueMapping map; OpBuilder builder(launchOpBody); - DenseSet processed; - SmallVector clonedOps; - while (processed.size() != sunkOperations.size()) { - auto startSize = processed.size(); - for (Operation *sunkOperation : sunkOperations) { - if (processed.count(sunkOperation)) - continue; - - // Operation cant be cloned yet if any of its operands is also being sunk, - // but isnt cloned yet. - if (llvm::any_of( - sunkOperation->getOperands(), [&sunkValues, &map](Value value) { - return sunkValues.count(value) && !map.lookupOrNull(value); - })) - continue; - - Operation *clonedOp = builder.clone(*sunkOperation, map); - // Only replace uses within the launch op. - for (auto result : llvm::enumerate(sunkOperation->getResults())) { - auto replacement = clonedOp->getResult(result.index()); - for (auto &use : llvm::make_early_inc_range(result.value().getUses())) - if (use.getOwner()->getParentOfType() == launchOp) - use.set(replacement); - } - processed.insert(sunkOperation); - } - if (startSize == processed.size()) - return launchOp.emitError( - "found illegal cyclic dependency between operations while sinking"); + for (Operation *op : toBeSunk) { + Operation *clonedOp = builder.clone(*op, map); + // Only replace uses within the launch op. + for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults())) + replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair), + launchOp.body()); } return success(); } -// Outline the `gpu.launch` operation body into a kernel function. Replace -// `gpu.terminator` operations by `gpu.return` in the generated function. +/// Outline the `gpu.launch` operation body into a kernel function. Replace +/// `gpu.terminator` operations by `gpu.return` in the generated function. static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, llvm::SetVector &operands) { @@ -191,9 +203,9 @@ return funcOp; } -// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching -// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with -// constant region arguments inlined. +/// Replace `gpu.launch` operations with an `gpu.launch_func` operation +/// launching `kernelFunc`. The kernel func contains the body of the +/// `gpu.launch` with constant region arguments inlined. static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands) { @@ -257,7 +269,7 @@ } private: - // Returns a gpu.module containing kernelFunc and all callees (recursive). + /// Returns a gpu.module containing kernelFunc and all callees (recursive). gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, const SymbolTable &parentSymbolTable) { // TODO: This code cannot use an OpBuilder because it must be inserted into diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -60,7 +60,7 @@ // ----- // CHECK: module attributes {gpu.container_module} - +// CHECK-LABEL: @multiple_launches func @multiple_launches() { // CHECK: %[[CST:.*]] = constant 8 : index %cst = constant 8 : index @@ -88,13 +88,66 @@ // ----- -func @extra_constants(%arg0 : memref) { +// CHECK-LABEL: @extra_constants_not_inlined +func @extra_constants_not_inlined(%arg0: memref) { + // CHECK: %[[CST:.*]] = constant 8 : index + %cst = constant 8 : index + %cst2 = constant 2 : index + %c0 = constant 0 : index + %cst3 = "secret_constant"() : () -> index + // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel} : (index, index, index, index, index, index, memref, index) -> () + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, + %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, + %block_z = %cst) { + "use"(%cst2, %arg0, %cst3) : (index, memref, index) -> () + gpu.terminator + } + return +} + +// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref, %{{.*}}: index) +// CHECK: constant 2 + +// ----- + +// CHECK-LABEL: @extra_constants +// CHECK-SAME: %[[ARG0:.*]]: memref +func @extra_constants(%arg0: memref) { // CHECK: %[[CST:.*]] = constant 8 : index %cst = constant 8 : index %cst2 = constant 2 : index %c0 = constant 0 : index %cst3 = dim %arg0, %c0 : memref - // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref, index) -> () + // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]]) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref) -> () + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, + %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, + %block_z = %cst) { + "use"(%cst2, %arg0, %cst3) : (index, memref, index) -> () + gpu.terminator + } + return +} + +// CHECK-LABEL: func @extra_constants_kernel +// CHECK-SAME: %[[KARG0:.*]]: memref +// CHECK: constant 2 +// CHECK: constant 0 +// CHECK: dim %[[KARG0]] + +// ----- + +// CHECK-LABEL: @extra_constants_noarg +// CHECK-SAME: %[[ARG0:.*]]: memref, %[[ARG1:.*]]: memref +func @extra_constants_noarg(%arg0: memref, %arg1: memref) { + // CHECK: %[[CST:.*]] = constant 8 : index + %cst = constant 8 : index + %cst2 = constant 2 : index + %c0 = constant 0 : index + // CHECK: dim %[[ARG1]] + %cst3 = dim %arg1, %c0 : memref + // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]], %{{.*}}) {kernel = @extra_constants_noarg_kernel::@extra_constants_noarg_kernel} : (index, index, index, index, index, index, memref, index) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, @@ -105,9 +158,10 @@ return } -// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref, %{{.*}}: index) -// CHECK: constant -// CHECK: constant +// CHECK-LABEL: func @extra_constants_noarg_kernel +// CHECK-SAME: %[[KARG0:.*]]: memref, %[[KARG1:.*]]: index +// CHECK: %[[KCST:.*]] = constant 2 +// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]]) // ----- @@ -135,6 +189,7 @@ llvm.mlir.global internal @global(42 : i64) : !llvm.i64 +//CHECK-LABEL: @function_call func @function_call(%arg0 : memref) { %cst = constant 8 : index gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,