diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -166,23 +166,65 @@ if (analysisFailure || backwardSlice.empty()) return; - // Backward slice is a topologically sorted list of ops starting at - // `outermostEnclosingForOp`. - assert(outermostEnclosingForOp == backwardSlice.front()); - - // Filter out the loops whose induction variable is not used to compute the - // padded result. As a first approximation, just look for IVs that have no use - // in the backwardSlice. - // These are the dimensions of reuse that we can exploit to reduce the amount - // of copy / memory. - for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { - for (Operation *user : forOp.getInductionVar().getUsers()) { - if (backwardSlice.contains(user)) { - packingLoops.insert(forOp); - break; - } + // Compute the index slice of `padTensorOp` and filter out any loops not + // used for packing. A loops is not used for packing if all loop iterations + // access the same data. As a result, all loop iterations can share the same + // buffer which reduces the memory footprint and increases the cache reuse. + // While computing the index slice, we keep track of all index and data + // values consumed by the padding and add only operations producing one of + // these values. The computation overapproximates the index slice since and + // with the exception of ForOp is unaware of op semantics. + SetVector indexAndDataEdges; + indexAndDataEdges.insert(padTensorOp.getResult()); + SetVector indexSlice; + getBackwardSlice(padTensorOp.getOperation(), &indexSlice, [&](Operation *op) { + if (!domInfo.dominates(outermostEnclosingForOp, op)) + return false; + // Adding a loop iteration argument to the index slice makes all enclosing + // loops packing loops possibly leading to over allocation. At the same + // time, a padding of an output operand passed in by iteration argument + // cannot be hoisted. We thus do not add the iteration arguments to the + // index and data edges to avoid spurious use-def relations when computing + // the index slice. + if (auto forOp = dyn_cast(op)) { + assert(llvm::none_of(forOp.getIterOpOperands(), + [&](OpOperand &opOperand) { + return indexAndDataEdges.contains( + forOp.getRegionIterArgForOpOperand(opOperand)); + }) && + "expect the padding does not depend on an iteration argument"); + assert(llvm::none_of(forOp.getResults(), + [&](Value value) { + return indexAndDataEdges.contains(value); + }) && + "expect the padding does not depend on a loop result"); + if (!indexAndDataEdges.contains(forOp.getInductionVar())) + return false; + indexAndDataEdges.insert(forOp.lowerBound()); + indexAndDataEdges.insert(forOp.upperBound()); + indexAndDataEdges.insert(forOp.step()); + return true; } + // Otherwise, we assume all operands of an operation with a result part of + // the index and data edges are true dependencies of padding and should be + // included in the index slice. This is a conservative overapproximation + // favoring correctness over performance. + if (llvm::none_of(op->getResults(), [&](Value value) { + return indexAndDataEdges.contains(value); + })) + return false; + for (Value value : op->getOperands()) + indexAndDataEdges.insert(value); + return true; + }); + + // Add only the loops part of the index slice to the packing loops. + for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { + if (indexSlice.contains(forOp)) + packingLoops.insert(forOp); } + if (packingLoops.empty()) + valid = false; // The analysis is valid and hoisting can occur. valid = true; diff --git a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir --- a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir +++ b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir @@ -1,25 +1,24 @@ // RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file | FileCheck %s -// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE +// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=3,2,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE // CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)> -// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)> +// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)> // CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> #map0 = affine_map<(d0) -> (5, -d0 + 24)> -#map1 = affine_map<(d0) -> (8, -d0 + 12)> -#map2 = affine_map<(d0) -> (7, -d0 + 25)> +#map1 = affine_map<(d0) -> (7, -d0 + 25)> -// CHECK: single_tiling -// CHECK-DOUBLE: single_tiling +// CHECK: static_sizes +// CHECK-DOUBLE: static_sizes // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> -func @single_tiling(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { +func @static_sizes(%arg0: tensor<24x12xf32>, + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 - // CHECK-DAG: %[[C8:.*]] = arith.constant 8 + // CHECK-DAG: %[[C7:.*]] = arith.constant 7 %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index @@ -33,11 +32,11 @@ // Packing the first input operand for all values of IV2 (IV2x5x6). // CHECK: = linalg.init_tensor [2, 5, 6] - // CHECK: %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] = - // CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]]) + // CHECK: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = + // CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[PIV0]]) // CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]]) // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // CHECK-SAME: %[[IV0]], %[[P0IV2]] + // CHECK-SAME: %[[IV0]], %[[PIV0]] // CHECK-SAME: %[[TS0]], 6 // CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]] // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]] @@ -47,15 +46,15 @@ // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { - // Packing the second input operand for all values of IV2 (IV2x6x8). - // CHECK: = linalg.init_tensor [2, 6, 8] - // CHECK: %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] = - // CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]]) + // Packing the second input operand for all values of IV2 (IV2x6x7). + // CHECK: = linalg.init_tensor [2, 6, 7] + // CHECK: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = + // CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]]) // CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]]) // CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] - // CHECK-SAME: %[[P1IV2]], %[[IV1]] + // CHECK-SAME: %[[PIV1]], %[[IV1]] // CHECK-SAME: 6, %[[TS1]] - // CHECK: %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]] + // CHECK: %[[V1:.*]] = arith.subi %[[C7]], %[[TS1]] // CHECK: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[C0]], %[[V1]] // CHECK: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] // CHECK: scf.yield %[[T5:.*]] @@ -63,6 +62,7 @@ // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) { %3 = affine.min #map0(%arg3) + // Index the packed operands. // CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]]) // CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]] @@ -91,6 +91,109 @@ // ----- +// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)> +// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)> +// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)> +// CHECK-DAG: #[[SDIV6:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 6)> +// CHECK-DAG: #[[DDIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> +#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)> +#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)> +#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)> + +// CHECK: dynamic_sizes +// CHECK-DOUBLE: dynamic_sizes + +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor +func @dynamic_sizes(%arg0: tensor, + %arg1: tensor, + %arg2: tensor) -> tensor { + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 + // CHECK-DAG: %[[C5:.*]] = arith.constant 5 + // CHECK-DAG: %[[C6:.*]] = arith.constant 6 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %c5 = arith.constant 5 : index + + // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] + // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] + // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]] + %0 = tensor.dim %arg0, %c0 : tensor + %1 = tensor.dim %arg0, %c1 : tensor + %2 = tensor.dim %arg1, %c1 : tensor + + // CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] = + %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor) { + + // Packing the first input operand for all values of IV2 (IV2x5x6). + // CHECK: %[[PS0:.*]] = affine.apply #[[SDIV6]]()[%[[D1]] + // CHECK: = linalg.init_tensor [%[[PS0]], 5, 6] + // CHECK: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = + // CHECK: %[[PIDX0:.*]] = affine.apply #[[DDIV6]](%[[PIV0]]) + // CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]] + // CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[PIV0]])[%[[D1]] + // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] + // CHECK-SAME: %[[IV0]], %[[PIV0]] + // CHECK-SAME: %[[TS0]], %[[TS1]] + // CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]] + // CHECK: %[[V1:.*]] = arith.subi %[[C6]], %[[TS1]] + // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]], %[[V1]] + // CHECK: %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0] + // CHECK: scf.yield %[[T2:.*]] + + // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = + %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor) { + + // Packing the second input operand for all values of IV2 (IV2x6x7). + // CHECK: = linalg.init_tensor [%[[PS0]], 6, 7] + // CHECK: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = + // CHECK: %[[PIDX1:.*]] = affine.apply #[[DDIV6]](%[[PIV1]]) + // CHECK: %[[TS2:.*]] = affine.min #[[MAP1]](%[[PIV1]])[%[[D1]] + // CHECK: %[[TS3:.*]] = affine.min #[[MAP2]](%[[IV1]])[%[[D2]] + // CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] + // CHECK-SAME: %[[PIV1]], %[[IV1]] + // CHECK-SAME: %[[TS2]], %[[TS3]] + // CHECK: %[[V2:.*]] = arith.subi %[[C6]], %[[TS2]] + // CHECK: %[[V3:.*]] = arith.subi %[[C7]], %[[TS3]] + // CHECK: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[V2]], %[[V3]] + // CHECK: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] + // CHECK: scf.yield %[[T5:.*]] + + // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = + %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor) { + %6 = affine.min #map0(%arg3)[%0] + %7 = affine.min #map1(%arg7)[%1] + + // Index the packed operands. + // CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DDIV6]](%[[IV2]]) + // CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]] + // CHECK-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]] + %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor to tensor + %9 = affine.min #map2(%arg5)[%2] + %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor to tensor + %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor to tensor + + // Check matmul uses the packed input operands. + // CHECK: = linalg.matmul ins(%[[T6]], %[[T7]] + %12 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%8, %10 : tensor, tensor) outs(%11 : tensor) -> tensor + %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor into tensor + scf.yield %13 : tensor + } + scf.yield %5 : tensor + } + scf.yield %4 : tensor + } + return %3 : tensor +} + +// ----- + +// CHECK-DOUBLE-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)> +// CHECK-DOUBLE-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> #map0 = affine_map<(d0) -> (15, -d0 + 24)> #map1 = affine_map<(d0) -> (16, -d0 + 25)> #map2 = affine_map<(d0, d1) -> (5, -d0 + d1)> @@ -114,16 +217,17 @@ %c5 = arith.constant 5 : index %c6 = arith.constant 6 : index - // Packing the first input operand. - // CHECK-DOUBLE: = linalg.init_tensor - // CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold - // CHECK-DOUBLE: scf.for %[[IV0:[0-9a-zA-Z]*]] = %0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) { - // Packing the second input operand. - // CHECK-DOUBLE: = linalg.init_tensor - // CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold + // Packing the first input operand. + // CHECK-DOUBLE: = linalg.init_tensor [3, 5, 12] + // CHECK-DOUBLE: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = + // CHECK-DOUBLE: %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]]) + // CHECK-DOUBLE: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] + // CHECK-DOUBLE: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold + // CHECK-DOUBLE: %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0] + // CHECK-DOUBLE: scf.yield %[[T2:.*]] // CHECK-DOUBLE: scf.for %[[IV1:[0-9a-zA-Z]*]] = %1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { @@ -131,6 +235,15 @@ %3 = affine.min #map1(%arg5) %4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor + // Packing the second input operand. + // CHECK-DOUBLE: = linalg.init_tensor [%{{.*}}, 12, 6] + // CHECK-DOUBLE: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = + // CHECK-DOUBLE: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]]) + // CHECK-DOUBLE: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] + // CHECK-DOUBLE: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold + // CHECK-DOUBLE: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] + // CHECK-DOUBLE: scf.yield %[[T5:.*]] + // CHECK-DOUBLE: scf.for %[[IV2:[0-9a-zA-Z]*]] = %5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor) { @@ -138,6 +251,12 @@ %7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor) { %8 = affine.min #map2(%arg7, %2) %9 = affine.apply #map3(%arg7, %arg3) + + // Index the packed operands. + // CHECK-DOUBLE-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV2]]) + // CHECK-DOUBLE-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX0]] + // CHECK-DOUBLE-DAG: %[[IDX1:.*]] = affine.apply #[[DIV6]](%[[IV3]]) + // CHECK-DOUBLE-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX1]] %10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor %11 = affine.min #map4(%arg9, %3) %12 = affine.apply #map3(%arg9, %arg5) @@ -146,9 +265,8 @@ %15 = affine.min #map4(%arg9, %3) %16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor to tensor - // Pad the output operand and perform the multiplication. - // CHECK-DOUBLE: = linalg.pad_tensor - // CHECK-DOUBLE: = linalg.matmul + // Check matmul uses the packed input operands. + // CHECK-DOUBLE: = linalg.matmul ins(%[[T6]], %[[T7]] %17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor, tensor<12x?xf32>) outs(%16 : tensor) -> tensor %18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor into tensor scf.yield %18 : tensor