diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -48,6 +48,10 @@ /// contains an unknown op with a region. /// 4. The backward slice from the pad op to the scf::ForOp to hoist above is /// empty. +/// 5. The source tensor of pad op is not defined by an extract slice op. +/// 6. The source tensor of the extract slice op is not defined outside of +/// the outermost enclosing scf::ForOp. +/// 7. There is no enclosing scf::ForOp that indexes the padded data. /// Other cases succeed and will trigger hoisting of the pad op. struct HoistingAnalysis { HoistingAnalysis(PadTensorOp padTensorOp, int nLevels); @@ -82,6 +86,26 @@ packingLoops; private: + /// Returns the loops in `backwardSlice` used to index the padded data. The + /// method starts from `padTensorOp` and `sliceOp`, follows the use-def + /// chains of their index operands, and stores any enclosing loop whose + /// induction variable is part of the walked index computation. + /// + /// Example: + /// ``` + /// %source = linalg.fill(%cst, %arg0) + /// scf.for %i + /// scf.for %j + /// scf.for %k // not used to index %source! + /// %ubi = affine.min #map(%i) + /// %ubj = affine.min #map(%j) + /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] + /// %padded_slice = linalg.pad_tensor %slice + /// ``` + /// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j] + SetVector getIndexingLoops(PadTensorOp padTensorOp, + tensor::ExtractSliceOp sliceOp); + /// Encodes whether the analysis is valid and hoisting can proceed. bool valid; }; @@ -166,28 +190,114 @@ if (analysisFailure || backwardSlice.empty()) return; - // Backward slice is a topologically sorted list of ops starting at - // `outermostEnclosingForOp`. - assert(outermostEnclosingForOp == backwardSlice.front()); + // Get the `sliceOp` that defines the source tensor of `padTensorOp` and + // check its source is defined outside of the outermost loop. This check + // ensures the padded data is available for packing before entering the + // outermost enclosing loop. + // + // Example: + // ``` + // %source = linalg.fill(%cst, %arg0) + // // %source is available for packing here! + // scf.for %i + // scf.for %j + // scf.for %k + // %slice = tensor.extract_slice %source [%i, %j] + // %padded_slice = linalg.pad_tensor %slice + // ``` + auto sliceOp = padTensorOp.source().getDefiningOp(); + if (!sliceOp) { + LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); + return; + } + if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) { + LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); + return; + } - // Filter out the loops whose induction variable is not used to compute the - // padded result. As a first approximation, just look for IVs that have no use - // in the backwardSlice. - // These are the dimensions of reuse that we can exploit to reduce the amount - // of copy / memory. + // Search the loops found in `backwardSlice` used to index the padded data. + SetVector indexingLoops = getIndexingLoops(padTensorOp, sliceOp); + + // Add only the loops part of `indexingLoops` to the packing loops. All other + // loops are not used to index the padded data and consequently access the + // same data in every loop iteration. Adding them to the packing loops would + // increase the cache footprint of the packed data by storing the same data + // multiple times. for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { - for (Operation *user : forOp.getInductionVar().getUsers()) { - if (backwardSlice.contains(user)) { - packingLoops.insert(forOp); - break; - } - } + if (indexingLoops.contains(forOp)) + packingLoops.insert(forOp); + } + assert(indexingLoops.size() == packingLoops.size() && + "expect the all indexing loops are enclosing loops"); + if (packingLoops.empty()) { + LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); + return; } // The analysis is valid and hoisting can occur. valid = true; } +SetVector +HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp, + tensor::ExtractSliceOp sliceOp) { + // Set of all values used for index computation. + SetVector indexEdges; + + // Helper function that adds all index operands of an operation to + // `indexEdges`. An operand is an index operand if it is of index type. + auto addIndexOperandsToIndexEdges = [&](Operation *op) { + for (Value operand : op->getOperands()) + if (operand.getType().isIndex()) + indexEdges.insert(operand); + }; + + // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index + // type in `backwardSlice`. Add the index operands of an operation to + // `indexEdges` if one of its results is an index edge found so far and store + // all loops part of the index computation to `indexingLoops`. + // + // Example: + // ``` + // %source = linalg.fill(%cst, %arg0) + // scf.for %i + // scf.for %j + // scf.for %k // not used to index %source! + // %ubi = affine.min #map(%i) + // %ubj = affine.min #map(%j) + // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] + // %padded_slice = linalg.pad_tensor %slice + // ``` + // After iterating `backwardSlice` we obtain: + // indexEdges = [%i, %j, %ubi, %ubj] + // indexingLoops = [scf.for %i, scf.for %j] + SetVector indexingLoops; + for (Operation *op : llvm::reverse(backwardSlice)) { + // Add the index operands of `padTensorOp` and `sliceOp` to start the + // exploration of the index computation. + if (op == padTensorOp || op == sliceOp) { + addIndexOperandsToIndexEdges(op); + continue; + } + // Add the index operands of the loop if its induction variable is + // used for index computation. Additionally, insert the loop into + // `indexingLoops` + if (auto forOp = dyn_cast(op)) { + if (indexEdges.contains(forOp.getInductionVar())) { + addIndexOperandsToIndexEdges(op); + indexingLoops.insert(forOp); + continue; + } + } + // Add the index operands of all other operations if at least one result is + // used for index computation. + if (llvm::any_of(op->getResults(), + [&](Value result) { return indexEdges.contains(result); })) + addIndexOperandsToIndexEdges(op); + } + return indexingLoops; +} + static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); } diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir --- a/mlir/test/Dialect/Linalg/hoist-padding.mlir +++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir @@ -141,29 +141,25 @@ // ----- - // CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)> -// CHECK-DAG: #[[$MIN_REST4:[0-9a-z]+]] = affine_map<(d0, d1) -> (4, d0 - d1)> -// CHECK-DAG: #[[$MIN_REST2:[0-9a-z]+]] = affine_map<(d0, d1) -> (2, d0 - d1)> // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> // CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)> #map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)> -#map1 = affine_map<(d0, d1) -> (4, d0 - d1)> -#map2 = affine_map<(d0, d1) -> (2, d0 - d1)> +#map1 = affine_map<(d0, d1) -> (4, -d0 + d1)> +#map2 = affine_map<(d0, d1) -> (2, -d0 + d1)> +#map3 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)> // CHECK-LABEL: func @dot // VERIFIER-ONLY-LABEL: func @dot func @dot(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 %c2 = arith.constant 2 : index - %c0 = arith.constant 0 : index - %1 = tensor.dim %arg0, %c0 : tensor - %2 = tensor.dim %arg0, %c0 : tensor - %3 = tensor.dim %arg1, %c0 : tensor + %0 = tensor.dim %arg0, %c0 : tensor // CHECK: scf.for %[[I:[0-9a-z]+]] = // @@ -194,39 +190,32 @@ // CHECK: %[[B:.*]] = tensor.extract_slice %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor to tensor<2xf32> // CHECK: linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor) -> tensor - %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor) { - %5 = affine.min #map0(%arg3)[%2] - %6 = tensor.extract_slice %arg0[%arg3] [%5] [1] : tensor to tensor - %7 = affine.min #map0(%arg3)[%3] - %8 = tensor.extract_slice %arg1[%arg3] [%7] [1] : tensor to tensor - %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor) { - %10 = affine.min #map1(%5, %arg5) - %11 = tensor.extract_slice %6[%arg5] [%10] [1] : tensor to tensor - %12 = affine.min #map1(%7, %arg5) - %13 = tensor.extract_slice %8[%arg5] [%12] [1] : tensor to tensor - %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor) { - %15 = affine.min #map2(%10, %arg7) - %16 = tensor.extract_slice %11[%arg7] [%15] [1] : tensor to tensor - %17 = affine.min #map2(%12, %arg7) - %18 = tensor.extract_slice %13[%arg7] [%17] [1] : tensor to tensor - %19 = arith.subi %c2, %15 : index - %20 = linalg.pad_tensor %16 low[%c0] high[%19] { + %1 = scf.for %arg3 = %c0 to %0 step %c8 iter_args(%arg4 = %arg2) -> (tensor) { + %2 = affine.min #map0(%arg3)[%0] + %3 = scf.for %arg5 = %c0 to %2 step %c4 iter_args(%arg6 = %arg4) -> (tensor) { + %4 = affine.min #map1(%arg5, %2) + %5 = scf.for %arg7 = %c0 to %4 step %c2 iter_args(%arg8 = %arg6) -> (tensor) { + %6 = affine.min #map2(%arg7, %4) + %7 = affine.apply #map3(%arg7, %arg5, %arg3) + %8 = tensor.extract_slice %arg0[%7] [%6] [1] : tensor to tensor + %9 = tensor.extract_slice %arg1[%7] [%6] [1] : tensor to tensor + %10 = arith.subi %c2, %6 : index + %11 = linalg.pad_tensor %8 low[%c0] high[%10] { ^bb0(%arg9: index): // no predecessors linalg.yield %cst : f32 } : tensor to tensor<2xf32> - %21 = arith.subi %c2, %17 : index - %22 = linalg.pad_tensor %18 low[%c0] high[%21] { + %12 = linalg.pad_tensor %9 low[%c0] high[%10] { ^bb0(%arg9: index): // no predecessors linalg.yield %cst : f32 } : tensor to tensor<2xf32> - %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor) -> tensor - scf.yield %23 : tensor + %13 = linalg.dot ins(%11, %12 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor) -> tensor + scf.yield %13 : tensor } - scf.yield %14 : tensor + scf.yield %5 : tensor } - scf.yield %9 : tensor + scf.yield %3 : tensor } - return %4 : tensor + return %1 : tensor } // ----- diff --git a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir --- a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir +++ b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir @@ -1,25 +1,23 @@ // RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file | FileCheck %s -// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE +// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=3,2,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE // CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)> -// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)> +// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)> // CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> #map0 = affine_map<(d0) -> (5, -d0 + 24)> -#map1 = affine_map<(d0) -> (8, -d0 + 12)> -#map2 = affine_map<(d0) -> (7, -d0 + 25)> - -// CHECK: single_tiling -// CHECK-DOUBLE: single_tiling +#map1 = affine_map<(d0) -> (7, -d0 + 25)> +// CHECK: static_sizes +// CHECK-DOUBLE: static_sizes // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> -func @single_tiling(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { +func @static_sizes(%arg0: tensor<24x12xf32>, + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 - // CHECK-DAG: %[[C8:.*]] = arith.constant 8 + // CHECK-DAG: %[[C7:.*]] = arith.constant 7 %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index @@ -33,11 +31,11 @@ // Packing the first input operand for all values of IV2 (IV2x5x6). // CHECK: = linalg.init_tensor [2, 5, 6] - // CHECK: %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] = - // CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]]) + // CHECK: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = + // CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[PIV0]]) // CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]]) // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // CHECK-SAME: %[[IV0]], %[[P0IV2]] + // CHECK-SAME: %[[IV0]], %[[PIV0]] // CHECK-SAME: %[[TS0]], 6 // CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]] // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]] @@ -47,15 +45,15 @@ // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { - // Packing the second input operand for all values of IV2 (IV2x6x8). - // CHECK: = linalg.init_tensor [2, 6, 8] - // CHECK: %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] = - // CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]]) + // Packing the second input operand for all values of IV2 (IV2x6x7). + // CHECK: = linalg.init_tensor [2, 6, 7] + // CHECK: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = + // CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]]) // CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]]) // CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] - // CHECK-SAME: %[[P1IV2]], %[[IV1]] + // CHECK-SAME: %[[PIV1]], %[[IV1]] // CHECK-SAME: 6, %[[TS1]] - // CHECK: %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]] + // CHECK: %[[V1:.*]] = arith.subi %[[C7]], %[[TS1]] // CHECK: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[C0]], %[[V1]] // CHECK: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] // CHECK: scf.yield %[[T5:.*]] @@ -63,6 +61,7 @@ // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) { %3 = affine.min #map0(%arg3) + // Index the packed operands. // CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]]) // CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]] @@ -91,6 +90,108 @@ // ----- +// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)> +// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)> +// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)> +// CHECK-DAG: #[[SDIV6:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 6)> +// CHECK-DAG: #[[DDIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> +#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)> +#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)> +#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)> + +// CHECK: dynamic_sizes +// CHECK-DOUBLE: dynamic_sizes +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor +func @dynamic_sizes(%arg0: tensor, + %arg1: tensor, + %arg2: tensor) -> tensor { + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 + // CHECK-DAG: %[[C5:.*]] = arith.constant 5 + // CHECK-DAG: %[[C6:.*]] = arith.constant 6 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %c5 = arith.constant 5 : index + + // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] + // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] + // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]] + %0 = tensor.dim %arg0, %c0 : tensor + %1 = tensor.dim %arg0, %c1 : tensor + %2 = tensor.dim %arg1, %c1 : tensor + + // CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] = + %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor) { + + // Packing the first input operand for all values of IV2 (IV2x5x6). + // CHECK: %[[PS0:.*]] = affine.apply #[[SDIV6]]()[%[[D1]] + // CHECK: = linalg.init_tensor [%[[PS0]], 5, 6] + // CHECK: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = + // CHECK: %[[PIDX0:.*]] = affine.apply #[[DDIV6]](%[[PIV0]]) + // CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]] + // CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[PIV0]])[%[[D1]] + // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] + // CHECK-SAME: %[[IV0]], %[[PIV0]] + // CHECK-SAME: %[[TS0]], %[[TS1]] + // CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]] + // CHECK: %[[V1:.*]] = arith.subi %[[C6]], %[[TS1]] + // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]], %[[V1]] + // CHECK: %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0] + // CHECK: scf.yield %[[T2:.*]] + + // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = + %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor) { + + // Packing the second input operand for all values of IV2 (IV2x6x7). + // CHECK: = linalg.init_tensor [%[[PS0]], 6, 7] + // CHECK: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = + // CHECK: %[[PIDX1:.*]] = affine.apply #[[DDIV6]](%[[PIV1]]) + // CHECK: %[[TS2:.*]] = affine.min #[[MAP1]](%[[PIV1]])[%[[D1]] + // CHECK: %[[TS3:.*]] = affine.min #[[MAP2]](%[[IV1]])[%[[D2]] + // CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] + // CHECK-SAME: %[[PIV1]], %[[IV1]] + // CHECK-SAME: %[[TS2]], %[[TS3]] + // CHECK: %[[V2:.*]] = arith.subi %[[C6]], %[[TS2]] + // CHECK: %[[V3:.*]] = arith.subi %[[C7]], %[[TS3]] + // CHECK: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[V2]], %[[V3]] + // CHECK: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] + // CHECK: scf.yield %[[T5:.*]] + + // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = + %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor) { + %6 = affine.min #map0(%arg3)[%0] + %7 = affine.min #map1(%arg7)[%1] + + // Index the packed operands. + // CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DDIV6]](%[[IV2]]) + // CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]] + // CHECK-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]] + %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor to tensor + %9 = affine.min #map2(%arg5)[%2] + %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor to tensor + %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor to tensor + + // Check matmul uses the packed input operands. + // CHECK: = linalg.matmul ins(%[[T6]], %[[T7]] + %12 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%8, %10 : tensor, tensor) outs(%11 : tensor) -> tensor + %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor into tensor + scf.yield %13 : tensor + } + scf.yield %5 : tensor + } + scf.yield %4 : tensor + } + return %3 : tensor +} + +// ----- + +// CHECK-DOUBLE-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)> +// CHECK-DOUBLE-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> #map0 = affine_map<(d0) -> (15, -d0 + 24)> #map1 = affine_map<(d0) -> (16, -d0 + 25)> #map2 = affine_map<(d0, d1) -> (5, -d0 + d1)> @@ -99,7 +200,6 @@ // CHECK: double_tiling // CHECK-DOUBLE: double_tiling - // CHECK-DOUBLE-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> // CHECK-DOUBLE-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> // CHECK-DOUBLE-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> @@ -114,16 +214,17 @@ %c5 = arith.constant 5 : index %c6 = arith.constant 6 : index - // Packing the first input operand. - // CHECK-DOUBLE: = linalg.init_tensor - // CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold - // CHECK-DOUBLE: scf.for %[[IV0:[0-9a-zA-Z]*]] = %0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) { - // Packing the second input operand. - // CHECK-DOUBLE: = linalg.init_tensor - // CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold + // Packing the first input operand. + // CHECK-DOUBLE: = linalg.init_tensor [3, 5, 12] + // CHECK-DOUBLE: %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = + // CHECK-DOUBLE: %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]]) + // CHECK-DOUBLE: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] + // CHECK-DOUBLE: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold + // CHECK-DOUBLE: %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0] + // CHECK-DOUBLE: scf.yield %[[T2:.*]] // CHECK-DOUBLE: scf.for %[[IV1:[0-9a-zA-Z]*]] = %1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { @@ -131,6 +232,15 @@ %3 = affine.min #map1(%arg5) %4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor + // Packing the second input operand. + // CHECK-DOUBLE: = linalg.init_tensor [%{{.*}}, 12, 6] + // CHECK-DOUBLE: %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = + // CHECK-DOUBLE: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]]) + // CHECK-DOUBLE: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] + // CHECK-DOUBLE: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold + // CHECK-DOUBLE: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] + // CHECK-DOUBLE: scf.yield %[[T5:.*]] + // CHECK-DOUBLE: scf.for %[[IV2:[0-9a-zA-Z]*]] = %5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor) { @@ -138,6 +248,12 @@ %7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor) { %8 = affine.min #map2(%arg7, %2) %9 = affine.apply #map3(%arg7, %arg3) + + // Index the packed operands. + // CHECK-DOUBLE-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV2]]) + // CHECK-DOUBLE-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX0]] + // CHECK-DOUBLE-DAG: %[[IDX1:.*]] = affine.apply #[[DIV6]](%[[IV3]]) + // CHECK-DOUBLE-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX1]] %10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor %11 = affine.min #map4(%arg9, %3) %12 = affine.apply #map3(%arg9, %arg5) @@ -146,9 +262,8 @@ %15 = affine.min #map4(%arg9, %3) %16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor to tensor - // Pad the output operand and perform the multiplication. - // CHECK-DOUBLE: = linalg.pad_tensor - // CHECK-DOUBLE: = linalg.matmul + // Check matmul uses the packed input operands. + // CHECK-DOUBLE: = linalg.matmul ins(%[[T6]], %[[T7]] %17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor, tensor<12x?xf32>) outs(%16 : tensor) -> tensor %18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor into tensor scf.yield %18 : tensor