diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -794,9 +794,51 @@ // HoistPadOp //===----------------------------------------------------------------------===// +def HoistPadBuildPackingLoopNestOp : + Op, + DeclareOpInterfaceMethods]> { + let description = [{ + Helper transform used to hoist a tensor.pad target operation. This operation + creates the packing loop nest required by the hoist_pad operation and makes + that functionality available independently. + + TODO: In the future, we should consider rewriting as a tensor.pack after + hoisting since this abstraction is now available. + + #### Return modes + + This operation ignores non-tensor.pad ops and drops them in the result. + If any non-tensor.pad is passed, the transform emits a silenceable failure. + + The return handle points to only the subset of successfully created packing + loop nests, which can be empty. + }]; + + // Also allow any !pdl.operation for simpler composition. Non-tensor.pad ops + // will be dropped from the results. + let arguments = + (ins TransformHandleTypeInterface:$target, + TransformHandleTypeInterface:$loop, + DefaultValuedAttr:$transpose); + let results = (outs TransformHandleTypeInterface:$packing_loop); + + let assemblyFormat = [{ + $target + `above` $loop + (`,` `transpose` `by` $transpose^)? + attr-dict + `:` functional-type(operands, results) + }]; + let hasVerifier = 1; +} + def HoistPadOp : Op { + [FunctionalStyleTransformOpTrait, + MemoryEffectsOpInterface, + TransformOpInterface, + TransformEachOpTrait]> { let description = [{ Hoist the tensor.pad target operation by at most the given number of loops. Optionally apply the transpose attribute to the inner dimensions. diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -362,6 +362,26 @@ ArrayRef paddingValues, ArrayRef packPaddings, LinalgOp &paddedOp); +namespace detail { + +/// Helper struct to hold the results of building a packing loop nest. +struct PackingResult { + SmallVector offsets, sizes, strides; + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; + GenericOp maybeTransposeOp; + tensor::PadOp hoistedPadOp; +}; + +/// Build the packing loop nest required to hoist `opToHoist` above +/// `outermostEnclosingForOp`. +/// The loop nest is built just before `outermostEnclosingForOp`. +FailureOr +buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, + scf::ForOp outermostEnclosingForOp, + ArrayRef transposeVector); + +} // namespace detail + /// Mechanically hoist padding operations on tensors by `numLoops` into a new, /// generally larger tensor. This achieves packing of multiple padding ops into /// a larger tensor. On success, `opToHoist` is replaced by the cloned version diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -54,6 +54,18 @@ ValueRange newIterOperands, const NewYieldValueFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop = true); +// Simpler API if the new yields are just a list of values that can be +// determined ahead of time. +inline scf::ForOp +replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop, + ValueRange newIterOperands, ValueRange newYields, + bool replaceIterOperandsUsesInLoop = true) { + auto fn = [&](OpBuilder &b, Location loc, ArrayRef newBBArgs) { + return SmallVector(newYields.begin(), newYields.end()); + }; + return replaceLoopWithNewYields(builder, loop, newIterOperands, fn, + replaceIterOperandsUsesInLoop); +} /// Update a perfectly nested loop nest to yield new values from the innermost /// loop and propagating it up through the loop nest. This function diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1757,6 +1757,60 @@ // HoistPadOp //===---------------------------------------------------------------------===// +DiagnosedSilenceableFailure transform::HoistPadBuildPackingLoopNestOp::apply( + transform::TransformResults &transformResults, + transform::TransformState &state) { + ArrayRef targetOps = state.getPayloadOps(getTarget()); + ArrayRef loopOps = state.getPayloadOps(getLoop()); + if (targetOps.size() != 1 || loopOps.size() != 1) { + return emitDefiniteFailure() + << "requires exactly one target and one loop handle (got " + << targetOps.size() << " and " << loopOps.size() << ")"; + } + + auto padOp = dyn_cast_or_null(targetOps.front()); + auto loopOp = dyn_cast_or_null(loopOps.front()); + if (!padOp || !loopOp) + return emitDefiniteFailure() << "requires exactly 2 non-null handles"; + + IRRewriter rewriter(getContext()); + FailureOr result = + linalg::detail::buildPackingLoopNest(rewriter, padOp, loopOp, + getTranspose()); + if (failed(result)) + return emitDefiniteFailure() << "could not build packing loop nest"; + + if (result->clonedLoopIvs.empty()) { + transformResults.set(getPackingLoop().cast(), + result->hoistedPadOp.getOperation()); + return DiagnosedSilenceableFailure::success(); + } + auto outerPackedLoop = + scf::getForInductionVarOwner(result->clonedLoopIvs.front()); + transformResults.set(getPackingLoop().cast(), + outerPackedLoop.getOperation()); + return DiagnosedSilenceableFailure::success(); +} + +LogicalResult transform::HoistPadBuildPackingLoopNestOp::verify() { + ArrayRef transpose = getTranspose(); + auto sequence = llvm::to_vector(llvm::seq(0, transpose.size())); + if (!std::is_permutation(sequence.begin(), sequence.end(), transpose.begin(), + transpose.end())) { + return emitOpError() << "expects transpose to be a permutation, found " + << getTranspose(); + } + return success(); +} + +void transform::HoistPadBuildPackingLoopNestOp::getEffects( + SmallVectorImpl &effects) { + transform::onlyReadsHandle(getTarget(), effects); + transform::onlyReadsHandle(getLoop(), effects); + transform::producesHandle(getPackingLoop(), effects); + transform::modifiesPayload(effects); +} + DiagnosedSilenceableFailure transform::HoistPadOp::applyToOne(tensor::PadOp target, transform::ApplyToEachResultList &results, diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -33,6 +33,7 @@ using namespace mlir; using namespace mlir::linalg; +using namespace mlir::linalg::detail; #ifndef NDEBUG static bool debugPrintLoopInShortForm(Operation *op) { @@ -61,6 +62,73 @@ DBGS() << "\n";); } +/// Return at most nLevels of immediately enclosing scf::ForOp loops. +/// Stops at the first parent that is not an scf::ForOp. +/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. +/// Control-flow and other containing ops with regions are not modeled atm. +static void +getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, + SmallVector &reverseEnclosingLoops) { + scf::ForOp outermostEnclosingForOp = nullptr; + Operation *nextEnclosingOp = padOp->getParentOp(); + while (nLevels-- > 0 && + (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { + LLVM_DEBUG(DBGS() << "loops: "; + debugPrintLoopInShortForm(outermostEnclosingForOp); + dbgs() << "\n"); + reverseEnclosingLoops.push_back(outermostEnclosingForOp); + nextEnclosingOp = outermostEnclosingForOp->getParentOp(); + } +} + +/// Return at most nLevels of immediately enclosing scf::ForOp loops. +/// Stops at the first parent that is not an scf::ForOp. +/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. +/// Control-flow and other containing ops with regions are not modeled atm. +static void +getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop, + SmallVector &reverseEnclosingLoops) { + scf::ForOp outermostEnclosingForOp = nullptr; + Operation *nextEnclosingOp = padOp->getParentOp(); + while (outermostEnclosingForOp != untilLoop && + (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { + LLVM_DEBUG(DBGS() << "loops: "; + debugPrintLoopInShortForm(outermostEnclosingForOp); + dbgs() << "\n"); + reverseEnclosingLoops.push_back(outermostEnclosingForOp); + nextEnclosingOp = outermostEnclosingForOp->getParentOp(); + } +} + +// Get all the ops in the backwards slice starting from `padOp` and that +// are dominated by the outermost enclosing loop. +// This also requires tracking ops defining values used in the region but +// defined above. +static void computeBackwardSlice(tensor::PadOp padOp, + scf::ForOp outermostEnclosingForOp, + SetVector &backwardSlice) { + DominanceInfo domInfo(outermostEnclosingForOp); + auto filter = [&](Operation *op) { + return domInfo.dominates(outermostEnclosingForOp, op) && + !padOp->isProperAncestor(op); + }; + // First, add the ops required to compute the region to the backwardSlice. + SetVector valuesDefinedAbove; + getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(), + valuesDefinedAbove); + for (Value v : valuesDefinedAbove) { + getBackwardSlice(v, &backwardSlice, filter, /*inclusive=*/true); + } + // Then, add the backward slice from padOp itself. + getBackwardSlice(padOp.getOperation(), &backwardSlice, filter, + /*inclusive=*/true); +} + +//===----------------------------------------------------------------------===// +// HoistPaddingAnalysis Implementation. +//===----------------------------------------------------------------------===// + +namespace { /// Analysis class to support tensor::PadOp hoisting across multiple enclosing /// loops. The failure conditions are: /// 1. Pad op has a use that is not an input of a LinalgOp. @@ -76,36 +144,48 @@ /// the outermost enclosing scf::ForOp. /// 8. There is no enclosing scf::ForOp that indexes the padded data. /// Other cases succeed and will trigger hoisting of the pad op. -struct HoistingAnalysis { - HoistingAnalysis(tensor::PadOp padOp, int numLoops); +struct HoistPaddingAnalysis { + HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops); + HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp); - bool isValid() { return valid; } + bool isValid() { return valid.has_value() && valid.value(); } + bool isInvalid() { return valid.has_value() && !valid.value(); } /// Footprint of the packedTensor, computed from the packingLoops. - SmallVector getPackedTensorSizes(RewriterBase &b, Location loc); + SmallVector getPackedTensorSizes(RewriterBase &rewriter, + Location loc) const; - /// The outermost loop, determined by `nLevels` above which `padOp` will - /// be hoisted. - scf::ForOp outermostEnclosingForOp; + /// Performs optional hoisting to enable hoist padding to occur. This may be + /// necessary when `sliceOp` is not defined outside of the outermost enclosing + /// loop we want to hoist above. + /// + /// Example: + /// ``` + /// %source = linalg.fill(%cst, %arg0) + /// // %source is available for packing here! + /// scf.for %i + /// scf.for %j + /// scf.for %k + /// %slice = tensor.extract_slice %source [%i, %j] + /// %padded_slice = tensor.pad %slice + /// ``` + void enableHoistPadding(RewriterBase &rewriter); - /// Backward slice rooted at `padOp` and nested under - /// `outermostEnclosingForOp`. - SetVector backwardSlice; + /// Common analysis builder to finalize the construction of the analysis once + /// optional `enableHoistPadding` has run. + /// `reverseEnclosingLoops.back()` is the loop to hoist above. + void finalizeHoistPaddingAnalysis(); - /// The scf::ForOp immediately enclosing `padOp` such that: - /// 1. they are nested under `outermostEnclosingForOp` (inclusive) - /// 2. whose induction variable is used, directly or indirectly, in the - /// computation of `padOp`. - /// The span of these loops determines the footprint of the packed tensor. - SmallVector packingLoops; +private: + /// Encodes whether the analysis is valid and hoisting can proceed. + std::optional valid; - /// The ExtractSliceOp that feeds the PadOp we want to hoist. - tensor::ExtractSliceOp sliceOp; + /// The padOp to hoist. + tensor::PadOp opToHoist; - /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`. - scf::ForOp padConsumingForOp; + /// Immediately enclosing loops considered for hoisting padding. + SmallVector reverseEnclosingLoops; -private: /// Drop any non-index dependencies of `padOp` and `sliceOp` from /// `backwardSlice`. The method follows the use-def chains of the index /// operands consumed by `padOp` and `sliceOp` and drops the operations @@ -130,96 +210,92 @@ /// ``` /// dropNonIndexDependencies(%padded_slice, %slice) /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. - LogicalResult dropNonIndexDependencies(tensor::PadOp padOp); + LogicalResult dropNonIndexDependencies(); - /// Encodes whether the analysis is valid and hoisting can proceed. - bool valid; -}; +public: + /// The outermost loop, determined by `nLevels` above which `padOp` will + /// be hoisted. + scf::ForOp outermostEnclosingForOp; -/// Return at most nLevels of immediately enclosing scf::ForOp loops. -/// Stops at the first parent that is not an scf::ForOp. -/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. -/// Control-flow and other containing ops with regions are not modeled atm. -static void -getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, - SmallVector &reverseEnclosingLoops) { - scf::ForOp outermostEnclosingForOp = nullptr; - Operation *nextEnclosingOp = padOp->getParentOp(); - while (nLevels-- > 0 && - (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { - LLVM_DEBUG(DBGS() << "loops: "; - debugPrintLoopInShortForm(outermostEnclosingForOp); - dbgs() << "\n"); - reverseEnclosingLoops.push_back(outermostEnclosingForOp); - nextEnclosingOp = outermostEnclosingForOp->getParentOp(); - } -} + /// Backward slice rooted at `padOp` and nested under + /// `outermostEnclosingForOp`. + SetVector backwardSlice; -// Get all the ops in the backwards slice starting from `padOp` and that -// are dominated by the outermost enclosing loop. -// This also requires tracking ops defining values used in the region but -// defined above. -static void computeBackwardSlice(tensor::PadOp padOp, - scf::ForOp outermostEnclosingForOp, - SetVector &backwardSlice) { - DominanceInfo domInfo(outermostEnclosingForOp); - auto filter = [&](Operation *op) { - return domInfo.dominates(outermostEnclosingForOp, op) && - !padOp->isProperAncestor(op); - }; - // First, add the ops required to compute the region to the backwardSlice. - SetVector valuesDefinedAbove; - getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(), - valuesDefinedAbove); - for (Value v : valuesDefinedAbove) { - getBackwardSlice(v, &backwardSlice, filter, /*inclusive=*/true); - } - // Then, add the backward slice from padOp itself. - getBackwardSlice(padOp.getOperation(), &backwardSlice, filter, - /*inclusive=*/true); -} + /// The scf::ForOp immediately enclosing `padOp` such that: + /// 1. they are nested under `outermostEnclosingForOp` (inclusive) + /// 2. whose induction variable is used, directly or indirectly, in the + /// computation of `padOp`. + /// The span of these loops determines the footprint of the packed tensor. + SmallVector packingLoops; + + /// The ExtractSliceOp that feeds the PadOp we want to hoist. + tensor::ExtractSliceOp sliceOp; + + /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`. + scf::ForOp padConsumingForOp; +}; -HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { - valid = false; +} // namespace +HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops) + : valid(std::nullopt), opToHoist(padOp) { // Get at most `numLoops` of immediately enclosing loops. - SmallVector reverseEnclosingLoops; - getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); + getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops); if (reverseEnclosingLoops.empty()) { LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n"); + valid = false; return; } - outermostEnclosingForOp = reverseEnclosingLoops.back(); + sliceOp = opToHoist.getSource().getDefiningOp(); + if (!sliceOp) { + LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n"); + valid = false; + return; + } +} - // Get the `sliceOp` that defines the source tensor of `padOp` and - // check its source is defined outside of the outermost loop. This check - // ensures the padded data is available for packing before entering the - // outermost enclosing loop. - // - // Example: - // ``` - // %source = linalg.fill(%cst, %arg0) - // // %source is available for packing here! - // scf.for %i - // scf.for %j - // scf.for %k - // %slice = tensor.extract_slice %source [%i, %j] - // %padded_slice = tensor.pad %slice - // ``` - sliceOp = padOp.getSource().getDefiningOp(); +HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, + scf::ForOp outermostEnclosingForOp) + : valid(std::nullopt), opToHoist(padOp) { + // Get enclosing loops until outermostEnclosingForOp. + getEnclosingLoopsUntil(opToHoist, outermostEnclosingForOp, + reverseEnclosingLoops); + if (reverseEnclosingLoops.empty()) { + LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n"); + valid = false; + return; + } + this->outermostEnclosingForOp = reverseEnclosingLoops.back(); + if (this->outermostEnclosingForOp != outermostEnclosingForOp) { + LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n"); + valid = false; + return; + } + sliceOp = opToHoist.getSource().getDefiningOp(); if (!sliceOp) { LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n"); + valid = false; return; } +} + +void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) { + if (isInvalid()) + return; // If the padded data is not yet available before entering the outermost // enclosing loop, try to apply hoisting on this outermost loop. // TODO: we may want finer-grained hoisting of only that particular `sliceOp`. - IRRewriter rewriter(outermostEnclosingForOp->getContext()); if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { outermostEnclosingForOp = hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp); } +} + +void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() { + if (isInvalid()) + return; + if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n" << outermostEnclosingForOp << "\n" @@ -227,6 +303,7 @@ << "--sliceOp.getSource(): " << sliceOp.getSource() << "\n"); LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n"); + valid = false; return; } if (sliceOp->hasOneUse()) { @@ -236,23 +313,27 @@ // Check the region of `padOp` depends on a constant only. Adding hoisting // support for arbitrary padding regions would require cloning all // dependencies captured by the padding region. - Value paddingValue = padOp.getConstantPaddingValue(); + Value paddingValue = opToHoist.getConstantPaddingValue(); if (!paddingValue || !isa_and_nonnull(paddingValue.getDefiningOp())) { LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n"); + valid = false; return; } - computeBackwardSlice(padOp, outermostEnclosingForOp, backwardSlice); - if (backwardSlice.size() <= 1) + computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice); + if (backwardSlice.size() <= 1) { + valid = false; return; + } debugPrintBackwardSlice(backwardSlice); // Remove all ops in the backward slice that are not used to index // the padded tensor. In particular, keep `padOp`, `sliceOp`, and // the loop and affine operations used for the index computation. - if (failed(dropNonIndexDependencies(padOp))) { + if (failed(dropNonIndexDependencies())) { LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n"); + valid = false; return; } debugPrintBackwardSlice(backwardSlice); @@ -281,7 +362,7 @@ valid = true; } -LogicalResult HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp) { +LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() { // Set of all values used for index computation. SetVector indexEdges; @@ -300,7 +381,7 @@ }); }; - // Starting from `padOp` and `sliceOp` walk the use-def edges of index + // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index // type in `backwardSlice`. Add the index operands of an operation to // `indexEdges` and remove all operations from `backwardSlice` that are not // part of the index computation. @@ -322,9 +403,9 @@ // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k] SetVector operationsToRemove; for (Operation *op : llvm::reverse(backwardSlice)) { - // Add the index operands of `padOp` and `sliceOp` to start the + // Add the index operands of `opToHoist` and `sliceOp` to start the // exploration of the index computation. - if (op == padOp || op == sliceOp) { + if (op == opToHoist || op == sliceOp) { addIndexOperandsToIndexEdges(op); continue; } @@ -358,7 +439,7 @@ continue; } // Remove all other operations not used by the index computation. An - // exception are constant operations that may be used by `padOp`. + // exception are constant operations that may be used by `opToHoist`. if (!isa(op)) operationsToRemove.insert(op); } @@ -367,7 +448,8 @@ } SmallVector -HoistingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, Location loc) { +HoistPaddingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, + Location loc) const { SmallVector dynamicTensorSizes; // Upper bound the packing loop lengths to size the packed tensor. Taking @@ -403,6 +485,10 @@ return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant()); } +//===----------------------------------------------------------------------===// +// buildPackingLoopNest Implementation. +//===----------------------------------------------------------------------===// + /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). /// The returned Value is guaranteed not to depend on any loop comprised in /// [`outer`, `forOp`]. @@ -423,12 +509,6 @@ loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal}); } -struct PackingLoopNestResult { - SmallVector offsets, sizes, strides; - SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - GenericOp maybeTransposeOp; -}; - // Build a packing loop nest by iteratively traversing the backward slice and // clone the operations, iteratively stepping into the loops that we encounter. // The implementation proceeds in a stack-like fashion: @@ -439,10 +519,10 @@ // 3. At the innermost loop level, create a InsertSliceOp. // 4. Iteratively pop and yield the result of the InsertSliceOp across the // cloned loops. -static PackingLoopNestResult buildPackingLoopNest( +static PackingResult buildPackingLoopNestImpl( RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, ArrayRef transposeVector, RankedTensorType transposedTensorType, - tensor::EmptyOp emptyOp, const HoistingAnalysis &analysis) { + tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) { SmallVector offsets, sizes, strides; SmallVector clonedLoopIvs, leadingPackedTensorIndexings; @@ -452,8 +532,21 @@ RankedTensorType paddedTensorType = opToHoist.getResultType(); int paddedRank = paddedTensorType.getRank(); - Value packedTensor = emptyOp.getResult(); + // Step 0. Populate bvm with opToHoist.getSource if relevant. + BlockArgument bbArg = opToHoist.getSource().dyn_cast(); + while (bbArg) { + auto forOp = dyn_cast(bbArg.getOwner()->getParentOp()); + if (!forOp) + break; + if (forOp != outerLoop && !outerLoop->isAncestor(forOp)) + break; + OpOperand &operand = forOp.getOpOperandForRegionIterArg(bbArg); + bvm.map(bbArg, operand.get()); + bbArg = operand.get().dyn_cast(); + } + // Step 1. iteratively clone loops and push `packedTensor`. + Value packedTensor = emptyOp.getResult(); OpBuilder::InsertionGuard g(rewriter); for (Operation *op : analysis.backwardSlice) { // Specifically sit out in the extract_slice(packedTensor) case: this is @@ -544,14 +637,83 @@ } } - return PackingLoopNestResult{offsets, - sizes, - strides, - clonedLoopIvs, - leadingPackedTensorIndexings, - maybeTransposeOp}; + return PackingResult{ + offsets, + sizes, + strides, + clonedLoopIvs, + leadingPackedTensorIndexings, + maybeTransposeOp, + cast(bvm.lookup(opToHoist.getResult()).getDefiningOp())}; } +/// Build the packing loop nest required to hoist `opToHoist` above +/// `outermostEnclosingForOp`. +/// The loop nest is built just before `outermostEnclosingForOp`. +static FailureOr buildPackingLoopNestImpl( + RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, + ArrayRef transposeVector, const HoistPaddingAnalysis &analysis) { + // Update actual number of loops, which may be smaller. + int nPackedLoops = analysis.packingLoops.size(); + LLVM_DEBUG(DBGS() << "\n"; + DBGS() << "Func:\n" + << *opToHoist->getParentOfType() << "\n"; + DBGS() << "Start hoisting above " << nPackedLoops << " loops\n"); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); + + // Compute the type of the transposed padded tensor. + FailureOr transposedTensorType = + tensor::computeTransposedType(paddedTensorType, transposeVector); + if (failed(transposedTensorType)) { + LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n"); + return failure(); + } + + // Create the packed tensor. + SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); + // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. + llvm::append_range(packedShape, transposedTensorType->getShape()); + auto packedTensorType = RankedTensorType::get( + packedShape, transposedTensorType->getElementType()); + + // Set the insertion point right before the outer loop and start packing. + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(outerLoop); + SmallVector dynamicTensorSizes = + analysis.getPackedTensorSizes(rewriter, loc); + auto emptyOp = rewriter.create( + loc, packedTensorType.getShape(), packedTensorType.getElementType(), + dynamicTensorSizes); + + return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector, + *transposedTensorType, emptyOp, analysis); +} + +/// Build the packing loop nest required to hoist `opToHoist` above +/// `outermostEnclosingForOp`. +/// The loop nest is built just before `outermostEnclosingForOp`. +FailureOr mlir::linalg::detail::buildPackingLoopNest( + RewriterBase &rewriter, tensor::PadOp opToHoist, + scf::ForOp outermostEnclosingForOp, ArrayRef transposeVector) { + HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp); + analysis.enableHoistPadding(rewriter); + analysis.finalizeHoistPaddingAnalysis(); + if (!analysis.isValid()) { + LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n"); + return failure(); + } + IRMapping bvm; + return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector, + analysis); +} + +//===----------------------------------------------------------------------===// +// hoistPaddingOnTensors Implementation. +//===----------------------------------------------------------------------===// + // If the original consumer of `sliceOp` was a `forOp` (i.e. through an iter // arg), propagate the `packedTensor` value through the same iter arg. // TODO: for multiple loops we need to track the use to the innermost loop. @@ -574,6 +736,7 @@ std::optional operandNumber = forOp.getIterArgNumberForOpOperand(*pUse); assert(operandNumber.has_value() && "expected a proper iter arg number"); + SmallVector initArgs = forOp.getInitArgs(); initArgs[operandNumber.value()] = casted; rewriter.startRootUpdate(forOp); @@ -584,10 +747,12 @@ /// Produce a tensor extracted from the packingResult. This can be used as a /// replacement for `opToHoist` in callers. -static Value replaceByPackingLoopNestResult( - RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, - RankedTensorType transposedTensorType, const HoistingAnalysis &analysis, - const PackingLoopNestResult &packingResult) { +static Value replaceByPackingResult(RewriterBase &rewriter, + const IRMapping &bvm, + tensor::PadOp opToHoist, + RankedTensorType transposedTensorType, + const HoistPaddingAnalysis &analysis, + const PackingResult &packingResult) { // The replacement occurs under a single insertion point within the original // loop, just before opToHoist. OpBuilder::InsertionGuard g(rewriter); @@ -625,13 +790,7 @@ ->getResult(0); } else { // If no loops were created, this is just hoisting without packing. - auto padOp = - cast(bvm.lookup(opToHoist.getResult()).getDefiningOp()); - tensor::ExtractSliceOp sliceOp = analysis.sliceOp; - rewriter.startRootUpdate(padOp); - padOp.getSourceMutable().assign(sliceOp.getResult()); - rewriter.finalizeRootUpdate(padOp); - packedTensor = padOp; + packedTensor = bvm.lookup(opToHoist.getResult()); } LLVM_DEBUG(DBGS() << "packedTensor: " << packedTensor << "\n"); @@ -657,59 +816,39 @@ SmallVectorImpl &transposeOps) { LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n"; DBGS() << " by " << numLoops << " loops\n"); - HoistingAnalysis analysis(opToHoist, numLoops); + + HoistPaddingAnalysis analysis(opToHoist, numLoops); + analysis.enableHoistPadding(rewriter); + analysis.finalizeHoistPaddingAnalysis(); if (!analysis.isValid()) { LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n"); return failure(); } - // Update actual number of loops, which may be smaller. - int nPackedLoops = analysis.packingLoops.size(); - LLVM_DEBUG(DBGS() << "\n"; - DBGS() << "Func:\n" - << *opToHoist->getParentOfType() << "\n"; - DBGS() << "Start hoisting above " << nPackedLoops << " loops\n"); - - Location loc = opToHoist->getLoc(); - RankedTensorType paddedTensorType = opToHoist.getResultType(); - - // Compute the type of the transposed padded tensor. - FailureOr transposedTensorType = - tensor::computeTransposedType(paddedTensorType, transposeVector); - if (failed(transposedTensorType)) { - LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n"); + /// Construct the packing loop nest. + IRMapping bvm; + FailureOr packingResult = buildPackingLoopNestImpl( + rewriter, bvm, opToHoist, transposeVector, analysis); + if (failed(packingResult)) { + LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n"); return failure(); } - // Create the packed tensor. - SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); - // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. - llvm::append_range(packedShape, transposedTensorType->getShape()); - auto packedTensorType = RankedTensorType::get( - packedShape, transposedTensorType->getElementType()); - - // Set the insertion point right before the outer loop and start packing. - scf::ForOp outerLoop = analysis.outermostEnclosingForOp; - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(outerLoop); - SmallVector dynamicTensorSizes = - analysis.getPackedTensorSizes(rewriter, loc); - auto emptyOp = rewriter.create( - loc, packedTensorType.getShape(), packedTensorType.getElementType(), - dynamicTensorSizes); - - /// Construct the packing loop nest. - IRMapping bvm; - PackingLoopNestResult packingResult = - buildPackingLoopNest(rewriter, bvm, opToHoist, transposeVector, - *transposedTensorType, emptyOp, analysis); if (!transposeVector.empty()) - transposeOps.push_back(packingResult.maybeTransposeOp); + transposeOps.push_back(packingResult->maybeTransposeOp); + + FailureOr transposedTensorType = + tensor::computeTransposedType(opToHoist.getResultType(), transposeVector); + assert(succeeded(transposedTensorType) && "unexpected failure in type"); // Now the packed tensor is ready, replace the original padding op by a // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. - Value newResult = replaceByPackingLoopNestResult( - rewriter, bvm, opToHoist, *transposedTensorType, analysis, packingResult); + Value newResult = + replaceByPackingResult(rewriter, bvm, opToHoist, *transposedTensorType, + analysis, *packingResult); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); if (!transposeVector.empty()) { OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPointAfter(newResult.getDefiningOp()); @@ -729,8 +868,7 @@ << "\n"); // Make the newly cloned `opToHoist` available to the caller. - hoistedOp = - cast(bvm.lookup(opToHoist.getResult()).getDefiningOp()); + hoistedOp = packingResult->hoistedPadOp; LLVM_DEBUG(DBGS() << "--SUCCESS\n"); return newResult; diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -87,6 +87,7 @@ ArrayRef transposeVector) { if (transposeVector.empty()) return rankedTensorType; + if (!isPermutationVector(transposeVector) || transposeVector.size() != static_cast(rankedTensorType.getRank())) return failure(); diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir copy from mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir copy to mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir --- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir @@ -1,20 +1,20 @@ -// RUN: mlir-opt --test-transform-dialect-interpreter -canonicalize -split-input-file --verify-diagnostics %s | FileCheck %s +// BUILD-PACKING-LOOP-NEST only checks the creation of packing code but does not connect it. +// Do not run canonicalization as it would be DCE'd away. +// RUN: mlir-opt --test-transform-dialect-interpreter -split-input-file --verify-diagnostics %s | FileCheck %s --check-prefix=BUILD-PACKING-LOOP-NEST func.func @pad_and_hoist_rhs( %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) - -> tensor<24x25xf32> + -> tensor<24x25xf32> { - // expected-note @below {{payload operation}} %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> func.return %0 : tensor<24x25xf32> } transform.sequence failures(propagate) { ^bb1(%arg1: !pdl.operation): - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] %matmul_padded = transform.structured.pad %matmul_l1 { @@ -24,32 +24,29 @@ // In this case, the pad op is actually empty: we only tile the first dimension // and it does not have an impact on the RHS operand. - // expected-error @below {{incompatible payload operation name}} %pad = transform.get_producer_of_operand %matmul_padded[1] - : (!pdl.operation) -> !transform.op<"tensor.pad"> + : (!pdl.operation) -> !pdl.operation - // We do not even reach this transform op. - transform.structured.hoist_pad %pad by 1 loops - : (!transform.op<"tensor.pad">) -> !pdl.operation + // expected-error @below {{requires exactly 2 non-null handles}} + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1 + : (!pdl.operation, !pdl.operation) -> !pdl.operation } // ----- func.func @pad_and_hoist_init( %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) - -> tensor<24x25xf32> + -> tensor<24x25xf32> { - // expected-note @below {{when applied to this op}} %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> func.return %0 : tensor<24x25xf32> } transform.sequence failures(propagate) { ^bb1(%arg1: !pdl.operation): - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] %matmul_padded = transform.structured.pad %matmul_l1 { @@ -58,40 +55,36 @@ } %pad = transform.get_producer_of_operand %matmul_padded[2] - : (!pdl.operation) -> !transform.op<"tensor.pad"> + : (!pdl.operation) -> !pdl.operation // We do not know yet how to hoist the init. - // expected-error @below {{transform.structured.hoist_pad failed to apply}} - transform.structured.hoist_pad %pad by 1 loops - : (!transform.op<"tensor.pad">) -> !pdl.operation + // expected-error @below {{could not build packing loop nest}} + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1 + : (!pdl.operation, !pdl.operation) -> !pdl.operation } // ----- -// CHECK-LABEL: pad_and_hoist_lhs +// BUILD-PACKING-LOOP-NEST-LABEL: pad_and_hoist_lhs func.func @pad_and_hoist_lhs( %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) - -> tensor<24x25xf32> + -> tensor<24x25xf32> { - // CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x5x12xf32>) { - // CHECK: tensor.pad %{{.*}} - // CHECK: : tensor to tensor<5x12xf32> - // CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] - // CHECK-SAME: : tensor<5x12xf32> into tensor<5x5x12xf32> - // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { - // CHECK: %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] - // CHECK-SAME: : tensor<5x5x12xf32> to tensor<5x12xf32> - // CHECK: linalg.matmul ins(%[[PADDED]] + // BUILD-PACKING-LOOP-NEST: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor) { + // BUILD-PACKING-LOOP-NEST: tensor.pad %{{.*}} + // BUILD-PACKING-LOOP-NEST: : tensor to tensor<5x12xf32> + // BUILD-PACKING-LOOP-NEST: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] + // BUILD-PACKING-LOOP-NEST-SAME: : tensor<5x12xf32> into tensor + // BUILD-PACKING-LOOP-NEST: scf.for %{{.*}} -> (tensor<24x25xf32>) %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> func.return %0 : tensor<24x25xf32> } transform.sequence failures(propagate) { ^bb1(%arg1: !pdl.operation): - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] %matmul_padded = transform.structured.pad %matmul_l1 { @@ -102,40 +95,34 @@ %pad = transform.get_producer_of_operand %matmul_padded[0] : (!pdl.operation) -> !pdl.operation - transform.structured.hoist_pad %pad by 1 loops - : (!pdl.operation) -> !pdl.operation + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1 + : (!pdl.operation, !pdl.operation) -> !pdl.operation } // ----- -// CHECK-LABEL: pad_and_hoist_lhs_transpose +// BUILD-PACKING-LOOP-NEST-LABEL: pad_and_hoist_lhs_transpose func.func @pad_and_hoist_lhs_transpose( %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) - -> tensor<24x25xf32> + -> tensor<24x25xf32> { - // CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x12x5xf32>) { - // CHECK: tensor.pad %{{.*}} - // CHECK: : tensor to tensor<5x12xf32> - // CHECK: linalg.generic - // CHECK: -> tensor<12x5xf32> - // CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] - // CHECK-SAME: : tensor<12x5xf32> into tensor<5x12x5xf32> - // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { - // CHECK: %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] - // CHECK-SAME: : tensor<5x12x5xf32> to tensor<12x5xf32> - // CHECK: %[[TRANSPOSED:.*]] = linalg.generic - // CHECK: -> tensor<5x12xf32> - // CHECK: linalg.matmul ins(%[[TRANSPOSED]] + // BUILD-PACKING-LOOP-NEST: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor) { + // BUILD-PACKING-LOOP-NEST: tensor.pad %{{.*}} + // BUILD-PACKING-LOOP-NEST: : tensor to tensor<5x12xf32> + // BUILD-PACKING-LOOP-NEST: linalg.generic + // BUILD-PACKING-LOOP-NEST: -> tensor<12x5xf32> + // BUILD-PACKING-LOOP-NEST: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] + // BUILD-PACKING-LOOP-NEST-SAME: : tensor<12x5xf32> into tensor + // BUILD-PACKING-LOOP-NEST: scf.for %{{.*}} -> (tensor<24x25xf32>) %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> func.return %0 : tensor<24x25xf32> } transform.sequence failures(propagate) { ^bb1(%arg1: !pdl.operation): - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] %matmul_padded = transform.structured.pad %matmul_l1 { @@ -146,38 +133,32 @@ %pad = transform.get_producer_of_operand %matmul_padded[0] : (!pdl.operation) -> !pdl.operation - transform.structured.hoist_pad %pad by 1 loops, transpose by [1, 0] - : (!pdl.operation) -> !pdl.operation + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1, transpose by [1, 0] + : (!pdl.operation, !pdl.operation) -> !pdl.operation } // ----- -// CHECK-LABEL: pad_and_hoist_init +// BUILD-PACKING-LOOP-NEST-LABEL: pad_and_hoist_init func.func @pad_and_hoist_init( %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) - -> tensor<24x25xf32> + -> tensor<24x25xf32> { - // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { - // CHECK: %[[PADDED:.*]] = tensor.pad %{{.*}} - // CHECK: : tensor to tensor<5x25xf32> - // CHECK: scf.for %{{.*}} iter_args(%[[INNER_PADDED:[0-9a-zA-Z]*]] = %[[PADDED]]) -> (tensor<5x25xf32>) - // CHECK: %[[RES:.*]] = linalg.matmul {{.*}} outs(%[[INNER_PADDED]] - // CHECK-SAME: : tensor<5x25xf32> - // CHECK: scf.yield %[[RES]] : tensor<5x25xf32> - // CHECK: %[[CAST:.*]] = tensor.cast %{{.*}} : tensor<5x25xf32> to tensor - // CHECK: tensor.insert_slice %[[CAST]] into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1] - // CHECK-SAME: : tensor into tensor<24x25xf32> + // BUILD-PACKING-LOOP-NEST: scf.for %{{.*}} -> (tensor<24x25xf32>) { + // BUILD-PACKING-LOOP-NEST: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice + // BUILD-PACKING-LOOP-NEST: %[[PADDED:.*]] = tensor.pad %[[EXTRACTED_SLICE]] + // BUILD-PACKING-LOOP-NEST: : tensor to tensor<5x25xf32> + // BUILD-PACKING-LOOP-NEST: scf.for %{{.*}} iter_args({{.*}} = %[[EXTRACTED_SLICE]]) -> (tensor<24x25xf32>, tensor) { %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> func.return %0 : tensor<24x25xf32> } transform.sequence failures(propagate) { ^bb1(%arg1: !pdl.operation): - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul_l1, %loops_l1:2 = transform.structured.tile_to_scf_for %matmul [5, 0, 7] %matmul_padded = transform.structured.pad %matmul_l1 { @@ -186,8 +167,8 @@ } %pad = transform.get_producer_of_operand %matmul_padded[2] - : (!pdl.operation) -> !transform.op<"tensor.pad"> + : (!pdl.operation) -> !pdl.operation - transform.structured.hoist_pad %pad by 1 loops - : (!transform.op<"tensor.pad">) -> !pdl.operation + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1#1 + : (!pdl.operation, !pdl.operation) -> !pdl.operation } diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir --- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir @@ -68,7 +68,7 @@ // ----- -// CHECK-LABEL: pad_and_hoist_lhs +// CHECK-LABEL: pad_and_hoist_lhs( func.func @pad_and_hoist_lhs( %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) -> tensor<24x25xf32>