diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -794,9 +794,51 @@ // HoistPadOp //===----------------------------------------------------------------------===// +def HoistPadBuildPackingLoopNestOp : + Op, + DeclareOpInterfaceMethods]> { + let description = [{ + Helper transform used to hoist a tensor.pad target operation. This operation + creates the packing loop nest required by the hoist_pad operation and makes + that functionality available independently. + + TODO: In the future, we should consider rewriting as a tensor.pack after + hoisting since this abstraction is now available. + + #### Return modes + + This operation ignores non-tensor.pad ops and drops them in the result. + If any non-tensor.pad is passed, the transform emits a silenceable failure. + + The return handle points to only the subset of successfully created packing + loop nests, which can be empty. + }]; + + // Also allow any !pdl.operation for simpler composition. Non-tensor.pad ops + // will be dropped from the results. + let arguments = + (ins TransformHandleTypeInterface:$target, + TransformHandleTypeInterface:$loop, + DefaultValuedAttr:$transpose); + let results = (outs TransformHandleTypeInterface:$packing_loop); + + let assemblyFormat = [{ + $target + `above` $loop + (`,` `transpose` `by` $transpose^)? + attr-dict + `:` functional-type(operands, results) + }]; + let hasVerifier = 1; +} + def HoistPadOp : Op { + [FunctionalStyleTransformOpTrait, + MemoryEffectsOpInterface, + TransformOpInterface, + TransformEachOpTrait]> { let description = [{ Hoist the tensor.pad target operation by at most the given number of loops. Optionally apply the transpose attribute to the inner dimensions. diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -362,6 +362,25 @@ ArrayRef paddingValues, ArrayRef packPaddings, LinalgOp &paddedOp); +namespace detail { + +/// Helper struct to hold the results of building a packing loop nest. +struct PackingLoopNestResult { + SmallVector offsets, sizes, strides; + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; + GenericOp maybeTransposeOp; +}; + +/// Build the packing loop nest required to hoist `opToHoist` above +/// `outermostEnclosingForOp`. +/// The loop nest is built just before `outermostEnclosingForOp`. +FailureOr +buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, + scf::ForOp outermostEnclosingForOp, + ArrayRef transposeVector); + +} // namespace detail + /// Mechanically hoist padding operations on tensors by `numLoops` into a new, /// generally larger tensor. This achieves packing of multiple padding ops into /// a larger tensor. On success, `opToHoist` is replaced by the cloned version diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -54,6 +54,18 @@ ValueRange newIterOperands, const NewYieldValueFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop = true); +// Simpler API if the new yields are just a list of values that can be +// determined ahead of time. +inline scf::ForOp +replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop, + ValueRange newIterOperands, ValueRange newYields, + bool replaceIterOperandsUsesInLoop = true) { + auto fn = [&](OpBuilder &b, Location loc, ArrayRef newBBArgs) { + return SmallVector(newYields.begin(), newYields.end()); + }; + return replaceLoopWithNewYields(builder, loop, newIterOperands, fn, + replaceIterOperandsUsesInLoop); +} /// Update a perfectly nested loop nest to yield new values from the innermost /// loop and propagating it up through the loop nest. This function diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1757,6 +1757,52 @@ // HoistPadOp //===---------------------------------------------------------------------===// +DiagnosedSilenceableFailure transform::HoistPadBuildPackingLoopNestOp::apply( + transform::TransformResults &transformResults, + transform::TransformState &state) { + ArrayRef targetOps = state.getPayloadOps(getTarget()); + ArrayRef loopOps = state.getPayloadOps(getLoop()); + if (targetOps.size() != 1 || loopOps.size() != 1) { + return emitDefiniteFailure() + << "requires exactly one target and loop handle (got " + << targetOps.size() << " and " << loopOps.size() << ")"; + } + + auto padOp = dyn_cast_or_null(targetOps.front()); + auto loopOp = dyn_cast_or_null(targetOps.front()); + IRRewriter rewriter(getContext()); + FailureOr result = + linalg::detail::buildPackingLoopNest(rewriter, padOp, loopOp, + getTranspose()); + if (failed(result)) + return emitDefiniteFailure() << "could not build packing loop nest"; + + auto outerPackedLoop = + scf::getForInductionVarOwner(result->clonedLoopIvs.front()); + transformResults.set(getPackingLoop().cast(), + outerPackedLoop.getOperation()); + return DiagnosedSilenceableFailure::success(); +} + +LogicalResult transform::HoistPadBuildPackingLoopNestOp::verify() { + ArrayRef transpose = getTranspose(); + auto sequence = llvm::to_vector(llvm::seq(0, transpose.size())); + if (!std::is_permutation(sequence.begin(), sequence.end(), transpose.begin(), + transpose.end())) { + return emitOpError() << "expects transpose to be a permutation, found " + << getTranspose(); + } + return success(); +} + +void transform::HoistPadBuildPackingLoopNestOp::getEffects( + SmallVectorImpl &effects) { + transform::onlyReadsHandle(getTarget(), effects); + transform::onlyReadsHandle(getLoop(), effects); + transform::producesHandle(getPackingLoop(), effects); + transform::modifiesPayload(effects); +} + DiagnosedSilenceableFailure transform::HoistPadOp::applyToOne(tensor::PadOp target, transform::ApplyToEachResultList &results, diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -17,9 +17,11 @@ #include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/IR/AsmState.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/Matchers.h" #include "mlir/Transforms/RegionUtils.h" @@ -33,6 +35,7 @@ using namespace mlir; using namespace mlir::linalg; +using namespace mlir::linalg::detail; #ifndef NDEBUG static bool debugPrintLoopInShortForm(Operation *op) { @@ -61,6 +64,73 @@ DBGS() << "\n";); } +/// Return at most nLevels of immediately enclosing scf::ForOp loops. +/// Stops at the first parent that is not an scf::ForOp. +/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. +/// Control-flow and other containing ops with regions are not modeled atm. +static void +getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, + SmallVector &reverseEnclosingLoops) { + scf::ForOp outermostEnclosingForOp = nullptr; + Operation *nextEnclosingOp = padOp->getParentOp(); + while (nLevels-- > 0 && + (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { + LLVM_DEBUG(DBGS() << "loops: "; + debugPrintLoopInShortForm(outermostEnclosingForOp); + dbgs() << "\n"); + reverseEnclosingLoops.push_back(outermostEnclosingForOp); + nextEnclosingOp = outermostEnclosingForOp->getParentOp(); + } +} + +/// Return at most nLevels of immediately enclosing scf::ForOp loops. +/// Stops at the first parent that is not an scf::ForOp. +/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. +/// Control-flow and other containing ops with regions are not modeled atm. +static void +getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop, + SmallVector &reverseEnclosingLoops) { + scf::ForOp outermostEnclosingForOp = nullptr; + Operation *nextEnclosingOp = padOp->getParentOp(); + while (outermostEnclosingForOp != untilLoop && + (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { + LLVM_DEBUG(DBGS() << "loops: "; + debugPrintLoopInShortForm(outermostEnclosingForOp); + dbgs() << "\n"); + reverseEnclosingLoops.push_back(outermostEnclosingForOp); + nextEnclosingOp = outermostEnclosingForOp->getParentOp(); + } +} + +// Get all the ops in the backwards slice starting from `padOp` and that +// are dominated by the outermost enclosing loop. +// This also requires tracking ops defining values used in the region but +// defined above. +static void computeBackwardSlice(tensor::PadOp padOp, + scf::ForOp outermostEnclosingForOp, + SetVector &backwardSlice) { + DominanceInfo domInfo(outermostEnclosingForOp); + auto filter = [&](Operation *op) { + return domInfo.dominates(outermostEnclosingForOp, op) && + !padOp->isProperAncestor(op); + }; + // First, add the ops required to compute the region to the backwardSlice. + SetVector valuesDefinedAbove; + getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(), + valuesDefinedAbove); + for (Value v : valuesDefinedAbove) { + getBackwardSlice(v, &backwardSlice, filter, /*inclusive=*/true); + } + // Then, add the backward slice from padOp itself. + getBackwardSlice(padOp.getOperation(), &backwardSlice, filter, + /*inclusive=*/true); +} + +//===----------------------------------------------------------------------===// +// HoistPaddingAnalysis Implementation. +//===----------------------------------------------------------------------===// + +namespace { /// Analysis class to support tensor::PadOp hoisting across multiple enclosing /// loops. The failure conditions are: /// 1. Pad op has a use that is not an input of a LinalgOp. @@ -76,36 +146,47 @@ /// the outermost enclosing scf::ForOp. /// 8. There is no enclosing scf::ForOp that indexes the padded data. /// Other cases succeed and will trigger hoisting of the pad op. -struct HoistingAnalysis { - HoistingAnalysis(tensor::PadOp padOp, int numLoops); +struct HoistPaddingAnalysis { + HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops); + HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp); bool isValid() { return valid; } /// Footprint of the packedTensor, computed from the packingLoops. - SmallVector getPackedTensorSizes(RewriterBase &b, Location loc); - - /// The outermost loop, determined by `nLevels` above which `padOp` will - /// be hoisted. - scf::ForOp outermostEnclosingForOp; + SmallVector getPackedTensorSizes(RewriterBase &rewriter, + Location loc) const; - /// Backward slice rooted at `padOp` and nested under - /// `outermostEnclosingForOp`. - SetVector backwardSlice; + /// Performs optional hoisting to enable hoist padding to occur. This may be + /// necessary when `sliceOp` is not defined outside of the outermost enclosing + /// loop we want to hoist above. + /// + /// Example: + /// ``` + /// %source = linalg.fill(%cst, %arg0) + /// // %source is available for packing here! + /// scf.for %i + /// scf.for %j + /// scf.for %k + /// %slice = tensor.extract_slice %source [%i, %j] + /// %padded_slice = tensor.pad %slice + /// ``` + void enableHoistPadding(RewriterBase &rewriter); - /// The scf::ForOp immediately enclosing `padOp` such that: - /// 1. they are nested under `outermostEnclosingForOp` (inclusive) - /// 2. whose induction variable is used, directly or indirectly, in the - /// computation of `padOp`. - /// The span of these loops determines the footprint of the packed tensor. - SmallVector packingLoops; + /// Common analysis builder to finalize the construction of the analysis once + /// optional `enableHoistPadding` has run. + /// `reverseEnclosingLoops.back()` is the loop to hoist above. + void finalizeHoistPaddingAnalysis(); + +private: + /// Encodes whether the analysis is valid and hoisting can proceed. + bool valid; - /// The ExtractSliceOp that feeds the PadOp we want to hoist. - tensor::ExtractSliceOp sliceOp; + /// The padOp to hoist. + tensor::PadOp opToHoist; - /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`. - scf::ForOp padConsumingForOp; + /// Immediately enclosing loops considered for hoisting padding. + SmallVector reverseEnclosingLoops; -private: /// Drop any non-index dependencies of `padOp` and `sliceOp` from /// `backwardSlice`. The method follows the use-def chains of the index /// operands consumed by `padOp` and `sliceOp` and drops the operations @@ -130,96 +211,79 @@ /// ``` /// dropNonIndexDependencies(%padded_slice, %slice) /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. - LogicalResult dropNonIndexDependencies(tensor::PadOp padOp); + LogicalResult dropNonIndexDependencies(); - /// Encodes whether the analysis is valid and hoisting can proceed. - bool valid; -}; +public: + /// The outermost loop, determined by `nLevels` above which `padOp` will + /// be hoisted. + scf::ForOp outermostEnclosingForOp; -/// Return at most nLevels of immediately enclosing scf::ForOp loops. -/// Stops at the first parent that is not an scf::ForOp. -/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. -/// Control-flow and other containing ops with regions are not modeled atm. -static void -getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, - SmallVector &reverseEnclosingLoops) { - scf::ForOp outermostEnclosingForOp = nullptr; - Operation *nextEnclosingOp = padOp->getParentOp(); - while (nLevels-- > 0 && - (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { - LLVM_DEBUG(DBGS() << "loops: "; - debugPrintLoopInShortForm(outermostEnclosingForOp); - dbgs() << "\n"); - reverseEnclosingLoops.push_back(outermostEnclosingForOp); - nextEnclosingOp = outermostEnclosingForOp->getParentOp(); - } -} + /// Backward slice rooted at `padOp` and nested under + /// `outermostEnclosingForOp`. + SetVector backwardSlice; -// Get all the ops in the backwards slice starting from `padOp` and that -// are dominated by the outermost enclosing loop. -// This also requires tracking ops defining values used in the region but -// defined above. -static void computeBackwardSlice(tensor::PadOp padOp, - scf::ForOp outermostEnclosingForOp, - SetVector &backwardSlice) { - DominanceInfo domInfo(outermostEnclosingForOp); - auto filter = [&](Operation *op) { - return domInfo.dominates(outermostEnclosingForOp, op) && - !padOp->isProperAncestor(op); - }; - // First, add the ops required to compute the region to the backwardSlice. - SetVector valuesDefinedAbove; - getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(), - valuesDefinedAbove); - for (Value v : valuesDefinedAbove) { - getBackwardSlice(v, &backwardSlice, filter, /*inclusive=*/true); - } - // Then, add the backward slice from padOp itself. - getBackwardSlice(padOp.getOperation(), &backwardSlice, filter, - /*inclusive=*/true); -} + /// The scf::ForOp immediately enclosing `padOp` such that: + /// 1. they are nested under `outermostEnclosingForOp` (inclusive) + /// 2. whose induction variable is used, directly or indirectly, in the + /// computation of `padOp`. + /// The span of these loops determines the footprint of the packed tensor. + SmallVector packingLoops; + + /// The ExtractSliceOp that feeds the PadOp we want to hoist. + tensor::ExtractSliceOp sliceOp; + + /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`. + scf::ForOp padConsumingForOp; +}; -HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { - valid = false; +} // namespace +HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops) + : valid(false), opToHoist(padOp) { // Get at most `numLoops` of immediately enclosing loops. SmallVector reverseEnclosingLoops; - getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); + getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops); if (reverseEnclosingLoops.empty()) { LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n"); return; } - outermostEnclosingForOp = reverseEnclosingLoops.back(); - - // Get the `sliceOp` that defines the source tensor of `padOp` and - // check its source is defined outside of the outermost loop. This check - // ensures the padded data is available for packing before entering the - // outermost enclosing loop. - // - // Example: - // ``` - // %source = linalg.fill(%cst, %arg0) - // // %source is available for packing here! - // scf.for %i - // scf.for %j - // scf.for %k - // %slice = tensor.extract_slice %source [%i, %j] - // %padded_slice = tensor.pad %slice - // ``` - sliceOp = padOp.getSource().getDefiningOp(); + sliceOp = opToHoist.getSource().getDefiningOp(); if (!sliceOp) { LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n"); return; } +} + +HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, + scf::ForOp outermostEnclosingForOp) + : valid(false), opToHoist(padOp) { + // Get enclosing loops until outermostEnclosingForOp. + SmallVector reverseEnclosingLoops; + getEnclosingLoopsUntil(padOp, outermostEnclosingForOp, reverseEnclosingLoops); + if (reverseEnclosingLoops.empty()) { + LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n"); + return; + } + this->outermostEnclosingForOp = reverseEnclosingLoops.back(); + if (this->outermostEnclosingForOp != outermostEnclosingForOp) { + LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n"); + return; + } +} + +void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) { // If the padded data is not yet available before entering the outermost // enclosing loop, try to apply hoisting on this outermost loop. - // TODO: we may want finer-grained hoisting of only that particular `sliceOp`. - IRRewriter rewriter(outermostEnclosingForOp->getContext()); + // TODO: we may want finer-grained hoisting of only that particular + // `sliceOp`. if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { outermostEnclosingForOp = hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp); } +} + +void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() { if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n" << outermostEnclosingForOp << "\n" @@ -236,14 +300,14 @@ // Check the region of `padOp` depends on a constant only. Adding hoisting // support for arbitrary padding regions would require cloning all // dependencies captured by the padding region. - Value paddingValue = padOp.getConstantPaddingValue(); + Value paddingValue = opToHoist.getConstantPaddingValue(); if (!paddingValue || !isa_and_nonnull(paddingValue.getDefiningOp())) { LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n"); return; } - computeBackwardSlice(padOp, outermostEnclosingForOp, backwardSlice); + computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice); if (backwardSlice.size() <= 1) return; @@ -251,7 +315,7 @@ // Remove all ops in the backward slice that are not used to index // the padded tensor. In particular, keep `padOp`, `sliceOp`, and // the loop and affine operations used for the index computation. - if (failed(dropNonIndexDependencies(padOp))) { + if (failed(dropNonIndexDependencies())) { LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n"); return; } @@ -281,7 +345,8 @@ valid = true; } -LogicalResult HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp) { +LogicalResult +HoistPaddingAnalysis::dropNonIndexDependencies() { // Set of all values used for index computation. SetVector indexEdges; @@ -300,7 +365,7 @@ }); }; - // Starting from `padOp` and `sliceOp` walk the use-def edges of index + // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index // type in `backwardSlice`. Add the index operands of an operation to // `indexEdges` and remove all operations from `backwardSlice` that are not // part of the index computation. @@ -322,9 +387,9 @@ // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k] SetVector operationsToRemove; for (Operation *op : llvm::reverse(backwardSlice)) { - // Add the index operands of `padOp` and `sliceOp` to start the + // Add the index operands of `opToHoist` and `sliceOp` to start the // exploration of the index computation. - if (op == padOp || op == sliceOp) { + if (op == opToHoist || op == sliceOp) { addIndexOperandsToIndexEdges(op); continue; } @@ -358,7 +423,7 @@ continue; } // Remove all other operations not used by the index computation. An - // exception are constant operations that may be used by `padOp`. + // exception are constant operations that may be used by `opToHoist`. if (!isa(op)) operationsToRemove.insert(op); } @@ -367,7 +432,8 @@ } SmallVector -HoistingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, Location loc) { +HoistPaddingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, + Location loc) const { SmallVector dynamicTensorSizes; // Upper bound the packing loop lengths to size the packed tensor. Taking @@ -403,6 +469,10 @@ return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant()); } +//===----------------------------------------------------------------------===// +// buildPackingLoopNest Implementation. +//===----------------------------------------------------------------------===// + /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). /// The returned Value is guaranteed not to depend on any loop comprised in /// [`outer`, `forOp`]. @@ -423,12 +493,6 @@ loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal}); } -struct PackingLoopNestResult { - SmallVector offsets, sizes, strides; - SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - GenericOp maybeTransposeOp; -}; - // Build a packing loop nest by iteratively traversing the backward slice and // clone the operations, iteratively stepping into the loops that we encounter. // The implementation proceeds in a stack-like fashion: @@ -439,10 +503,10 @@ // 3. At the innermost loop level, create a InsertSliceOp. // 4. Iteratively pop and yield the result of the InsertSliceOp across the // cloned loops. -static PackingLoopNestResult buildPackingLoopNest( +static PackingLoopNestResult buildPackingLoopNestImpl( RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, ArrayRef transposeVector, RankedTensorType transposedTensorType, - tensor::EmptyOp emptyOp, const HoistingAnalysis &analysis) { + tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) { SmallVector offsets, sizes, strides; SmallVector clonedLoopIvs, leadingPackedTensorIndexings; @@ -552,6 +616,73 @@ maybeTransposeOp}; } +/// Build the packing loop nest required to hoist `opToHoist` above +/// `outermostEnclosingForOp`. +/// The loop nest is built just before `outermostEnclosingForOp`. +static FailureOr buildPackingLoopNestImpl( + RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, + ArrayRef transposeVector, const HoistPaddingAnalysis &analysis) { + // Update actual number of loops, which may be smaller. + int nPackedLoops = analysis.packingLoops.size(); + LLVM_DEBUG(DBGS() << "\n"; + DBGS() << "Func:\n" + << *opToHoist->getParentOfType() << "\n"; + DBGS() << "Start hoisting above " << nPackedLoops << " loops\n"); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); + + // Compute the type of the transposed padded tensor. + FailureOr transposedTensorType = + tensor::computeTransposedType(paddedTensorType, transposeVector); + if (failed(transposedTensorType)) { + LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n"); + return failure(); + } + + // Create the packed tensor. + SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); + // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. + llvm::append_range(packedShape, transposedTensorType->getShape()); + auto packedTensorType = RankedTensorType::get( + packedShape, transposedTensorType->getElementType()); + + // Set the insertion point right before the outer loop and start packing. + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(outerLoop); + SmallVector dynamicTensorSizes = + analysis.getPackedTensorSizes(rewriter, loc); + auto emptyOp = rewriter.create( + loc, packedTensorType.getShape(), packedTensorType.getElementType(), + dynamicTensorSizes); + + return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector, + paddedTensorType, emptyOp, analysis); +} + +/// Build the packing loop nest required to hoist `opToHoist` above +/// `outermostEnclosingForOp`. +/// The loop nest is built just before `outermostEnclosingForOp`. +FailureOr mlir::linalg::detail::buildPackingLoopNest( + RewriterBase &rewriter, tensor::PadOp opToHoist, + scf::ForOp outermostEnclosingForOp, ArrayRef transposeVector) { + HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp); + analysis.enableHoistPadding(rewriter); + analysis.finalizeHoistPaddingAnalysis(); + if (!analysis.isValid()) { + LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n"); + return failure(); + } + IRMapping bvm; + return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector, + analysis); +} + +//===----------------------------------------------------------------------===// +// hoistPaddingOnTensors Implementation. +//===----------------------------------------------------------------------===// + // If the original consumer of `sliceOp` was a `forOp` (i.e. through an iter // arg), propagate the `packedTensor` value through the same iter arg. // TODO: for multiple loops we need to track the use to the innermost loop. @@ -574,6 +705,7 @@ std::optional operandNumber = forOp.getIterArgNumberForOpOperand(*pUse); assert(operandNumber.has_value() && "expected a proper iter arg number"); + SmallVector initArgs = forOp.getInitArgs(); initArgs[operandNumber.value()] = casted; rewriter.startRootUpdate(forOp); @@ -586,7 +718,7 @@ /// replacement for `opToHoist` in callers. static Value replaceByPackingLoopNestResult( RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, - RankedTensorType transposedTensorType, const HoistingAnalysis &analysis, + RankedTensorType transposedTensorType, const HoistPaddingAnalysis &analysis, const PackingLoopNestResult &packingResult) { // The replacement occurs under a single insertion point within the original // loop, just before opToHoist. @@ -657,59 +789,38 @@ SmallVectorImpl &transposeOps) { LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n"; DBGS() << " by " << numLoops << " loops\n"); - HoistingAnalysis analysis(opToHoist, numLoops); + + HoistPaddingAnalysis analysis(opToHoist, numLoops); + analysis.enableHoistPadding(rewriter); + analysis.finalizeHoistPaddingAnalysis(); if (!analysis.isValid()) { LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n"); return failure(); } - // Update actual number of loops, which may be smaller. - int nPackedLoops = analysis.packingLoops.size(); - LLVM_DEBUG(DBGS() << "\n"; - DBGS() << "Func:\n" - << *opToHoist->getParentOfType() << "\n"; - DBGS() << "Start hoisting above " << nPackedLoops << " loops\n"); - - Location loc = opToHoist->getLoc(); - RankedTensorType paddedTensorType = opToHoist.getResultType(); - - // Compute the type of the transposed padded tensor. - FailureOr transposedTensorType = - tensor::computeTransposedType(paddedTensorType, transposeVector); - if (failed(transposedTensorType)) { - LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n"); + /// Construct the packing loop nest. + IRMapping bvm; + FailureOr packingResult = buildPackingLoopNestImpl( + rewriter, bvm, opToHoist, transposeVector, analysis); + if (failed(packingResult)) { + LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n"); return failure(); } - // Create the packed tensor. - SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); - // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. - llvm::append_range(packedShape, transposedTensorType->getShape()); - auto packedTensorType = RankedTensorType::get( - packedShape, transposedTensorType->getElementType()); - - // Set the insertion point right before the outer loop and start packing. - scf::ForOp outerLoop = analysis.outermostEnclosingForOp; - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(outerLoop); - SmallVector dynamicTensorSizes = - analysis.getPackedTensorSizes(rewriter, loc); - auto emptyOp = rewriter.create( - loc, packedTensorType.getShape(), packedTensorType.getElementType(), - dynamicTensorSizes); - - /// Construct the packing loop nest. - IRMapping bvm; - PackingLoopNestResult packingResult = - buildPackingLoopNest(rewriter, bvm, opToHoist, transposeVector, - *transposedTensorType, emptyOp, analysis); if (!transposeVector.empty()) - transposeOps.push_back(packingResult.maybeTransposeOp); + transposeOps.push_back(packingResult->maybeTransposeOp); // Now the packed tensor is ready, replace the original padding op by a // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. + auto outerPackedLoop = + scf::getForInductionVarOwner(packingResult->clonedLoopIvs.front()); + auto packedTensorType = + outerPackedLoop.getResult(0).getType().cast(); Value newResult = replaceByPackingLoopNestResult( - rewriter, bvm, opToHoist, *transposedTensorType, analysis, packingResult); + rewriter, bvm, opToHoist, packedTensorType, analysis, *packingResult); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); if (!transposeVector.empty()) { OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPointAfter(newResult.getDefiningOp()); diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -87,6 +87,7 @@ ArrayRef transposeVector) { if (transposeVector.empty()) return rankedTensorType; + if (!isPermutationVector(transposeVector) || transposeVector.size() != static_cast(rankedTensorType.getRank())) return failure(); diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir --- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir @@ -186,8 +186,201 @@ } %pad = transform.get_producer_of_operand %matmul_padded[2] - : (!pdl.operation) -> !transform.op<"tensor.pad"> + : (!pdl.operation) -> !pdl.operation - transform.structured.hoist_pad %pad by 1 loops - : (!transform.op<"tensor.pad">) -> !pdl.operation + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1#1 + : (!pdl.operation) -> !pdl.operation +} + +// ----- + +func.func @pad_and_hoist_rhs( + %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) + -> tensor<24x25xf32> +{ + // expected-note @below {{payload operation}} + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!pdl.operation) -> !pdl.operation + + + %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] + + %matmul_padded = transform.structured.pad %matmul_l1 { + padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2] + } + + // In this case, the pad op is actually empty: we only tile the first dimension + // and it does not have an impact on the RHS operand. + // expected-error @below {{incompatible payload operation name}} + %pad = transform.get_producer_of_operand %matmul_padded[1] + : (!pdl.operation) -> !pdl.operation + + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1 + : (!pdl.operation) -> !pdl.operation +} + +// ----- + +func.func @pad_and_hoist_init( + %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) + -> tensor<24x25xf32> +{ + // expected-note @below {{when applied to this op}} + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!pdl.operation) -> !pdl.operation + + + %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] + + %matmul_padded = transform.structured.pad %matmul_l1 { + padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2] + } + + %pad = transform.get_producer_of_operand %matmul_padded[2] + : (!pdl.operation) -> !pdl.operation + + // We do not know yet how to hoist the init. + // expected-error @below {{transform.structured.hoist_pad failed to apply}} + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1 + : (!pdl.operation) -> !pdl.operation +} + +// ----- + +// CHECK-LABEL: pad_and_hoist_lhs +func.func @pad_and_hoist_lhs( + %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) + -> tensor<24x25xf32> +{ + // CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x5x12xf32>) { + // CHECK: tensor.pad %{{.*}} + // CHECK: : tensor to tensor<5x12xf32> + // CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] + // CHECK-SAME: : tensor<5x12xf32> into tensor<5x5x12xf32> + // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { + // CHECK: %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] + // CHECK-SAME: : tensor<5x5x12xf32> to tensor<5x12xf32> + // CHECK: linalg.matmul ins(%[[PADDED]] + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!pdl.operation) -> !pdl.operation + + + %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] + + %matmul_padded = transform.structured.pad %matmul_l1 { + padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2] + } + + %pad = transform.get_producer_of_operand %matmul_padded[0] + : (!pdl.operation) -> !pdl.operation + + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1 + : (!pdl.operation) -> !pdl.operation +} + +// ----- + +// CHECK-LABEL: pad_and_hoist_lhs_transpose +func.func @pad_and_hoist_lhs_transpose( + %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) + -> tensor<24x25xf32> +{ + // CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x12x5xf32>) { + // CHECK: tensor.pad %{{.*}} + // CHECK: : tensor to tensor<5x12xf32> + // CHECK: linalg.generic + // CHECK: -> tensor<12x5xf32> + // CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] + // CHECK-SAME: : tensor<12x5xf32> into tensor<5x12x5xf32> + // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { + // CHECK: %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] + // CHECK-SAME: : tensor<5x12x5xf32> to tensor<12x5xf32> + // CHECK: %[[TRANSPOSED:.*]] = linalg.generic + // CHECK: -> tensor<5x12xf32> + // CHECK: linalg.matmul ins(%[[TRANSPOSED]] + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!pdl.operation) -> !pdl.operation + + + %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5] + + %matmul_padded = transform.structured.pad %matmul_l1 { + padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2] + } + + %pad = transform.get_producer_of_operand %matmul_padded[0] + : (!pdl.operation) -> !pdl.operation + + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1, transpose by [1, 0] + : (!pdl.operation) -> !pdl.operation +} + +// ----- + +// CHECK-LABEL: pad_and_hoist_init +func.func @pad_and_hoist_init( + %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) + -> tensor<24x25xf32> +{ + + // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { + // CHECK: %[[PADDED:.*]] = tensor.pad %{{.*}} + // CHECK: : tensor to tensor<5x25xf32> + // CHECK: scf.for %{{.*}} iter_args(%[[INNER_PADDED:[0-9a-zA-Z]*]] = %[[PADDED]]) -> (tensor<5x25xf32>) + // CHECK: %[[RES:.*]] = linalg.matmul {{.*}} outs(%[[INNER_PADDED]] + // CHECK-SAME: : tensor<5x25xf32> + // CHECK: scf.yield %[[RES]] : tensor<5x25xf32> + // CHECK: %[[CAST:.*]] = tensor.cast %{{.*}} : tensor<5x25xf32> to tensor + // CHECK: tensor.insert_slice %[[CAST]] into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1] + // CHECK-SAME: : tensor into tensor<24x25xf32> + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!pdl.operation) -> !pdl.operation + + + %matmul_l1, %loops_l1:2 = transform.structured.tile_to_scf_for %matmul [5, 0, 7] + + %matmul_padded = transform.structured.pad %matmul_l1 { + padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2] + } + + %pad = transform.get_producer_of_operand %matmul_padded[2] + : (!pdl.operation) -> !pdl.operation + + transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1#1 + : (!pdl.operation) -> !pdl.operation }