diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -411,6 +411,12 @@ /// } /// ``` FailureOr +hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, + int64_t numLoops, ArrayRef transposeVector, + tensor::PadOp &hoistedOp, + SmallVectorImpl &transposeOps); +/// Calls into `hoistPaddingOnTensors` with a local IRRewriter. +FailureOr hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops, ArrayRef transposeVector, tensor::PadOp &hoistedOp, diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h --- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h @@ -31,6 +31,12 @@ SmallVector createDimValues(OpBuilder &b, Location loc, Value rankedTensor); +/// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. +/// Fail if `transposeVector` is not a permutation matching the tensor rank. +FailureOr +computeTransposedType(RankedTensorType rankedTensorType, + ArrayRef transposeVector); + } // namespace tensor } // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1779,12 +1779,12 @@ transform::HoistPadOp::applyToOne(tensor::PadOp target, transform::ApplyToEachResultList &results, transform::TransformState &state) { - IRRewriter rewriter(target->getContext()); tensor::PadOp hoistedPadOp; SmallVector transposeOps; - // TODO: Pass rewriter down to hoistPaddingOnTensors, in a followup commit. - FailureOr result = hoistPaddingOnTensors( - target, getNumLoops(), getTranspose(), hoistedPadOp, transposeOps); + IRRewriter rewriter(target->getContext()); + FailureOr result = + hoistPaddingOnTensors(rewriter, target, getNumLoops(), getTranspose(), + hoistedPadOp, transposeOps); if (succeeded(result)) { // We need to perform our own replacement here because this API is still // used in patterns that "pad and hoist", for which the replacement values diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -14,20 +14,15 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/SCF/Utils/Utils.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Dialect/Vector/Utils/VectorUtils.h" #include "mlir/IR/AsmState.h" -#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" - #include "mlir/IR/Matchers.h" #include "mlir/Transforms/RegionUtils.h" -#include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" using llvm::dbgs; @@ -39,6 +34,31 @@ using namespace mlir; using namespace mlir::linalg; +static bool debugPrintLoopInShortForm(Operation *op) { + AsmState state(op->getParentOfType()); + (void)state; + if (auto forOp = dyn_cast(op)) { + forOp.getInductionVar().printAsOperand(dbgs(), state); + dbgs() << " @ " << forOp.getOperation(); + return true; + } + return false; +} + +static void debugPrintBackwardSlice(SetVector &backwardSlice) { + LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:", + [](Operation *op) { + dbgs() << "\n"; + DBGS() << "----"; + if (debugPrintLoopInShortForm(op)) { + dbgs() << "\n"; + return; + } + dbgs() << *op << "\n"; + }); + DBGS() << "\n";); +} + /// Analysis class to support tensor::PadOp hoisting across multiple enclosing /// loops. The failure conditions are: /// 1. Pad op has a use that is not an input of a LinalgOp. @@ -60,7 +80,7 @@ bool isValid() { return valid; } /// Footprint of the packedTensor, computed from the packingLoops. - SmallVector getPackedTensorSizes(ImplicitLocOpBuilder &b); + SmallVector getPackedTensorSizes(RewriterBase &b, Location loc); /// The outermost loop, determined by `nLevels` above which `padOp` will /// be hoisted. @@ -77,22 +97,27 @@ /// The span of these loops determines the footprint of the packed tensor. SmallVector packingLoops; + /// The ExtractSliceOp that feeds the PadOp we want to hoist. + tensor::ExtractSliceOp sliceOp; + private: /// Drop any non-index dependencies of `padOp` and `sliceOp` from /// `backwardSlice`. The method follows the use-def chains of the index /// operands consumed by `padOp` and `sliceOp` and drops the operations /// not part of this index computation. Afterwards, the filtered - /// `backwardSlice` contains only the loops whose induction variable is used, - /// directly or indirectly, to index the padded tensor. The method returns - /// failure if the filtered backward slice contains an unexpected operation. + /// `backwardSlice` contains only the loops whose induction variable is + /// used, directly or indirectly, to index the padded tensor. The method + /// returns failure if the filtered backward slice contains an unexpected + /// operation. /// /// Example: /// ``` /// %source = linalg.fill(%cst, %arg0) /// scf.for %i - /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! - /// scf.for %j (%arg2 = %unrelated) - /// scf.for %k // not used to index %source! + /// %unrelated = linalg.fill(%cst, %arg1) // not used to index + /// %source! scf.for %j (%arg2 = %unrelated) + /// scf.for %k // not used to index + /// %source! /// %ubi = affine.min #map(%i) /// %ubj = affine.min #map(%j) /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] @@ -100,29 +125,12 @@ /// ``` /// dropNonIndexDependencies(%padded_slice, %slice) /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. - LogicalResult dropNonIndexDependencies(tensor::PadOp padOp, - tensor::ExtractSliceOp sliceOp); + LogicalResult dropNonIndexDependencies(tensor::PadOp padOp); /// Encodes whether the analysis is valid and hoisting can proceed. bool valid; }; -/// Return true if all uses of `padOp` are an input tensor of some -/// LinalgOp. -static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) { - for (OpOperand &use : padOp.getResult().getUses()) { - auto linalgUser = dyn_cast(use.getOwner()); - if (!linalgUser || !linalgUser.isDpsInput(&use)) { - LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp) - << "\nthat is not an input tensor of a LinalgOp, " - << "cannot hoist\n" - << *(use.getOwner()) << "\n"); - return false; - } - } - return true; -} - /// Return at most nLevels of immediately enclosing scf::ForOp loops. /// Stops at the first parent that is not an scf::ForOp. /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. @@ -130,42 +138,18 @@ static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector &reverseEnclosingLoops) { - AsmState state(padOp->getParentOfType()); - (void)state; scf::ForOp outermostEnclosingForOp = nullptr; Operation *nextEnclosingOp = padOp->getParentOp(); while (nLevels-- > 0 && (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { - LLVM_DEBUG( - DBGS() << "loops: "; - outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); - dbgs() << "\n"); + LLVM_DEBUG(DBGS() << "loops: "; + debugPrintLoopInShortForm(outermostEnclosingForOp); + dbgs() << "\n"); reverseEnclosingLoops.push_back(outermostEnclosingForOp); nextEnclosingOp = outermostEnclosingForOp->getParentOp(); } } -/// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. -/// Fail if `transposeVector` is no permutation matching the tensor rank. -static FailureOr -computeTransposedType(RankedTensorType rankedTensorType, - ArrayRef transposeVector) { - if (transposeVector.empty()) - return rankedTensorType; - if (!isPermutationVector(transposeVector) || - transposeVector.size() != static_cast(rankedTensorType.getRank())) - return failure(); - - SmallVector transposedShape(rankedTensorType.getShape().begin(), - rankedTensorType.getShape().end()); - applyPermutationToVector(transposedShape, transposeVector); - - using RTTBuilder = RankedTensorType::Builder; - RankedTensorType transposedTensorType = - RTTBuilder(rankedTensorType).setShape(transposedShape); - return transposedTensorType; -} - // Get all the ops in the backwards slice starting from `padOp` and that // are dominated by the outermost enclosing loop. // This also requires tracking ops defining values used in the region but @@ -193,16 +177,11 @@ HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { valid = false; - // Bail on any use that isn't an input of a LinalgOp. - // Hoisting of inplace updates happens after vectorization. - if (!isOnlyUsedAsInputOfLinalgOp(padOp)) - return; - // Get at most `numLoops` of immediately enclosing loops. SmallVector reverseEnclosingLoops; getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); if (reverseEnclosingLoops.empty()) { - LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); + LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n"); return; } @@ -223,13 +202,26 @@ // %slice = tensor.extract_slice %source [%i, %j] // %padded_slice = tensor.pad %slice // ``` - auto sliceOp = padOp.getSource().getDefiningOp(); + sliceOp = padOp.getSource().getDefiningOp(); if (!sliceOp) { - LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); + LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n"); return; } + // If the padded data is not yet available before entering the outermost + // enclosing loop, try to apply hoisting on this outermost loop. + // TODO: we may want finer-grained hoisting of only that particular `sliceOp`. + IRRewriter rewriter(outermostEnclosingForOp->getContext()); + if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { + outermostEnclosingForOp = + hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp); + } if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { - LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); + LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n" + << outermostEnclosingForOp << "\n" + << "--sliceOp: " << sliceOp << "\n" + << "--sliceOp.getSource(): " << sliceOp.getSource() + << "\n"); + LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n"); return; } @@ -239,7 +231,7 @@ Value paddingValue = padOp.getConstantPaddingValue(); if (!paddingValue || !isa_and_nonnull(paddingValue.getDefiningOp())) { - LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n"); + LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n"); return; } @@ -247,37 +239,39 @@ if (backwardSlice.size() <= 1) return; - // Remove all ops in the backward slice that are not used to index the padded - // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and - // affine operations used for the index computation. - if (failed(dropNonIndexDependencies(padOp, sliceOp))) + debugPrintBackwardSlice(backwardSlice); + // Remove all ops in the backward slice that are not used to index + // the padded tensor. In particular, keep `padOp`, `sliceOp`, and + // the loop and affine operations used for the index computation. + if (failed(dropNonIndexDependencies(padOp))) { + LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n"); return; - - // Add only the loops part of the filtered `backwardSlice` to the packing - // loops. All other loops are not used to index the padded data and - // consequently access the same data in every loop iteration. Adding them to - // the packing loops would increase the cache footprint of the packed data - // by storing the same data multiple times. + } + debugPrintBackwardSlice(backwardSlice); + + // Add only the loops part of the filtered `backwardSlice` to the + // packing loops. All other loops are not used to index the padded + // data and consequently access the same data in every loop + // iteration. Adding them to the packing loops would increase the + // cache footprint of the packed data by storing the same data + // multiple times. for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) if (backwardSlice.contains(forOp)) packingLoops.push_back(forOp); - if (packingLoops.empty()) { - LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); - return; - } + + // Note: at this point, packing loops may be empty but we would still like + // to hoist the padding if so specified. // The analysis is valid and hoisting can occur. valid = true; } -LogicalResult -HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp, - tensor::ExtractSliceOp sliceOp) { +LogicalResult HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp) { // Set of all values used for index computation. SetVector indexEdges; - // Add all index operands of `operation` to `indexEdges`. An index operand is - // an operand of type index. + // Add all index operands of `operation` to `indexEdges`. An index operand + // is an operand of type index. auto addIndexOperandsToIndexEdges = [&](Operation *operation) { for (Value operand : operation->getOperands()) if (operand.getType().isIndex()) @@ -327,15 +321,15 @@ continue; } } - // Add the index operands of all other operations if at least one result is - // used for index computation. + // Add the index operands of all other operations if at least one result + // is used for index computation. if (hasIndexResult(op)) { addIndexOperandsToIndexEdges(op); // Check the operands of the remaining operations all have index type. if (llvm::any_of(op->getOperandTypes(), [](Type type) { return !type.isIndex(); })) { LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: " - << op << " -> skip\n"); + << op << " -> Skip\n"); return failure(); } // Check the remaining operations do not have regions or memory effects. @@ -343,7 +337,7 @@ bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect(); if (hasMemoryEffect || op->getNumRegions() != 0) { LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: " - << op << " -> skip\n"); + << op << " -> Skip\n"); return failure(); } continue; @@ -358,30 +352,32 @@ } SmallVector -HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { +HoistingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, Location loc) { SmallVector dynamicTensorSizes; // Upper bound the packing loop lengths to size the packed tensor. Taking // upper bounds can make the sizes of the packed tensor independent of the // enclosing loops. This independence is a prerequisite for reusing the same - // buffer for all enclosing loop iterations and hoisting its allocation out of - // the enclosing loops. + // buffer for all enclosing loop iterations and hoisting its allocation out + // of the enclosing loops. for (auto forOp : packingLoops) { // Compute an upper bound `ubVal` for the upper bound of `forOp`. AffineMap boundMap; SmallVector boundOperands; getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands); - Value ubVal = b.createOrFold(boundMap, boundOperands); + Value ubVal = + rewriter.createOrFold(loc, boundMap, boundOperands); // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and // store the result to `dynamicTensorSizes`. // TODO: instead of using the lower bound of `forOp` directly, implement a // lower bound computation similar to the upper bound computation. AffineExpr lb, ub, step; - bindDims(b.getContext(), lb, ub); - bindSymbols(b.getContext(), step); - Value res = b.createOrFold( - (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal, - cast(forOp).getStep()}); + bindDims(rewriter.getContext(), lb, ub); + bindSymbols(rewriter.getContext(), step); + Value res = rewriter.createOrFold( + loc, (ub - lb).ceilDiv(step), + ValueRange{forOp.getLowerBound(), ubVal, + cast(forOp).getStep()}); dynamicTensorSizes.push_back(res); } @@ -396,7 +392,7 @@ /// The returned Value is guaranteed not to depend on any loop comprised in /// [`outer`, `forOp`]. /// Return null if such a loop-independent quantity cannot be computed. -static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, +static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer, scf::ForOp forOp) { MLIRContext *ctx = forOp->getContext(); AffineExpr iv, lb, step; @@ -408,84 +404,66 @@ Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(), stepVal = forOp.getStep(); auto loc = forOp->getLoc(); - return b.createOrFold(loc, (iv - lb).ceilDiv(step), - ValueRange{ivVal, lbVal, stepVal}); + return rewriter.createOrFold( + loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal}); } -FailureOr -mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops, - ArrayRef transposeVector, - tensor::PadOp &hoistedOp, - SmallVectorImpl &transposeOps) { - LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops - << " loops\n"); - HoistingAnalysis analysis(opToHoist, numLoops); - if (!analysis.isValid()) { - LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); - return failure(); - } - - scf::ForOp outer = analysis.outermostEnclosingForOp; - ImplicitLocOpBuilder b(outer->getLoc(), outer); +struct PackingLoopNestResult { + SmallVector offsets, sizes, strides; + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; + GenericOp maybeTransposeOp; +}; - SmallVector dynamicTensorSizes = analysis.getPackedTensorSizes(b); +// Build a packing loop nest by iteratively traversing the backward slice and +// clone the operations, iteratively stepping into the loops that we encounter. +// The implementation proceeds in a stack-like fashion: +// 1. Iteratively clone and step into the loops, pushing the `packedTensor` +// deeper in the stack. +// 2. At the innermost loop level, create a GenericOp if `transposeVector` is +// non-empty. +// 3. At the innermost loop level, create a InsertSliceOp. +// 4. Iteratively pop and yield the result of the InsertSliceOp across the +// cloned loops. +static PackingLoopNestResult buildPackingLoopNest( + RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, + ArrayRef transposeVector, RankedTensorType transposedTensorType, + tensor::EmptyOp emptyOp, const HoistingAnalysis &analysis) { + SmallVector offsets, sizes, strides; + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - // Update actual number of loops, which may be smaller. - int nPackedLoops = analysis.packingLoops.size(); + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; Location loc = opToHoist->getLoc(); RankedTensorType paddedTensorType = opToHoist.getResultType(); int paddedRank = paddedTensorType.getRank(); - // Compute the type of the transposed padded tensor. - FailureOr transposedTensorType = - computeTransposedType(paddedTensorType, transposeVector); - if (failed(transposedTensorType)) - return failure(); - - // Create the packed tensor into which we amortize - // padding. - SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); - // TODO: go grab dims when necessary, for now tensor::PadOp returns a static - // tensor. - llvm::append_range(packedShape, transposedTensorType->getShape()); - auto packedTensorType = RankedTensorType::get( - packedShape, transposedTensorType->getElementType()); - Value packedTensor = b.create( - loc, packedTensorType.getShape(), packedTensorType.getElementType(), - dynamicTensorSizes); - - // Clone the operations involved in the backward slice, iteratively stepping - // into the loops that we encounter. - // The implementation proceeds in a stack-like fashion: - // 1. Iteratively clone and step into the loops, pushing the `packedTensor` - // deeper in the stack. - // 2. Create a GenericOp if `transposeVector` is non-empty. - // 3. Create a InsertSliceOp at the top of the stack. - // 4. Iteratively pop and yield the result of the InsertSliceOp across - // the cloned loops. - SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - clonedLoopIvs.reserve(nPackedLoops); - leadingPackedTensorIndexings.reserve(nPackedLoops); - IRMapping bvm; - // Stack step 1. iteratively clone loops and push `packedTensor`. + Value packedTensor = emptyOp.getResult(); + // Step 1. iteratively clone loops and push `packedTensor`. + OpBuilder::InsertionGuard g(rewriter); for (Operation *op : analysis.backwardSlice) { - // Specifically sit out in the extract_slice(packedTensor) case: this is the - // piece we seek to replace. - if (auto sliceOp = dyn_cast(op)) - if (bvm.lookupOrDefault(sliceOp.getSource()) == packedTensor) + // Specifically sit out in the extract_slice(packedTensor) case: this is + // the piece we seek to replace. + if (auto sliceOp = dyn_cast(op)) { + if (bvm.lookupOrDefault(sliceOp.getSource()) == packedTensor) { + LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n"); continue; - // Clone all operations except it is a loop. + } + } + + // Clone all operations except loops which require special handling. auto forOp = dyn_cast(op); if (!forOp) { - b.clone(*op, bvm); + // We are at the right insertion point within the loop nest. + rewriter.clone(*op, bvm); continue; } + // Create a packing loop that takes `packedTensor` as iteration argument. - auto clonedForOp = b.create( + auto clonedForOp = rewriter.create( loc, bvm.lookupOrDefault(forOp.getLowerBound()), bvm.lookupOrDefault(forOp.getUpperBound()), bvm.lookupOrDefault(forOp.getStep()), packedTensor); + // Map the induction var, region args and results to the `clonedForOp`. bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); @@ -493,9 +471,11 @@ assert(clonedForOp->getNumRegions() == 1); clonedLoopIvs.push_back(clonedForOp.getInductionVar()); - b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); + // Do not insert guard here, we get deeper into the loop nest. + rewriter.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); Value loopIndependentIterationCount = - buildLoopIterationCount(b, outer, clonedForOp); + buildLoopIterationCount(rewriter, outerLoop, clonedForOp); + // Assert the loop-independent iteration count can be computed. if (!loopIndependentIterationCount) llvm_unreachable("loop independence prerequisite not met"); @@ -503,75 +483,213 @@ packedTensor = clonedForOp.getRegionIterArgs().front(); } + // Step 2. Construct offsets, sizes and strides for the innermost level of the + // packing loop. + int64_t nPackedLoops = clonedLoopIvs.size(); // offsets = [clonedLoopIvs, 0 .. 0]. - SmallVector offsets(leadingPackedTensorIndexings.begin(), - leadingPackedTensorIndexings.end()); - offsets.append(paddedRank, b.getIndexAttr(0)); + offsets = SmallVector{leadingPackedTensorIndexings.begin(), + leadingPackedTensorIndexings.end()}; + offsets.append(paddedRank, rewriter.getIndexAttr(0)); // sizes = [1 .. 1, transposedShape]. - SmallVector sizes(nPackedLoops, b.getIndexAttr(1)); - for (int64_t sz : transposedTensorType->getShape()) { - // TODO: go grab dims when necessary, for now tensor::PadOp returns a static + sizes = SmallVector(nPackedLoops, rewriter.getIndexAttr(1)); + for (int64_t sz : transposedTensorType.getShape()) { + // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); - sizes.push_back(b.getIndexAttr(sz)); + sizes.push_back(rewriter.getIndexAttr(sz)); } // strides = [1 .. 1]. - SmallVector strides(nPackedLoops + paddedRank, - b.getIndexAttr(1)); + strides = SmallVector(nPackedLoops + paddedRank, + rewriter.getIndexAttr(1)); - // Stack step 2. create GenericOp if `transposeVector` is non-empty. + // Step 3. Optionally transpose the padded tensor. + GenericOp maybeTransposeOp; Value paddedTensor = bvm.lookup(opToHoist.getResult()); if (!transposeVector.empty()) { - Value outputTensor = b.create( - loc, *transposedTensorType, packedTensor, offsets, sizes, strides); - transposeOps.push_back( - makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector)); - paddedTensor = transposeOps.back()->getResult(0); + Value outputTensor = rewriter.create( + loc, transposedTensorType, packedTensor, offsets, sizes, strides); + maybeTransposeOp = makeTransposeOp(rewriter, loc, paddedTensor, + outputTensor, transposeVector); + paddedTensor = maybeTransposeOp.getResult(0); } - // Stack step 3. create InsertSliceOp at the top of the stack. - Value inserted = b.create( + // Step 4. Create InsertSliceOp at the innermost loop level, inserting an + // optionally transposed padded slice into the packed tensor. + Value inserted = rewriter.create( loc, paddedTensor, packedTensor, offsets, sizes, strides); - // Stack step 4. iteratively pop the stack and propagate the yield. + // Step 5. Iteratively pop the stack and propagate the yield. Value valueToYield = inserted; for (Value iv : llvm::reverse(clonedLoopIvs)) { auto forOp = scf::getForInductionVarOwner(iv); - b.setInsertionPointToEnd(&forOp.getRegion().front()); - b.create(loc, valueToYield); + rewriter.setInsertionPointToEnd(&forOp.getRegion().front()); + rewriter.create(loc, valueToYield); valueToYield = forOp.getResult(0); } - // Now the packed tensor is ready, replace the original padding op by a - // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. - b.setInsertionPoint(opToHoist); - SmallVector loopIterationCounts = llvm::to_vector<4>( - llvm::map_range(analysis.packingLoops, [&](Operation *loop) { - return buildLoopIterationCount(b, outer, cast(loop)); - })); - // Assert all loop iteration counts can be computed. - if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) - llvm_unreachable("loop independence prerequisite not met"); - // offsets = [originalLoopIvs, 0 .. 0]. - offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); - offsets.append(paddedRank, b.getIndexAttr(0)); - // sizes = [1 .. 1, transposedShape] (definedabove). + return PackingLoopNestResult{offsets, + sizes, + strides, + clonedLoopIvs, + leadingPackedTensorIndexings, + maybeTransposeOp}; +} + +/// Produce a tensor extracted from the packingResult. This can be used as a +/// replacement for `opToHoist` in callers. +static Value replaceByPackingLoopNestResult( + RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, + RankedTensorType transposedTensorType, const HoistingAnalysis &analysis, + const PackingLoopNestResult &packingResult) { + // The replacement occurs under a single insertion point within the original + // loop, just before opToHoist. + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(opToHoist); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); + int paddedRank = paddedTensorType.getRank(); + + int64_t nPackedLoops = packingResult.clonedLoopIvs.size(); + LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n"); + + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; + ArrayRef packingLoops = analysis.packingLoops; + + Value packedTensor; + SmallVector loopIterationCounts; + SmallVector offsets(nPackedLoops + paddedRank, + rewriter.getIndexAttr(0)); + if (nPackedLoops > 0) { + loopIterationCounts = + llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) { + return buildLoopIterationCount(rewriter, outerLoop, + cast(loop)); + })); + // Assert all loop iteration counts can be computed. + if (llvm ::any_of(loopIterationCounts, [](Value v) { return !v; })) + llvm_unreachable("loop independence prerequisite not met"); + + // offsets = [maybe_leading_ivs = originalLoopIvs, 0 .. 0]. + std::copy(loopIterationCounts.begin(), loopIterationCounts.end(), + offsets.begin()); + packedTensor = + scf::getForInductionVarOwner(packingResult.clonedLoopIvs.front()) + ->getResult(0); + } else { + // If no loops were created, this is just hoisting without packing. + auto padOp = + cast(bvm.lookup(opToHoist.getResult()).getDefiningOp()); + tensor::ExtractSliceOp sliceOp = analysis.sliceOp; + rewriter.startRootUpdate(padOp); + padOp.getSourceMutable().assign(sliceOp.getResult()); + rewriter.finalizeRootUpdate(padOp); + packedTensor = padOp; + } + + LLVM_DEBUG(DBGS() << "packedTensor: " << packedTensor << "\n"); + + // TODO: atm we are missing the plumbing of packedTensor through the loop + // bbarg when required (i.e. when hoisting init tensors). + + // offsets = [maybe_leading_ivs, 0 .. 0]. + // sizes = [1 .. 1, transposedShape] (defined above). // strides = [1 .. 1] (defined above) - packedTensor = - scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); - Value newResult = b.create( - loc, *transposedTensorType, packedTensor, offsets, sizes, strides); + return rewriter.create( + loc, transposedTensorType, packedTensor, offsets, packingResult.sizes, + packingResult.strides); +} + +FailureOr mlir::linalg::hoistPaddingOnTensors( + RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, + ArrayRef transposeVector, tensor::PadOp &hoistedOp, + SmallVectorImpl &transposeOps) { + LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n"; + DBGS() << " by " << numLoops << " loops\n"); + HoistingAnalysis analysis(opToHoist, numLoops); + if (!analysis.isValid()) { + LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n"); + return failure(); + } - // Transpose the packed tensor back to the original storage order. + // Update actual number of loops, which may be smaller. + int nPackedLoops = analysis.packingLoops.size(); + LLVM_DEBUG(DBGS() << "\n"; + DBGS() << "Func:\n" + << *opToHoist->getParentOfType() << "\n"; + DBGS() << "Start hoisting above " << nPackedLoops << " loops\n"); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); + + // Compute the type of the transposed padded tensor. + FailureOr transposedTensorType = + tensor::computeTransposedType(paddedTensorType, transposeVector); + if (failed(transposedTensorType)) { + LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n"); + return failure(); + } + + // Create the packed tensor. + SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); + // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. + llvm::append_range(packedShape, transposedTensorType->getShape()); + auto packedTensorType = RankedTensorType::get( + packedShape, transposedTensorType->getElementType()); + + // Set the insertion point right before the outer loop and start packing. + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(outerLoop); + SmallVector dynamicTensorSizes = + analysis.getPackedTensorSizes(rewriter, loc); + auto emptyOp = rewriter.create( + loc, packedTensorType.getShape(), packedTensorType.getElementType(), + dynamicTensorSizes); + + /// Construct the packing loop nest. + IRMapping bvm; + PackingLoopNestResult packingResult = + buildPackingLoopNest(rewriter, bvm, opToHoist, transposeVector, + *transposedTensorType, emptyOp, analysis); + if (!transposeVector.empty()) + transposeOps.push_back(packingResult.maybeTransposeOp); + + // Now the packed tensor is ready, replace the original padding op by a + // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. + Value newResult = replaceByPackingLoopNestResult( + rewriter, bvm, opToHoist, *transposedTensorType, analysis, packingResult); if (!transposeVector.empty()) { - Value emptyTensor = b.create( + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPointAfter(newResult.getDefiningOp()); + // Transpose the packed tensor back to the original storage order. + Value emptyTensor = rewriter.create( loc, paddedTensorType.getShape(), paddedTensorType.getElementType()); - transposeOps.push_back( - makeTransposeOp(b, loc, newResult, emptyTensor, transposeVector)); - newResult = transposeOps.back()->getResult(0); + GenericOp unTransposeOp = + makeTransposeOp(rewriter, loc, newResult, emptyTensor, transposeVector); + newResult = unTransposeOp.getResult(0); + transposeOps.push_back(unTransposeOp); } + LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n"); + LLVM_DEBUG( + DBGS() << "After hoisting: " + << newResult.getDefiningOp()->getParentOfType() + << "\n"); + // Make the newly cloned `opToHoist` available to the caller. hoistedOp = cast(bvm.lookup(opToHoist.getResult()).getDefiningOp()); + + LLVM_DEBUG(DBGS() << "--SUCCESS\n"); return newResult; } + +FailureOr +mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops, + ArrayRef transposeVector, + tensor::PadOp &hoistedOp, + SmallVectorImpl &transposeOps) { + IRRewriter rewriter(opToHoist.getContext()); + return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector, + hoistedOp, transposeOps); +} diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -2449,6 +2449,15 @@ auto resultType = getResult().getType().cast(); auto expectedType = PadOp::inferResultType(sourceType, getStaticLow(), getStaticHigh()); + if (!expectedType) { + return emitError("failed to infer expectedType from sourceType ") + << sourceType << ", specified resultType is " << resultType; + } + if (resultType.getRank() != expectedType.getRank()) { + return emitError("specified type ") + << resultType << " does not match the inferred type " + << expectedType; + } for (int i = 0, e = sourceType.getRank(); i < e; ++i) { if (resultType.getDimSize(i) == expectedType.getDimSize(i)) continue; @@ -2490,10 +2499,12 @@ ArrayRef staticHigh, ArrayRef resultShape) { unsigned rank = sourceType.getRank(); - assert(staticLow.size() == rank && "unexpected staticLow size mismatch"); - assert(staticHigh.size() == rank && "unexpected staticHigh size mismatch"); - assert((resultShape.empty() || resultShape.size() == rank) && - "unexpected resultShape size mismatch"); + if (staticLow.size() != rank) + return RankedTensorType(); + if (staticHigh.size() != rank) + return RankedTensorType(); + if (!(resultShape.empty() || resultShape.size() == rank)) + return RankedTensorType(); SmallVector inferredShape; for (auto i : llvm::seq(0, rank)) { diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" using namespace mlir; using namespace mlir::tensor; @@ -65,3 +66,22 @@ } return dims; } + +FailureOr +mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType, + ArrayRef transposeVector) { + if (transposeVector.empty()) + return rankedTensorType; + if (!isPermutationVector(transposeVector) || + transposeVector.size() != static_cast(rankedTensorType.getRank())) + return failure(); + + SmallVector transposedShape(rankedTensorType.getShape().begin(), + rankedTensorType.getShape().end()); + applyPermutationToVector(transposedShape, transposeVector); + + using RTTBuilder = RankedTensorType::Builder; + RankedTensorType transposedTensorType = + RTTBuilder(rankedTensorType).setShape(transposedShape); + return transposedTensorType; +} diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir --- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir @@ -149,3 +149,46 @@ transform.structured.hoist_pad %pad by 1 loops, transpose by [1, 0] : (!pdl.operation) -> !pdl.operation } + +// ----- + +// CHECK-LABEL: pad_and_hoist_init +func.func @pad_and_hoist_init( + %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>) + -> tensor<24x25xf32> +{ + + // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) { + // CHECK: %[[PADDED:.*]] = tensor.pad %{{.*}} + // CHECK: : tensor to tensor<5x25xf32> + // CHECK: scf.for %{{.*}} -> (tensor) { + // CHECK: %[[RES:.*]] = linalg.matmul {{.*}} outs(%[[PADDED]] : tensor<5x25xf32> + // + // TODO: atm we are missing the plumbing of packedTensor through the loop bbarg + // when required (i.e. when hoisting init tensors). + // CHECK: %[[RES_EXTRACTED:.*]] = tensor.extract_slice %[[RES]][0, 0] [%{{.*}}, 25] [1, 1] + // CHECK-SAME: : tensor<5x25xf32> to tensor + // CHECK: scf.yield %[[RES_EXTRACTED]] : tensor + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !pdl.operation): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!pdl.operation) -> !pdl.operation + + + %matmul_l1, %loops_l1:2 = transform.structured.tile_to_scf_for %matmul [5, 0, 7] + + %matmul_padded = transform.structured.pad %matmul_l1 { + padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2] + } + + %pad = transform.get_producer_of_operand %matmul_padded[2] + : (!pdl.operation) -> !transform.op<"tensor.pad"> + + transform.structured.hoist_pad %pad by 1 loops + : (!transform.op<"tensor.pad">) -> !pdl.operation +} diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5491,6 +5491,7 @@ deps = [ ":AffineDialect", ":ArithDialect", + ":DialectUtils", ":TensorDialect", "//llvm:Support", ], @@ -8467,6 +8468,7 @@ ":LinalgPassIncGen", ":LinalgStructuredOpsIncGen", ":LinalgUtils", + ":LoopLikeInterface", ":MaskableOpInterface", ":MathDialect", ":MemRefDialect",