diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -410,6 +410,12 @@ /// } /// } /// ``` +FailureOr +hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, + int numLoops, ArrayRef transposeVector, + tensor::PadOp &hoistedOp, + SmallVectorImpl &transposeOps); +/// Calls into `hoistPaddingOnTensors` with a local IRRewriter. FailureOr hoistPaddingOnTensors( tensor::PadOp opToHoist, int numLoops, ArrayRef transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl &transposeOps); diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h --- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h @@ -31,6 +31,12 @@ SmallVector createDimValues(OpBuilder &b, Location loc, Value rankedTensor); +/// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. +/// Fail if `transposeVector` is not a permutation matching the tensor rank. +FailureOr +computeTransposedType(RankedTensorType rankedTensorType, + ArrayRef transposeVector); + } // namespace tensor } // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1780,9 +1780,17 @@ transform::TransformState &state) { tensor::PadOp hoistedPadOp; SmallVector transposeOps; + IRRewriter rewriter(target->getContext()); + // TODO: Make hoistPaddingOnTensors take our rewriter. FailureOr result = hoistPaddingOnTensors( target, getNumLoops(), getTranspose(), hoistedPadOp, transposeOps); if (succeeded(result)) { + // We need to perform our own replacement here because this API is still + // used in patterns that "pad and hoist", for which the replacement values + // need to be different. + // TODO: clean this up and stop "pad and hoist" behavior more globally now + // that we have more composable abstractions. + rewriter.replaceOp(target, *result); results.push_back(hoistedPadOp); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -14,17 +14,17 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/SCF/Utils/Utils.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Utils/IndexingUtils.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Dialect/Vector/Utils/VectorUtils.h" +#include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/IR/AsmState.h" -#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" @@ -37,6 +37,31 @@ using namespace mlir; using namespace mlir::linalg; +static bool debugPrintLoopInShortForm(Operation *op) { + AsmState state(op->getParentOfType()); + (void)state; + if (auto forOp = dyn_cast(op)) { + forOp.getInductionVar().printAsOperand(dbgs(), state); + dbgs() << " @ " << forOp.getOperation(); + return true; + } + return false; +} + +static void debugPrintBackwardSlice(SetVector &backwardSlice) { + LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:", + [](Operation *op) { + dbgs() << "\n"; + DBGS() << "----"; + if (debugPrintLoopInShortForm(op)) { + dbgs() << "\n"; + return; + } + dbgs() << *op << "\n"; + }); + DBGS() << "\n";); +} + /// Analysis class to support tensor::PadOp hoisting across multiple enclosing /// loops. The failure conditions are: /// 1. Pad op has a use that is not an input of a LinalgOp. @@ -58,7 +83,7 @@ bool isValid() { return valid; } /// Footprint of the packedTensor, computed from the packingLoops. - SmallVector getPackedTensorSizes(ImplicitLocOpBuilder &b); + SmallVector getPackedTensorSizes(RewriterBase &b, Location loc); /// The outermost loop, determined by `nLevels` above which `padOp` will /// be hoisted. @@ -75,22 +100,27 @@ /// The span of these loops determines the footprint of the packed tensor. SmallVector packingLoops; + /// The ExtractSliceOp that feeds the PadOp we want to hoist. + tensor::ExtractSliceOp sliceOp; + private: /// Drop any non-index dependencies of `padOp` and `sliceOp` from /// `backwardSlice`. The method follows the use-def chains of the index /// operands consumed by `padOp` and `sliceOp` and drops the operations /// not part of this index computation. Afterwards, the filtered - /// `backwardSlice` contains only the loops whose induction variable is used, - /// directly or indirectly, to index the padded tensor. The method returns - /// failure if the filtered backward slice contains an unexpected operation. + /// `backwardSlice` contains only the loops whose induction variable is + /// used, directly or indirectly, to index the padded tensor. The method + /// returns failure if the filtered backward slice contains an unexpected + /// operation. /// /// Example: /// ``` /// %source = linalg.fill(%cst, %arg0) /// scf.for %i - /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! - /// scf.for %j (%arg2 = %unrelated) - /// scf.for %k // not used to index %source! + /// %unrelated = linalg.fill(%cst, %arg1) // not used to index + /// %source! scf.for %j (%arg2 = %unrelated) + /// scf.for %k // not used to index + /// %source! /// %ubi = affine.min #map(%i) /// %ubj = affine.min #map(%j) /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] @@ -98,29 +128,12 @@ /// ``` /// dropNonIndexDependencies(%padded_slice, %slice) /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. - LogicalResult dropNonIndexDependencies(tensor::PadOp padOp, - tensor::ExtractSliceOp sliceOp); + LogicalResult dropNonIndexDependencies(tensor::PadOp padOp); /// Encodes whether the analysis is valid and hoisting can proceed. bool valid; }; -/// Return true if all uses of `padOp` are an input tensor of some -/// LinalgOp. -static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) { - for (OpOperand &use : padOp.getResult().getUses()) { - auto linalgUser = dyn_cast(use.getOwner()); - if (!linalgUser || !linalgUser.isDpsInput(&use)) { - LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp) - << "\nthat is not an input tensor of a LinalgOp, " - << "cannot hoist\n" - << *(use.getOwner()) << "\n"); - return false; - } - } - return true; -} - /// Return at most nLevels of immediately enclosing scf::ForOp loops. /// Stops at the first parent that is not an scf::ForOp. /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. @@ -128,55 +141,25 @@ static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector &reverseEnclosingLoops) { - AsmState state(padOp->getParentOfType()); - (void)state; scf::ForOp outermostEnclosingForOp = nullptr; Operation *nextEnclosingOp = padOp->getParentOp(); while (nLevels-- > 0 && (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { - LLVM_DEBUG( - DBGS() << "loops: "; - outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); - dbgs() << "\n"); + LLVM_DEBUG(DBGS() << "loops: "; + debugPrintLoopInShortForm(outermostEnclosingForOp); + dbgs() << "\n"); reverseEnclosingLoops.push_back(outermostEnclosingForOp); nextEnclosingOp = outermostEnclosingForOp->getParentOp(); } } - -/// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. -/// Fail if `transposeVector` is no permutation matching the tensor rank. -static FailureOr -computeTransposedType(RankedTensorType rankedTensorType, - ArrayRef transposeVector) { - if (transposeVector.empty()) - return rankedTensorType; - if (!isPermutationVector(transposeVector) || - transposeVector.size() != static_cast(rankedTensorType.getRank())) - return failure(); - - SmallVector transposedShape(rankedTensorType.getShape().begin(), - rankedTensorType.getShape().end()); - applyPermutationToVector(transposedShape, transposeVector); - - using RTTBuilder = RankedTensorType::Builder; - RankedTensorType transposedTensorType = - RTTBuilder(rankedTensorType).setShape(transposedShape); - return transposedTensorType; -} - HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { valid = false; - // Bail on any use that isn't an input of a LinalgOp. - // Hoisting of inplace updates happens after vectorization. - if (!isOnlyUsedAsInputOfLinalgOp(padOp)) - return; - // Get at most `numLoops` of immediately enclosing loops. SmallVector reverseEnclosingLoops; getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); if (reverseEnclosingLoops.empty()) { - LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); + LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n"); return; } @@ -197,13 +180,26 @@ // %slice = tensor.extract_slice %source [%i, %j] // %padded_slice = tensor.pad %slice // ``` - auto sliceOp = padOp.getSource().getDefiningOp(); + sliceOp = padOp.getSource().getDefiningOp(); if (!sliceOp) { - LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); + LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n"); return; } + // If the padded data is not yet available before entering the outermost + // enclosing loop, try to apply hoisting on this outermost loop. + // TODO: we may want finer-grained hoisting of only that particular `sliceOp`. + IRRewriter rewriter(outermostEnclosingForOp->getContext()); + if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { + outermostEnclosingForOp = + hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp); + } if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) { - LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); + LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n" + << outermostEnclosingForOp << "\n" + << "--sliceOp: " << sliceOp << "\n" + << "--sliceOp.getSource(): " << sliceOp.getSource() + << "\n"); + LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n"); return; } @@ -213,7 +209,7 @@ Value paddingValue = padOp.getConstantPaddingValue(); if (!paddingValue || !isa_and_nonnull(paddingValue.getDefiningOp())) { - LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n"); + LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n"); return; } @@ -223,42 +219,46 @@ getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) { return domInfo.dominates(outermostEnclosingForOp, op); }); - if (backwardSlice.empty()) + if (backwardSlice.empty()) { + LLVM_DEBUG(DBGS() << "--Empty backward slice -> Skip\n"); return; + } // Add `padOp` itself to the backward slice. backwardSlice.insert(padOp.getOperation()); - // Remove all ops in the backward slice that are not used to index the padded - // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and - // affine operations used for the index computation. - if (failed(dropNonIndexDependencies(padOp, sliceOp))) + debugPrintBackwardSlice(backwardSlice); + // Remove all ops in the backward slice that are not used to index + // the padded tensor. In particular, keep `padOp`, `sliceOp`, and + // the loop and affine operations used for the index computation. + if (failed(dropNonIndexDependencies(padOp))) { + LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n"); return; - - // Add only the loops part of the filtered `backwardSlice` to the packing - // loops. All other loops are not used to index the padded data and - // consequently access the same data in every loop iteration. Adding them to - // the packing loops would increase the cache footprint of the packed data - // by storing the same data multiple times. + } + debugPrintBackwardSlice(backwardSlice); + + // Add only the loops part of the filtered `backwardSlice` to the + // packing loops. All other loops are not used to index the padded + // data and consequently access the same data in every loop + // iteration. Adding them to the packing loops would increase the + // cache footprint of the packed data by storing the same data + // multiple times. for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) if (backwardSlice.contains(forOp)) packingLoops.push_back(forOp); - if (packingLoops.empty()) { - LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); - return; - } + + // Note: at this point, packing loops may be empty but we would still like + // to hoist the padding if so specified. // The analysis is valid and hoisting can occur. valid = true; } -LogicalResult -HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp, - tensor::ExtractSliceOp sliceOp) { +LogicalResult HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp) { // Set of all values used for index computation. SetVector indexEdges; - // Add all index operands of `operation` to `indexEdges`. An index operand is - // an operand of type index. + // Add all index operands of `operation` to `indexEdges`. An index operand + // is an operand of type index. auto addIndexOperandsToIndexEdges = [&](Operation *operation) { for (Value operand : operation->getOperands()) if (operand.getType().isIndex()) @@ -308,15 +308,15 @@ continue; } } - // Add the index operands of all other operations if at least one result is - // used for index computation. + // Add the index operands of all other operations if at least one result + // is used for index computation. if (hasIndexResult(op)) { addIndexOperandsToIndexEdges(op); // Check the operands of the remaining operations all have index type. if (llvm::any_of(op->getOperandTypes(), [](Type type) { return !type.isIndex(); })) { LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: " - << op << " -> skip\n"); + << op << " -> Skip\n"); return failure(); } // Check the remaining operations do not have regions or memory effects. @@ -324,7 +324,7 @@ bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect(); if (hasMemoryEffect || op->getNumRegions() != 0) { LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: " - << op << " -> skip\n"); + << op << " -> Skip\n"); return failure(); } continue; @@ -339,30 +339,32 @@ } SmallVector -HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { +HoistingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, Location loc) { SmallVector dynamicTensorSizes; // Upper bound the packing loop lengths to size the packed tensor. Taking // upper bounds can make the sizes of the packed tensor independent of the // enclosing loops. This independence is a prerequisite for reusing the same - // buffer for all enclosing loop iterations and hoisting its allocation out of - // the enclosing loops. + // buffer for all enclosing loop iterations and hoisting its allocation out + // of the enclosing loops. for (auto forOp : packingLoops) { // Compute an upper bound `ubVal` for the upper bound of `forOp`. AffineMap boundMap; SmallVector boundOperands; getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands); - Value ubVal = b.createOrFold(boundMap, boundOperands); + Value ubVal = + rewriter.createOrFold(loc, boundMap, boundOperands); // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and // store the result to `dynamicTensorSizes`. // TODO: instead of using the lower bound of `forOp` directly, implement a // lower bound computation similar to the upper bound computation. AffineExpr lb, ub, step; - bindDims(b.getContext(), lb, ub); - bindSymbols(b.getContext(), step); - Value res = b.createOrFold( - (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal, - cast(forOp).getStep()}); + bindDims(rewriter.getContext(), lb, ub); + bindSymbols(rewriter.getContext(), step); + Value res = rewriter.createOrFold( + loc, (ub - lb).ceilDiv(step), + ValueRange{forOp.getLowerBound(), ubVal, + cast(forOp).getStep()}); dynamicTensorSizes.push_back(res); } @@ -377,7 +379,7 @@ /// The returned Value is guaranteed not to depend on any loop comprised in /// [`outer`, `forOp`]. /// Return null if such a loop-independent quantity cannot be computed. -static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, +static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer, scf::ForOp forOp) { MLIRContext *ctx = forOp->getContext(); AffineExpr iv, lb, step; @@ -389,82 +391,66 @@ Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(), stepVal = forOp.getStep(); auto loc = forOp->getLoc(); - return b.createOrFold(loc, (iv - lb).ceilDiv(step), - ValueRange{ivVal, lbVal, stepVal}); + return rewriter.createOrFold( + loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal}); } -FailureOr mlir::linalg::hoistPaddingOnTensors( - tensor::PadOp opToHoist, int numLoops, ArrayRef transposeVector, - tensor::PadOp &hoistedOp, SmallVectorImpl &transposeOps) { - LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops - << " loops\n"); - HoistingAnalysis analysis(opToHoist, numLoops); - if (!analysis.isValid()) { - LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); - return failure(); - } - - scf::ForOp outer = analysis.outermostEnclosingForOp; - ImplicitLocOpBuilder b(outer->getLoc(), outer); +struct PackingLoopNestResult { + SmallVector offsets, sizes, strides; + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; + GenericOp maybeTransposeOp; +}; - SmallVector dynamicTensorSizes = analysis.getPackedTensorSizes(b); +// Build a packing loop nest by iteratively traversing the backward slice and +// clone the operations, iteratively stepping into the loops that we encounter. +// The implementation proceeds in a stack-like fashion: +// 1. Iteratively clone and step into the loops, pushing the `packedTensor` +// deeper in the stack. +// 2. At the innermost loop level, create a GenericOp if `transposeVector` is +// non-empty. +// 3. At the innermost loop level, create a InsertSliceOp. +// 4. Iteratively pop and yield the result of the InsertSliceOp across the +// cloned loops. +static PackingLoopNestResult buildPackingLoopNest( + RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, + ArrayRef transposeVector, RankedTensorType transposedTensorType, + tensor::EmptyOp emptyOp, const HoistingAnalysis &analysis) { + SmallVector offsets, sizes, strides; + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - // Update actual number of loops, which may be smaller. - int nPackedLoops = analysis.packingLoops.size(); + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; Location loc = opToHoist->getLoc(); RankedTensorType paddedTensorType = opToHoist.getResultType(); int paddedRank = paddedTensorType.getRank(); - // Compute the type of the transposed padded tensor. - FailureOr transposedTensorType = - computeTransposedType(paddedTensorType, transposeVector); - if (failed(transposedTensorType)) - return failure(); - - // Create the packed tensor into which we amortize - // padding. - SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); - // TODO: go grab dims when necessary, for now tensor::PadOp returns a static - // tensor. - llvm::append_range(packedShape, transposedTensorType->getShape()); - auto packedTensorType = RankedTensorType::get( - packedShape, transposedTensorType->getElementType()); - Value packedTensor = b.create( - loc, packedTensorType.getShape(), packedTensorType.getElementType(), - dynamicTensorSizes); - - // Clone the operations involved in the backward slice, iteratively stepping - // into the loops that we encounter. - // The implementation proceeds in a stack-like fashion: - // 1. Iteratively clone and step into the loops, pushing the `packedTensor` - // deeper in the stack. - // 2. Create a GenericOp if `transposeVector` is non-empty. - // 3. Create a InsertSliceOp at the top of the stack. - // 4. Iteratively pop and yield the result of the InsertSliceOp across - // the cloned loops. - SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - clonedLoopIvs.reserve(nPackedLoops); - leadingPackedTensorIndexings.reserve(nPackedLoops); - IRMapping bvm; - // Stack step 1. iteratively clone loops and push `packedTensor`. + Value packedTensor = emptyOp.getResult(); + // Step 1. iteratively clone loops and push `packedTensor`. + OpBuilder::InsertionGuard g(rewriter); for (Operation *op : analysis.backwardSlice) { - // Specifically sit out in the extract_slice(packedTensor) case: this is the - // piece we seek to replace. - if (auto sliceOp = dyn_cast(op)) - if (bvm.lookupOrDefault(sliceOp.getSource()) == packedTensor) + // Specifically sit out in the extract_slice(packedTensor) case: this is + // the piece we seek to replace. + if (auto sliceOp = dyn_cast(op)) { + if (bvm.lookupOrDefault(sliceOp.getSource()) == packedTensor) { + LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n"); continue; - // Clone all operations except it is a loop. + } + } + + // Clone all operations except loops which require special handling. auto forOp = dyn_cast(op); if (!forOp) { - b.clone(*op, bvm); + // We are at the right insertion point within the loop nest. + rewriter.clone(*op, bvm); continue; } + // Create a packing loop that takes `packedTensor` as iteration argument. - auto clonedForOp = b.create( + auto clonedForOp = rewriter.create( loc, bvm.lookupOrDefault(forOp.getLowerBound()), bvm.lookupOrDefault(forOp.getUpperBound()), bvm.lookupOrDefault(forOp.getStep()), packedTensor); + // Map the induction var, region args and results to the `clonedForOp`. bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); @@ -472,9 +458,11 @@ assert(clonedForOp->getNumRegions() == 1); clonedLoopIvs.push_back(clonedForOp.getInductionVar()); - b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); + // Do not insert guard here, we get deeper into the loop nest. + rewriter.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); Value loopIndependentIterationCount = - buildLoopIterationCount(b, outer, clonedForOp); + buildLoopIterationCount(rewriter, outerLoop, clonedForOp); + // Assert the loop-independent iteration count can be computed. if (!loopIndependentIterationCount) llvm_unreachable("loop independence prerequisite not met"); @@ -482,75 +470,205 @@ packedTensor = clonedForOp.getRegionIterArgs().front(); } + // Step 2. Construct offsets, sizes and strides for the innermost level of the + // packing loop. + int64_t nPackedLoops = clonedLoopIvs.size(); // offsets = [clonedLoopIvs, 0 .. 0]. - SmallVector offsets(leadingPackedTensorIndexings.begin(), - leadingPackedTensorIndexings.end()); - offsets.append(paddedRank, b.getIndexAttr(0)); + offsets = SmallVector{leadingPackedTensorIndexings.begin(), + leadingPackedTensorIndexings.end()}; + offsets.append(paddedRank, rewriter.getIndexAttr(0)); // sizes = [1 .. 1, transposedShape]. - SmallVector sizes(nPackedLoops, b.getIndexAttr(1)); - for (int64_t sz : transposedTensorType->getShape()) { - // TODO: go grab dims when necessary, for now tensor::PadOp returns a static + sizes = SmallVector(nPackedLoops, rewriter.getIndexAttr(1)); + for (int64_t sz : transposedTensorType.getShape()) { + // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); - sizes.push_back(b.getIndexAttr(sz)); + sizes.push_back(rewriter.getIndexAttr(sz)); } // strides = [1 .. 1]. - SmallVector strides(nPackedLoops + paddedRank, - b.getIndexAttr(1)); + strides = SmallVector(nPackedLoops + paddedRank, + rewriter.getIndexAttr(1)); - // Stack step 2. create GenericOp if `transposeVector` is non-empty. + // Step 3. Optionally transpose the padded tensor. + GenericOp maybeTransposeOp; Value paddedTensor = bvm.lookup(opToHoist.getResult()); if (!transposeVector.empty()) { - Value outputTensor = b.create( - loc, *transposedTensorType, packedTensor, offsets, sizes, strides); - transposeOps.push_back( - makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector)); - paddedTensor = transposeOps.back()->getResult(0); + Value outputTensor = rewriter.create( + loc, transposedTensorType, packedTensor, offsets, sizes, strides); + maybeTransposeOp = makeTransposeOp(rewriter, loc, paddedTensor, + outputTensor, transposeVector); + paddedTensor = maybeTransposeOp.getResult(0); } - // Stack step 3. create InsertSliceOp at the top of the stack. - Value inserted = b.create( + // Step 4. Create InsertSliceOp at the innermost loop level, inserting an + // optionally transposed padded slice into the packed tensor. + Value inserted = rewriter.create( loc, paddedTensor, packedTensor, offsets, sizes, strides); - // Stack step 4. iteratively pop the stack and propagate the yield. + // Step 5. Iteratively pop the stack and propagate the yield. Value valueToYield = inserted; for (Value iv : llvm::reverse(clonedLoopIvs)) { auto forOp = scf::getForInductionVarOwner(iv); - b.setInsertionPointToEnd(&forOp.getRegion().front()); - b.create(loc, valueToYield); + rewriter.setInsertionPointToEnd(&forOp.getRegion().front()); + rewriter.create(loc, valueToYield); valueToYield = forOp.getResult(0); } - // Now the packed tensor is ready, replace the original padding op by a - // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. - b.setInsertionPoint(opToHoist); - SmallVector loopIterationCounts = llvm::to_vector<4>( - llvm::map_range(analysis.packingLoops, [&](Operation *loop) { - return buildLoopIterationCount(b, outer, cast(loop)); - })); - // Assert all loop iteration counts can be computed. - if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) - llvm_unreachable("loop independence prerequisite not met"); - // offsets = [originalLoopIvs, 0 .. 0]. - offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); - offsets.append(paddedRank, b.getIndexAttr(0)); - // sizes = [1 .. 1, transposedShape] (definedabove). + return PackingLoopNestResult{offsets, + sizes, + strides, + clonedLoopIvs, + leadingPackedTensorIndexings, + maybeTransposeOp}; +} + +/// Produce a tensor extracted from the packingResult. This can be used as a +/// replacement for `opToHoist` in callers. +static Value replaceByPackingLoopNestResult( + RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, + RankedTensorType transposedTensorType, const HoistingAnalysis &analysis, + const PackingLoopNestResult &packingResult) { + // The replacement occurs under a single insertion point within the original + // loop, just before opToHoist. + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(opToHoist); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); + int paddedRank = paddedTensorType.getRank(); + + int64_t nPackedLoops = packingResult.clonedLoopIvs.size(); + LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n"); + + SmallVector offsets = packingResult.offsets; + + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; + ArrayRef packingLoops = analysis.packingLoops; + + SmallVector loopIterationCounts; + Value packedTensor; + if (nPackedLoops > 0) { + loopIterationCounts = + llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) { + return buildLoopIterationCount(rewriter, outerLoop, + cast(loop)); + })); + // Assert all loop iteration counts can be computed. + if (llvm ::any_of(loopIterationCounts, [](Value v) { return !v; })) + llvm_unreachable("loop independence prerequisite not met"); + + // offsets = [originalLoopIvs, 0 .. 0]. + offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); + offsets.append(paddedRank, rewriter.getIndexAttr(0)); + packedTensor = + scf::getForInductionVarOwner(packingResult.clonedLoopIvs.front()) + ->getResult(0); + } else { + // If no loops were created, this is just hoisting without packing. + // offsets = [0 .. 0]. + offsets.append(paddedRank, rewriter.getIndexAttr(0)); + auto padOp = + cast(bvm.lookup(opToHoist.getResult()).getDefiningOp()); + tensor::ExtractSliceOp sliceOp = analysis.sliceOp; + rewriter.startRootUpdate(padOp); + padOp.getSourceMutable().assign(sliceOp.getResult()); + rewriter.finalizeRootUpdate(padOp); + packedTensor = padOp; + } + + LLVM_DEBUG(DBGS() << "packedTensor: " << packedTensor << "\n"); + + // sizes = [1 .. 1, transposedShape] (defined above). // strides = [1 .. 1] (defined above) - packedTensor = - scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); - Value newResult = b.create( - loc, *transposedTensorType, packedTensor, offsets, sizes, strides); + return rewriter.create( + loc, transposedTensorType, packedTensor, offsets, packingResult.sizes, + packingResult.strides); +} + +FailureOr mlir::linalg::hoistPaddingOnTensors( + RewriterBase &rewriter, tensor::PadOp opToHoist, int numLoops, + ArrayRef transposeVector, tensor::PadOp &hoistedOp, + SmallVectorImpl &transposeOps) { + LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n"; + DBGS() << " by " << numLoops << " loops\n"); + HoistingAnalysis analysis(opToHoist, numLoops); + if (!analysis.isValid()) { + LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n"); + return failure(); + } - // Transpose the packed tensor back to the original storage order. + // Update actual number of loops, which may be smaller. + int nPackedLoops = analysis.packingLoops.size(); + LLVM_DEBUG(DBGS() << "\n"; + DBGS() << "Start hoisting above " << nPackedLoops << " loops\n"); + + Location loc = opToHoist->getLoc(); + RankedTensorType paddedTensorType = opToHoist.getResultType(); + + // Compute the type of the transposed padded tensor. + FailureOr transposedTensorType = + tensor::computeTransposedType(paddedTensorType, transposeVector); + if (failed(transposedTensorType)) { + LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n"); + return failure(); + } + + // Create the packed tensor. + SmallVector packedShape(nPackedLoops, ShapedType::kDynamic); + // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor. + llvm::append_range(packedShape, transposedTensorType->getShape()); + auto packedTensorType = RankedTensorType::get( + packedShape, transposedTensorType->getElementType()); + + // Set the insertion point right before the outer loop and start packing. + scf::ForOp outerLoop = analysis.outermostEnclosingForOp; + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(outerLoop); + SmallVector dynamicTensorSizes = + analysis.getPackedTensorSizes(rewriter, loc); + auto emptyOp = rewriter.create( + loc, packedTensorType.getShape(), packedTensorType.getElementType(), + dynamicTensorSizes); + + /// Construct the packing loop nest. + IRMapping bvm; + PackingLoopNestResult packingResult = + buildPackingLoopNest(rewriter, bvm, opToHoist, transposeVector, + *transposedTensorType, emptyOp, analysis); + if (!transposeVector.empty()) + transposeOps.push_back(packingResult.maybeTransposeOp); + + // Now the packed tensor is ready, replace the original padding op by a + // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. + Value newResult = replaceByPackingLoopNestResult( + rewriter, bvm, opToHoist, *transposedTensorType, analysis, packingResult); if (!transposeVector.empty()) { - Value emptyTensor = b.create( + // Transpose the packed tensor back to the original storage order. + Value emptyTensor = rewriter.create( loc, paddedTensorType.getShape(), paddedTensorType.getElementType()); - transposeOps.push_back( - makeTransposeOp(b, loc, newResult, emptyTensor, transposeVector)); - newResult = transposeOps.back()->getResult(0); + GenericOp unTransposeOp = + makeTransposeOp(rewriter, loc, newResult, emptyTensor, transposeVector); + newResult = unTransposeOp.getResult(0); + transposeOps.push_back(unTransposeOp); } + LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n"); + LLVM_DEBUG( + DBGS() << "After hoisting: " + << newResult.getDefiningOp()->getParentOfType() + << "\n"); + // Make the newly cloned `opToHoist` available to the caller. hoistedOp = cast(bvm.lookup(opToHoist.getResult()).getDefiningOp()); + + LLVM_DEBUG(DBGS() << "--SUCCESS\n"); return newResult; } + +FailureOr mlir::linalg::hoistPaddingOnTensors( + tensor::PadOp opToHoist, int numLoops, ArrayRef transposeVector, + tensor::PadOp &hoistedOp, SmallVectorImpl &transposeOps) { + IRRewriter rewriter(opToHoist.getContext()); + return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector, + hoistedOp, transposeOps); +} diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -2446,6 +2446,15 @@ auto resultType = getResult().getType().cast(); auto expectedType = PadOp::inferResultType(sourceType, getStaticLow(), getStaticHigh()); + if (!expectedType) { + return emitError("failed to infer expectedType from sourceType ") + << sourceType << ", specified resultType is " << resultType; + } + if (resultType.getRank() != expectedType.getRank()) { + return emitError("specified type ") + << resultType << " does not match the inferred type " + << expectedType; + } for (int i = 0, e = sourceType.getRank(); i < e; ++i) { if (resultType.getDimSize(i) == expectedType.getDimSize(i)) continue; @@ -2487,10 +2496,12 @@ ArrayRef staticHigh, ArrayRef resultShape) { unsigned rank = sourceType.getRank(); - assert(staticLow.size() == rank && "unexpected staticLow size mismatch"); - assert(staticHigh.size() == rank && "unexpected staticHigh size mismatch"); - assert((resultShape.empty() || resultShape.size() == rank) && - "unexpected resultShape size mismatch"); + if (staticLow.size() != rank) + return RankedTensorType(); + if (staticHigh.size() != rank) + return RankedTensorType(); + if (!(resultShape.empty() || resultShape.size() == rank)) + return RankedTensorType(); SmallVector inferredShape; for (auto i : llvm::seq(0, rank)) { diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" using namespace mlir; using namespace mlir::tensor; @@ -65,3 +66,22 @@ } return dims; } + +FailureOr +mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType, + ArrayRef transposeVector) { + if (transposeVector.empty()) + return rankedTensorType; + if (!isPermutationVector(transposeVector) || + transposeVector.size() != static_cast(rankedTensorType.getRank())) + return failure(); + + SmallVector transposedShape(rankedTensorType.getShape().begin(), + rankedTensorType.getShape().end()); + applyPermutationToVector(transposedShape, transposeVector); + + using RTTBuilder = RankedTensorType::Builder; + RankedTensorType transposedTensorType = + RTTBuilder(rankedTensorType).setShape(transposedShape); + return transposedTensorType; +} diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5491,6 +5491,7 @@ deps = [ ":AffineDialect", ":ArithDialect", + ":DialectUtils", ":TensorDialect", "//llvm:Support", ], @@ -8466,6 +8467,7 @@ ":LinalgPassIncGen", ":LinalgStructuredOpsIncGen", ":LinalgUtils", + ":LoopLikeInterface", ":MaskableOpInterface", ":MathDialect", ":MemRefDialect",