diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h copy from mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h copy to mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h @@ -1,4 +1,4 @@ -//===- Hoisting.h - Linalg hoisting transformations -------------*- C++ -*-===// +//===- HoistPadding.h - Hoisting transformation for PadTensorOp -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,35 +6,15 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_ -#define MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_ +#ifndef MLIR_DIALECT_LINALG_TRANSFORMS_HOIST_PADDING_H_ +#define MLIR_DIALECT_LINALG_TRANSFORMS_HOIST_PADDING_H_ namespace mlir { -class FuncOp; struct LogicalResult; namespace linalg { class PadTensorOp; -/// Hoist vector.transfer_read/vector.transfer_write on buffers pairs out of -/// immediately enclosing scf::ForOp iteratively, if the following conditions -/// are true: -/// 1. The two ops access the same memref with the same indices. -/// 2. All operands are invariant under the enclosing scf::ForOp. -/// 3. No uses of the memref either dominate the transfer_read or are -/// dominated by the transfer_write (i.e. no aliasing between the write and -/// the read across the loop) -/// To improve hoisting opportunities, call the `moveLoopInvariantCode` helper -/// function on the candidate loop above which to hoist. Hoisting the transfers -/// results in scf::ForOp yielding the value that originally transited through -/// memory. -// TODO: generalize on a per-need basis. -void hoistRedundantVectorTransfers(FuncOp func); - -/// Same behavior as `hoistRedundantVectorTransfers` but works on tensors -/// instead of buffers. -void hoistRedundantVectorTransfersOnTensor(FuncOp func); - /// Mechanically hoist padding operations on tensors by `nLoops` into a new, /// generally larger tensor. This achieves packing of multiple padding ops into /// a larger tensor. On success, `padTensorOp` is replaced by the cloned version @@ -77,9 +57,9 @@ /// } /// } /// ``` -LogicalResult hoistPaddingOnTensors(PadTensorOp &padTensorOp, unsigned nLoops); +LogicalResult hoistPaddingOnTensors(PadTensorOp &padTensorOp, int nLoops); } // namespace linalg } // namespace mlir -#endif // MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_ +#endif // MLIR_DIALECT_LINALG_TRANSFORMS_HOIST_PADDING_H_ diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h @@ -11,10 +11,8 @@ namespace mlir { class FuncOp; -struct LogicalResult; namespace linalg { -class PadTensorOp; /// Hoist vector.transfer_read/vector.transfer_write on buffers pairs out of /// immediately enclosing scf::ForOp iteratively, if the following conditions @@ -35,50 +33,6 @@ /// instead of buffers. void hoistRedundantVectorTransfersOnTensor(FuncOp func); -/// Mechanically hoist padding operations on tensors by `nLoops` into a new, -/// generally larger tensor. This achieves packing of multiple padding ops into -/// a larger tensor. On success, `padTensorOp` is replaced by the cloned version -/// in the packing loop so the caller can continue reasoning about the padding -/// operation. -/// -/// Example in pseudo-mlir: -/// ======================= -/// -/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR. -/// ``` -/// scf.for (%i, %j, %k) -/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor -/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] { -/// ^bb0( ... ): -/// linalg.yield %pad -/// } : tensor to tensor<4x8xf32> -/// compute(%0) -/// ``` -/// -/// IR resembling the following is produced: -/// -/// ``` -/// scf.for (%i) { -/// %packed_init = linalg.init_tensor range(%j) : tensor -/// %packed = scf.for (%k) iter_args(%p : %packed_init) { -/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor -/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] { -/// ^bb0( ... ): -/// linalg.yield %pad -/// } : tensor to tensor<4x8xf32> -/// %1 = tensor.insert_slice %0 ... -/// : tensor<4x8xf32> to tensor -/// scf.yield %1: tensor -/// } -> tensor -/// scf.for (%j, %k) { -/// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] : -/// tensor to tensor<4x8xf32> -/// compute(%st0) -/// } -/// } -/// ``` -LogicalResult hoistPaddingOnTensors(PadTensorOp &padTensorOp, unsigned nLoops); - } // namespace linalg } // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -0,0 +1,549 @@ +//===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements functions concerned with hoisting invariant operations +// in the context of Linalg transformations. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" +#include "mlir/Analysis/AffineStructures.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/Utils.h" +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/SCF/SCF.h" +#include "mlir/Dialect/SCF/Utils.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Vector/VectorOps.h" +#include "mlir/Dialect/Vector/VectorUtils.h" +#include "mlir/IR/AsmState.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Dominance.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/LoopUtils.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Debug.h" + +using llvm::dbgs; + +#define DEBUG_TYPE "hoist-padding" + +#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") + +using namespace mlir; +using namespace mlir::linalg; + +struct HoistingAnalysis { + HoistingAnalysis(PadTensorOp padTensorOp, int nLevels); + + bool isValid() { return valid; } + + // Footprint of the packedTensor, computed from the packingLoops and + // `backwardSlice`. + FailureOr> getPackedTensorSizes(ImplicitLocOpBuilder &b); + + // The padTensorOp that needs to be hoisted. + PadTensorOp padTensorOp; + + // The maximum number of immediately enclosing scf::ForOp to hoist over. + int nLevels; + + // The outermost loop, determined by `nLevels` above which `padTensorOp` will + // be hoisted. + scf::ForOp outermostEnclosingForOp; + + // Backward slice rooted at `padTensorOp` and nested under + // `outermostEnclosingForOp`. + SetVector backwardSlice; + + // The scf::ForOp immediately enclosing `padTensorOp` such that: + // 1. they are nested under `outermostEnclosingForOp` (inclusive) + // 2. whose induction variable is used, directly or indirectly, in the + // computation of `padTensorOp`. + // The span of these loops determines the footprint of the packed tensor. + // SmallSetVector packingLoops; + SetVector, DenseSet> + packingLoops; + +private: + // Encodes whether the analysis is valid and hoisting can proceed. + bool valid; +}; + +/// Return true if all uses of `padTensorOp` are an input tensor of some +/// LinalgOp. +static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) { + for (OpOperand &use : padTensorOp.result().getUses()) { + auto linalgUser = dyn_cast(use.getOwner()); + if (!linalgUser || !linalgUser.isInputTensor(&use)) { + LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp) + << "\nthat is not an input tensor of a LinalgOp, " + << "cannot hoist\n" + << *(use.getOwner()) << "\n"); + return false; + } + } + return true; +} + +/// Return at most nLevels of immediately enclosing scf::ForOp loops. +/// Stops at the first parent that is not an scf::ForOp. +/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. +/// Control-flow and other containing ops with regions are not modeled atm. +static void +getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels, + SmallVector &reverseEnclosingLoops) { + AsmState state(padTensorOp->getParentOfType()); + scf::ForOp outermostEnclosingForOp = nullptr; + Operation *nextEnclosingOp = padTensorOp->getParentOp(); + while (nLevels-- > 0 && + (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { + LLVM_DEBUG( + DBGS() << "loops: "; + outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); + dbgs() << "\n"); + reverseEnclosingLoops.push_back(outermostEnclosingForOp); + nextEnclosingOp = outermostEnclosingForOp->getParentOp(); + } +} + +HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels) + : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) { + AsmState state(padTensorOp->getParentOfType()); + + // Bail on any use that isn't an input of a Linalg op. + // Hoisting of inplace updates happens after vectorization. + if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp)) + return; + + // Get at most nLevels of immediately enclosing loops. + SmallVector reverseEnclosingLoops; + getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops); + if (reverseEnclosingLoops.empty()) { + LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); + return; + } + + outermostEnclosingForOp = reverseEnclosingLoops.back(); + + // Get all the ops in the backwards slice starting from `padTensorOp` and that + // are dominated by the outermost enclosing loop. + // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp. + bool analysisFailure = false; + DominanceInfo domInfo(outermostEnclosingForOp); + getBackwardSlice( + padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) { + if (!domInfo.dominates(outermostEnclosingForOp, op)) + return false; + if (op != padTensorOp && op->getNumRegions() > 0 && + !isa(op)) { + analysisFailure = true; + LLVM_DEBUG(DBGS() + << "Unsupported op with region: " << *op << " -> skip\n"); + return false; + } + return true; + }); + + if (analysisFailure || backwardSlice.empty()) + return; + + // Backward slice is a topologically sorted list of ops starting at + // `outermostEnclosingForOp`. + assert(outermostEnclosingForOp == backwardSlice.front()); + + // Filter out the loops whose induction variable is not used to compute the + // padded result. As a first approximation, just look for IVs that have no use + // in the backwardSlice. + // These are the dimensions of reuse that we can exploit to reduce the amount + // of copy / memory. + for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { + for (Operation *user : forOp.getInductionVar().getUsers()) { + if (backwardSlice.contains(user)) { + packingLoops.insert(forOp); + break; + } + } + } + + // The analysis is valid and hoisting can occur. + valid = true; +} + +/// Given a set of loops, assumed to be scf::ForOp, create a constraint set +/// containing the inequalities `iv - lb >= 0` and `-iv + ub - 1 >= 0` for each +/// loop. The order of the constraints follows: +/// +/// ivs | lbs | ubs | eq/ineq +/// ----+-----+-----+--------- +/// 1 -1 0 >= 0 +/// ----+-----+-----+--------- +/// -1 0 1 >= 0 +/// +static FlatAffineValueConstraints +initLoopIvsAndBounds(ArrayRef loops) { + FlatAffineValueConstraints constraints; + // Append dims for all ivs, lbs, ubs: the order is important. + for (scf::ForOp op : loops) + constraints.appendDimId(op.getInductionVar()); + for (scf::ForOp op : loops) + constraints.appendDimId(op.lowerBound()); + for (scf::ForOp op : loops) + constraints.appendDimId(op.upperBound()); + int numLoops = loops.size(); + for (int ivIdx = 0, e = numLoops; ivIdx < e; ++ivIdx) { + // iv - lb >= 0 + SmallVector ineqLb(constraints.getNumCols(), 0); + ineqLb[ivIdx] = 1; + ineqLb[ivIdx + numLoops] = -1; + // -iv + ub >= 0 + SmallVector ineqUb(constraints.getNumCols(), 0); + ineqUb[ivIdx] = -1; + ineqUb[ivIdx + 2 * numLoops] = 1; + ineqUb[constraints.getNumCols() - 1] = -1; + constraints.addInequality(ineqLb); + constraints.addInequality(ineqUb); + } + return constraints; +} + +static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { + return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); +} + +/// For each loop in `loops`, determine the ops involved in the construction of +/// its upper bound---up to the outerLimit loop--- and fold them as new +/// inequalities in the constraint set. +/// This is achieved by computing the backwardSlice of the loop's upper bound +/// and iteratively folding each op in reverse topological order to guarantee +/// use-def ordering. +/// As operations are folded in, their result is projected out of the +/// constraints set. +/// The following operations are supported: +/// - scf::ForOp are simply skipped. +/// - AffineApplyOp are composed to replace the result by an equality. +/// - AffineMinOp are composed by adding each entry as an upper bound. +/// If any other operation is met, return failure. +// TODO: extend on a per-need basis. +static LogicalResult +foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints, + scf::ForOp outerLimit, + ArrayRef loops) { + SetVector toProjectOut; + for (scf::ForOp loop : loops) { + auto ub = loop.upperBound(); + if (isDefinedOutsideOrConstant(outerLimit, ub)) + continue; + + // Compute a backward slice up to, but not including, `outerLimit`. + SetVector backwardSlice; + getBackwardSlice(ub, &backwardSlice, [&](Operation *op) { + return outerLimit->isProperAncestor(op); + }); + backwardSlice.insert(ub.getDefiningOp()); + + // Iterate over all ops in the slice and compose them in the constraints. + for (Operation *op : llvm::reverse(backwardSlice)) { + if (!isa(op)) + return failure(); + if (isa(op)) + continue; + // Ensure there is a + auto ensureIdFailed = [&](Value v) { + if (constraints.containsId(v)) { + unsigned pos; + constraints.findId(v, &pos); + return pos >= constraints.getNumDimIds(); + } + constraints.appendDimId(v); + return false; + }; + + // Ensure all ids exist and add results for later projection. + if (llvm::any_of(op->getResults(), ensureIdFailed) || + llvm::any_of(op->getOperands(), ensureIdFailed)) + return failure(); + + // All supported ops have 1 result. + // TODO: extend when needed. + toProjectOut.insert(op->getResult(0)); + + // Compose supported ops. + if (auto affineApplyOp = dyn_cast(op)) { + AffineValueMap avm(affineApplyOp.getAffineMap(), + affineApplyOp.getOperands(), + affineApplyOp.getResult()); + if (failed(constraints.composeMap(&avm))) + return failure(); + continue; + } + auto affineMinOp = cast(op); + unsigned pos; + bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos); + (void)foundMinOp; + assert(foundMinOp); + AffineMap alignedMap = constraints.computeAlignedMap( + affineMinOp.getAffineMap(), affineMinOp.getOperands()); + if (failed( + constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap))) + return failure(); + } + } + for (Value v : toProjectOut) + constraints.projectOut(v); + return success(); +} + +// Footprint of the packedTensor, computed from the packingLoops and +// `backwardSlice`. +FailureOr> +HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { + FlatAffineValueConstraints constraints = + initLoopIvsAndBounds(packingLoops.getArrayRef()); + if (failed(foldUpperBoundsIntoConstraintsSet( + constraints, outermostEnclosingForOp, packingLoops.getArrayRef()))) + return failure(); + + int nPackedLoops = packingLoops.size(); + SmallVector lbs(nPackedLoops), ubs(nPackedLoops); + // Compute the bounds of the first positions, assuming the others are fixed. + constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops, + outermostEnclosingForOp->getContext(), &lbs, &ubs); + + SmallVector allValues; + constraints.getAllValues(&allValues); + SmallVector allNonLoopValues(allValues.begin() + nPackedLoops, + allValues.end()); + + // For each packingLoop, create the extent by (ub - lb).ceilDiv(step). + // IP just before the outermost loop considered that we hoist above. + assert(nPackedLoops == lbs.size() && "expected matching lb sizes"); + assert(nPackedLoops == ubs.size() && "expected matching ub sizes"); + SmallVector dynamicTensorSizes; + for (auto it : llvm::zip(packingLoops, lbs, ubs)) { + scf::ForOp loop = std::get<0>(it); + AffineMap lbMap = std::get<1>(it); + AffineMap ubMap = std::get<2>(it); + SmallVector lbOperands(allNonLoopValues); + canonicalizeMapAndOperands(&lbMap, &lbOperands); + Value lbVal = b.createOrFold(lbMap, lbOperands); + + SmallVector ubOperands(allNonLoopValues); + canonicalizeMapAndOperands(&ubMap, &ubOperands); + Value ubVal = b.createOrFold(ubMap, ubOperands); + + AffineExpr lb, ub, step; + bindDims(b.getContext(), lb, ub); + bindSymbols(b.getContext(), step); + Value res = b.createOrFold( + (ub - lb).ceilDiv(step), + ValueRange{lbVal, ubVal, cast(loop).step()}); + + dynamicTensorSizes.push_back(res); + } + return dynamicTensorSizes; +} + +/// Return success if `v` is a value that is only transitively defined by ops of +/// type in `OpTypeList`. +template +static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) { + // Compute a backward slice up to, but not including, `outerLimit`. + SetVector backwardSlice; + getBackwardSlice(v, &backwardSlice, [&](Operation *op) { + return outerLimit->isProperAncestor(op); + }); + // Traverse the backward slice and ensure we can perform the computation to + // hoist. + for (Operation *op : backwardSlice) { + if (isa(op)) + continue; + LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n"); + return false; + } + return true; +} + +/// Return the current iteration number in the loop (iv - lb).ceilDiv(step). +/// The returned Value is guaranteed not to depend on any loop comprised in +/// [`outer`, `forOp`]. +/// Return null if such a loop-independent quantity cannot be computed. +static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, + scf::ForOp forOp) { + MLIRContext *ctx = forOp->getContext(); + AffineExpr iv, lb, step; + bindDims(ctx, iv, lb); + bindSymbols(ctx, step); + if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) || + !isDefinedOutsideOrConstant(outer, forOp.step())) + return Value(); + Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(), + stepVal = forOp.step(); + auto loc = forOp->getLoc(); + return b.createOrFold(loc, (iv - lb).ceilDiv(step), + ValueRange{ivVal, lbVal, stepVal}); +} + +LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp, + int nLoops) { + LLVM_DEBUG(DBGS() << "Try to hoist " << *(padTensorOp) << " by " << nLoops + << " loops\n"); + HoistingAnalysis analysis(padTensorOp, nLoops); + if (!analysis.isValid()) { + LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); + return failure(); + } + + scf::ForOp outer = analysis.outermostEnclosingForOp; + ImplicitLocOpBuilder b(outer->getLoc(), outer); + + auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b); + if (failed(maybeDynamicTensorSizes)) + return failure(); + SmallVector dynamicTensorSizes = *maybeDynamicTensorSizes; + + // Update actual number of loops, which may be smaller. + int nPackedLoops = analysis.packingLoops.size(); + + Location loc = padTensorOp->getLoc(); + RankedTensorType paddedTensorType = padTensorOp.getResultType(); + int paddedRank = paddedTensorType.getRank(); + + // Create the packed tensor into which we amortize + // padding. + SmallVector packedShape(nPackedLoops, ShapedType::kDynamicSize); + // TODO: go grab dims when necessary, for now PadTensorOp returns a static + // tensor. + llvm::append_range(packedShape, paddedTensorType.getShape()); + auto packedTensorType = + RankedTensorType::get(packedShape, paddedTensorType.getElementType()); + Value packedTensor = b.create( + loc, dynamicTensorSizes, packedTensorType.getShape(), + packedTensorType.getElementType()); + + // Clone the operations involved in the backward slice, iteratively stepping + // into the loops that we encounter. + // The implementation proceeds in a stack-like fashion: + // 1. Iteratively clone and step into the loops, pushing the `packedTensor` + // deeper in the stack. + // 2. Create a InsertSliceOp at the top of the stack. + // 3. Iteratively pop and yield the result of the InsertSliceOp across + // the cloned loops. + SmallVector clonedLoopIvs, leadingPackedTensorIndexings; + clonedLoopIvs.reserve(nPackedLoops); + leadingPackedTensorIndexings.reserve(nPackedLoops); + BlockAndValueMapping bvm; + // Insert `padTensorOp` into the backwardSlice so we clone it too. + analysis.backwardSlice.insert(padTensorOp); + // Stack step 1. iteratively clone loops and push `packedTensor`. + for (Operation *op : analysis.backwardSlice) { + // Specifically sit out in the extract_slice(packedTensor) case: this is the + // piece we seek to replace. + if (auto sliceOp = dyn_cast(op)) + if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) + continue; + auto effects = dyn_cast(op); + bool hasNoEffects = !effects || effects.hasNoEffect(); + if (hasNoEffects && + (op->getNumRegions() == 0 || isa(op))) { + b.clone(*op, bvm); + continue; + } + // TODO: support more cases as they appear. + auto forOp = dyn_cast(op); + assert(forOp && "Expected scf::ForOp when hoisting pad ops"); + // Unused loop, just skip it. + if (!analysis.packingLoops.contains(forOp)) + continue; + + auto clonedForOp = + b.create(loc, bvm.lookupOrDefault(forOp.lowerBound()), + bvm.lookupOrDefault(forOp.upperBound()), + bvm.lookupOrDefault(forOp.step()), packedTensor); + // Map the induction var, region args and results to the `clonedForOp`. + bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); + bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); + bvm.map(forOp.getResults(), clonedForOp.getResults()); + assert(clonedForOp->getNumRegions() == 1); + clonedLoopIvs.push_back(clonedForOp.getInductionVar()); + + b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); + Value loopIndependentIterationCount = + buildLoopIterationCount(b, outer, clonedForOp); + // Assert the loop-independent iteration count can be computed. + if (!loopIndependentIterationCount) + llvm_unreachable("loop independence prerequisite not met"); + leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); + packedTensor = clonedForOp.getRegionIterArgs().front(); + } + + // Stack step 2. create InsertSliceOp at the top of the stack. + // offsets = [clonedLoopIvs, 0 .. 0]. + SmallVector offsets(leadingPackedTensorIndexings.begin(), + leadingPackedTensorIndexings.end()); + offsets.append(paddedRank, b.getIndexAttr(0)); + // sizes = [1 .. 1, paddedShape]. + SmallVector sizes(nPackedLoops, b.getIndexAttr(1)); + for (int64_t sz : paddedTensorType.getShape()) { + // TODO: go grab dims when necessary, for now PadTensorOp returns a static + // tensor. + assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); + sizes.push_back(b.getIndexAttr(sz)); + } + // strides = [1 .. 1]. + SmallVector strides(nPackedLoops + paddedRank, + b.getIndexAttr(1)); + + Value inserted = + b.create(loc, bvm.lookup(padTensorOp.result()), + packedTensor, offsets, sizes, strides); + + // Stack step 3. iteratively pop the stack and propagate the yield. + Value valueToYield = inserted; + for (Value iv : llvm::reverse(clonedLoopIvs)) { + auto forOp = scf::getForInductionVarOwner(iv); + b.setInsertionPointToEnd(&forOp.getRegion().front()); + b.create(loc, valueToYield); + valueToYield = forOp.getResult(0); + } + + // Now the packed tensor is ready, replace the original padding op by a + // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. + b.setInsertionPoint(padTensorOp); + SmallVector loopIterationCounts = llvm::to_vector<4>( + llvm::map_range(analysis.packingLoops, [&](Operation *loop) { + return buildLoopIterationCount(b, outer, cast(loop)); + })); + // Assert all loop iteration counts can be computed. + if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) + llvm_unreachable("loop independence prerequisite not met"); + // offsets = [originalLoopIvs, 0 .. 0]. + offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); + offsets.append(paddedRank, b.getIndexAttr(0)); + // sizes = [1 .. 1, paddedShape] (definedabove). + // strides = [1 .. 1] (defined above) + packedTensor = + scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); + padTensorOp.replaceAllUsesWith( + b.create(loc, padTensorOp.getResultType(), + packedTensor, offsets, sizes, strides) + ->getResult(0)); + + Operation *toErase = padTensorOp; + + // Make the newly cloned `padTensorOp` available to the caller. + padTensorOp = + cast(bvm.lookup(padTensorOp.result()).getDefiningOp()); + + toErase->erase(); + + return success(); +} diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -509,445 +509,3 @@ }); } } - -/// Return success if `v` is a value that is only transitively defined by ops of -/// type in `OpTypeList`. -template -static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) { - // Compute a backward slice up to, but not including, `outerLimit`. - SetVector backwardSlice; - getBackwardSlice(v, &backwardSlice, [&](Operation *op) { - return outerLimit->isProperAncestor(op); - }); - // Traverse the backward slice and ensure we can perform the computation to - // hoist. - for (Operation *op : backwardSlice) { - if (isa(op)) - continue; - LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n"); - return false; - } - return true; -} - -bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { - return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); -} - -/// Return the current iteration number in the loop (iv - lb).ceilDiv(step). -/// The returned Value is guaranteed not to depend on any loop comprised in -/// [`outer`, `forOp`]. -/// Return null if such a loop-independent quantity cannot be computed. -static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, - scf::ForOp forOp) { - MLIRContext *ctx = forOp->getContext(); - AffineExpr iv, lb, step; - bindDims(ctx, iv, lb); - bindSymbols(ctx, step); - if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) || - !isDefinedOutsideOrConstant(outer, forOp.step())) - return Value(); - Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(), - stepVal = forOp.step(); - auto loc = forOp->getLoc(); - return b.createOrFold(loc, (iv - lb).ceilDiv(step), - ValueRange{ivVal, lbVal, stepVal}); -} - -/// Given a set of loops, assumed to be scf::ForOp, create a constraint set -/// containing the inequalities `iv - lb >= 0` and `-iv + ub - 1 >= 0` for each -/// loop. -static FlatAffineValueConstraints -initLoopIvsAndBounds(ArrayRef loops) { - FlatAffineValueConstraints constraints; - for (Operation *op : loops) - constraints.appendDimId(cast(op).getInductionVar()); - for (Operation *op : loops) - constraints.appendDimId(cast(op).lowerBound()); - for (Operation *op : loops) - constraints.appendDimId(cast(op).upperBound()); - unsigned numLoops = loops.size(); - for (unsigned ivIdx = 0, e = numLoops; ivIdx < e; ++ivIdx) { - // iv - lb >= 0 - SmallVector ineqLb(constraints.getNumCols(), 0); - ineqLb[ivIdx] = 1; - ineqLb[ivIdx + numLoops] = -1; - // -iv + ub >= 0 - SmallVector ineqUb(constraints.getNumCols(), 0); - ineqUb[ivIdx] = -1; - ineqUb[ivIdx + 2 * numLoops] = 1; - ineqUb[constraints.getNumCols() - 1] = -1; - constraints.addInequality(ineqLb); - constraints.addInequality(ineqUb); - } - return constraints; -} - -/// For each loop in `loops`, determine the ops involved in the construction of -/// its upper bound---up to the outerLimit loop--- and fold them as new -/// inequalities in the constraint set. -/// This is achieved by computing the backwardSlice of the loop's upper bound -/// and iteratively folding each op in reverse topological order to guarantee -/// use-def ordering. -/// As operations are folded in, their result is projected out of the -/// constraints set. -/// The following operations are supported: -/// - scf::ForOp are simply skipped. -/// - AffineApplyOp are composed to replace the result by an equality. -/// - AffineMinOp are composed by adding each entry as an upper bound. -/// If any other operation is met, return failure. -// TODO: extend on a per-need basis. -static LogicalResult -foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints, - scf::ForOp outerLimit, - ArrayRef loops) { - SetVector toProjectOut; - for (Operation *loop : loops) { - auto ub = cast(loop).upperBound(); - if (isDefinedOutsideOrConstant(outerLimit, ub)) - continue; - - // Compute a backward slice up to, but not including, `outerLimit`. - SetVector backwardSlice; - getBackwardSlice(ub, &backwardSlice, [&](Operation *op) { - return outerLimit->isProperAncestor(op); - }); - backwardSlice.insert(ub.getDefiningOp()); - - // Iterate over all ops in the slice and compose them in the constraints. - for (Operation *op : llvm::reverse(backwardSlice)) { - if (!isa(op)) - return failure(); - if (isa(op)) - continue; - // Ensure there is a - auto ensureIdFailed = [&](Value v) { - if (constraints.containsId(v)) { - unsigned pos; - constraints.findId(v, &pos); - return pos >= constraints.getNumDimIds(); - } - constraints.appendDimId(v); - return false; - }; - - // Ensure all ids exist and add results for later projection. - if (llvm::any_of(op->getResults(), ensureIdFailed) || - llvm::any_of(op->getOperands(), ensureIdFailed)) - return failure(); - - // All supported ops have 1 result. - // TODO: extend when needed. - toProjectOut.insert(op->getResult(0)); - - // Compose supported ops. - if (auto affineApplyOp = dyn_cast(op)) { - AffineValueMap avm(affineApplyOp.getAffineMap(), - affineApplyOp.getOperands(), - affineApplyOp.getResult()); - if (failed(constraints.composeMap(&avm))) - return failure(); - continue; - } - auto affineMinOp = cast(op); - unsigned pos; - bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos); - (void)foundMinOp; - assert(foundMinOp); - AffineMap alignedMap = constraints.computeAlignedMap( - affineMinOp.getAffineMap(), affineMinOp.getOperands()); - if (failed( - constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap))) - return failure(); - } - } - for (Value v : toProjectOut) - constraints.projectOut(v); - return success(); -} - -/// Ensure prerequisites that guarantee pad op hoisting can occur. -/// Return failure in the cases when we cannot perform hoisting; i.e. if either: -/// 1. There exists a use of `padTensorOp` that is not a linalg input operand. -/// 2. There isn't an enclosing `outermostEnclosingForOp` loop. -/// 3. There exists an op with a region that is dominated by -/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a -/// LinalgOp. -/// 4. There exists an op with side effects that is dominated by -/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface. -/// 5. The lower bound, upper bound and step of all the loops involved in the -/// hoisting can be -/// -/// While ensuring prerequisites: -/// 1. Fill the `backwardSlice` to contain the topologically sorted ops -/// dominated by `outermostEnclosingForOp`. -/// 2. Fill the `packingLoops` to contain only the enclosing loops of -/// `backwardSlice` whose IV is actually used in computing padding. Loops that -/// remain in `backwardSlice` but that are not in `packingLoops` are -/// dimensions of reuse. -static LogicalResult -hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels, - SetVector &backwardSlice, - SetVector &packingLoops, - SmallVector &dynamicTensorSizes) { - // Bail on any use that isn't an input of a Linalg op. - // Hoisting of inplace updates happens after vectorization. - for (OpOperand &use : padTensorOp.result().getUses()) { - auto linalgUser = dyn_cast(use.getOwner()); - if (!linalgUser || !linalgUser.isInputTensor(&use)) - return failure(); - } - - // Get at most nLevels of enclosing loops. - SmallVector reverseEnclosingLoops; - Operation *outermostEnclosingForOp = nullptr, - *nextEnclosingForOp = - padTensorOp->getParentOfType(); - while (nLevels-- > 0 && nextEnclosingForOp) { - outermostEnclosingForOp = nextEnclosingForOp; - reverseEnclosingLoops.push_back(outermostEnclosingForOp); - nextEnclosingForOp = - nextEnclosingForOp->getParentOfType(); - } - if (!outermostEnclosingForOp) - return failure(); - - // Get the backwards slice from `padTensorOp` that is dominated by the - // outermost enclosing loop. - DominanceInfo domInfo(outermostEnclosingForOp); - getBackwardSlice(padTensorOp.getOperation(), &backwardSlice, - [&](Operation *op) { - return domInfo.dominates(outermostEnclosingForOp, op); - }); - - // Bail on any op with a region that is not a LoopLikeInterface or a LinalgOp. - if (llvm::any_of(backwardSlice, [](Operation *op) { - return op->getNumRegions() > 0 && !isa(op) && - !isa(op); - })) - return failure(); - - // Filter out the loops whose induction variable is not used to compute the - // padded result. As a first approximation, just look for IVs that have no use - // in the backwardSlice. - // These are the dimensions of reuse that we can exploit to reduce the amount - // of work / memory. - // TODO: would this optimization compose better as a canonicalization? - for (LoopLikeOpInterface loop : llvm::reverse(reverseEnclosingLoops)) { - auto forOp = dyn_cast(loop.getOperation()); - if (!forOp) - continue; - for (Operation *user : forOp.getInductionVar().getUsers()) { - if (backwardSlice.contains(user)) { - packingLoops.insert(forOp); - break; - } - } - } - - // Backward slice is a topologically sorted list of ops starting at - // `outermostEnclosingForOp`. - assert(outermostEnclosingForOp == backwardSlice.front()); - - scf::ForOp outer = cast(outermostEnclosingForOp); - - FlatAffineValueConstraints constraints = - initLoopIvsAndBounds(packingLoops.getArrayRef()); - if (failed(foldUpperBoundsIntoConstraintsSet(constraints, outer, - packingLoops.getArrayRef()))) - return failure(); - - unsigned numLoops = packingLoops.size(); - SmallVector lbs(numLoops), ubs(numLoops); - // Compute the bounds of the first positions, assuming the others are fixed. - constraints.getSliceBounds(/*pos=*/0, /*num=*/packingLoops.size(), - outer->getContext(), &lbs, &ubs); - - SmallVector allValues; - constraints.getAllValues(&allValues); - SmallVector allNonLoopValues(allValues.begin() + numLoops, - allValues.end()); - - // For each packingLoop, create the extent by (ub - lb).ceilDiv(step). - // IP just before the outermost loop considered that we hoist above. - ImplicitLocOpBuilder b(outer->getLoc(), outer); - assert(packingLoops.size() == lbs.size() && "expected matching lb sizes"); - assert(packingLoops.size() == ubs.size() && "expected matching ub sizes"); - for (auto it : llvm::zip(packingLoops, lbs, ubs)) { - scf::ForOp loop = cast(std::get<0>(it)); - AffineMap lbMap = std::get<1>(it); - AffineMap ubMap = std::get<2>(it); - SmallVector lbOperands(allNonLoopValues); - canonicalizeMapAndOperands(&lbMap, &lbOperands); - Value lbVal = b.createOrFold(lbMap, lbOperands); - - SmallVector ubOperands(allNonLoopValues); - canonicalizeMapAndOperands(&ubMap, &ubOperands); - Value ubVal = b.createOrFold(ubMap, ubOperands); - - AffineExpr lb, ub, step; - bindDims(b.getContext(), lb, ub); - bindSymbols(b.getContext(), step); - Value res = b.createOrFold( - (ub - lb).ceilDiv(step), - ValueRange{lbVal, ubVal, cast(loop).step()}); - - dynamicTensorSizes.push_back(res); - } - - return success(); -} - -LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp, - unsigned nLoops) { - SmallVector dynamicTensorSizes; - SetVector backwardSlice, packingLoops; - if (failed(hoistPaddingOnTensorsPrerequisites(padTensorOp, nLoops, - backwardSlice, packingLoops, - dynamicTensorSizes))) - return failure(); - - // Update actual number of loops, which may be smaller. - nLoops = packingLoops.size(); - - Location loc = padTensorOp->getLoc(); - RankedTensorType paddedTensorType = padTensorOp.getResultType(); - unsigned paddedRank = paddedTensorType.getRank(); - - // Backward slice is a topologically sorted list of ops starting at - // `outermostEnclosingForOp`. - Operation *outermostEnclosingForOp = backwardSlice.front(); - // IP just before the outermost loop considered that we hoist above. - OpBuilder b(outermostEnclosingForOp); - - // Create the packed tensor into which we amortize - // padding. - SmallVector packedShape(nLoops, ShapedType::kDynamicSize); - // TODO: go grab dims when necessary, for now PadTensorOp returns a static - // tensor. - llvm::append_range(packedShape, paddedTensorType.getShape()); - auto packedTensorType = - RankedTensorType::get(packedShape, paddedTensorType.getElementType()); - Value packedTensor = b.create( - loc, dynamicTensorSizes, packedTensorType.getShape(), - packedTensorType.getElementType()); - - // Clone the operations involved in the backward slice, iteratively stepping - // into the loops that we encounter. - // The implementation proceeds in a stack-like fashion: - // 1. Iteratively clone and step into the loops, pushing the `packedTensor` - // deeper in the stack. - // 2. Create a InsertSliceOp at the top of the stack. - // 3. Iteratively pop and yield the result of the InsertSliceOp across - // the cloned loops. - SmallVector clonedLoopIvs, leadingPackedTensorIndexings; - clonedLoopIvs.reserve(nLoops); - leadingPackedTensorIndexings.reserve(nLoops); - BlockAndValueMapping bvm; - // Insert `padTensorOp` into the backwardSlice so we clone it too. - backwardSlice.insert(padTensorOp); - // Stack step 1. iteratively clone loops and push `packedTensor`. - for (Operation *op : backwardSlice) { - // Specifically sit out in the extract_slice(packedTensor) case: this is the - // piece we seek to replace. - if (auto sliceOp = dyn_cast(op)) - if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) - continue; - auto effects = dyn_cast(op); - bool hasNoEffects = !effects || effects.hasNoEffect(); - if (hasNoEffects && - (op->getNumRegions() == 0 || isa(op))) { - b.clone(*op, bvm); - continue; - } - // TODO: support more cases as they appear. - auto forOp = dyn_cast(op); - assert(forOp && "Expected scf::ForOp when hoisting pad ops"); - // Unused loop, just skip it. - if (!packingLoops.contains(forOp)) - continue; - - auto clonedForOp = - b.create(loc, bvm.lookupOrDefault(forOp.lowerBound()), - bvm.lookupOrDefault(forOp.upperBound()), - bvm.lookupOrDefault(forOp.step()), packedTensor); - // Map the induction var, region args and results to the `clonedForOp`. - bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); - bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); - bvm.map(forOp.getResults(), clonedForOp.getResults()); - assert(clonedForOp->getNumRegions() == 1); - clonedLoopIvs.push_back(clonedForOp.getInductionVar()); - - b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); - Value loopIndependentIterationCount = buildLoopIterationCount( - b, cast(outermostEnclosingForOp), clonedForOp); - // Assert the loop-independent iteration count can be computed. - if (!loopIndependentIterationCount) - llvm_unreachable("loop independence prerequisite not met"); - leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); - packedTensor = clonedForOp.getRegionIterArgs().front(); - } - - // Stack step 2. create InsertSliceOp at the top of the stack. - // offsets = [clonedLoopIvs, 0 .. 0]. - SmallVector offsets(leadingPackedTensorIndexings.begin(), - leadingPackedTensorIndexings.end()); - offsets.append(paddedRank, b.getIndexAttr(0)); - // sizes = [1 .. 1, paddedShape]. - SmallVector sizes(nLoops, b.getIndexAttr(1)); - for (int64_t sz : paddedTensorType.getShape()) { - // TODO: go grab dims when necessary, for now PadTensorOp returns a static - // tensor. - assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); - sizes.push_back(b.getIndexAttr(sz)); - } - // strides = [1 .. 1]. - SmallVector strides(nLoops + paddedRank, b.getIndexAttr(1)); - - Value inserted = - b.create(loc, bvm.lookup(padTensorOp.result()), - packedTensor, offsets, sizes, strides); - - // Stack step 3. iteratively pop the stack and propagate the yield. - Value valueToYield = inserted; - for (Value iv : llvm::reverse(clonedLoopIvs)) { - auto forOp = scf::getForInductionVarOwner(iv); - b.setInsertionPointToEnd(&forOp.getRegion().front()); - b.create(loc, valueToYield); - valueToYield = forOp.getResult(0); - } - - // Now the packed tensor is ready, replace the original padding op by a - // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. - b.setInsertionPoint(padTensorOp); - SmallVector loopIterationCounts = - llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) { - return buildLoopIterationCount( - b, cast(outermostEnclosingForOp), - cast(loop)); - })); - // Assert all loop iteration counts can be computed. - if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) - llvm_unreachable("loop independence prerequisite not met"); - // offsets = [originalLoopIvs, 0 .. 0]. - offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); - offsets.append(paddedRank, b.getIndexAttr(0)); - // sizes = [1 .. 1, paddedShape] (definedabove). - // strides = [1 .. 1] (defined above) - packedTensor = - scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); - padTensorOp.replaceAllUsesWith( - b.create(loc, padTensorOp.getResultType(), - packedTensor, offsets, sizes, strides) - ->getResult(0)); - - Operation *toErase = padTensorOp; - - // Make the newly cloned `padTensorOp` available to the caller. - padTensorOp = - cast(bvm.lookup(padTensorOp.result()).getDefiningOp()); - - toErase->erase(); - - return success(); -} diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" #include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6184,6 +6184,7 @@ "include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h", "include/mlir/Dialect/Linalg/Passes.h", "include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h", + "include/mlir/Dialect/Linalg/Transforms/HoistPadding.h", "include/mlir/Dialect/Linalg/Transforms/Hoisting.h", "include/mlir/Dialect/Linalg/Transforms/Transforms.h", "include/mlir/Dialect/Linalg/Utils/Utils.h",