diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -11,9 +11,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" -#include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" @@ -26,7 +24,6 @@ #include "mlir/IR/AsmState.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" @@ -58,9 +55,8 @@ bool isValid() { return valid; } - /// Footprint of the packedTensor, computed from the packingLoops and - /// `backwardSlice`. - FailureOr> getPackedTensorSizes(ImplicitLocOpBuilder &b); + /// Footprint of the packedTensor, computed from the packingLoops. + SmallVector getPackedTensorSizes(ImplicitLocOpBuilder &b); /// The outermost loop, determined by `nLevels` above which `padTensorOp` will /// be hoisted. @@ -229,21 +225,20 @@ valid = true; } -/// Add all index operands of `operation` to `indexEdges`. An index operand is -/// an operand of type index. -static void addIndexOperandsToIndexEdges(Operation *operation, - SetVector &indexEdges) { - for (Value operand : operation->getOperands()) - if (operand.getType().isIndex()) - indexEdges.insert(operand); -} - SmallVector HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp, tensor::ExtractSliceOp sliceOp) { // Set of all values used for index computation. SetVector indexEdges; + // Add all index operands of `operation` to `indexEdges`. An index operand is + // an operand of type index. + auto addIndexOperandsToIndexEdges = [&](Operation *operation) { + for (Value operand : operation->getOperands()) + if (operand.getType().isIndex()) + indexEdges.insert(operand); + }; + // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index // type in `backwardSlice`. Add the index operands of an operation to // `indexEdges` if one of its results is an index edge found so far and store @@ -268,7 +263,7 @@ // Add the index operands of `padTensorOp` and `sliceOp` to start the // exploration of the index computation. if (op == padTensorOp || op == sliceOp) { - addIndexOperandsToIndexEdges(op, indexEdges); + addIndexOperandsToIndexEdges(op); continue; } // Add the index operands of the loop if its induction variable is @@ -276,7 +271,7 @@ // `indexingLoops` if (auto forOp = dyn_cast(op)) { if (indexEdges.contains(forOp.getInductionVar())) { - addIndexOperandsToIndexEdges(op, indexEdges); + addIndexOperandsToIndexEdges(op); indexingLoops.push_back(forOp); continue; } @@ -285,199 +280,46 @@ // used for index computation. if (llvm::any_of(op->getResults(), [&](Value result) { return indexEdges.contains(result); })) - addIndexOperandsToIndexEdges(op, indexEdges); + addIndexOperandsToIndexEdges(op); } return indexingLoops; } -static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { - return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); -} - -/// For each loop in `loops`, determine the ops involved in the construction of -/// its upper bound---up to the outerLimit loop--- and fold them as new -/// inequalities in the constraint set. -/// This is achieved by computing the backwardSlice of the loop's upper bound -/// and iteratively folding each op in reverse topological order to guarantee -/// use-def ordering. -/// As operations are folded in, their result is projected out of the -/// constraints set. -/// The following operations are supported: -/// - scf::ForOp are simply skipped. -/// - AffineApplyOp are composed to replace the result by an equality. -/// - AffineMinOp are composed by adding each entry as an upper bound. -/// Additionally, the following terminal operations are handled: -/// - DimOp and ConstantOp are skipped. -/// If any other operation is met, return failure. -// TODO: extend on a per-need basis. -static LogicalResult -foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints, - scf::ForOp outerLimit, - ArrayRef loops) { - SetVector toProjectOut; - for (scf::ForOp loop : loops) { - auto ub = loop.upperBound(); - - // Set of all values used for index computation. - SetVector indexEdges; - indexEdges.insert(ub); - - // Compute the backward slice `indexSlice` containing the index computation - // performed to obtain the upper bound `ub`. Starting from `ub` add the - // index operands of an operation to `indexEdges` if one of its results is - // an index edge. Otherwise, stop the slice computation. For a loop, check - // if its induction variable is an index edge. - // - // Example: - // ``` - // %c0 = arith.constant 0 - // scf.for %i = %c0 to ... - // scf.for %j = %c0 to ... - // %ub = affine.min #map(%i) - // scf.for %k = %c0 to %ub - // ``` - // After computing the backward slice we obtain: - // indexEdges = [%ub, %i, %c0] - // indexSlice = [arith.constant 0, scf.for %i, affine.min #map(%i)] - SetVector indexSlice; - getBackwardSlice(ub, &indexSlice, [&](Operation *op) { - // Continue only along the index operands of the ForOp. - if (auto forOp = dyn_cast(op)) { - // Consider only loops part of the enclosing loops. - if (!outerLimit->isAncestor(op)) - return false; - if (!indexEdges.contains(forOp.getInductionVar())) - return false; - addIndexOperandsToIndexEdges(op, indexEdges); - return true; - } - // All supported index operations have one result. - assert(op->getNumResults() == 1 && - "expect operations to have one result"); - if (!indexEdges.contains(op->getResult(0))) - return false; - addIndexOperandsToIndexEdges(op, indexEdges); - return true; - }); - indexSlice.insert(ub.getDefiningOp()); - - // Iterate over all ops in the slice and compose them in the constraints. - for (Operation *op : llvm::reverse(indexSlice)) { - // All ForOps have previously been added to the constraints and ConstantOp - // and DimOp are terminals of the index computation. - if (isa(op)) - continue; - // Check all index computation operations are supported. - if (!isa(op)) - return failure(); - // Ensure there is an id. - auto ensureIdFailed = [&](Value v) { - if (constraints.containsId(v)) { - unsigned pos; - constraints.findId(v, &pos); - return pos >= constraints.getNumDimIds(); - } - constraints.appendDimId(v); - return false; - }; - - // Ensure all ids exist and add results for later projection. - if (llvm::any_of(op->getResults(), ensureIdFailed) || - llvm::any_of(op->getOperands(), ensureIdFailed)) - return failure(); - - // All supported ops have 1 result. - // TODO: extend when needed. - assert(op->getNumResults() == 1 && - "expect operations to have one result"); - toProjectOut.insert(op->getResult(0)); - - // Compose supported ops. - if (auto affineApplyOp = dyn_cast(op)) { - AffineValueMap avm(affineApplyOp.getAffineMap(), - affineApplyOp.getOperands(), - affineApplyOp.getResult()); - if (failed(constraints.composeMap(&avm))) - return failure(); - continue; - } - auto affineMinOp = cast(op); - unsigned pos; - bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos); - (void)foundMinOp; - assert(foundMinOp); - AffineMap alignedMap = constraints.computeAlignedMap( - affineMinOp.getAffineMap(), affineMinOp.getOperands()); - if (failed( - constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap))) - return failure(); - } - } - for (Value v : toProjectOut) - constraints.projectOut(v); - return success(); -} - -// Footprint of the packedTensor, computed from the packingLoops and -// `backwardSlice`. -FailureOr> +SmallVector HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { - // Create the base affine constaints for the packedLoops. - auto constraints = FlatAffineValueConstraints::getHyperrectangular( - llvm::to_vector<8>(llvm::map_range( - packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })), - llvm::to_vector<8>(llvm::map_range( - packingLoops, [](scf::ForOp op) { return op.lowerBound(); })), - llvm::to_vector<8>(llvm::map_range( - packingLoops, [](scf::ForOp op) { return op.upperBound(); }))); - - // Iteratively try to fold the upper bounds into the constraints set. - if (failed(foldUpperBoundsIntoConstraintsSet( - constraints, outermostEnclosingForOp, packingLoops))) - return failure(); - - int nPackedLoops = packingLoops.size(); - SmallVector lbs(nPackedLoops), ubs(nPackedLoops); - // Compute the bounds of the first positions, assuming the others are fixed. - constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops, - outermostEnclosingForOp->getContext(), &lbs, &ubs); - - SmallVector allValues; - constraints.getAllValues(&allValues); - SmallVector allNonLoopValues(allValues.begin() + nPackedLoops, - allValues.end()); - - // For each packingLoop, create the extent by (ub - lb).ceilDiv(step). - // IP just before the outermost loop considered that we hoist above. - assert(nPackedLoops == static_cast(lbs.size()) && - "expected matching lb sizes"); - assert(nPackedLoops == static_cast(ubs.size()) && - "expected matching ub sizes"); SmallVector dynamicTensorSizes; - for (auto it : llvm::zip(packingLoops, lbs, ubs)) { - scf::ForOp loop = std::get<0>(it); - AffineMap lbMap = std::get<1>(it); - AffineMap ubMap = std::get<2>(it); - SmallVector lbOperands(allNonLoopValues); - canonicalizeMapAndOperands(&lbMap, &lbOperands); - Value lbVal = b.createOrFold(lbMap, lbOperands); - - SmallVector ubOperands(allNonLoopValues); - canonicalizeMapAndOperands(&ubMap, &ubOperands); - Value ubVal = b.createOrFold(ubMap, ubOperands); + // Upper bound the packing loop lengths to size the packed tensor. Taking + // upper bounds can make the sizes of the packed tensor independent of the + // enclosing loops. This independence is a prerequisite for reusing the same + // buffer for all enclosing loop iterations and hoisting its allocation out of + // the enclosing loops. + for (auto forOp : packingLoops) { + // Compute an upper bound `ubVal` for the upper bound of `forOp`. + AffineMap boundMap; + SmallVector boundOperands; + getUpperBoundForIndex(forOp.upperBound(), boundMap, boundOperands); + Value ubVal = b.createOrFold(boundMap, boundOperands); + // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and + // store the result to `dynamicTensorSizes`. + // TODO: instead of using the lower bound of `forOp` directly, implement a + // lower bound computation similar to the upper bound computation. AffineExpr lb, ub, step; bindDims(b.getContext(), lb, ub); bindSymbols(b.getContext(), step); Value res = b.createOrFold( (ub - lb).ceilDiv(step), - ValueRange{lbVal, ubVal, cast(loop).step()}); - + ValueRange{forOp.lowerBound(), ubVal, cast(forOp).step()}); dynamicTensorSizes.push_back(res); } + return dynamicTensorSizes; } +static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { + return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); +} + /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). /// The returned Value is guaranteed not to depend on any loop comprised in /// [`outer`, `forOp`]. @@ -512,10 +354,7 @@ scf::ForOp outer = analysis.outermostEnclosingForOp; ImplicitLocOpBuilder b(outer->getLoc(), outer); - auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b); - if (failed(maybeDynamicTensorSizes)) - return failure(); - SmallVector dynamicTensorSizes = *maybeDynamicTensorSizes; + SmallVector dynamicTensorSizes = analysis.getPackedTensorSizes(b); // Update actual number of loops, which may be smaller. int nPackedLoops = analysis.packingLoops.size();