diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -75,6 +75,11 @@ OpBuilder<(ins "AffineMap":$map, "ValueRange":$mapOperands), [{ build($_builder, $_state, $_builder.getIndexType(), map, mapOperands); + }]>, + OpBuilder<(ins "ArrayRef ":$exprList,"ValueRange":$mapOperands), + [{ + build($_builder, $_state, $_builder.getIndexType(), + AffineMap::inferFromExprList(exprList).front(), mapOperands); }]> ]; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -13,7 +13,9 @@ #include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/SCF/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" @@ -503,6 +505,134 @@ } } +/// Return success if `v` is a value that is only transitively defined by ops of +/// type in `OpTypeList`. +template +static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) { + // Compute a backward slice up to, but not including, `outerLimit`. + llvm::SetVector backwardSlice; + getBackwardSlice(v, &backwardSlice, [&](Operation *op) { + return outerLimit->isProperAncestor(op); + }); + // Traverse the backward slice and ensure we can perform the computation to + // hoist. + for (Operation *op : backwardSlice) { + if (isa(op)) + continue; + LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n"); + return false; + } + return true; +} + +bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { + return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); +} + +/// Compute the tightest lower bound with quantities that are all defined +/// outside of `outer`. +/// Return null if such a bound cannot be computed. +Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer, + Value v) { + if (isDefinedOutsideOrConstant(outer, v)) + return v; + return Value(); +} + +/// Compute the tightest upper bound with quantities that are all defined +/// outside of `outer`. +/// Expects all ops in the backward slice of `v` up to `outer` to be either +/// scf.for, affine.min or affine.apply. +static Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer, + Value v) { + if (isDefinedOutsideOrConstant(outer, v)) + return v; + + LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n"); + + bool ok = + backwardsSliceOnlyHasOpsOfType( + outer, v); + assert(ok && "expected to only be defined by scf::ForOp and AffineMinOp"); + + // Compute a backward slice up to, but not including, `outer`. + llvm::SetVector backwardSlice; + getBackwardSlice(v, &backwardSlice, + [&](Operation *op) { return outer->isProperAncestor(op); }); + backwardSlice.insert(v.getDefiningOp()); + + OpBuilder::InsertionGuard g(b); + b.setInsertionPoint(outer); + Value res = v; + BlockAndValueMapping bvm; + for (Operation *op : backwardSlice) { + if (isa(op)) + continue; + if (isa(op)) { + b.clone(*op, bvm); + continue; + } + auto sliceMinOp = cast(op); + // Perform the substitution of the operands of AffineMinOp. + auto mapAndOperands = substituteMin( + sliceMinOp, [&](Operation *op) { return outer->isAncestor(op); }); + SmallVector resultOperands = mapAndOperands.dims; + llvm::append_range(resultOperands, mapAndOperands.symbols); + AffineMap map = mapAndOperands.map; + canonicalizeMapAndOperands(&map, &resultOperands); + map = simplifyAffineMap(map); + res = b.create( + outer->getLoc(), map, + llvm::to_vector<4>(llvm::map_range(resultOperands, [&](Value operand) { + return bvm.lookupOrDefault(operand); + }))); + bvm.map(sliceMinOp, res); + } + LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n"); + return res; +} + +/// Return the number of iterations in the loop (ub - lb).ceilDiv(step). +/// The returned Value is guaranteed not to depend on any loop comprised in +/// [`outer`, `forOp`]. +/// Return null if such a loop-independent quantity cannot be computed. +static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer, + scf::ForOp forOp) { + MLIRContext *ctx = forOp->getContext(); + AffineExpr lb, ub, step; + bindDims(ctx, lb, ub); + bindSymbols(ctx, step); + Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()), + ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()), + stepVal = forOp.step(); + if (!lbVal || !ubVal || !stepVal) + return Value(); + auto loc = forOp->getLoc(); + Value res = b.create(loc, (ub - lb).ceilDiv(step), + ValueRange{lbVal, ubVal, stepVal}); + return res; +} + +/// Return the current iteration number in the loop (iv - lb).ceilDiv(step). +/// The returned Value is guaranteed not to depend on any loop comprised in +/// [`outer`, `forOp`]. +/// Return null if such a loop-independent quantity cannot be computed. +static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, + scf::ForOp forOp) { + MLIRContext *ctx = forOp->getContext(); + AffineExpr iv, lb, step; + bindDims(ctx, iv, lb); + bindSymbols(ctx, step); + Value ivVal = forOp.getInductionVar(), + lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()), + stepVal = forOp.step(); + if (!ivVal || !lbVal || !stepVal) + return Value(); + auto loc = forOp->getLoc(); + return b.create(loc, (iv - lb).ceilDiv(step), + ValueRange{ivVal, lbVal, stepVal}); +} + /// Ensure prerequisites that guarantee pad op hoisting can occur. /// Return failure in the cases when we cannot perform hoisting; i.e. if either: /// 1. There exists a use of `padTensorOp` that is not a linalg input operand. @@ -510,8 +640,10 @@ /// 3. There exists an op with a region that is dominated by /// `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a /// LinalgOp. -/// 3. There exists an op with side effects that is dominated by -/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface. +/// 4. There exists an op with side effects that is dominated by +/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface. +/// 5. The lower bound, upper bound and step of all the loops involved in the +/// hoisting can be /// /// While ensuring prerequisites: /// 1. Fill the `backwardSlice` to contain the topologically sorted ops @@ -523,7 +655,8 @@ static LogicalResult hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels, llvm::SetVector &backwardSlice, - llvm::SetVector &packingLoops) { + llvm::SetVector &packingLoops, + SmallVector &dynamicTensorSizes) { // Bail on any use that isn't an input of a Linalg op. // Hoisting of inplace updates happens after vectorization. for (OpOperand &use : padTensorOp.result().getUses()) { @@ -583,36 +716,39 @@ // `outermostEnclosingForOp`. assert(outermostEnclosingForOp == backwardSlice.front()); - return success(); -} - -/// Return the number of iterations in the loop (ub - lb).ceilDiv(step). -static Value buildLoopTripCount(OpBuilder &b, scf::ForOp forOp) { - MLIRContext *ctx = forOp->getContext(); - AffineExpr lb, ub, step; - bindDims(ctx, lb, ub); - bindSymbols(ctx, step); - return b.create( - forOp->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx), - ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); -} + scf::ForOp outer = cast(outermostEnclosingForOp); + if (llvm::any_of(packingLoops, [&](Operation *op) { + scf::ForOp forOp = cast(op); + Value lb = forOp.lowerBound(), ub = forOp.upperBound(), + step = forOp.step(); + return !isDefinedOutsideOrConstant(outer, lb) || + !(isDefinedOutsideOrConstant(outer, ub) || + backwardsSliceOnlyHasOpsOfType(outer, ub)) || + !isDefinedOutsideOrConstant(outer, step); + })) + return failure(); -/// Return the current iteration number in the loop (iv - lb).ceilDiv(step). -static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp forOp) { - MLIRContext *ctx = forOp->getContext(); - AffineExpr iv, lb, step; - bindDims(ctx, iv, lb); - bindSymbols(ctx, step); - return b.create( - forOp->getLoc(), AffineMap::get(2, 1, {(iv - lb).ceilDiv(step)}, ctx), - ValueRange{forOp.getInductionVar(), forOp.lowerBound(), forOp.step()}); + // IP just before the outermost loop considered that we hoist above. + OpBuilder b(outermostEnclosingForOp); + dynamicTensorSizes = + llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) { + return buildLoopTripCount(b, cast(outermostEnclosingForOp), + cast(op)); + })); + // Assert all loop trip counts can be computed. + if (!llvm::all_of(dynamicTensorSizes, [](Value v) { return v; })) + llvm_unreachable("loop independence prerequisite not met"); + return success(); } LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp, unsigned nLoops) { + SmallVector dynamicTensorSizes; llvm::SetVector backwardSlice, packingLoops; if (failed(hoistPaddingOnTensorsPrerequisites(padTensorOp, nLoops, - backwardSlice, packingLoops))) + backwardSlice, packingLoops, + dynamicTensorSizes))) return failure(); // Update actual number of loops, which may be smaller. @@ -636,12 +772,8 @@ llvm::append_range(packedShape, paddedTensorType.getShape()); auto packedTensorType = RankedTensorType::get(packedShape, paddedTensorType.getElementType()); - auto dynamicSizes = - llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) { - return buildLoopTripCount(b, cast(op)); - })); Value packedTensor = b.create( - loc, dynamicSizes, packedTensorType.getShape(), + loc, dynamicTensorSizes, packedTensorType.getShape(), packedTensorType.getElementType()); // Clone the operations involved in the backward slice, iteratively stepping @@ -656,9 +788,9 @@ clonedLoopIvs.reserve(nLoops); leadingPackedTensorIndexings.reserve(nLoops); BlockAndValueMapping bvm; - // Stack step 1. iteratively clone loops and push `packedTensor`. // Insert `padTensorOp` into the backwardSlice so we clone it too. backwardSlice.insert(padTensorOp); + // Stack step 1. iteratively clone loops and push `packedTensor`. for (Operation *op : backwardSlice) { if (op->getNumRegions() == 0 || isa(op)) { b.clone(*op, bvm); @@ -670,15 +802,23 @@ // Unused loop, just skip it. if (!packingLoops.contains(forOp)) continue; + auto clonedForOp = - b.create(loc, forOp.lowerBound(), forOp.upperBound(), - forOp.step(), packedTensor); + b.create(loc, bvm.lookupOrDefault(forOp.lowerBound()), + bvm.lookupOrDefault(forOp.upperBound()), + bvm.lookupOrDefault(forOp.step()), packedTensor); + + bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); assert(clonedForOp->getNumRegions() == 1); clonedLoopIvs.push_back(clonedForOp.getInductionVar()); + b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); - leadingPackedTensorIndexings.push_back( - buildLoopIterationCount(b, clonedForOp)); - bvm.map(forOp.getInductionVar(), clonedLoopIvs.back()); + Value loopIndependentIterationCount = buildLoopIterationCount( + b, cast(outermostEnclosingForOp), clonedForOp); + // Assert the loop-independent iteration count can be computed. + if (!loopIndependentIterationCount) + llvm_unreachable("loop independence prerequisite not met"); + leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); packedTensor = clonedForOp.getRegionIterArgs().front(); } @@ -716,8 +856,13 @@ b.setInsertionPoint(padTensorOp); SmallVector loopIterationCounts = llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) { - return buildLoopIterationCount(b, cast(loop)); + return buildLoopIterationCount( + b, cast(outermostEnclosingForOp), + cast(loop)); })); + // Assert all loop iteration counts can be computed. + if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) + llvm_unreachable("loop independence prerequisite not met"); // offsets = [originalLoopIvs, 0 .. 0]. offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); offsets.append(paddedRank, b.getIndexAttr(0)); diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir --- a/mlir/test/Dialect/Linalg/hoist-padding.mlir +++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir @@ -1,16 +1,15 @@ -// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s +// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)> +// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> +// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)> +// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)> #map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)> #map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)> #map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)> #map3 = affine_map<(d0, d1) -> (2, d0 - d1)> #map4 = affine_map<(d0, d1) -> (3, d0 - d1)> -// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)> -// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> -// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)> -// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)> - // CHECK-LABEL: func @matmul_tensors // CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor // CHECK-SAME: %[[TB:[0-9a-z]+]]: tensor @@ -129,3 +128,92 @@ } return %3 : tensor } + +// ----- + +// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)> +// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)> +// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> +// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)> +#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)> +#map1 = affine_map<(d0, d1) -> (4, d0 - d1)> +#map2 = affine_map<(d0, d1) -> (2, d0 - d1)> + +// CHECK-LABEL: func @dot +func @dot(%arg0: tensor, %arg1: tensor, %arg2: tensor) + -> tensor +{ + %c8 = constant 8 : index + %c4 = constant 4 : index + %cst = constant 0.000000e+00 : f32 + %c2 = constant 2 : index + %c0 = constant 0 : index + %1 = memref.dim %arg0, %c0 : tensor + %2 = memref.dim %arg0, %c0 : tensor + %3 = memref.dim %arg1, %c0 : tensor + + // CHECK: scf.for %[[I:[0-9a-z]+]] = + // + // CHECK: %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]]) + // CHECK: %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]]) + // CHECK: %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]]) + // CHECK: %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]]) + // Init tensor and pack. + // CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor + // CHECK: %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor) { + // CHECK: scf.for %[[III:[0-9a-z]+]] = + // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor + // + // CHECK: %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]]) + // CHECK: %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]]) + // CHECK: %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]]) + // Init tensor and pack. + // CHECK: %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor + // CHECK: %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor) { + // CHECK: scf.for %[[III_2:[0-9a-z]+]] = + // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor + // Compute. + // CHECK: scf.for %[[II_3:[0-9a-z]+]] = + // CHECK: scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor) { + // CHECK: %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]]) + // CHECK: %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]]) + // CHECK: %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor to tensor<2xf32> + // CHECK: %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]]) + // CHECK: %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]]) + // CHECK: %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor to tensor<2xf32> + // CHECK: linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor) -> tensor + + %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor) { + %5 = affine.min #map0(%arg3)[%2] + %6 = subtensor %arg0[%arg3] [%5] [1] : tensor to tensor + %7 = affine.min #map0(%arg3)[%3] + %8 = subtensor %arg1[%arg3] [%7] [1] : tensor to tensor + %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor) { + %10 = affine.min #map1(%5, %arg5) + %11 = subtensor %6[%arg5] [%10] [1] : tensor to tensor + %12 = affine.min #map1(%7, %arg5) + %13 = subtensor %8[%arg5] [%12] [1] : tensor to tensor + %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor) { + %15 = affine.min #map2(%10, %arg7) + %16 = subtensor %11[%arg7] [%15] [1] : tensor to tensor + %17 = affine.min #map2(%12, %arg7) + %18 = subtensor %13[%arg7] [%17] [1] : tensor to tensor + %19 = subi %c2, %15 : index + %20 = linalg.pad_tensor %16 low[%c0] high[%19] { + ^bb0(%arg9: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<2xf32> + %21 = subi %c2, %17 : index + %22 = linalg.pad_tensor %18 low[%c0] high[%21] { + ^bb0(%arg9: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<2xf32> + %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor) -> tensor + scf.yield %23 : tensor + } + scf.yield %14 : tensor + } + scf.yield %9 : tensor + } + return %4 : tensor +}