diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -13,7 +13,9 @@ #include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/SCF/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" @@ -503,6 +505,73 @@ } } +bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { + return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); +} + +/// Compute the tightest upper bound with quantities that are all defined +/// outside of `outer`. +Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer, + Value v) { + if (isDefinedOutsideOrConstant(outer, v)) + return v; + v.dump(); + llvm_unreachable("Unsupported loop independent lower bound case"); +} + +/// Compute the tightest upper bound with quantities that are all defined +/// outside of `outer`. +Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer, + Value v) { + if (isDefinedOutsideOrConstant(outer, v)) + return v; + + auto minOp = v.getDefiningOp(); + if (minOp) { + LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n"); + // Compute a backward slice up to, but not including, `outer`. + llvm::SetVector backwardSlice; + getBackwardSlice(minOp.getOperation(), &backwardSlice, [&](Operation *op) { + return outer->isProperAncestor(op); + }); + backwardSlice.insert(minOp); + + OpBuilder::InsertionGuard g(b); + b.setInsertionPoint(outer); + Value res; + BlockAndValueMapping bvm; + for (Operation *op : backwardSlice) { + // For now, skip op with regions. This avoids pulling in the whole loops + // for which we should only indirectly depend on the induction variables. + // TODO: revisit this assumption when new data comes in. + if (op->getNumRegions() > 0) + continue; + auto sliceMinOp = dyn_cast(op); + if (!sliceMinOp) { + b.clone(*op, bvm); + continue; + } + auto mapAndOperands = substituteMin( + sliceMinOp, [&](Operation *op) { return outer->isAncestor(op); }); + SmallVector resultOperands = mapAndOperands.dims; + llvm::append_range(resultOperands, mapAndOperands.symbols); + AffineMap map = mapAndOperands.map; + canonicalizeMapAndOperands(&map, &resultOperands); + map = simplifyAffineMap(map); + res = b.create(outer->getLoc(), map, + llvm::to_vector<4>(llvm::map_range( + resultOperands, [&](Value operand) { + return bvm.lookupOrDefault(operand); + }))); + bvm.map(sliceMinOp, res); + } + LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n"); + return res; + } + v.dump(); + llvm_unreachable("Unsupported loop independent upper bound case"); +} + /// Ensure prerequisites that guarantee pad op hoisting can occur. /// Return failure in the cases when we cannot perform hoisting; i.e. if either: /// 1. There exists a use of `padTensorOp` that is not a linalg input operand. @@ -510,7 +579,7 @@ /// 3. There exists an op with a region that is dominated by /// `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a /// LinalgOp. -/// 3. There exists an op with side effects that is dominated by +/// 4. There exists an op with side effects that is dominated by /// `outermostEnclosingForOp` and that isn't a LoopLikeInterface. /// /// While ensuring prerequisites: @@ -587,25 +656,39 @@ } /// Return the number of iterations in the loop (ub - lb).ceilDiv(step). -static Value buildLoopTripCount(OpBuilder &b, scf::ForOp forOp) { +static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer, + scf::ForOp forOp) { MLIRContext *ctx = forOp->getContext(); AffineExpr lb, ub, step; bindDims(ctx, lb, ub); bindSymbols(ctx, step); - return b.create( + Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()), + ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()), + stepVal = forOp.step(); + assert(isDefinedOutsideOrConstant(outer, lbVal)); + assert(isDefinedOutsideOrConstant(outer, ubVal)); + assert(isDefinedOutsideOrConstant(outer, stepVal)); + Value res = b.create( forOp->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx), - ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); + ValueRange{lbVal, ubVal, stepVal}); + return res; } /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). -static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp forOp) { +static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, + scf::ForOp forOp) { MLIRContext *ctx = forOp->getContext(); AffineExpr iv, lb, step; bindDims(ctx, iv, lb); bindSymbols(ctx, step); + Value ivVal = forOp.getInductionVar(), + lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()), + stepVal = forOp.step(); + assert(isDefinedOutsideOrConstant(outer, lbVal)); + assert(isDefinedOutsideOrConstant(outer, stepVal)); return b.create( forOp->getLoc(), AffineMap::get(2, 1, {(iv - lb).ceilDiv(step)}, ctx), - ValueRange{forOp.getInductionVar(), forOp.lowerBound(), forOp.step()}); + ValueRange{ivVal, lbVal, stepVal}); } LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp, @@ -634,11 +717,13 @@ // TODO: go grab dims when necessary, for now PadTensorOp returns a static // tensor. llvm::append_range(packedShape, paddedTensorType.getShape()); + auto packedTensorType = RankedTensorType::get(packedShape, paddedTensorType.getElementType()); auto dynamicSizes = llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) { - return buildLoopTripCount(b, cast(op)); + return buildLoopTripCount(b, cast(outermostEnclosingForOp), + cast(op)); })); Value packedTensor = b.create( loc, dynamicSizes, packedTensorType.getShape(), @@ -656,9 +741,9 @@ clonedLoopIvs.reserve(nLoops); leadingPackedTensorIndexings.reserve(nLoops); BlockAndValueMapping bvm; - // Stack step 1. iteratively clone loops and push `packedTensor`. // Insert `padTensorOp` into the backwardSlice so we clone it too. backwardSlice.insert(padTensorOp); + // Stack step 1. iteratively clone loops and push `packedTensor`. for (Operation *op : backwardSlice) { if (op->getNumRegions() == 0 || isa(op)) { b.clone(*op, bvm); @@ -670,15 +755,19 @@ // Unused loop, just skip it. if (!packingLoops.contains(forOp)) continue; + auto clonedForOp = - b.create(loc, forOp.lowerBound(), forOp.upperBound(), - forOp.step(), packedTensor); + b.create(loc, bvm.lookupOrDefault(forOp.lowerBound()), + bvm.lookupOrDefault(forOp.upperBound()), + bvm.lookupOrDefault(forOp.step()), packedTensor); + + bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); assert(clonedForOp->getNumRegions() == 1); clonedLoopIvs.push_back(clonedForOp.getInductionVar()); + b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); - leadingPackedTensorIndexings.push_back( - buildLoopIterationCount(b, clonedForOp)); - bvm.map(forOp.getInductionVar(), clonedLoopIvs.back()); + leadingPackedTensorIndexings.push_back(buildLoopIterationCount( + b, cast(outermostEnclosingForOp), clonedForOp)); packedTensor = clonedForOp.getRegionIterArgs().front(); } @@ -716,7 +805,9 @@ b.setInsertionPoint(padTensorOp); SmallVector loopIterationCounts = llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) { - return buildLoopIterationCount(b, cast(loop)); + return buildLoopIterationCount( + b, cast(outermostEnclosingForOp), + cast(loop)); })); // offsets = [originalLoopIvs, 0 .. 0]. offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir --- a/mlir/test/Dialect/Linalg/hoist-padding.mlir +++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir @@ -1,16 +1,15 @@ -// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s +// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)> +// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> +// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)> +// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)> #map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)> #map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)> #map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)> #map3 = affine_map<(d0, d1) -> (2, d0 - d1)> #map4 = affine_map<(d0, d1) -> (3, d0 - d1)> -// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)> -// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> -// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)> -// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)> - // CHECK-LABEL: func @matmul_tensors // CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor // CHECK-SAME: %[[TB:[0-9a-z]+]]: tensor @@ -129,3 +128,92 @@ } return %3 : tensor } + +// ----- + +// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)> +// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)> +// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> +// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)> +#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)> +#map1 = affine_map<(d0, d1) -> (4, d0 - d1)> +#map2 = affine_map<(d0, d1) -> (2, d0 - d1)> + +// CHECK-LABEL: func @dot +func @dot(%arg0: tensor, %arg1: tensor, %arg2: tensor) + -> tensor +{ + %c8 = constant 8 : index + %c4 = constant 4 : index + %cst = constant 0.000000e+00 : f32 + %c2 = constant 2 : index + %c0 = constant 0 : index + %1 = memref.dim %arg0, %c0 : tensor + %2 = memref.dim %arg0, %c0 : tensor + %3 = memref.dim %arg1, %c0 : tensor + + // CHECK: scf.for %[[I:[0-9a-z]+]] = + // + // CHECK: %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]]) + // CHECK: %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]]) + // CHECK: %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]]) + // CHECK: %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]]) + // Init tensor and pack. + // CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor + // CHECK: %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor) { + // CHECK: scf.for %[[III:[0-9a-z]+]] = + // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor + // + // CHECK: %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]]) + // CHECK: %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]]) + // CHECK: %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]]) + // Init tensor and pack. + // CHECK: %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor + // CHECK: %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor) { + // CHECK: scf.for %[[III_2:[0-9a-z]+]] = + // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor + // Compute. + // CHECK: scf.for %[[II_3:[0-9a-z]+]] = + // CHECK: scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor) { + // CHECK: %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]]) + // CHECK: %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]]) + // CHECK: %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor to tensor<2xf32> + // CHECK: %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]]) + // CHECK: %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]]) + // CHECK: %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor to tensor<2xf32> + // CHECK: linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor) -> tensor + + %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor) { + %5 = affine.min #map0(%arg3)[%2] + %6 = subtensor %arg0[%arg3] [%5] [1] : tensor to tensor + %7 = affine.min #map0(%arg3)[%3] + %8 = subtensor %arg1[%arg3] [%7] [1] : tensor to tensor + %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor) { + %10 = affine.min #map1(%5, %arg5) + %11 = subtensor %6[%arg5] [%10] [1] : tensor to tensor + %12 = affine.min #map1(%7, %arg5) + %13 = subtensor %8[%arg5] [%12] [1] : tensor to tensor + %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor) { + %15 = affine.min #map2(%10, %arg7) + %16 = subtensor %11[%arg7] [%15] [1] : tensor to tensor + %17 = affine.min #map2(%12, %arg7) + %18 = subtensor %13[%arg7] [%17] [1] : tensor to tensor + %19 = subi %c2, %15 : index + %20 = linalg.pad_tensor %16 low[%c0] high[%19] { + ^bb0(%arg9: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<2xf32> + %21 = subi %c2, %17 : index + %22 = linalg.pad_tensor %18 low[%c0] high[%21] { + ^bb0(%arg9: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<2xf32> + %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor) -> tensor + scf.yield %23 : tensor + } + scf.yield %14 : tensor + } + scf.yield %9 : tensor + } + return %4 : tensor +}