diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h --- a/mlir/include/mlir/Dialect/SCF/Passes.h +++ b/mlir/include/mlir/Dialect/SCF/Passes.h @@ -24,6 +24,9 @@ /// vectorization. std::unique_ptr createForLoopSpecializationPass(); +/// Creates a pass that specializes upper for loop bounds for vectorization. +std::unique_ptr createForLoopBoundSpecializationPass(); + /// Creates a loop fusion pass which fuses parallel loops. std::unique_ptr createParallelLoopFusionPass(); diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td --- a/mlir/include/mlir/Dialect/SCF/Passes.td +++ b/mlir/include/mlir/Dialect/SCF/Passes.td @@ -17,6 +17,13 @@ let dependentDialects = ["memref::MemRefDialect"]; } +def SCFForLoopBoundSpecialization + : FunctionPass<"for-loop-bound-specialization"> { + let summary = "Specialize upper bounds of `for` loops for vectorization"; + let constructor = "mlir::createForLoopBoundSpecializationPass()"; + let dependentDialects = ["AffineDialect"]; +} + def SCFForLoopSpecialization : FunctionPass<"for-loop-specialization"> { let summary = "Specialize `for` loops for vectorization"; diff --git a/mlir/include/mlir/Dialect/SCF/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms.h --- a/mlir/include/mlir/Dialect/SCF/Transforms.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms.h @@ -18,14 +18,18 @@ namespace mlir { class ConversionTarget; +class LogicalResult; class MLIRContext; class Region; +class RewriterBase; class TypeConverter; class RewritePatternSet; using OwningRewritePatternList = RewritePatternSet; namespace scf { +class IfOp; +class ForOp; class ParallelOp; /// Fuses all adjacent scf.parallel operations with identical bounds and step @@ -33,6 +37,45 @@ /// analysis. void naivelyFuseParallelOps(Region ®ion); +/// Rewrite a for loop with bounds/step that potentially do not divide evenly +/// into a for loop where the step divides the iteration space evenly, followed +/// by an if statement for the last (uneven) iteration. Other patterns can +/// simplify/canonicalize operations in the body of the new for loop. This is +/// beneficial for a wide range of transformations such as vectorization or +/// loop tiling. +/// +/// E.g., assuming a lower bound of 0 (for illustration purposes): +/// ``` +/// scf.for %iv = %c0 to %ub step %c4 { +/// (loop body) +/// } +/// ``` +/// is rewritten into the following pseudo IR: +/// ``` +/// %newUb = %ub - (%ub mod %c4) +/// scf.for %iv = %c0 to %newUb step %c4 { +/// (loop body) +/// } +/// scf.if %newUb < %ub { +/// (loop body) +/// } +/// ``` +/// +/// This function returns the new loop via `mainLoop` and the corresponding +/// if statement for the last iteration via `ifOp`. +/// +/// Note: This pattern also simplifies affine.min ops of the following form, +/// enabling additional canonicalizations: +/// ``` +/// #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> +/// %m = affine.min #map(%ub, %iv)[%step] +/// ``` +/// is rewritten into the following pseudo IR: +/// * Inside the new scf.for: `%m = %step` +/// * Inside the scf.if: `%m = %ub mod %step` +LogicalResult specializeUpperForLoopBound(RewriterBase &b, ForOp forOp, + ForOp &mainLoop, scf::IfOp &ifOp); + /// Tile a parallel loop of the form /// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) /// step (%arg4, %arg5) diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp --- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp @@ -15,9 +15,14 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/SCF/Passes.h" #include "mlir/Dialect/SCF/SCF.h" +#include "mlir/Dialect/SCF/Transforms.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/DenseMap.h" using namespace mlir; using scf::ForOp; @@ -89,6 +94,133 @@ op.erase(); } +// TODO: Instead of searching for very specific AffineMinOps, perform a range +// analysis. (As part of a canonicalization pattern in the Affine dialect.) +static void tryRewriteAffineMinOp(RewriterBase &b, AffineMinOp minOp, Value iv, + Value ub, Value step, bool insideLoop) { + auto minMap = minOp.getAffineMap(); + auto *ctx = minMap.getContext(); + llvm::DenseMap exprs; + for (const auto &it : llvm::enumerate(minOp.getDimOperands())) + exprs[it.value()] = getAffineDimExpr(it.index(), ctx); + for (const auto &it : llvm::enumerate(minOp.getSymbolOperands())) + exprs[it.value()] = getAffineSymbolExpr(it.index(), ctx); + if (!exprs[step] || !exprs[ub] || !exprs[iv]) + return; + + // Try to match for AffineMap: {step, ub - iv} + auto expected = AffineMap::get(minMap.getNumDims(), minMap.getNumSymbols(), + {exprs[step], exprs[ub] - exprs[iv]}, ctx); + if (expected.getResults() != minMap.getResults()) + return; + + if (insideLoop) { + // AffineMinOp inside loop: Replace with step. + b.replaceOp(minOp, step); + } else { + // AffineMinOp inside if statement: Replace with ub - iv. + auto remMap = AffineMap::get(minMap.getNumDims(), minMap.getNumSymbols(), + {exprs[ub] - exprs[iv]}); + OpBuilder::InsertionGuard guard(b); + b.setInsertionPoint(minOp); + b.replaceOpWithNewOp(minOp, remMap, minOp.getOperands()); + } +} + +/// Rewrite a for loop with bounds/step that potentially do not divide evenly +/// into a for loop where the step divides the iteration space evenly, followed +/// by an if statement for the last (uneven) iteration. +LogicalResult mlir::scf::specializeUpperForLoopBound(RewriterBase &b, + ForOp forOp, + ForOp &mainLoop, + scf::IfOp &ifOp) { + auto lbInt = getConstantIntValue(forOp.lowerBound()); + auto ubInt = getConstantIntValue(forOp.upperBound()); + auto stepInt = getConstantIntValue(forOp.step()); + + // No specialization necessary if step already divides upper bound evenly. + if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) + return failure(); + // No specialization necessary if step size is 1. + if (stepInt == static_cast(1)) + return failure(); + + auto loc = forOp.getLoc(); + AffineExpr dim0, dim1, dim2; + bindDims(b.getContext(), dim0, dim1, dim2); + // New upper bound: %ub - (%ub - %lb) mod %step + auto modMap = AffineMap::get(3, 0, {dim1 - ((dim1 - dim0) % dim2)}); + Value splitBound = b.createOrFold( + loc, modMap, + ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); + + // Create loop with new upper bound. + mainLoop = cast(b.clone(*forOp.getOperation())); + mainLoop.upperBoundMutable().assign(splitBound); + + // Do we need one more iteration? + Value hasMoreIter = + b.create(loc, CmpIPredicate::slt, splitBound, forOp.upperBound()); + + // Create IfOp for last iteration. + auto resultTypes = llvm::to_vector<4>( + llvm::map_range(forOp.initArgs(), [](Value v) { return v.getType(); })); + ifOp = b.create(loc, resultTypes, hasMoreIter, + /*withElseRegion=*/!resultTypes.empty()); + // Build then case. + BlockAndValueMapping bvm; + bvm.map(forOp.region().getArgument(0), splitBound); + for (auto it : llvm::zip(forOp.region().getArguments().drop_front(), + mainLoop->getResults())) { + bvm.map(std::get<0>(it), std::get<1>(it)); + } + b.cloneRegionBefore(forOp.region(), ifOp.thenRegion(), + ifOp.thenRegion().begin(), bvm); + // Build else case. + if (!resultTypes.empty()) + ifOp.getElseBodyBuilder().create(loc, mainLoop->getResults()); + + // Find `affine.min(%step, %ub - %iv)` ops and simplify them, enabling + // further canonicalizations. + mainLoop.walk([&](AffineMinOp minOp) { + Value iv = mainLoop.region().getArgument(0); + tryRewriteAffineMinOp(b, minOp, /*iv=*/iv, /*ub=*/forOp.upperBound(), + /*step=*/forOp.step(), + /*insideLoop=*/true); + }); + ifOp.walk([&](AffineMinOp minOp) { + tryRewriteAffineMinOp(b, minOp, /*iv=*/splitBound, + /*ub=*/forOp.upperBound(), + /*step=*/forOp.step(), /*insideLoop=*/false); + }); + + b.replaceOp(forOp, ifOp->getResults()); + return success(); +} + +namespace { +struct ForLoopBoundSpecializationPattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + static constexpr char kLoopLabel[] = "__splitted_loop__"; + + LogicalResult matchAndRewrite(ForOp forOp, + PatternRewriter &rewriter) const override { + if (forOp->hasAttr(kLoopLabel)) + return failure(); + + ForOp mainLoop; + scf::IfOp ifOp; + if (failed(specializeUpperForLoopBound(rewriter, forOp, mainLoop, ifOp))) + return failure(); + // Apply label, so that the same loop is not rewritten a second time. + mainLoop->setAttr(kLoopLabel, rewriter.getUnitAttr()); + + return success(); + } +}; +} // namespace + namespace { struct ParallelLoopSpecialization : public SCFParallelLoopSpecializationBase { @@ -104,6 +236,22 @@ getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); }); } }; + +struct ForLoopBoundSpecialization + : public SCFForLoopBoundSpecializationBase { + void runOnFunction() override { + FuncOp funcOp = getFunction(); + MLIRContext *ctx = funcOp.getContext(); + RewritePatternSet patterns(ctx); + patterns.add(ctx); + (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); + + // Drop the marker. + funcOp.walk([](ForOp op) { + op->removeAttr(ForLoopBoundSpecializationPattern::kLoopLabel); + }); + } +}; } // namespace std::unique_ptr mlir::createParallelLoopSpecializationPass() { @@ -113,3 +261,7 @@ std::unique_ptr mlir::createForLoopSpecializationPass() { return std::make_unique(); } + +std::unique_ptr mlir::createForLoopBoundSpecializationPass() { + return std::make_unique(); +} diff --git a/mlir/test/Dialect/SCF/for-loop-bound-specialization.mlir b/mlir/test/Dialect/SCF/for-loop-bound-specialization.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SCF/for-loop-bound-specialization.mlir @@ -0,0 +1,147 @@ +// RUN: mlir-opt %s -for-loop-bound-specialization -canonicalize -split-input-file | FileCheck %s + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0, s1, s2] -> (s1 - (s1 - (s1 - s0) mod s2))> +// CHECK: func @fully_dynamic_bounds( +// CHECK-SAME: %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index +// CHECK: %[[C0_I32:.*]] = constant 0 : i32 +// CHECK: %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[LB]], %[[UB]], %[[STEP]]] +// CHECK: %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[LB]] to %[[NEW_UB]] +// CHECK-SAME: step %[[STEP]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) { +// CHECK: %[[CAST:.*]] = index_cast %[[STEP]] : index to i32 +// CHECK: %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32 +// CHECK: scf.yield %[[ADD]] +// CHECK: } +// CHECK: %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]] +// CHECK: %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) { +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[LB]], %[[UB]], %[[STEP]]] +// CHECK: %[[CAST2:.*]] = index_cast %[[REM]] +// CHECK: %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]] +// CHECK: scf.yield %[[ADD2]] +// CHECK: } else { +// CHECK: scf.yield %[[LOOP]] +// CHECK: } +// CHECK: return %[[RESULT]] +#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> +func @fully_dynamic_bounds(%lb : index, %ub: index, %step: index) -> i32 { + %c0 = constant 0 : i32 + %r = scf.for %iv = %lb to %ub step %step iter_args(%arg = %c0) -> i32 { + %s = affine.min #map(%ub, %iv)[%step] + %casted = index_cast %s : index to i32 + %0 = addi %arg, %casted : i32 + scf.yield %0 : i32 + } + return %r : i32 +} + +// ----- + +// CHECK: func @fully_static_bounds( +// CHECK-DAG: %[[C0_I32:.*]] = constant 0 : i32 +// CHECK-DAG: %[[C1_I32:.*]] = constant 1 : i32 +// CHECK-DAG: %[[C4_I32:.*]] = constant 4 : i32 +// CHECK-DAG: %[[C0:.*]] = constant 0 : index +// CHECK-DAG: %[[C4:.*]] = constant 4 : index +// CHECK-DAG: %[[C16:.*]] = constant 16 : index +// CHECK: %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C16]] +// CHECK-SAME: step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) { +// CHECK: %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32 +// CHECK: scf.yield %[[ADD]] +// CHECK: } +// CHECK: %[[RESULT:.*]] = addi %[[LOOP]], %[[C1_I32]] : i32 +// CHECK: return %[[RESULT]] +#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> +func @fully_static_bounds() -> i32 { + %c0_i32 = constant 0 : i32 + %lb = constant 0 : index + %step = constant 4 : index + %ub = constant 17 : index + %r = scf.for %iv = %lb to %ub step %step + iter_args(%arg = %c0_i32) -> i32 { + %s = affine.min #map(%ub, %iv)[%step] + %casted = index_cast %s : index to i32 + %0 = addi %arg, %casted : i32 + scf.yield %0 : i32 + } + return %r : i32 +} + +// ----- + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)> +// CHECK: func @dynamic_upper_bound( +// CHECK-SAME: %[[UB:.*]]: index +// CHECK-DAG: %[[C0_I32:.*]] = constant 0 : i32 +// CHECK-DAG: %[[C4_I32:.*]] = constant 4 : i32 +// CHECK-DAG: %[[C0:.*]] = constant 0 : index +// CHECK-DAG: %[[C4:.*]] = constant 4 : index +// CHECK: %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[UB]]] +// CHECK: %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[NEW_UB]] +// CHECK-SAME: step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) { +// CHECK: %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32 +// CHECK: scf.yield %[[ADD]] +// CHECK: } +// CHECK: %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]] +// CHECK: %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) { +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]] +// CHECK: %[[CAST2:.*]] = index_cast %[[REM]] +// CHECK: %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]] +// CHECK: scf.yield %[[ADD2]] +// CHECK: } else { +// CHECK: scf.yield %[[LOOP]] +// CHECK: } +// CHECK: return %[[RESULT]] +#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> +func @dynamic_upper_bound(%ub : index) -> i32 { + %c0_i32 = constant 0 : i32 + %lb = constant 0 : index + %step = constant 4 : index + %r = scf.for %iv = %lb to %ub step %step + iter_args(%arg = %c0_i32) -> i32 { + %s = affine.min #map(%ub, %iv)[%step] + %casted = index_cast %s : index to i32 + %0 = addi %arg, %casted : i32 + scf.yield %0 : i32 + } + return %r : i32 +} + +// ----- + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)> +// CHECK: func @no_loop_results( +// CHECK-SAME: %[[UB:.*]]: index, %[[MEMREF:.*]]: memref +// CHECK-DAG: %[[C4_I32:.*]] = constant 4 : i32 +// CHECK-DAG: %[[C0:.*]] = constant 0 : index +// CHECK-DAG: %[[C4:.*]] = constant 4 : index +// CHECK: %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[UB]]] +// CHECK: scf.for %[[IV:.*]] = %[[C0]] to %[[NEW_UB]] step %[[C4]] { +// CHECK: %[[LOAD:.*]] = memref.load %[[MEMREF]][] +// CHECK: %[[ADD:.*]] = addi %[[LOAD]], %[[C4_I32]] : i32 +// CHECK: memref.store %[[ADD]], %[[MEMREF]] +// CHECK: } +// CHECK: %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]] +// CHECK: scf.if %[[HAS_MORE]] { +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]] +// CHECK: %[[LOAD2:.*]] = memref.load %[[MEMREF]][] +// CHECK: %[[CAST2:.*]] = index_cast %[[REM]] +// CHECK: %[[ADD2:.*]] = addi %[[LOAD2]], %[[CAST2]] +// CHECK: memref.store %[[ADD2]], %[[MEMREF]] +// CHECK: } +// CHECK: return +#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> +func @no_loop_results(%ub : index, %d : memref) { + %c0_i32 = constant 0 : i32 + %lb = constant 0 : index + %step = constant 4 : index + scf.for %iv = %lb to %ub step %step { + %s = affine.min #map(%ub, %iv)[%step] + %r = memref.load %d[] : memref + %casted = index_cast %s : index to i32 + %0 = addi %r, %casted : i32 + memref.store %0, %d[] : memref + } + return +}