diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h --- a/mlir/include/mlir/Dialect/SCF/Passes.h +++ b/mlir/include/mlir/Dialect/SCF/Passes.h @@ -20,6 +20,10 @@ class Pass; +/// Creates a pass that specializes for loop for unrolling and +/// vectorization. +std::unique_ptr createForLoopSpecializationPass(); + /// Creates a loop fusion pass which fuses parallel loops. std::unique_ptr createParallelLoopFusionPass(); diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td --- a/mlir/include/mlir/Dialect/SCF/Passes.td +++ b/mlir/include/mlir/Dialect/SCF/Passes.td @@ -1,4 +1,4 @@ -//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===// +//===-- Passes.td - SCF pass definition file ---------------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,18 +11,24 @@ include "mlir/Pass/PassBase.td" -def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> { +def SCFForLoopSpecialization + : FunctionPass<"for-loop-specialization"> { + let summary = "Specialize `for` loops for vectorization"; + let constructor = "mlir::createForLoopSpecializationPass()"; +} + +def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> { let summary = "Fuse adjacent parallel loops"; let constructor = "mlir::createParallelLoopFusionPass()"; } -def LoopParallelLoopSpecialization +def SCFParallelLoopSpecialization : FunctionPass<"parallel-loop-specialization"> { let summary = "Specialize parallel loops for vectorization"; let constructor = "mlir::createParallelLoopSpecializationPass()"; } -def LoopParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> { +def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> { let summary = "Tile parallel loops"; let constructor = "mlir::createParallelLoopTilingPass()"; let options = [ diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt @@ -1,6 +1,6 @@ add_mlir_dialect_library(MLIRSCFTransforms + LoopSpecialization.cpp ParallelLoopFusion.cpp - ParallelLoopSpecialization.cpp ParallelLoopTiling.cpp Utils.cpp diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp rename from mlir/lib/Dialect/SCF/Transforms/ParallelLoopSpecialization.cpp rename to mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopSpecialization.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp @@ -1,4 +1,4 @@ -//===- ParallelLoopSpecialization.cpp - scf.parallel specialization ------===// +//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// // -// Specializes parallel loops for easier unrolling and vectorization. +// Specializes parallel loops and for loops for easier unrolling and +// vectorization. // //===----------------------------------------------------------------------===// @@ -19,13 +20,14 @@ #include "mlir/IR/BlockAndValueMapping.h" using namespace mlir; +using scf::ForOp; using scf::ParallelOp; -/// Rewrite a loop with bounds defined by an affine.min with a constant into 2 -/// loops after checking if the bounds are equal to that constant. This is -/// beneficial if the loop will almost always have the constant bound and that -/// version can be fully unrolled and vectorized. -static void specializeLoopForUnrolling(ParallelOp op) { +/// Rewrite a parallel loop with bounds defined by an affine.min with a constant +/// into 2 loops after checking if the bounds are equal to that constant. This +/// is beneficial if the loop will almost always have the constant bound and +/// that version can be fully unrolled and vectorized. +static void specializeParallelLoopForUnrolling(ParallelOp op) { SmallVector constantIndices; constantIndices.reserve(op.upperBound().size()); for (auto bound : op.upperBound()) { @@ -33,7 +35,7 @@ if (!minOp) return; int64_t minConstant = std::numeric_limits::max(); - for (auto expr : minOp.map().getResults()) { + for (AffineExpr expr : minOp.map().getResults()) { if (auto constantIndex = expr.dyn_cast()) minConstant = std::min(minConstant, constantIndex.getValue()); } @@ -58,11 +60,48 @@ op.erase(); } +/// Rewrite a for loop with bounds defined by an affine.min with a constant into +/// 2 loops after checking if the bounds are equal to that constant. This is +/// beneficial if the loop will almost always have the constant bound and that +/// version can be fully unrolled and vectorized. +static void specializeForLoopForUnrolling(ForOp op) { + auto bound = op.upperBound(); + auto minOp = bound.getDefiningOp(); + if (!minOp) + return; + int64_t minConstant = std::numeric_limits::max(); + for (AffineExpr expr : minOp.map().getResults()) { + if (auto constantIndex = expr.dyn_cast()) + minConstant = std::min(minConstant, constantIndex.getValue()); + } + if (minConstant == std::numeric_limits::max()) + return; + + OpBuilder b(op); + BlockAndValueMapping map; + Value constant = b.create(op.getLoc(), minConstant); + Value cond = + b.create(op.getLoc(), CmpIPredicate::eq, bound, constant); + map.map(bound, constant); + auto ifOp = b.create(op.getLoc(), cond, /*withElseRegion=*/true); + ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); + ifOp.getElseBodyBuilder().clone(*op.getOperation()); + op.erase(); +} + namespace { struct ParallelLoopSpecialization - : public LoopParallelLoopSpecializationBase { + : public SCFParallelLoopSpecializationBase { + void runOnFunction() override { + getFunction().walk( + [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); + } +}; + +struct ForLoopSpecialization + : public SCFForLoopSpecializationBase { void runOnFunction() override { - getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); }); + getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); }); } }; } // namespace @@ -70,3 +109,7 @@ std::unique_ptr mlir::createParallelLoopSpecializationPass() { return std::make_unique(); } + +std::unique_ptr mlir::createForLoopSpecializationPass() { + return std::make_unique(); +} diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp @@ -160,7 +160,7 @@ namespace { struct ParallelLoopFusion - : public LoopParallelLoopFusionBase { + : public SCFParallelLoopFusionBase { void runOnOperation() override { getOperation()->walk([&](Operation *child) { for (Region ®ion : child->getRegions()) diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp @@ -119,7 +119,7 @@ namespace { struct ParallelLoopTiling - : public LoopParallelLoopTilingBase { + : public SCFParallelLoopTilingBase { ParallelLoopTiling() = default; explicit ParallelLoopTiling(ArrayRef tileSizes) { this->tileSizes = tileSizes; diff --git a/mlir/test/Dialect/SCF/for-loop-specialization.mlir b/mlir/test/Dialect/SCF/for-loop-specialization.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SCF/for-loop-specialization.mlir @@ -0,0 +1,39 @@ +// RUN: mlir-opt %s -for-loop-specialization -split-input-file | FileCheck %s + +#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)> +#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)> + +func @for(%outer: index, %A: memref, %B: memref, + %C: memref, %result: memref) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %d0 = dim %A, %c0 : memref + %b0 = affine.min #map0()[%d0, %outer] + scf.for %i0 = %c0 to %b0 step %c1 { + %B_elem = load %B[%i0] : memref + %C_elem = load %C[%i0] : memref + %sum_elem = addf %B_elem, %C_elem : f32 + store %sum_elem, %result[%i0] : memref + } + return +} + +// CHECK-LABEL: func @for( +// CHECK-SAME: [[ARG0:%.*]]: index, [[ARG1:%.*]]: memref, [[ARG2:%.*]]: memref, [[ARG3:%.*]]: memref, [[ARG4:%.*]]: memref) { +// CHECK: [[CST_0:%.*]] = constant 0 : index +// CHECK: [[CST_1:%.*]] = constant 1 : index +// CHECK: [[DIM_0:%.*]] = dim [[ARG1]], [[CST_0]] : memref +// CHECK: [[MIN:%.*]] = affine.min #map0(){{\[}}[[DIM_0]], [[ARG0]]] +// CHECK: [[CST_1024:%.*]] = constant 1024 : index +// CHECK: [[PRED:%.*]] = cmpi "eq", [[MIN]], [[CST_1024]] : index +// CHECK: scf.if [[PRED]] { +// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[CST_1024]] step [[CST_1]] { +// CHECK: store +// CHECK: } +// CHECK: } else { +// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[MIN]] step [[CST_1]] { +// CHECK: store +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: }