diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h --- a/mlir/include/mlir/Dialect/Linalg/Passes.h +++ b/mlir/include/mlir/Dialect/Linalg/Passes.h @@ -29,6 +29,9 @@ std::unique_ptr> createLinalgTilingToParallelLoopsPass(ArrayRef tileSizes = {}); +std::unique_ptr> +createLinalgTilingToTiledLoopPass(ArrayRef tileSizes = {}); + std::unique_ptr> createLinalgPromotionPass(bool dynamicBuffers, bool useAlloca); std::unique_ptr> createLinalgPromotionPass(); diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td --- a/mlir/include/mlir/Dialect/Linalg/Passes.td +++ b/mlir/include/mlir/Dialect/Linalg/Passes.td @@ -121,8 +121,7 @@ "scf::SCFDialect" ]; let options = [ - ListOption<"tileSizes", "linalg-tile-sizes", "int64_t", - "Test generation of dynamic promoted buffers", + ListOption<"tileSizes", "linalg-tile-sizes", "int64_t", "Tile sizes", "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated"> ]; } @@ -132,8 +131,23 @@ let summary = "Tile operations in the linalg dialect to parallel loops"; let constructor = "mlir::createLinalgTilingToParallelLoopsPass()"; let options = [ - ListOption<"tileSizes", "linalg-tile-sizes", "int64_t", - "Test generation of dynamic promoted buffers", + ListOption<"tileSizes", "linalg-tile-sizes", "int64_t", "Tile sizes", + "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated"> + ]; + let dependentDialects = [ + "AffineDialect", + "linalg::LinalgDialect", + "memref::MemRefDialect", + "scf::SCFDialect" + ]; +} + +def LinalgTilingToTiledLoops + : FunctionPass<"linalg-tile-to-tiled-loop"> { + let summary = "Tile operations in the linalg dialect to linalg.tiled_loop"; + let constructor = "mlir::createLinalgTilingToTiledLoopPass()"; + let options = [ + ListOption<"tileSizes", "linalg-tile-sizes", "int64_t", "Tile sizes", "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated"> ]; let dependentDialects = [ diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -424,6 +424,7 @@ Loops = 0, AffineLoops = 1, ParallelLoops = 2, + TiledLoops = 3, }; using TileSizeComputationFunction = diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -253,7 +253,7 @@ edsc::intrinsics::MemRefIndexedValue>::type; static void - doit(ArrayRef loopRanges, ValueRange iterArgInitValues, + doit(ArrayRef loopRanges, LinalgOp linalgOp, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional = None); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp @@ -473,7 +473,7 @@ SmallVector allIvs; GenerateLoopNest::doit( - loopRanges, /*iterInitArgs=*/{}, iteratorTypes, + loopRanges, linalgOp, iteratorTypes, [&](ValueRange ivs, ValueRange iterArgs) -> scf::ValueVector { assert(iterArgs.empty() && "unexpected iterArgs"); allIvs.append(ivs.begin(), ivs.end()); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -312,9 +312,8 @@ // 2. Create the tiled loops. LinalgOp res = op; SmallVector ivs, tensorResults; - auto outputTensors = op.getOutputTensors(); GenerateLoopNest::doit( - loopRanges, /*iterArgInitValues*/ outputTensors, iteratorTypes, + loopRanges, op, iteratorTypes, [&](ValueRange localIvs, ValueRange iterArgs) -> scf::ValueVector { auto &b = ScopedContext::getBuilderRef(); auto loc = ScopedContext::getLocation(); @@ -439,6 +438,8 @@ return tileLinalgOpImpl(b, op, options); case LinalgTilingLoopType::ParallelLoops: return tileLinalgOpImpl(b, op, options); + case LinalgTilingLoopType::TiledLoops: + return tileLinalgOpImpl(b, op, options); default:; } return llvm::None; @@ -567,6 +568,17 @@ } }; +struct LinalgTilingToTiledLoopsPass + : public LinalgTilingToTiledLoopsBase { + LinalgTilingToTiledLoopsPass() = default; + LinalgTilingToTiledLoopsPass(ArrayRef sizes) { tileSizes = sizes; } + + void runOnFunction() override { + applyTilingToLoopPatterns(LinalgTilingLoopType::TiledLoops, getFunction(), + tileSizes); + } +}; + } // namespace std::unique_ptr> @@ -578,3 +590,8 @@ mlir::createLinalgTilingToParallelLoopsPass(ArrayRef tileSizes) { return std::make_unique(tileSizes); } + +std::unique_ptr> +mlir::createLinalgTilingToTiledLoopPass(ArrayRef tileSizes) { + return std::make_unique(tileSizes); +} diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -142,6 +142,7 @@ template struct mlir::linalg::GenerateLoopNest; template struct mlir::linalg::GenerateLoopNest; template struct mlir::linalg::GenerateLoopNest; +template struct mlir::linalg::GenerateLoopNest; /// Given a list of subview ranges, extract individual values for lower, upper /// bounds and steps and put them into the corresponding vectors. @@ -186,10 +187,11 @@ /// Specialization to build an scf "for" nest. template <> void GenerateLoopNest::doit( - ArrayRef loopRanges, ValueRange iterArgInitValues, + ArrayRef loopRanges, LinalgOp linalgOp, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional distributionOptions) { + auto iterArgInitValues = linalgOp.getOutputTensors(); // Create procInfo so it dominates loops, if appropriate. OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); Location loc = edsc::ScopedContext::getLocation(); @@ -216,10 +218,11 @@ /// Specialization to build affine "for" nest. template <> void GenerateLoopNest::doit( - ArrayRef loopRanges, ValueRange iterArgInitValues, + ArrayRef loopRanges, LinalgOp linalgOp, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional) { + auto iterArgInitValues = linalgOp.getOutputTensors(); assert(iterArgInitValues.empty() && "unexpected AffineForOp init values"); SmallVector lbs, ubs, steps; unpackRanges(loopRanges, lbs, ubs, steps); @@ -240,6 +243,44 @@ bodyBuilderWithoutIterArgsFn); } +/// Specialization to build an linalg.tiled_loop +template <> +void GenerateLoopNest::doit( + ArrayRef loopRanges, LinalgOp linalgOp, + ArrayRef iteratorTypes, + function_ref bodyBuilderFn, + Optional) { + OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); + Location loc = edsc::ScopedContext::getLocation(); + SmallVector procInfo; + + SmallVector lbs, ubs, steps; + unpackRanges(loopRanges, lbs, ubs, steps); + + auto wrappedBuilderFn = [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange ivs, ValueRange inputs, + ValueRange outputs) { + ScopedContext context(nestedBuilder, nestedLoc); + scf::ValueVector results = bodyBuilderFn(ivs, linalgOp.getOutputTensors()); + nestedBuilder.create(nestedLoc, results); + }; + + auto tiledLoop = builder.create( + loc, lbs, ubs, steps, linalgOp.getInputs(), linalgOp.getOutputs(), + builder.getArrayAttr(iteratorTypes), wrappedBuilderFn); + + // Replace inputs/outputs with the corresponding region args. + auto isInsideTiledLoop = [&](OpOperand &operand) { + return operand.getOwner()->getBlock() == tiledLoop.getBody(); + }; + for (auto it : + llvm::zip(linalgOp.getInputs(), tiledLoop.getRegionInputArgs())) + std::get<0>(it).replaceUsesWithIf(std::get<1>(it), isInsideTiledLoop); + for (auto it : + llvm::zip(linalgOp.getOutputs(), tiledLoop.getRegionOutputArgs())) + std::get<0>(it).replaceUsesWithIf(std::get<1>(it), isInsideTiledLoop); +} + /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`. void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc, Value procId, Value nprocs, Value &lb, @@ -373,10 +414,11 @@ /// Specialization for generating a mix of parallel and sequential scf loops. template <> void GenerateLoopNest::doit( - ArrayRef loopRanges, ValueRange iterArgInitValues, + ArrayRef loopRanges, LinalgOp linalgOp, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional distributionOptions) { + auto iterArgInitValues = linalgOp.getOutputTensors(); assert(iterArgInitValues.empty() && "unexpected ParallelOp init values"); // This function may be passed more iterator types than ranges. assert(iteratorTypes.size() >= loopRanges.size() && diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir --- a/mlir/test/Dialect/Linalg/tile-tensors.mlir +++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -linalg-tile-to-tiled-loop="linalg-tile-sizes=2,3,4" -split-input-file | FileCheck %s -check-prefix=TLOOP // CHECK-LABEL: func @matmul_tensors( // CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor @@ -27,6 +28,38 @@ return %0 : tensor } +// TLOOP-LABEL: func @matmul_tensors +// TLOOP-SAME: (%[[ARG_0:.*]]: [[TY:.*]], %[[ARG_1:.*]]: [[TY]], +// TLOOP-SAME: %[[ARG_2:.*]]: [[TY]]) -> [[TY]] { + +// TLOOP-DAG: %[[C0:.*]] = constant 0 : index +// TLOOP-DAG: %[[C1:.*]] = constant 1 : index +// TLOOP-DAG: %[[C2:.*]] = constant 2 : index +// TLOOP-DAG: %[[C3:.*]] = constant 3 : index +// TLOOP-DAG: %[[C4:.*]] = constant 4 : index + +// TLOOP: %[[ARG_0_X:.*]] = memref.dim %[[ARG_0]], %[[C0]] : [[TY]] +// TLOOP: %[[ARG_0_Y:.*]] = memref.dim %[[ARG_0]], %[[C1]] : [[TY]] +// TLOOP: %[[ARG_1_Y:.*]] = memref.dim %[[ARG_1]], %[[C1]] : [[TY]] + +// TLOOP: %{{.*}} = linalg.tiled_loop (%[[I:.*]], %[[J:.*]], %[[K:.*]]) = +// TLOOP-SAME: (%[[C0]], %[[C0]], %[[C0]]) +// TLOOP-SAME: to (%[[ARG_0_X]], %[[ARG_1_Y]], %[[ARG_0_Y]]) +// TLOOP-SAME: step (%[[C2]], %[[C3]], %[[C4]]) +// TLOOP-SAME: ins (%[[A0:.*]] = %[[ARG_0]]: [[TY]], %[[A1:.*]] = %[[ARG_1]]: [[TY]]) +// TLOOP-SAME: outs (%[[A2:.*]] = %[[ARG_2]]: [[TY]]) +// TLOOP-SAME: iterators["parallel", "parallel", "reduction"] { + +// TLOOP: %[[SUB_ARG_0:.*]] = subtensor %[[A0]][%[[I]], %[[K]]] +// TLOOP: %[[SUB_ARG_1:.*]] = subtensor %[[A1]][%[[K]], %[[J]]] +// TLOOP: %[[SUB_ARG_2:.*]] = subtensor %[[A2]][%[[I]], %[[J]]] + +// TLOOP: %[[PROD:.*]] = linalg.matmul ins(%[[SUB_ARG_0]], %[[SUB_ARG_1]] +// TLOOP-SE: outs(%[[SUB_ARG_2]] : [[TY]]) -> [[TY]] + +// TLOOP: %[[O:.*]] = subtensor_insert %[[PROD]] into %[[A2]][%[[I]], %[[J]]] +// TLOOP: linalg.yield %[[O]] : [[TY]] + // ----- func @generic_op_tensors( @@ -74,6 +107,28 @@ // CHECK: } // CHECK: return %[[TD0]] +// TLOOP-LABEL: func @generic_op_tensors( +// TLOOP-SAME: %[[ARG_0:.*]]: [[TY:.*]], +// TLOOP-SAME: %[[ARG_1:.*]]: [[TY]]) -> [[TY]] { + +// TLOOP-DAG: %[[C0:.*]] = constant 0 : index +// TLOOP-DAG: %[[C1:.*]] = constant 1 : index +// TLOOP-DAG: %[[C2:.*]] = constant 2 : index +// TLOOP-DAG: %[[C3:.*]] = constant 3 : index +// TLOOP-DAG: %[[C4:.*]] = constant 4 : index + +// TLOOP: %[[INIT:.*]] = linalg.init_tensor +// TLOOP: %[[ARG_0_X:.*]] = memref.dim %[[ARG_0]], %[[C0]] : [[TY]] +// TLOOP: %[[ARG_0_Y:.*]] = memref.dim %[[ARG_0]], %[[C1]] : [[TY]] +// TLOOP: %[[ARG_0_Z:.*]] = memref.dim %[[ARG_0]], %[[C2]] : [[TY]] + +// TLOOP: %{{.*}} = linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = +// TLOOP-SAME: (%[[C0]], %[[C0]], %[[C0]]) +// TLOOP-SAME: to (%[[ARG_0_X]], %[[ARG_0_Y]], %[[ARG_0_Z]]) +// TLOOP-SAME: step (%[[C2]], %[[C3]], %[[C4]]) +// TLOOP-SAME: ins (%{{.*}} = %[[ARG_0]]: [[TY]], %{{.*}} = %[[ARG_1]]: [[TY]]) +// TLOOP-SAME: outs (%{{.*}} = %[[INIT]]: [[TY]]) + // ----- func @indexed_generic_op_tensors(