diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h --- a/mlir/include/mlir/IR/AffineMap.h +++ b/mlir/include/mlir/IR/AffineMap.h @@ -327,6 +327,21 @@ /// ``` AffineMap concatAffineMaps(ArrayRef maps); +/// Returns the map that results from projecting out the dimensions specified in +/// `projectedDimensions`. The projected dimensions are set to 0. +/// +/// Example: +/// 1) map : affine_map<(d0, d1, d2) -> (d0, d1)> +/// projected_dimensions : {2} +/// result : affine_map<(d0, d1) -> (d0, d1)> +/// +/// 2) map : affine_map<(d0, d1) -> (d0 + d1)> +/// projected_dimensions : {1} +/// result : affine_map<(d0) -> (d0)> +/// +/// 3) map : affine_map<(d0, d1, d2) -> (d0, d1)> +/// projected_dimensions : {1} +/// result : affine_map<(d0, d1) -> (d0, 0)> AffineMap getProjectedMap(AffineMap map, ArrayRef projectedDimensions); diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1938,6 +1938,11 @@ newOperands.push_back(fold ? tensorCastOp.getOperand() : v); newResultTypes.push_back(newOperands.back().getType()); } + if (linalgOp.getNumInitTensors() == 0) { + for (Value v : linalgOp.getOperation()->getResults()) { + newResultTypes.push_back(v.getType()); + } + } auto extraOperands = linalgOp.getAssumedNonShapedOperands(); newOperands.append(extraOperands.begin(), extraOperands.end()); // Clone op. diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/SCF/EDSC/Builders.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" @@ -220,9 +221,8 @@ static SmallVector makeTiledShapes(OpBuilder &b, Location loc, LinalgOp linalgOp, - ValueRange operands, AffineMap map, ValueRange ivs, + ArrayRef tiledOperands, AffineMap map, ValueRange ivs, ValueRange tileSizes, ValueRange allShapeSizes) { - assert(operands.size() == linalgOp.getShapedOperands().size()); assert(ivs.size() == static_cast(llvm::count_if( llvm::make_range(tileSizes.begin(), tileSizes.end()), [](Value v) { return !isZero(v); })) && @@ -242,11 +242,9 @@ subShapeSizes.push_back(size - std_constant_index(1)); } - auto *op = linalgOp.getOperation(); - SmallVector res; - res.reserve(op->getNumOperands()); - for (auto en : llvm::enumerate(operands)) { + res.reserve(tiledOperands.size()); + for (auto en : llvm::enumerate(tiledOperands)) { Value shapedOp = en.value(); ShapedType shapedType = shapedOp.getType().cast(); unsigned rank = shapedType.getRank(); @@ -313,6 +311,40 @@ return res; } +/// Linalg operation on tensors might not have a `init` tensor if they dont have +/// reduction semantics. Tiling though requires an init tensor since it is the +/// tensor into which the sub tensor created by the body of the tiled loop is +/// inserted into. This struct provides a uniform interface to reason about init +/// tensors during tiling. +struct TiledOp { + TiledOp(OpBuilder &b, LinalgOp op) : op(op) { + if (op.getOperation()->getNumResults() != 0 && + op.getNumInitTensors() == 0) { + for (Value result : op.getOperation()->getResults()) { + ShapedType resultType = result.getType().cast(); + SmallVector shape = llvm::to_vector<4>( + llvm::map_range(llvm::seq(0, resultType.getRank()), + [&](int64_t v) -> Value { + return b.create(op.getLoc(), result, v); + })); + initTensors.push_back(b.create( + op.getLoc(), shape, resultType.getElementType())); + } + } else { + initTensors = llvm::to_vector<1>(op.getInitTensors()); + } + } + SmallVector getTiledOperands() { + SmallVector tiledOperands = + llvm::to_vector<4>(op.getShapedOperands()); + if (op.getNumInitTensors() == 0) + tiledOperands.append(initTensors.begin(), initTensors.end()); + return tiledOperands; + } + LinalgOp op; + SmallVector initTensors; +}; + template static Optional tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ValueRange tileSizes, @@ -341,6 +373,7 @@ LoopIndexToRangeIndexMap loopIndexToRangeIndex; std::tie(loopRanges, loopIndexToRangeIndex) = makeTiledLoopRanges( b, op.getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes); + SmallVector iteratorTypes; for (auto attr : enumerate(op.iterator_types().cast().getValue())) { @@ -374,9 +407,9 @@ // 2. Create the tiled loops. LinalgOp res = op; SmallVector ivs, tensorResults; - auto initTensors = op.getInitTensors(); + TiledOp tiledOp(b, op); GenerateLoopNest::doit( - loopRanges, /*iterArgInitValues*/ initTensors, iteratorTypes, + loopRanges, /*iterArgInitValues*/ tiledOp.initTensors, iteratorTypes, [&](ValueRange localIvs, ValueRange iterArgs) -> scf::ValueVector { auto &b = ScopedContext::getBuilderRef(); auto loc = ScopedContext::getLocation(); @@ -391,47 +424,42 @@ else interchangedIvs.assign(ivs.begin(), ivs.end()); - assert(op.getNumInitTensors() == iterArgs.size() && + assert((tiledOp.initTensors.size() == iterArgs.size()) && "num init tensors must match number of loop iter arguments"); - // This uses knowledge about position of the init tensor in the list - // of operands. - auto operands = llvm::to_vector<4>(op.getShapedOperands()); + // Replace the init tensors of the original operands with the arguments + // of the loop to get the init tensors of the tiled operation. + // This uses knowledge about position of the init tensor in the list of + // operands. + SmallVector tiledOperands = tiledOp.getTiledOperands(); std::copy(iterArgs.begin(), iterArgs.end(), - operands.begin() + op.getNumInputsAndOutputBuffers()); + tiledOperands.begin() + op.getNumInputsAndOutputBuffers()); - SmallVector tiledOperands = - makeTiledShapes(b, loc, op, operands, shapeSizesToLoopsMap, + SmallVector tiledValues = + makeTiledShapes(b, loc, op, tiledOperands, shapeSizesToLoopsMap, interchangedIvs, tileSizes, allShapeSizes); - auto nonShapedOperands = op.getAssumedNonShapedOperands(); - tiledOperands.append(nonShapedOperands.begin(), - nonShapedOperands.end()); - - // If LinalgOp has results, they must all be tied to init tensors. - // We enforce this to ensure all tiled ops have been rewritten in - // "init tensor" form. This ensures tiling has anchor values into which - // to subtensor / subtensor_insert. Otherwise tiling would need to - // allocate which is not acceptable. - // This would not be the case with a special terminator op that - // generates the whole tensor (instead of inserting a subtensor). But - // the generator-based abstraction has other issues. - assert(op.getNumInitTensors() == op->getNumResults() && - "expected same number of init tensors as number of results"); // Handle init tensor operands. // This uses knowledge about position of the init tensor in the list // of operands. // TODO: InterfaceAdaptor ? SmallVector resultTensorTypes; - for (auto idx : llvm::seq(0, op.getNumInitTensors())) + for (auto idx : + llvm::seq(0, op.getOperation()->getNumResults())) resultTensorTypes.push_back( - tiledOperands[op.getNumInputsAndOutputBuffers() + idx].getType()); + tiledValues[op.getNumInputsAndOutputBuffers() + idx].getType()); - res = op.clone(b, loc, resultTensorTypes, tiledOperands); + SmallVector clonedOpOperands = llvm::to_vector<4>( + ArrayRef(tiledValues).take_front(op.getNumShapedOperands())); + auto nonShapedOperands = op.getAssumedNonShapedOperands(); + clonedOpOperands.append(nonShapedOperands.begin(), + nonShapedOperands.end()); + res = op.clone(b, loc, resultTensorTypes, clonedOpOperands); // Insert a subtensor_insert for each init subtensor. - for (unsigned idx = 0, e = op.getNumInitTensors(); idx != e; ++idx) { + for (unsigned idx = 0, e = tiledOp.initTensors.size(); idx != e; + ++idx) { Value initTensor = - tiledOperands[op.getNumInputsAndOutputBuffers() + idx]; + tiledValues[op.getNumInputsAndOutputBuffers() + idx]; if (auto subtensor = initTensor.getDefiningOp()) { tensorResults.push_back(b.create( loc, subtensor.source().getType(), res->getResult(idx), @@ -581,10 +609,10 @@ static void insertTilingPatterns(OwningRewritePatternList &patterns, const LinalgTilingOptions &options, MLIRContext *ctx) { - RewritePatternList< + RewritePatternList::insert(patterns, options, ctx); + >::insert(patterns, options, ctx); } static void applyTilingToLoopPatterns(LinalgTilingLoopType loopType, diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -125,15 +125,8 @@ if (failed(marker.checkAndNotify(rewriter, linalgOp))) return failure(); - // If LinalgOp has results, they must all be tied to init tensors. - // We enforce this to ensure all tiled ops have been rewritten in - // "init tensor" form. This ensures tiling has anchor values into which to - // subtensor / subtensor_insert. Otherwise tiling would need to allocate which - // is not acceptable. - // This would not be the case with a special terminator op that generates the - // whole tensor (instead of inserting a subtensor). But the generator-based - // abstraction has other issues. - if (linalgOp.getNumInitTensors() != linalgOp->getNumResults()) + if (linalgOp.getNumInitTensors() != 0 && + linalgOp.getNumInitTensors() != linalgOp->getNumResults()) return failure(); Optional res = tileLinalgOp(rewriter, linalgOp, options); diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir --- a/mlir/test/Dialect/Linalg/tile-tensors.mlir +++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -mlir-disable-threading=true | FileCheck %s +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -split-input-file | FileCheck %s // CHECK-LABEL: func @matmul_tensors( // CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor @@ -26,3 +26,40 @@ // CHECK: return %[[TD0]] : tensor return %0 : tensor } + +// ----- + +func @generic_op_tensors( + %arg0 : tensor, %arg1 : tensor) -> tensor { + %0 = linalg.generic + {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d2, d1)>, + affine_map<(d0, d1, d2) -> (d2, d1, d0)>], + iterator_types = ["parallel", "parallel", "parallel"]} + ins(%arg0, %arg1 : tensor, tensor) { + ^bb0(%arg2 : f32, %arg3: f32): + %1 = addf %arg2, %arg3 : f32 + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func @generic_op_tensors +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor +// CHECK: %[[INIT:.+]] = linalg.init_tensor +// CHECK: %[[TD0:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[TC0:.+]] = %[[INIT]]) -> (tensor) { +// CHECK: %[[TD1:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[TC1:.+]] = %[[TC0]]) -> (tensor) { +// CHECK: %[[TD2:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[TC2:.+]] = %[[TC1]]) -> (tensor) { +// CHECK: %[[STARG0:.+]] = subtensor %[[ARG0]][{{.+}}] : tensor to tensor +// CHECK: %[[STARG1:.+]] = subtensor %[[ARG1]][{{.+}}] : tensor to tensor +// CHECK: %[[STRETURN:.+]] = linalg.generic +// CHECK-SAME: ins(%[[STARG0]], %[[STARG1]] : tensor, tensor) +// CHECK: %[[TD:.+]] = subtensor_insert %[[STRETURN]] into %[[TC2]] +// CHECK: scf.yield %[[TD]] +// CHECK: } +// CHECK: scf.yield %[[TD2]] +// CHECK: } +// CHECK: scf.yield %[[TD1]] +// CHECK: } +// CHECK: return %[[TD0]]