diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -512,6 +512,15 @@ assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops"); } +static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc, + AffineExpr expr, ValueRange operands) { + AffineMap map = AffineMap::inferFromExprList({expr}).front(); + SmallVector normalizedOperands(operands.begin(), operands.end()); + mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands); + canonicalizeMapAndOperands(&map, &normalizedOperands); + return b.createOrFold(loc, map, normalizedOperands); +} + Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile, ValueRange tileSizes, AffineMap map, ValueRange lbs, ValueRange ubs, ValueRange subShapeSizes) { @@ -547,16 +556,21 @@ applyMapToValues(builder, loc, m, subShapeSizes).front(); // Resulting size needs to be made half open interval again. AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext()); - Value size = makeComposedAffineApply(builder, loc, s0 + 1, closedIntSize); + Value size = + fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize); LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n"); // The size of the subview / extract_slice should be trimmed to avoid - // out-of-bounds accesses, unless we statically know the subshape size - // divides the shape size evenly. + // out-of-bounds accesses, unless: + // a. We statically know the subshape size divides the shape size evenly. + // b. The subshape size is 1. According to the way the loops are set up, + // tensors with "0" dimensions would never be constructed. int64_t shapeSize = shape[r]; auto sizeCst = size.getDefiningOp(); - if (ShapedType::isDynamic(shapeSize) || !sizeCst || - (shapeSize % sizeCst.getValue()) != 0) { + auto hasTileSizeOne = sizeCst && sizeCst.getValue() == 1; + auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) && + ((shapeSize % sizeCst.getValue()) == 0); + if (!hasTileSizeOne && !dividesEvenly) { LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize << ", size: " << size << ": make sure in bound with affine.min\n"); @@ -570,6 +584,7 @@ Value d = applyMapToValues(builder, loc, m, ubs).front(); SmallVector operands{size, d, offset}; fullyComposeAffineMapAndOperands(&minMap, &operands); + canonicalizeMapAndOperands(&minMap, &operands); size = builder.create(loc, builder.getIndexType(), minMap, operands); } @@ -616,7 +631,7 @@ // Before composing, we need to make range a closed interval. Value size = isTiled ? tileSizes[idx] : sizeBounds[idx]; AffineExpr d0 = getAffineDimExpr(0, b.getContext()); - sizes.push_back(makeComposedAffineApply(b, loc, d0 - 1, size)); + sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size)); LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n"); } return sizes; diff --git a/mlir/test/Dialect/Linalg/fusion-sequence.mlir b/mlir/test/Dialect/Linalg/fusion-sequence.mlir --- a/mlir/test/Dialect/Linalg/fusion-sequence.mlir +++ b/mlir/test/Dialect/Linalg/fusion-sequence.mlir @@ -212,7 +212,6 @@ } } -// CHaECK: #[[MAP0:.+]] = affine_map<(d0, d1) -> (16, d0 - d1)> // CHECK: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)> // CHECK: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, 16, -d0 + s1)> diff --git a/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir b/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir --- a/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir +++ b/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir @@ -25,3 +25,50 @@ outs(%out: tensor) -> tensor return %r : tensor } + +// ----- + +// The input IR of this test case is a tiled and peeled linalg.matmul op. + +// CHECK-LABEL: func @tiled_and_peeled_matmul( +// CHECK: linalg.matmul ins({{.*}} : tensor<32x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<32x258xf32>) -> tensor<32x258xf32> +// CHECK: linalg.matmul ins({{.*}} : tensor<1x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<1x258xf32>) -> tensor<1x258xf32> +#map0 = affine_map<(d0) -> (64, -d0 + 257)> +#map1 = affine_map<()[s0] -> ((s0 floordiv 32) * 32)> +#map2 = affine_map<(d0)[s0] -> (d0 - (s0 floordiv 32) * 32)> + +func @tiled_and_peeled_matmul(%arg0: tensor<257x259xf32>, %arg1: tensor<259x258xf32>, %arg2: tensor<257x258xf32>) -> tensor<257x258xf32> { + %c257 = constant 257 : index + %c64 = constant 64 : index + %cst = constant 0.000000e+00 : f32 + %c0 = constant 0 : index + %c32 = constant 32 : index + %0 = linalg.fill(%cst, %arg2) : f32, tensor<257x258xf32> -> tensor<257x258xf32> + %1 = scf.for %arg3 = %c0 to %c257 step %c64 iter_args(%arg4 = %0) -> (tensor<257x258xf32>) { + %2 = affine.min #map0(%arg3) + %3 = tensor.extract_slice %arg0[%arg3, 0] [%2, 259] [1, 1] : tensor<257x259xf32> to tensor + %4 = tensor.extract_slice %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<257x258xf32> to tensor + %5 = affine.apply #map1()[%2] + %6 = scf.for %arg5 = %c0 to %5 step %c32 iter_args(%arg6 = %4) -> (tensor) { + %10 = tensor.extract_slice %3[%arg5, 0] [32, 259] [1, 1] : tensor to tensor<32x259xf32> + %11 = tensor.extract_slice %arg6[%arg5, 0] [32, 258] [1, 1] : tensor to tensor<32x258xf32> + %12 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%10, %arg1 : tensor<32x259xf32>, tensor<259x258xf32>) outs(%11 : tensor<32x258xf32>) -> tensor<32x258xf32> + %13 = tensor.insert_slice %12 into %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<32x258xf32> into tensor + scf.yield %13 : tensor + } + %7 = cmpi slt, %5, %2 : index + %8 = scf.if %7 -> (tensor) { + %10 = affine.apply #map2(%2)[%2] + %11 = tensor.extract_slice %3[%5, 0] [%10, 259] [1, 1] : tensor to tensor + %12 = tensor.extract_slice %6[%5, 0] [%10, 258] [1, 1] : tensor to tensor + %13 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%11, %arg1 : tensor, tensor<259x258xf32>) outs(%12 : tensor) -> tensor + %14 = tensor.insert_slice %13 into %6[%5, 0] [%10, 258] [1, 1] : tensor into tensor + scf.yield %14 : tensor + } else { + scf.yield %6 : tensor + } + %9 = tensor.insert_slice %8 into %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor into tensor<257x258xf32> + scf.yield %9 : tensor<257x258xf32> + } + return %1 : tensor<257x258xf32> +} diff --git a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir --- a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir +++ b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s -// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s +// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s -check-prefix=CHECK-1D +// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s -check-prefix=CHECK-2D func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>, %B: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>, @@ -11,17 +11,36 @@ return } -// CHECK-LABEL:func @matmul -// CHECK: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32> -// CHECK: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32> -// CHECK: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32> +// CHECK-1D-LABEL:func @matmul +// CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32> +// CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32> +// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32> // -// CHECK: linalg.copy -// CHECK: linalg.copy -// CHECK: linalg.copy +// CHECK-1D: vector.transfer_read {{.*}} : memref<8x16xf32, #{{.*}}>, vector<8x16xf32> +// CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32, #{{.*}}> +// CHECK-1D: vector.transfer_read {{.*}} : memref<16x12xf32, #{{.*}}>, vector<16x12xf32> +// CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32, #{{.*}}> +// CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32> +// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}> // -// CHECK: vector.contract -// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] -// CHECK-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32> +// CHECK-1D: vector.contract +// CHECK-1D-SAME: iterator_types = ["parallel", "parallel", "reduction"] +// CHECK-1D-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32> // -// CHECK: linalg.copy +// CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32> +// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}> + +// CHECK-2D-LABEL:func @matmul +// CHECK-2D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32> +// CHECK-2D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32> +// CHECK-2D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32> +// +// CHECK-2D: linalg.copy +// CHECK-2D: linalg.copy +// CHECK-2D: linalg.copy +// +// CHECK-2D: vector.contract +// CHECK-2D-SAME: iterator_types = ["parallel", "parallel", "reduction"] +// CHECK-2D-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32> +// +// CHECK-2D: linalg.copy