diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -306,14 +306,21 @@ // Bufferize ops top-to-bottom. When creating a new op, we should ideally // know the exact memref type of all operands. Otherwise, we have to use a // memref type with a fully dynamic layout map, which has to canonicalize - // away. - // Moreover, if "fullyDynamicLayoutMaps = false", we may otherwise have to - // insert buffer copies to fold ("finalize") to_memref(to_tensor(x)) ops with - // non-cast-compatible layout maps. + // away. This is less efficient. + // + // Note: If "fullyDynamicLayoutMaps = false", we may have to insert buffer + // copies to fold ("finalize") to_memref(to_tensor(x)) ops with non-cast- + // compatible layout maps when doing a traversal other than top-to-bottom. + // There are currently no canonicalization patterns to fold these away. GreedyRewriteConfig config; config.useTopDownTraversal = true; - if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) + // TODO: Perform a preorder walk instead of the greedy pattern rewriter. This + // would be more efficient because every bufferization pattern is guaranteed + // to apply only a single time (otherwise, an assertion would be triggered). + // However, there are restrictions wrt. erasing ops during a preorder walk, + // which would likely require a larger refactoring. + if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config))) return failure(); return checkBufferizationResult(op, state.getOptions()); diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir @@ -132,8 +132,6 @@ // ----- -// CHECK: #[[$MAP3:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - // CHECK-LABEL: func @unknown_op_may_read( func @unknown_op_may_read(%v: vector<5xf32>) -> (tensor<10xf32>, tensor<10xf32>) { @@ -142,27 +140,25 @@ // One alloc for the init_tensor, another one because the transfer_write // bufferizes out-of-place. - // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32> // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32> - // CHECK: %[[alloc_casted:.*]] = memref.cast %[[alloc]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]> - // CHECK: %[[m1_casted:.*]] = memref.cast %[[m1]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]> + // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32> %t1 = linalg.init_tensor [10] : tensor<10xf32> // CHECK: linalg.fill(%{{.*}}, %[[m1]]) - // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1_casted]] + // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1]] %filled = linalg.fill(%cst, %t1) : f32, tensor<10xf32> -> tensor<10xf32> // The transfer_write is out-of-place because "dummy_op" may read. // CHECK: memref.copy %[[m1]], %[[alloc]] // CHECK: vector.transfer_write %{{.*}}, %[[alloc]] - // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc_casted]] + // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc]] %1 = vector.transfer_write %v, %filled[%idx] : vector<5xf32>, tensor<10xf32> // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[filled_tensor]]) %2 = "test.dummy_op"(%filled) : (tensor<10xf32>) -> (tensor<10xf32>) - // CHECK: memref.dealloc %[[alloc]] - // CHECK: memref.dealloc %[[m1]] + // CHECK-DAG: memref.dealloc %[[alloc]] + // CHECK-DAG: memref.dealloc %[[m1]] // CHECK: return %[[alloc_tensor]], %[[dummy]] return %1, %2 : tensor<10xf32>, tensor<10xf32> } diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir @@ -31,19 +31,19 @@ %v1 = arith.constant 1.0 : f32 %v2 = arith.constant 2.0 : f32 - // CHECK-NEXT: %[[A:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32> - // CHECK-NEXT: %[[B:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32> // CHECK-NEXT: %[[C:.*]] = memref.alloca() {alignment = 128 : i64} : memref - // CHECK-NEXT: %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> - // CHECK-NEXT: %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> - // CHECK-NEXT: %[[cC:.*]] = memref.cast %[[C]] : memref to memref + // CHECK-NEXT: %[[B:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32> + // CHECK-NEXT: %[[A:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32> + // CHECK-DAG: %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> + // CHECK-DAG: %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> + // CHECK-DAG: %[[cC:.*]] = memref.cast %[[C]] : memref to memref %A = linalg.init_tensor [64] : tensor<64xf32> %B = linalg.init_tensor [64] : tensor<64xf32> %C = linalg.init_tensor [] : tensor - // CHECK-NEXT: linalg.fill(%[[C1]], %[[A]]) : f32, memref<64xf32> - // CHECK-NEXT: linalg.fill(%[[C2]], %[[B]]) : f32, memref<64xf32> - // CHECK-NEXT: linalg.fill(%[[C0]], %[[C]]) : f32, memref + // CHECK-DAG: linalg.fill(%[[C1]], %[[A]]) : f32, memref<64xf32> + // CHECK-DAG: linalg.fill(%[[C2]], %[[B]]) : f32, memref<64xf32> + // CHECK-DAG: linalg.fill(%[[C0]], %[[C]]) : f32, memref %AA = linalg.fill(%v1, %A) : f32, tensor<64xf32> -> tensor<64xf32> %BB = linalg.fill(%v2, %B) : f32, tensor<64xf32> -> tensor<64xf32> %CC = linalg.fill(%v0, %C) : f32, tensor -> tensor diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir @@ -20,7 +20,6 @@ // insert_slice. InitTensorOp replaces the init_tensor with an out-of-place // extract_slice. // CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]]) - // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] %a = linalg.init_tensor[%sz] : tensor // CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref @@ -31,6 +30,7 @@ // CHECK: memref.copy %[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]] : memref to memref %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor into tensor + // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] // CHECK: memref.copy %[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]] %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir @@ -187,9 +187,9 @@ -> (tensor, tensor, tensor, tensor) { // Hoisted allocs. - // CHECK: %[[REALLOC3:.*]] = memref.alloc - // CHECK: %[[REALLOC2:.*]] = memref.alloc // CHECK: %[[REALLOC1:.*]] = memref.alloc + // CHECK: %[[REALLOC2:.*]] = memref.alloc + // CHECK: %[[REALLOC3:.*]] = memref.alloc // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. // CHECK: memref.copy %[[A0]], %[[REALLOC3]] @@ -414,12 +414,12 @@ // CHECK: func @main() func @main() { -// CHECK: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> +// CHECK-DAG: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> -// CHECK: %[[alloc:.*]] = memref.alloc -// CHECK: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> -// CHECK: memref.copy %[[A]], %[[alloc]] +// CHECK-DAG: %[[alloc:.*]] = memref.alloc +// CHECK-DAG: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> +// CHECK-DAG: memref.copy %[[A]], %[[alloc]] // CHECK: call @some_external_func(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> () call @some_external_func(%A) : (tensor<4xi32>) -> () @@ -436,12 +436,12 @@ // CHECK: func @main() func @main() { -// CHECK: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> +// CHECK-DAG: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> -// CHECK: %[[alloc:.*]] = memref.alloc -// CHECK: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> -// CHECK: memref.copy %[[A]], %[[alloc]] +// CHECK-DAG: %[[alloc:.*]] = memref.alloc +// CHECK-DAG: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> +// CHECK-DAG: memref.copy %[[A]], %[[alloc]] // CHECK: call @some_external_func_within_scf_execute(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> () scf.execute_region { call @some_external_func_within_scf_execute(%A) : (tensor<4xi32>) -> () @@ -552,14 +552,14 @@ %lb : index, %ub : index, %step : index) -> (tensor, tensor) { -// CHECK: call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]] +// CHECK-DAG: call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]] %r0:2 = call @scf_for_with_tensor_insert_slice(%A, %B, %C, %lb, %ub, %step) : (tensor, tensor, tensor<4xf32>, index, index, index) -> (tensor, tensor) // %r0#0 requires a copy because we have no idea what the function is doing. -// CHECK: %[[alloc:.*]] = memref.alloc -// CHECK: %[[casted:.*]] = memref.cast %[[alloc]] +// CHECK-DAG: %[[alloc:.*]] = memref.alloc +// CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]] // CHECK: memref.copy %[[B]], %[[alloc]] // CHECK-NEXT: call @some_external_func(%[[casted]]) : (memref) -> () call @some_external_func(%r0#0) : (tensor) -> () @@ -601,19 +601,19 @@ %v1 = arith.constant 1.0 : f32 %v2 = arith.constant 2.0 : f32 - // CHECK-NEXT: %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> - // CHECK-NEXT: %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> // CHECK-NEXT: %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref - // CHECK-NEXT: %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> - // CHECK-NEXT: %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> - // CHECK-NEXT: %[[cC:.*]] = memref.cast %[[C]] : memref to memref + // CHECK-NEXT: %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> + // CHECK-NEXT: %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> + // CHECK-DAG: %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> + // CHECK-DAG: %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> + // CHECK-DAG: %[[cC:.*]] = memref.cast %[[C]] : memref to memref %A = linalg.init_tensor [64] : tensor<64xf32> %B = linalg.init_tensor [64] : tensor<64xf32> %C = linalg.init_tensor [] : tensor - // CHECK-NEXT: linalg.fill(%[[C1]], %[[A]]) : f32, memref<64xf32> - // CHECK-NEXT: linalg.fill(%[[C2]], %[[B]]) : f32, memref<64xf32> - // CHECK-NEXT: linalg.fill(%[[C0]], %[[C]]) : f32, memref + // CHECK-DAG: linalg.fill(%[[C1]], %[[A]]) : f32, memref<64xf32> + // CHECK-DAG: linalg.fill(%[[C2]], %[[B]]) : f32, memref<64xf32> + // CHECK-DAG: linalg.fill(%[[C0]], %[[C]]) : f32, memref %AA = linalg.fill(%v1, %A) : f32, tensor<64xf32> -> tensor<64xf32> %BB = linalg.fill(%v2, %B) : f32, tensor<64xf32> -> tensor<64xf32> %CC = linalg.fill(%v0, %C) : f32, tensor -> tensor @@ -725,7 +725,6 @@ tensor<256x192xf32> to tensor<256x16xf32> // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc. - // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1] %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] : tensor<128x192xf32> to tensor<8x16xf32> @@ -751,6 +750,7 @@ // insert_slice is inplace but its source comes from an equivalent buffer // that is not in place. So we must insert a copy of the small buffer into // the bigger buffer. + // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1] // CHECK: memref.copy %[[ALLOC]], %[[T]] %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<128x192xf32>