This is required for bufferization of scf::IfOp, which is added in a subsequent commit.
Some ops (scf::ForOp, TiledLoopOp) require PreOrder traversal to make sure that bbArgs are mapped before bufferizing the loop body.
Differential D111924
[mlir][linalg][bufferize] Bufferize using PostOrder traversal springerm on Oct 15 2021, 8:22 PM. Authored by
Details This is required for bufferization of scf::IfOp, which is added in a subsequent commit. Some ops (scf::ForOp, TiledLoopOp) require PreOrder traversal to make sure that bbArgs are mapped before bufferizing the loop body.
Diff Detail
Event TimelineComment Actions Bufferization crashes in various ways on the following example. module { func @reduction_2d_on_tensors(%arg0: tensor<4096x1024xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<4096xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}) -> tensor<4096xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { %cst = arith.constant 0.000000e+00 : f32 %c16 = arith.constant 16 : index %c4096 = arith.constant 4096 : index %c1024 = arith.constant 1024 : index %c0 = arith.constant 0 : index %0 = linalg.fill(%cst, %arg1) : f32, tensor<4096xf32> -> tensor<4096xf32> %1 = linalg.init_tensor [16] : tensor<16xf32> %2 = linalg.fill(%cst, %1) : f32, tensor<16xf32> -> tensor<16xf32> %3 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : tensor<16xf32>, vector<16xf32> %4 = scf.for %arg2 = %c0 to %c4096 step %c16 iter_args(%arg3 = %0) -> (tensor<4096xf32>) { %5 = vector.transfer_read %0[%arg2], %cst {in_bounds = [true]} : tensor<4096xf32>, vector<16xf32> %6 = scf.for %arg4 = %c0 to %c1024 step %c16 iter_args(%arg5 = %arg3) -> (tensor<4096xf32>) { %7 = vector.transfer_read %arg0[%arg2, %arg4], %cst {in_bounds = [true, true]} : tensor<4096x1024xf32>, vector<16x16xf32> %8 = vector.multi_reduction #vector.kind<add>, %7 [1] : vector<16x16xf32> to vector<16xf32> %9 = arith.addf %8, %3 : vector<16xf32> %10 = arith.addf %5, %9 : vector<16xf32> %11 = vector.transfer_write %10, %arg5[%arg2] {in_bounds = [true]} : vector<16xf32>, tensor<4096xf32> scf.yield %11 : tensor<4096xf32> } scf.yield %6 : tensor<4096xf32> } return %4 : tensor<4096xf32> } func public @main(%arg0: tensor<4096x1024xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<4096xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}, %arg2: index) -> tensor<4096xf32> attributes {llvm.emit_c_interface} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = scf.for %arg3 = %c0 to %arg2 step %c1 iter_args(%arg4 = %arg1) -> (tensor<4096xf32>) { %1 = call @reduction_2d_on_tensors(%arg0, %arg4) : (tensor<4096x1024xf32>, tensor<4096xf32>) -> tensor<4096xf32> scf.yield %1 : tensor<4096xf32> } return %0 : tensor<4096xf32> } } |