This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg][bufferize] Bufferize using PostOrder traversal
ClosedPublic

Authored by springerm on Oct 15 2021, 8:22 PM.

Details

Summary

This is required for bufferization of scf::IfOp, which is added in a subsequent commit.

Some ops (scf::ForOp, TiledLoopOp) require PreOrder traversal to make sure that bbArgs are mapped before bufferizing the loop body.

Diff Detail

Event Timeline

springerm created this revision.Oct 15 2021, 8:22 PM
springerm requested review of this revision.Oct 15 2021, 8:22 PM
springerm updated this revision to Diff 381134.Oct 20 2021, 7:47 PM

fix bug with nested for loop

This revision is now accepted and ready to land.Oct 20 2021, 11:54 PM
This revision was landed with ongoing or failed builds.Oct 21 2021, 1:22 AM
This revision was automatically updated to reflect the committed changes.

Bufferization crashes in various ways on the following example.
It may be that the IR is not in a good form to start with, still we should not crash at callOp->erase (and if I make that removal conditional I get assert.h assertion failed at third_party/llvm/llvm-project/llvm/include/llvm/ADT/EquivalenceClasses.h:172 in const ElemTy &llvm::EquivalenceClasses<mlir::linalg::BufferizationAliasInfo::ValueWrapper>::getLeaderValue(const ElemTy &) const [ElemTy = mlir::linalg::BufferizationAliasInfo::ValueWrapper]: MI != member_end() && "Value is not in the set!").

module  {
  func @reduction_2d_on_tensors(%arg0: tensor<4096x1024xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<4096xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}) -> tensor<4096xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
    %cst = arith.constant 0.000000e+00 : f32
    %c16 = arith.constant 16 : index
    %c4096 = arith.constant 4096 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %0 = linalg.fill(%cst, %arg1) : f32, tensor<4096xf32> -> tensor<4096xf32> 
    %1 = linalg.init_tensor [16] : tensor<16xf32>
    %2 = linalg.fill(%cst, %1) : f32, tensor<16xf32> -> tensor<16xf32> 
    %3 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : tensor<16xf32>, vector<16xf32>
    %4 = scf.for %arg2 = %c0 to %c4096 step %c16 iter_args(%arg3 = %0) -> (tensor<4096xf32>) {
      %5 = vector.transfer_read %0[%arg2], %cst {in_bounds = [true]} : tensor<4096xf32>, vector<16xf32>
      %6 = scf.for %arg4 = %c0 to %c1024 step %c16 iter_args(%arg5 = %arg3) -> (tensor<4096xf32>) {
        %7 = vector.transfer_read %arg0[%arg2, %arg4], %cst {in_bounds = [true, true]} : tensor<4096x1024xf32>, vector<16x16xf32>
        %8 = vector.multi_reduction #vector.kind<add>, %7 [1] : vector<16x16xf32> to vector<16xf32>
        %9 = arith.addf %8, %3 : vector<16xf32>
        %10 = arith.addf %5, %9 : vector<16xf32>
        %11 = vector.transfer_write %10, %arg5[%arg2] {in_bounds = [true]} : vector<16xf32>, tensor<4096xf32>
        scf.yield %11 : tensor<4096xf32>
      }
      scf.yield %6 : tensor<4096xf32>
    }
    return %4 : tensor<4096xf32>
  }
  func public @main(%arg0: tensor<4096x1024xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<4096xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}, %arg2: index) -> tensor<4096xf32> attributes {llvm.emit_c_interface} {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = scf.for %arg3 = %c0 to %arg2 step %c1 iter_args(%arg4 = %arg1) -> (tensor<4096xf32>) {
      %1 = call @reduction_2d_on_tensors(%arg0, %arg4) : (tensor<4096x1024xf32>, tensor<4096xf32>) -> tensor<4096xf32>
      scf.yield %1 : tensor<4096xf32>
    }
    return %0 : tensor<4096xf32>
  }
}