diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp --- a/clang/lib/CodeGen/CGCoroutine.cpp +++ b/clang/lib/CodeGen/CGCoroutine.cpp @@ -707,6 +707,10 @@ if (Stmt *Ret = S.getReturnStmt()) EmitStmt(Ret); + + // LLVM require the frontend to add the function attribute. See + // Coroutines.rst. + CurFn->addFnAttr("coroutine.presplit", "0"); } // Emit coroutine intrinsic and patch up arguments of the token type. diff --git a/clang/test/CodeGenCoroutines/coro-always-inline.cpp b/clang/test/CodeGenCoroutines/coro-always-inline.cpp --- a/clang/test/CodeGenCoroutines/coro-always-inline.cpp +++ b/clang/test/CodeGenCoroutines/coro-always-inline.cpp @@ -48,3 +48,15 @@ // CHECK: [[CAST3:%[0-9]+]] = bitcast %"struct.std::awaitable"* %ref.tmp{{.*}} to i8* // CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* [[CAST3]]) void foo() { co_return; } + +// Check that bar is not inlined even it's marked as always_inline. + +// CHECK-LABEL: define {{.*}} void @_Z3bazv() +// CHECK: call void @_Z3barv( +__attribute__((__always_inline__)) void bar() { + co_return; +} +void baz() { + bar(); + co_return; +} diff --git a/clang/test/CodeGenCoroutines/coro-attributes.cpp b/clang/test/CodeGenCoroutines/coro-attributes.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCoroutines/coro-attributes.cpp @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s +#include "Inputs/coroutine.h" + +using namespace std; + +struct coro { + struct promise_type { + coro get_return_object(); + suspend_never initial_suspend(); + suspend_never final_suspend() noexcept; + void return_void(); + static void unhandled_exception(); + }; +}; + +// CHECK: void @_Z3foov() #[[FOO_ATTR_NUM:[0-9]+]] +// CHECK: attributes #[[FOO_ATTR_NUM]] = { {{.*}} "coroutine.presplit"="0" +coro foo() { + co_await suspend_always{}; +} diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -1175,6 +1175,8 @@ A frontend should emit exactly one `coro.id` intrinsic per coroutine. +A frontend should emit function attribute `"coroutine.presplit"` for the coroutine. + .. _coro.id.async: 'llvm.coro.id.async' Intrinsic @@ -1214,6 +1216,8 @@ A frontend should emit exactly one `coro.id.async` intrinsic per coroutine. +A frontend should emit function attribute `"coroutine.presplit"` for the coroutine. + .. _coro.id.retcon: 'llvm.coro.id.retcon' Intrinsic @@ -1266,6 +1270,11 @@ The sixth argument must be a reference to a global function that will be used to deallocate memory. It must take a pointer and return ``void``. +Semantics: +"""""""""" + +A frontend should emit function attribute `"coroutine.presplit"` for the coroutine. + 'llvm.coro.id.retcon.once' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: @@ -1287,6 +1296,11 @@ continuation prototype must be `void` instead of matching the coroutine's return type. +Semantics: +"""""""""" + +A frontend should emit function attribute `"coroutine.presplit"` for the coroutine. + .. _coro.end: 'llvm.coro.end' Intrinsic diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp --- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -176,11 +176,14 @@ lowerCoroNoop(cast(&I)); break; case Intrinsic::coro_id: - // Mark a function that comes out of the frontend that has a coro.id - // with a coroutine attribute. if (auto *CII = cast(&I)) { if (CII->getInfo().isPreSplit()) { - F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT); + assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) && + F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() == + UNPREPARED_FOR_SPLIT && + "The frontend uses Swtich-Resumed ABI should emit " + "\"coroutine.presplit\" attribute with value \"0\" for the " + "coroutine."); setCannotDuplicate(CII); CII->setCoroutineSelf(); CoroId = cast(&I); @@ -190,6 +193,8 @@ case Intrinsic::coro_id_retcon: case Intrinsic::coro_id_retcon_once: case Intrinsic::coro_id_async: + // TODO: Remove the line once we support it in the corresponding + // frontend. F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT); break; case Intrinsic::coro_resume: diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -36,6 +36,11 @@ // adds coroutine subfunctions to the SCC to be processed by IPO pipeline. // Async lowering similarily triggers a restart of the pipeline after it has // split the coroutine. +// +// FIXME: Refactor these attributes as LLVM attributes instead of string +// attributes since these attributes are already used outside LLVM's +// coroutine module. +// FIXME: Remove these values once we remove the Legacy PM. #define CORO_PRESPLIT_ATTR "coroutine.presplit" #define UNPREPARED_FOR_SPLIT "0" #define PREPARED_FOR_SPLIT "1" diff --git a/llvm/test/Transforms/Coroutines/coro-async.ll b/llvm/test/Transforms/Coroutines/coro-async.ll --- a/llvm/test/Transforms/Coroutines/coro-async.ll +++ b/llvm/test/Transforms/Coroutines/coro-async.ll @@ -61,7 +61,7 @@ } -define swiftcc void @my_async_function(i8* swiftasync %async.ctxt, %async.task* %task, %async.actor* %actor) !dbg !1 { +define swiftcc void @my_async_function(i8* swiftasync %async.ctxt, %async.task* %task, %async.actor* %actor) "coroutine.presplit"="1" !dbg !1 { entry: %tmp = alloca { i64, i64 }, align 8 %vector = alloca <4 x double>, align 16 @@ -203,7 +203,7 @@ i32 128 ; Initial async context size without space for frame }> -define swiftcc void @my_async_function2(%async.task* %task, %async.actor* %actor, i8* %async.ctxt) "frame-pointer"="all" !dbg !6 { +define swiftcc void @my_async_function2(%async.task* %task, %async.actor* %actor, i8* %async.ctxt) "coroutine.presplit"="1" "frame-pointer"="all" !dbg !6 { entry: %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 2, i8* bitcast (<{i32, i32}>* @my_async_function2_fp to i8*)) @@ -325,7 +325,7 @@ ret void } -define swiftcc void @dont_crash_on_cf(i8* %async.ctxt, %async.task* %task, %async.actor* %actor) { +define swiftcc void @dont_crash_on_cf(i8* %async.ctxt, %async.task* %task, %async.actor* %actor) "coroutine.presplit"="1" { entry: %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 0, i8* bitcast (<{i32, i32}>* @dont_crash_on_cf_fp to i8*)) @@ -371,7 +371,7 @@ ret void } -define swiftcc void @multiple_coro_end_async(i8* %async.ctxt, %async.task* %task, %async.actor* %actor) { +define swiftcc void @multiple_coro_end_async(i8* %async.ctxt, %async.task* %task, %async.actor* %actor) "coroutine.presplit"="1" { entry: %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 0, i8* bitcast (<{i32, i32}>* @dont_crash_on_cf_fp to i8*)) @@ -427,7 +427,7 @@ i32 64 ; Initial async context size without space for frame }> -define swiftcc void @polymorphic_suspend_return(i8* swiftasync %async.ctxt, %async.task* %task, %async.actor* %actor) { +define swiftcc void @polymorphic_suspend_return(i8* swiftasync %async.ctxt, %async.task* %task, %async.actor* %actor) "coroutine.presplit"="1" { entry: %tmp = alloca { i64, i64 }, align 8 %proj.1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %tmp, i64 0, i32 0 @@ -496,7 +496,7 @@ i32 128 ; Initial async context size without space for frame }> -define swiftcc void @no_coro_suspend(i8* %async.ctx) { +define swiftcc void @no_coro_suspend(i8* %async.ctx) "coroutine.presplit"="1" { entry: %some_alloca = alloca i64 %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 0, @@ -523,7 +523,7 @@ declare void @do_with_swifterror(i64** swifterror) -define swiftcc void @no_coro_suspend_swifterror(i8* %async.ctx) { +define swiftcc void @no_coro_suspend_swifterror(i8* %async.ctx) "coroutine.presplit"="1" { entry: %some_alloca = alloca swifterror i64* %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 0, @@ -553,7 +553,7 @@ declare void @crash() declare void @use(i8*) -define swiftcc void @undefined_coro_async_resume(i8 *%async.ctx) { +define swiftcc void @undefined_coro_async_resume(i8 *%async.ctx) "coroutine.presplit"="1" { entry: %id = call token @llvm.coro.id.async(i32 24, i32 16, i32 0, i8* bitcast (<{i32, i32}>* @undefined_coro_async_resume_fp to i8*)) %hdl = call i8* @llvm.coro.begin(token %id, i8* null) diff --git a/llvm/test/Transforms/Coroutines/coro-debug-O2.ll b/llvm/test/Transforms/Coroutines/coro-debug-O2.ll --- a/llvm/test/Transforms/Coroutines/coro-debug-O2.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-O2.ll @@ -9,7 +9,7 @@ ; CHECK: ![[PROMISEVAR_RESUME]] = !DILocalVariable(name: "__promise" %promise_type = type { i32, i32, double } -define void @f() !dbg !8 { +define void @f() "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca %promise_type, align 8 %0 = bitcast %promise_type* %__promise to i8* diff --git a/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll --- a/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll @@ -56,7 +56,7 @@ declare void @pi64(i64*) declare void @pdouble(double*) -define void @f(i32 %a, i32 %b, i64 %c, double %d) !dbg !8 { +define void @f(i32 %a, i32 %b, i64 %c, double %d) "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca %promise_type, align 8 %0 = bitcast %promise_type* %__promise to i8* @@ -182,7 +182,7 @@ } -define void @bar(i32 %a, i64 %c, double %d) !dbg !19 { +define void @bar(i32 %a, i64 %c, double %d) "coroutine.presplit"="0" !dbg !19 { entry: %__promise = alloca %promise_type, align 8 %0 = bitcast %promise_type* %__promise to i8* diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll @@ -13,7 +13,7 @@ source_filename = "../llvm/test/Transforms/Coroutines/coro-debug-dbg.values-O2.ll" -define void @f(i32 %i, i32 %j) !dbg !8 { +define void @f(i32 %i, i32 %j) "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca i8, align 8 %x = alloca [10 x i32], align 16 diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll @@ -19,7 +19,7 @@ source_filename = "../llvm/test/Transforms/Coroutines/coro-debug-dbg.values-O2.ll" declare void @consume(i32) -define void @f(i32 %i, i32 %j) !dbg !8 { +define void @f(i32 %i, i32 %j) "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca i8, align 8 %x = alloca [10 x i32], align 16 diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll --- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll @@ -62,7 +62,7 @@ ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i" ; CHECK: ![[JVAR_RESUME]] = !DILocalVariable(name: "j" ; CHECK: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_SCOPE]]) -define void @f() { +define void @f() "coroutine.presplit"="0" { entry: %__promise = alloca i8, align 8 %i = alloca i32, align 4 diff --git a/llvm/test/Transforms/Coroutines/coro-noalias-param.ll b/llvm/test/Transforms/Coroutines/coro-noalias-param.ll --- a/llvm/test/Transforms/Coroutines/coro-noalias-param.ll +++ b/llvm/test/Transforms/Coroutines/coro-noalias-param.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -S -passes=coro-early | FileCheck %s %struct.A = type <{ i64, i64, i32, [4 x i8] }> -define void @f(%struct.A* nocapture readonly noalias align 8 %a) { +define void @f(%struct.A* nocapture readonly noalias align 8 %a) "coroutine.presplit"="0" { %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) diff --git a/llvm/test/Transforms/Coroutines/coro-split-01.ll b/llvm/test/Transforms/Coroutines/coro-split-01.ll --- a/llvm/test/Transforms/Coroutines/coro-split-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-01.ll @@ -1,7 +1,7 @@ ; Tests that a coroutine is split, inlined into the caller and devirtualized. ; RUN: opt < %s -S -enable-coroutines -passes='default' | FileCheck %s -define i8* @f() { +define i8* @f() "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) diff --git a/llvm/test/Transforms/Coroutines/coro-split-recursive.ll b/llvm/test/Transforms/Coroutines/coro-split-recursive.ll --- a/llvm/test/Transforms/Coroutines/coro-split-recursive.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-recursive.ll @@ -13,7 +13,7 @@ ; CHECK: call void @foo() ; CHECK-LABEL: define {{.*}}void @foo.destroy( -define void @foo() { +define void @foo() "coroutine.presplit"="0" { entry: %__promise = alloca i32, align 8 %0 = bitcast i32* %__promise to i8* diff --git a/llvm/test/Transforms/Coroutines/ex0.ll b/llvm/test/Transforms/Coroutines/ex0.ll --- a/llvm/test/Transforms/Coroutines/ex0.ll +++ b/llvm/test/Transforms/Coroutines/ex0.ll @@ -1,7 +1,7 @@ ; First example from Doc/Coroutines.rst (two block loop) ; RUN: opt < %s -enable-coroutines -aa-pipeline=basic-aa -passes='default' -preserve-alignment-assumptions-during-inlining=false -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() diff --git a/llvm/test/Transforms/Coroutines/ex1.ll b/llvm/test/Transforms/Coroutines/ex1.ll --- a/llvm/test/Transforms/Coroutines/ex1.ll +++ b/llvm/test/Transforms/Coroutines/ex1.ll @@ -1,7 +1,7 @@ ; First example from Doc/Coroutines.rst (one block loop) ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -preserve-alignment-assumptions-during-inlining=false -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() diff --git a/llvm/test/Transforms/Coroutines/ex2.ll b/llvm/test/Transforms/Coroutines/ex2.ll --- a/llvm/test/Transforms/Coroutines/ex2.ll +++ b/llvm/test/Transforms/Coroutines/ex2.ll @@ -1,7 +1,7 @@ ; Second example from Doc/Coroutines.rst (custom alloc and free functions) ; RUN: opt < %s -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) diff --git a/llvm/test/Transforms/Coroutines/ex3.ll b/llvm/test/Transforms/Coroutines/ex3.ll --- a/llvm/test/Transforms/Coroutines/ex3.ll +++ b/llvm/test/Transforms/Coroutines/ex3.ll @@ -1,7 +1,7 @@ ; Third example from Doc/Coroutines.rst (two suspend points) ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() diff --git a/llvm/test/Transforms/Coroutines/ex4.ll b/llvm/test/Transforms/Coroutines/ex4.ll --- a/llvm/test/Transforms/Coroutines/ex4.ll +++ b/llvm/test/Transforms/Coroutines/ex4.ll @@ -1,7 +1,7 @@ ; Fourth example from Doc/Coroutines.rst (coroutine promise) ; RUN: opt < %s -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %promise = alloca i32 %pv = bitcast i32* %promise to i8* diff --git a/llvm/test/Transforms/Coroutines/ex5.ll b/llvm/test/Transforms/Coroutines/ex5.ll --- a/llvm/test/Transforms/Coroutines/ex5.ll +++ b/llvm/test/Transforms/Coroutines/ex5.ll @@ -1,7 +1,7 @@ ; Fifth example from Doc/Coroutines.rst (final suspend) ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -preserve-alignment-assumptions-during-inlining=false -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() diff --git a/llvm/test/Transforms/Coroutines/phi-coro-end.ll b/llvm/test/Transforms/Coroutines/phi-coro-end.ll --- a/llvm/test/Transforms/Coroutines/phi-coro-end.ll +++ b/llvm/test/Transforms/Coroutines/phi-coro-end.ll @@ -1,7 +1,7 @@ ; Verify that we correctly handle suspend when the coro.end block contains phi ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() diff --git a/llvm/test/Transforms/Coroutines/restart-trigger.ll b/llvm/test/Transforms/Coroutines/restart-trigger.ll --- a/llvm/test/Transforms/Coroutines/restart-trigger.ll +++ b/llvm/test/Transforms/Coroutines/restart-trigger.ll @@ -10,7 +10,7 @@ ; CHECK-NEWPM-NOT: CoroSplit: Processing coroutine 'f' state: 1 -define void @f() { +define void @f() "coroutine.presplit"="0" { %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp --- a/mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp +++ b/mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp @@ -190,6 +190,13 @@ } } + // The switch-resumed API based coroutine should be marked with + // "coroutine.presplit" attribute with value "0" to mark the function as a + // coroutine. + func->setAttr("passthrough", builder.getArrayAttr(builder.getArrayAttr( + {builder.getStringAttr("coroutine.presplit"), + builder.getStringAttr("0")}))); + CoroMachinery machinery; machinery.func = func; machinery.asyncToken = retToken; diff --git a/mlir/test/mlir-opt/async.mlir b/mlir/test/mlir-opt/async.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-opt/async.mlir @@ -0,0 +1,80 @@ +// Check if mlir marks the corresponding function with required coroutine attribute. +// +// RUN: mlir-opt %s -async-to-async-runtime \ +// RUN: -async-runtime-ref-counting \ +// RUN: -async-runtime-ref-counting-opt \ +// RUN: -convert-async-to-llvm \ +// RUN: -convert-linalg-to-loops \ +// RUN: -convert-scf-to-std \ +// RUN: -convert-linalg-to-llvm \ +// RUN: -convert-memref-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-std-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | FileCheck %s + +// CHECK: llvm.func @async_execute_fn{{.*}}attributes{{.*}}"coroutine.presplit", "0" +// CHECK: llvm.func @async_execute_fn_0{{.*}}attributes{{.*}}"coroutine.presplit", "0" +// CHECK: llvm.func @async_execute_fn_1{{.*}}attributes{{.*}}"coroutine.presplit", "0" + +func @main() { + %i0 = arith.constant 0 : index + %i1 = arith.constant 1 : index + %i2 = arith.constant 2 : index + %i3 = arith.constant 3 : index + + %c0 = arith.constant 0.0 : f32 + %c1 = arith.constant 1.0 : f32 + %c2 = arith.constant 2.0 : f32 + %c3 = arith.constant 3.0 : f32 + %c4 = arith.constant 4.0 : f32 + + %A = memref.alloc() : memref<4xf32> + linalg.fill(%c0, %A) : f32, memref<4xf32> + + %U = memref.cast %A : memref<4xf32> to memref<*xf32> + call @print_memref_f32(%U): (memref<*xf32>) -> () + + memref.store %c1, %A[%i0]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + %outer = async.execute { + memref.store %c2, %A[%i1]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + // No op async region to create a token for testing async dependency. + %noop = async.execute { + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + async.yield + } + + %inner = async.execute [%noop] { + memref.store %c3, %A[%i2]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + async.yield + } + async.await %inner : !async.token + + memref.store %c4, %A[%i3]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + async.yield + } + async.await %outer : !async.token + + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + memref.dealloc %A : memref<4xf32> + + return +} + +func private @mlirAsyncRuntimePrintCurrentThreadId() -> () + +func private @print_memref_f32(memref<*xf32>) attributes { llvm.emit_c_interface }