Index: clang/lib/CodeGen/CGCoroutine.cpp =================================================================== --- clang/lib/CodeGen/CGCoroutine.cpp +++ clang/lib/CodeGen/CGCoroutine.cpp @@ -703,6 +703,9 @@ if (Stmt *Ret = S.getReturnStmt()) EmitStmt(Ret); + + assert(CurFn); + CurFn->addFnAttr("coroutine.presplit", "0"); } // Emit coroutine intrinsic and patch up arguments of the token type. Index: clang/test/CodeGenCoroutines/coro-always-inline.cpp =================================================================== --- clang/test/CodeGenCoroutines/coro-always-inline.cpp +++ clang/test/CodeGenCoroutines/coro-always-inline.cpp @@ -48,3 +48,15 @@ // CHECK: [[CAST3:%[0-9]+]] = bitcast %"struct.std::awaitable"* %ref.tmp{{.*}} to i8* // CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* [[CAST3]]) void foo() { co_return; } + +// Check that bar is not inlined even it's marked as always_inline. + +// CHECK-LABEL: define {{.*}} void @_Z3bazv() +// CHECK: call void @_Z3barv( +__attribute__((__always_inline__)) void bar() { + co_return; +} +void baz() { + bar(); + co_return; +} Index: clang/test/CodeGenCoroutines/coro-attributes.cpp =================================================================== --- /dev/null +++ clang/test/CodeGenCoroutines/coro-attributes.cpp @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s +#include "Inputs/coroutine.h" + +using namespace std; + +struct coro { + struct promise_type { + coro get_return_object(); + suspend_never initial_suspend(); + suspend_never final_suspend() noexcept; + void return_void(); + static void unhandled_exception(); + }; +}; + +// CHECK: void @_Z3foov() #[[FOO_ATTR_NUM:[0-9]+]] +// CHECK: attributes #[[FOO_ATTR_NUM]] = { {{.*}} "coroutine.presplit"="0" +coro foo() { + co_await suspend_always{}; +} Index: llvm/docs/Coroutines.rst =================================================================== --- llvm/docs/Coroutines.rst +++ llvm/docs/Coroutines.rst @@ -1175,6 +1175,9 @@ A frontend should emit exactly one `coro.id` intrinsic per coroutine. +The frontend should add attribute `"coroutine.presplit"` with value `"0"` for the coroutine +containing `coro.id`. + .. _coro.id.async: 'llvm.coro.id.async' Intrinsic Index: llvm/lib/Transforms/Coroutines/CoroEarly.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -180,7 +180,12 @@ // with a coroutine attribute. if (auto *CII = cast(&I)) { if (CII->getInfo().isPreSplit()) { - F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT); + assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) && + F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() == + "0" && + "The frontend uses Swtich-Resumed ABI should emit " + "\"coroutine.presplit\" attribute with value \"0\" for the " + "coroutine."); setCannotDuplicate(CII); CII->setCoroutineSelf(); CoroId = cast(&I); Index: llvm/lib/Transforms/Coroutines/CoroInternal.h =================================================================== --- llvm/lib/Transforms/Coroutines/CoroInternal.h +++ llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -36,6 +36,10 @@ // adds coroutine subfunctions to the SCC to be processed by IPO pipeline. // Async lowering similarily triggers a restart of the pipeline after it has // split the coroutine. +// +// FIXME: Refactor these attributes as LLVM attributes instead of string +// attributes since these attributes are already used outside the LLVM's +// coroutine module. #define CORO_PRESPLIT_ATTR "coroutine.presplit" #define UNPREPARED_FOR_SPLIT "0" #define PREPARED_FOR_SPLIT "1" Index: llvm/test/Transforms/Coroutines/coro-debug-O2.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-debug-O2.ll +++ llvm/test/Transforms/Coroutines/coro-debug-O2.ll @@ -9,7 +9,7 @@ ; CHECK: ![[PROMISEVAR_RESUME]] = !DILocalVariable(name: "__promise" %promise_type = type { i32, i32, double } -define void @f() !dbg !8 { +define void @f() "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca %promise_type, align 8 %0 = bitcast %promise_type* %__promise to i8* Index: llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll +++ llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll @@ -56,7 +56,7 @@ declare void @pi64(i64*) declare void @pdouble(double*) -define void @f(i32 %a, i32 %b, i64 %c, double %d) !dbg !8 { +define void @f(i32 %a, i32 %b, i64 %c, double %d) "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca %promise_type, align 8 %0 = bitcast %promise_type* %__promise to i8* @@ -182,7 +182,7 @@ } -define void @bar(i32 %a, i64 %c, double %d) !dbg !19 { +define void @bar(i32 %a, i64 %c, double %d) "coroutine.presplit"="0" !dbg !19 { entry: %__promise = alloca %promise_type, align 8 %0 = bitcast %promise_type* %__promise to i8* Index: llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll +++ llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll @@ -13,7 +13,7 @@ source_filename = "../llvm/test/Transforms/Coroutines/coro-debug-dbg.values-O2.ll" -define void @f(i32 %i, i32 %j) !dbg !8 { +define void @f(i32 %i, i32 %j) "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca i8, align 8 %x = alloca [10 x i32], align 16 Index: llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll +++ llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll @@ -19,7 +19,7 @@ source_filename = "../llvm/test/Transforms/Coroutines/coro-debug-dbg.values-O2.ll" declare void @consume(i32) -define void @f(i32 %i, i32 %j) !dbg !8 { +define void @f(i32 %i, i32 %j) "coroutine.presplit"="0" !dbg !8 { entry: %__promise = alloca i8, align 8 %x = alloca [10 x i32], align 16 Index: llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll +++ llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll @@ -62,7 +62,7 @@ ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i" ; CHECK: ![[JVAR_RESUME]] = !DILocalVariable(name: "j" ; CHECK: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_SCOPE]]) -define void @f() { +define void @f() "coroutine.presplit"="0" { entry: %__promise = alloca i8, align 8 %i = alloca i32, align 4 Index: llvm/test/Transforms/Coroutines/coro-noalias-param.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-noalias-param.ll +++ llvm/test/Transforms/Coroutines/coro-noalias-param.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -S -passes=coro-early | FileCheck %s %struct.A = type <{ i64, i64, i32, [4 x i8] }> -define void @f(%struct.A* nocapture readonly noalias align 8 %a) { +define void @f(%struct.A* nocapture readonly noalias align 8 %a) "coroutine.presplit"="0" { %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) Index: llvm/test/Transforms/Coroutines/coro-split-01.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-split-01.ll +++ llvm/test/Transforms/Coroutines/coro-split-01.ll @@ -1,7 +1,7 @@ ; Tests that a coroutine is split, inlined into the caller and devirtualized. ; RUN: opt < %s -S -enable-coroutines -passes='default' | FileCheck %s -define i8* @f() { +define i8* @f() "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) Index: llvm/test/Transforms/Coroutines/coro-split-recursive.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-split-recursive.ll +++ llvm/test/Transforms/Coroutines/coro-split-recursive.ll @@ -13,7 +13,7 @@ ; CHECK: call void @foo() ; CHECK-LABEL: define {{.*}}void @foo.destroy( -define void @foo() { +define void @foo() "coroutine.presplit"="0" { entry: %__promise = alloca i32, align 8 %0 = bitcast i32* %__promise to i8* Index: llvm/test/Transforms/Coroutines/ex0.ll =================================================================== --- llvm/test/Transforms/Coroutines/ex0.ll +++ llvm/test/Transforms/Coroutines/ex0.ll @@ -1,7 +1,7 @@ ; First example from Doc/Coroutines.rst (two block loop) ; RUN: opt < %s -enable-coroutines -aa-pipeline=basic-aa -passes='default' -preserve-alignment-assumptions-during-inlining=false -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() Index: llvm/test/Transforms/Coroutines/ex1.ll =================================================================== --- llvm/test/Transforms/Coroutines/ex1.ll +++ llvm/test/Transforms/Coroutines/ex1.ll @@ -1,7 +1,7 @@ ; First example from Doc/Coroutines.rst (one block loop) ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -preserve-alignment-assumptions-during-inlining=false -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() Index: llvm/test/Transforms/Coroutines/ex2.ll =================================================================== --- llvm/test/Transforms/Coroutines/ex2.ll +++ llvm/test/Transforms/Coroutines/ex2.ll @@ -1,7 +1,7 @@ ; Second example from Doc/Coroutines.rst (custom alloc and free functions) ; RUN: opt < %s -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) Index: llvm/test/Transforms/Coroutines/ex3.ll =================================================================== --- llvm/test/Transforms/Coroutines/ex3.ll +++ llvm/test/Transforms/Coroutines/ex3.ll @@ -1,7 +1,7 @@ ; Third example from Doc/Coroutines.rst (two suspend points) ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() Index: llvm/test/Transforms/Coroutines/ex4.ll =================================================================== --- llvm/test/Transforms/Coroutines/ex4.ll +++ llvm/test/Transforms/Coroutines/ex4.ll @@ -1,7 +1,7 @@ ; Fourth example from Doc/Coroutines.rst (coroutine promise) ; RUN: opt < %s -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %promise = alloca i32 %pv = bitcast i32* %promise to i8* Index: llvm/test/Transforms/Coroutines/ex5.ll =================================================================== --- llvm/test/Transforms/Coroutines/ex5.ll +++ llvm/test/Transforms/Coroutines/ex5.ll @@ -1,7 +1,7 @@ ; Fifth example from Doc/Coroutines.rst (final suspend) ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -preserve-alignment-assumptions-during-inlining=false -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() Index: llvm/test/Transforms/Coroutines/phi-coro-end.ll =================================================================== --- llvm/test/Transforms/Coroutines/phi-coro-end.ll +++ llvm/test/Transforms/Coroutines/phi-coro-end.ll @@ -1,7 +1,7 @@ ; Verify that we correctly handle suspend when the coro.end block contains phi ; RUN: opt < %s -aa-pipeline=basic-aa -passes='default' -enable-coroutines -S | FileCheck %s -define i8* @f(i32 %n) { +define i8* @f(i32 %n) "coroutine.presplit"="0" { entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() Index: llvm/test/Transforms/Coroutines/restart-trigger.ll =================================================================== --- llvm/test/Transforms/Coroutines/restart-trigger.ll +++ llvm/test/Transforms/Coroutines/restart-trigger.ll @@ -10,7 +10,7 @@ ; CHECK-NEWPM-NOT: CoroSplit: Processing coroutine 'f' state: 1 -define void @f() { +define void @f() "coroutine.presplit"="0" { %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) Index: mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp =================================================================== --- mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp +++ mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp @@ -339,7 +339,7 @@ auto nullPtr = rewriter.create(loc, i8Ptr); // Get coroutine id: @llvm.coro.id. - rewriter.replaceOpWithNewOp( + const auto &Id = rewriter.replaceOpWithNewOp( op, token, ValueRange({constZero, nullPtr, nullPtr, nullPtr})); return success(); Index: mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp =================================================================== --- mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp +++ mlir/lib/Dialect/Async/Transforms/AsyncToAsyncRuntime.cpp @@ -190,6 +190,15 @@ } } + // The switch-resumed API based coroutine should be marked with + // "coroutine.presplit" attribute with value "0" to mark the function as a + // coroutine. + func->setAttr( + "passthrough", + ArrayAttr::get( + ctx, {ArrayAttr::get(ctx, {StringAttr::get(ctx, "coroutine.presplit"), + StringAttr::get(ctx, "0")})})); + CoroMachinery machinery; machinery.func = func; machinery.asyncToken = retToken; Index: mlir/test/mlir-opt/async.mlir =================================================================== --- /dev/null +++ mlir/test/mlir-opt/async.mlir @@ -0,0 +1,80 @@ +// Check if mlir marks the corresponding function with required coroutine attribute. +// +// RUN: mlir-opt %s -async-to-async-runtime \ +// RUN: -async-runtime-ref-counting \ +// RUN: -async-runtime-ref-counting-opt \ +// RUN: -convert-async-to-llvm \ +// RUN: -convert-linalg-to-loops \ +// RUN: -convert-scf-to-std \ +// RUN: -convert-linalg-to-llvm \ +// RUN: -convert-memref-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-std-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | FileCheck %s + +// CHECK: llvm.func @async_execute_fn{{.*}}attributes{{.*}}"coroutine.presplit", "0" +// CHECK: llvm.func @async_execute_fn_0{{.*}}attributes{{.*}}"coroutine.presplit", "0" +// CHECK: llvm.func @async_execute_fn_1{{.*}}attributes{{.*}}"coroutine.presplit", "0" + +func @main() { + %i0 = arith.constant 0 : index + %i1 = arith.constant 1 : index + %i2 = arith.constant 2 : index + %i3 = arith.constant 3 : index + + %c0 = arith.constant 0.0 : f32 + %c1 = arith.constant 1.0 : f32 + %c2 = arith.constant 2.0 : f32 + %c3 = arith.constant 3.0 : f32 + %c4 = arith.constant 4.0 : f32 + + %A = memref.alloc() : memref<4xf32> + linalg.fill(%c0, %A) : f32, memref<4xf32> + + %U = memref.cast %A : memref<4xf32> to memref<*xf32> + call @print_memref_f32(%U): (memref<*xf32>) -> () + + memref.store %c1, %A[%i0]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + %outer = async.execute { + memref.store %c2, %A[%i1]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + // No op async region to create a token for testing async dependency. + %noop = async.execute { + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + async.yield + } + + %inner = async.execute [%noop] { + memref.store %c3, %A[%i2]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + async.yield + } + async.await %inner : !async.token + + memref.store %c4, %A[%i3]: memref<4xf32> + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + async.yield + } + async.await %outer : !async.token + + call @mlirAsyncRuntimePrintCurrentThreadId(): () -> () + call @print_memref_f32(%U): (memref<*xf32>) -> () + + memref.dealloc %A : memref<4xf32> + + return +} + +func private @mlirAsyncRuntimePrintCurrentThreadId() -> () + +func private @print_memref_f32(memref<*xf32>) attributes { llvm.emit_c_interface }