Index: llvm/docs/Coroutines.rst =================================================================== --- llvm/docs/Coroutines.rst +++ llvm/docs/Coroutines.rst @@ -1692,6 +1692,38 @@ In a yield-once coroutine, it is undefined behavior if the coroutine executes a call to ``llvm.coro.suspend.retcon`` after resuming in any way. +Coroutine Helper Intrinsics +------------------------------ +Intrinsics described in this section are used as a helper to show +the changed properties after we introduced coroutines. + +.. _coro.tls.wrapper: + +'llvm.coro.tls.wrapper' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + declare ptr @llvm.coro.tls.wrapper(ptr) + +Overview: +""""""""" + +The '``llvm.coro.tls.wrapper``' intrinsic refers to the address of the TLS +variables. The addresses of TLS variables are thought to be constant in one +function. But it is not true in coroutines due to a coroutine may resume in +another thread. + +Arguments: +"""""""""" + +The address of the TLS variable. + +Semantics: +"""""""""" + +The `llvm.coro.tls.wrapper` intrinsic would be replaced with its argument +after we lowered all the coroutines. + Coroutine Transformation Passes =============================== CoroEarly Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1336,6 +1336,12 @@ ReadOnly>, NoCapture>]>; +// A wrapper for TLS variables to avoid incorrect merging in coroutines. +// Since a coroutine may resume in another thread. +def int_coro_tls_wrapper : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], + [IntrNoMem, IntrSpeculatable, + IntrWillReturn]>; + ///===-------------------------- Other Intrinsics --------------------------===// // def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold]>, Index: llvm/lib/Transforms/Coroutines/CoroCleanup.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/CoroCleanup.cpp +++ llvm/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -82,6 +82,9 @@ } else continue; break; + case Intrinsic::coro_tls_wrapper: + II->replaceAllUsesWith(II->getOperand(0)); + break; case Intrinsic::coro_async_size_replace: auto *Target = cast( cast(II->getArgOperand(0)->stripPointerCasts()) @@ -113,7 +116,8 @@ M, {"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.subfn.addr", "llvm.coro.free", "llvm.coro.id", "llvm.coro.id.retcon", "llvm.coro.id.async", "llvm.coro.id.retcon.once", - "llvm.coro.async.size.replace", "llvm.coro.async.resume"}); + "llvm.coro.async.size.replace", "llvm.coro.async.resume", + "llvm.coro.tls.wrapper"}); } PreservedAnalyses CoroCleanupPass::run(Module &M, Index: llvm/lib/Transforms/Coroutines/CoroEarly.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -242,6 +242,35 @@ for (auto &F : M) L.lowerEarlyIntrinsics(F); + // Add @llvm.coro.tls.wrapper to TLS variables to avoid + // misoptimization. + // + // Previously, the optimizer could assume the address of a TLS variable is + // same in the same function. The assumption is broken now after we introduced + // coroutines. + // + // Note that we couldn't mark the use in coroutines only due to potentially + // inlining. It is too expensive to do reachability analysis here to mark the + // use for reachable function only. And the TLS variables in normal functions + // should be optimized correctly since @llvm.coro.tls.wrapper is + // marked as readnone. + auto *MaychangeFn = + Intrinsic::getDeclaration(&M, Intrinsic::coro_tls_wrapper); + for (auto &GV : M.getGlobalList()) + if (GV.isThreadLocal()) + for (auto &Use : llvm::make_early_inc_range(GV.uses())) { + auto *UserInst = dyn_cast(Use.getUser()); + if (!UserInst) + continue; + + if (auto *PN = dyn_cast(UserInst)) { + auto *IncomingBB = PN->getIncomingBlock(Use); + UserInst = IncomingBB->getTerminator(); + } + + Use.set(CallInst::Create(MaychangeFn, {&GV}, GV.getName(), UserInst)); + } + PreservedAnalyses PA; PA.preserveSet(); return PA; Index: llvm/lib/Transforms/Coroutines/Coroutines.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -94,6 +94,7 @@ "llvm.coro.suspend", "llvm.coro.suspend.async", "llvm.coro.suspend.retcon", + "llvm.coro.tls.wrapper", }; #ifndef NDEBUG @@ -104,6 +105,8 @@ bool coro::declaresAnyIntrinsic(const Module &M) { for (StringRef Name : CoroIntrinsics) { + if (!isCoroutineIntrinsicName(Name)) + llvm::outs() << Name << " is not a coro intrinsic.\n"; assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic"); if (M.getNamedValue(Name)) return true; Index: llvm/test/Transforms/Coroutines/coro-TLS-01.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Coroutines/coro-TLS-01.ll @@ -0,0 +1,67 @@ +; Tests that the TLS variables which cross suspend points wouldn't be misoptimized. +; RUN: opt < %s -S -passes=coro-early,sroa,early-cse,coro-split,coro-cleanup,simplifycfg | FileCheck %s +; RUN: opt < %s -S -passes='default' | FileCheck %s + +@tls_variable = thread_local global i32 0 + +define ptr @f() presplitcoroutine { +entry: + %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call ptr @malloc(i32 %size) + %i = alloca ptr + %j = alloca ptr + %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc) + store ptr @tls_variable, ptr %i + %sus_result = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sus_result, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + store ptr @tls_variable, ptr %j + %i_value = load ptr, ptr %i + %j_value = load ptr, ptr %j + %cmp = icmp eq ptr %i_value, %j_value + br i1 %cmp, label %same, label %diff + +same: + call void @print_same() + br label %cleanup + +diff: + call void @print_diff() + br label %cleanup + +cleanup: + %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) + call void @free(ptr %mem) + br label %suspend + +suspend: + call i1 @llvm.coro.end(ptr %hdl, i1 0) + ret ptr %hdl +} + +; CHECK-LABEL: f.resume( +; CHECK: br i1 %cmp, label %same, label %diff +; CHECK-EMPTY: +; CHECK-NEXT: same: +; CHECK-NEXT: call void @print_same() +; CHECK-NEXT: br label +; CHECK-EMPTY: +; CHECK-NEXT: diff: +; CHECK-NEXT: call void @print_diff() +; CHECK-NEXT: br label + +declare void @print_same() +declare void @print_diff() +declare ptr @llvm.coro.free(token, ptr) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) + +declare token @llvm.coro.id(i32, ptr, ptr, ptr) +declare i1 @llvm.coro.alloc(token) +declare ptr @llvm.coro.begin(token, ptr) +declare i1 @llvm.coro.end(ptr, i1) + +declare noalias ptr @malloc(i32) +declare void @free(ptr) Index: llvm/test/Transforms/Coroutines/coro-TLS-02.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Coroutines/coro-TLS-02.ll @@ -0,0 +1,91 @@ +; Tests that the TLS variables which don't cross suspend points would be optimized correctly. +; RUN: opt < %s -S -passes=coro-early,coro-split,coro-cleanup,sroa,early-cse,simplifycfg | FileCheck %s +; RUN: opt < %s -S -passes='default' | FileCheck %s + +@tls_variable = thread_local global i32 0 + +define ptr @f() presplitcoroutine { +entry: + %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call ptr @malloc(i32 %size) + %i = alloca ptr + %j = alloca ptr + %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc) + %sus_result = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sus_result, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + store ptr @tls_variable, ptr %i + store ptr @tls_variable, ptr %j + %i_value = load ptr, ptr %i + %j_value = load ptr, ptr %j + %cmp = icmp eq ptr %i_value, %j_value + br i1 %cmp, label %same, label %diff + +same: + call void @print_same() + br label %cleanup + +diff: + call void @print_diff() + br label %cleanup + +cleanup: + %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) + call void @free(ptr %mem) + br label %suspend + +suspend: + call i1 @llvm.coro.end(ptr %hdl, i1 0) + ret ptr %hdl +} + +; Tests that normal functions could be optimized expectedly. +define void @normal_func() { +entry: + %i = alloca ptr + %j = alloca ptr + store ptr @tls_variable, ptr %i + store ptr @tls_variable, ptr %j + %i_value = load ptr, ptr %i + %j_value = load ptr, ptr %j + %cmp = icmp eq ptr %i_value, %j_value + br i1 %cmp, label %same, label %diff + +same: + call void @print_same() + br label %ret + +diff: + call void @print_diff() + br label %ret + +ret: + ret void +} + +; CHECK: void @normal_func +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @print_same() +; CHECk-NEXT: ret + +; CHECK: void @f.resume +; CHECK-NEXT: [[LABELNAME:.*]]: +; CHECK-NEXT: call void @print_same() +; CHECK-NEXT: call void @free(ptr{{.*}}%hdl) +; CHECK-NEXT: ret void + +declare void @print_same() +declare void @print_diff() +declare ptr @llvm.coro.free(token, ptr) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) + +declare token @llvm.coro.id(i32, ptr, ptr, ptr) +declare i1 @llvm.coro.alloc(token) +declare ptr @llvm.coro.begin(token, ptr) +declare i1 @llvm.coro.end(ptr, i1) + +declare noalias ptr @malloc(i32) +declare void @free(ptr) Index: llvm/test/Transforms/Coroutines/coro-TLS-cleanup.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Coroutines/coro-TLS-cleanup.ll @@ -0,0 +1,73 @@ +; Test that coro-cleanup would convert llvm.coro.tls.wrapper intrinsics +; correctly. +; RUN: opt < %s -S -passes=coro-cleanup | FileCheck %s + +%f.Frame = type { ptr, ptr, ptr, ptr, i1 } + +@tls_variable = thread_local global i32 0 +@f.resumers = private constant [3 x ptr] [ptr @f.resume, ptr @f.destroy, ptr @f.cleanup] + +define ptr @f() { +entry: + %id = call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr @f.resumers) + %alloc = call ptr @malloc(i32 40) + %i = alloca ptr, align 8 + %j = alloca ptr, align 8 + %hdl = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc) + %resume.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 0 + store ptr @f.resume, ptr %resume.addr, align 8 + %destroy.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 1 + store ptr @f.destroy, ptr %destroy.addr, align 8 + %i.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 2 + %j.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 3 + ; CHECK: store ptr @tls_variable, ptr %i.reload.addr, align 8 + %tls_variable1 = call ptr @llvm.coro.tls.wrapper(ptr @tls_variable) + store ptr %tls_variable1, ptr %i.reload.addr, align 8 + %index.addr2 = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 4 + store i1 false, ptr %index.addr2, align 1 + ret ptr %hdl +} + +define internal fastcc void @f.resume(ptr noalias nonnull align 8 dereferenceable(40) %hdl) { +entry.resume: + %i.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 2 + %j.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 3 + %tls_variable = call ptr @llvm.coro.tls.wrapper(ptr @tls_variable) + ; CHECK: store ptr @tls_variable, ptr %j.reload.addr, align 8 + store ptr %tls_variable, ptr %j.reload.addr, align 8 + call void @consume(ptr %i.reload.addr) + call void @consume(ptr %j.reload.addr) + call void @free(ptr %hdl) + ret void +} + +define internal fastcc void @f.destroy(ptr noalias nonnull align 8 dereferenceable(40) %hdl) { +entry.destroy: + %i.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 2 + %j.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 3 + call void @free(ptr %hdl) + ret void +} + +define internal fastcc void @f.cleanup(ptr noalias nonnull align 8 dereferenceable(40) %hdl) { +entry.cleanup: + %i.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 2 + %j.reload.addr = getelementptr inbounds %f.Frame, ptr %hdl, i32 0, i32 3 + call void @free(ptr null) + ret void +} + +declare void @consume(ptr) +declare ptr @llvm.coro.free(token, ptr) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) + +declare token @llvm.coro.id(i32, ptr, ptr, ptr) +declare i1 @llvm.coro.alloc(token) +declare ptr @llvm.coro.begin(token, ptr) +declare i1 @llvm.coro.end(ptr, i1) + +declare noalias ptr @malloc(i32) +declare void @free(ptr) + +declare ptr @llvm.coro.tls.wrapper(ptr) Index: llvm/test/Transforms/Coroutines/coro-TLS-early.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Coroutines/coro-TLS-early.ll @@ -0,0 +1,59 @@ +; Tests that coro-early would generate llvm.coro.tls.wrapper intrinsics for TLS +; variable correctly. +; RUN: opt < %s -S -passes=coro-early | FileCheck %s + +@tls_variable = thread_local global i32 0 + +define ptr @f() presplitcoroutine { +entry: + %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call ptr @malloc(i32 %size) + %i = alloca ptr + %j = alloca ptr + %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc) + ; CHECK: %[[TLS_VARIABLE:.*]] = call ptr @llvm.coro.tls.wrapper(ptr @tls_variable) + ; CHECK-NEXT: store ptr %[[TLS_VARIABLE]], ptr %i + store ptr @tls_variable, ptr %i + %sus_result = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sus_result, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + ; CHECK: %[[TLS_VARIABLE2:.*]] = call ptr @llvm.coro.tls.wrapper(ptr @tls_variable) + ; CHECK-NEXT: store ptr %[[TLS_VARIABLE2]], ptr %j + ; CHECK-NEXT: call void @consume(ptr %i) + ; CHECK-NEXT: call void @consume(ptr %j) + ; CHECK-NEXT: %[[PHI_TLS:.*]] = call ptr @llvm.coro.tls.wrapper(ptr @tls_variable) + ; CHECK-NEXT: br label %cleanup + ; CHECK-EMPTY: + ; CHECK-NEXT: cleanup: + ; CHECK-NEXT: %phi = phi ptr [ %[[PHI_TLS]], %resume ], [ null, %entry ] + store ptr @tls_variable, ptr %j + call void @consume(ptr %i) + call void @consume(ptr %j) + br label %cleanup + +cleanup: + %phi = phi ptr [@tls_variable, %resume], [null, %entry] + call void @consume(ptr %phi) + %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) + call void @free(ptr %mem) + br label %suspend + +suspend: + call i1 @llvm.coro.end(ptr %hdl, i1 0) + ret ptr %hdl +} + +declare void @consume(ptr) +declare ptr @llvm.coro.free(token, ptr) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) + +declare token @llvm.coro.id(i32, ptr, ptr, ptr) +declare i1 @llvm.coro.alloc(token) +declare ptr @llvm.coro.begin(token, ptr) +declare i1 @llvm.coro.end(ptr, i1) + +declare noalias ptr @malloc(i32) +declare void @free(ptr)