Index: llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h =================================================================== --- llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h +++ llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h @@ -21,8 +21,12 @@ class Module; struct CoroCleanupPass : PassInfoMixin { + CoroCleanupPass(bool OptimizationEnabled = false) : OptimizationEnabled(OptimizationEnabled) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); static bool isRequired() { return true; } + +private: + bool OptimizationEnabled; }; } // end namespace llvm Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -606,6 +606,10 @@ return parseSinglePassOption(Params, "minimal", "LowerMatrixIntrinsics"); } +Expected parseCoroCleanupPassOptions(StringRef Params) { + return parseSinglePassOption(Params, "opt", "CoroCleanup"); +} + Expected parseASanPassOptions(StringRef Params) { AddressSanitizerOptions Result; while (!Params.empty()) { Index: llvm/lib/Passes/PassBuilderPipelines.cpp =================================================================== --- llvm/lib/Passes/PassBuilderPipelines.cpp +++ llvm/lib/Passes/PassBuilderPipelines.cpp @@ -990,7 +990,7 @@ else MPM.addPass(buildInlinerPipeline(Level, Phase)); - MPM.addPass(CoroCleanupPass()); + MPM.addPass(CoroCleanupPass(/*OptimizationEnabled*/ true)); if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass())); Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -51,7 +51,6 @@ MODULE_PASS("check-debugify", NewPMCheckDebugifyPass()) MODULE_PASS("constmerge", ConstantMergePass()) MODULE_PASS("coro-early", CoroEarlyPass()) -MODULE_PASS("coro-cleanup", CoroCleanupPass()) MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass()) MODULE_PASS("deadargelim", DeadArgumentEliminationPass()) MODULE_PASS("debugify", NewPMDebugifyPass()) @@ -154,6 +153,13 @@ }, parseASanPassOptions, "kernel") +MODULE_PASS_WITH_PARAMS("coro-cleanup", + "CoroCleanupPass", + [](bool OptimizationEnabled) { + return CoroCleanupPass(OptimizationEnabled); + }, + parseCoroCleanupPassOptions, + "opt") #undef MODULE_PASS_WITH_PARAMS #ifndef CGSCC_ANALYSIS Index: llvm/lib/Transforms/Coroutines/CoroCleanup.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/CoroCleanup.cpp +++ llvm/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -12,6 +12,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" using namespace llvm; @@ -67,6 +68,8 @@ CB->replaceAllUsesWith(NewCB); CB->eraseFromParent(); CB = NewCB; + + Changed = true; } if (auto *II = dyn_cast(CB)) { @@ -145,11 +148,12 @@ MAM.getResult(M).getManager(); FunctionPassManager FPM; + FPM.addPass(EarlyCSEPass()); FPM.addPass(SimplifyCFGPass()); Lowerer L(M); for (auto &F : M) - if (L.lower(F)) + if (L.lower(F) && OptimizationEnabled) FPM.run(F, FAM); return PreservedAnalyses::none(); Index: llvm/test/Transforms/Coroutines/coro-async.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-async.ll +++ llvm/test/Transforms/Coroutines/coro-async.ll @@ -169,6 +169,8 @@ ; CHECK: [[CALLER_CONTEXT_ADDR:%.*]] = bitcast i8* %0 to i8** ; CHECK: [[CALLER_CONTEXT:%.*]] = load i8*, i8** [[CALLER_CONTEXT_ADDR]] ; CHECK: [[FRAME_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALLER_CONTEXT]], i64 128 +; CHECK-O0: [[VECTOR_SPILL_SPILL_ADDR:%.*]] = getelementptr inbounds %my_async_function.Frame, %my_async_function.Frame* {{.*}}, i32 0, i32 1 +; CHECK-O0: store <4 x double> %vector_spill, <4 x double>* [[VECTOR_SPILL_SPILL_ADDR]] ; CHECK-O0: [[VECTOR_SPILL_ADDR:%.*]] = getelementptr inbounds %my_async_function.Frame, %my_async_function.Frame* {{.*}}, i32 0, i32 1 ; CHECK-O0: load <4 x double>, <4 x double>* [[VECTOR_SPILL_ADDR]], align 16 ; CHECK: [[CALLEE_CTXT_SPILL_ADDR:%.*]] = getelementptr inbounds i8, i8* [[CALLER_CONTEXT]], i64 160 Index: llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll +++ llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes='default' -S | FileCheck %s +; RUN: opt < %s -passes='default',simplifycfg -S | FileCheck %s ; Define a function 'f' that resembles the Clang frontend's output for the ; following C++ coroutine: Index: llvm/test/Transforms/Coroutines/coro-readnone-06.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Coroutines/coro-readnone-06.ll @@ -0,0 +1,58 @@ +; Tests that the readnone function which don't cross suspend points could be optimized expectly. +; RUN: opt < %s -S -passes='default' -opaque-pointers | FileCheck %s + +define ptr @f() "coroutine.presplit" { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + %sus_result = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sus_result, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + %i = call i32 @readnone_func() readnone + %j = call i32 @readnone_func() readnone + %cmp = icmp eq i32 %i, %j + br i1 %cmp, label %same, label %diff + +same: + call void @print_same() + br label %cleanup + +diff: + call void @print_diff() + br label %cleanup + +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %suspend + +suspend: + call i1 @llvm.coro.end(i8* %hdl, i1 0) + ret i8* %hdl +} + +; CHECK-LABEL: f.resume( +; CHECK-NEXT: entry +; CHECK-NEXT: call i32 @readnone_func() +; CHECK-NEXT: call void @print_same() +; CHECK-NEXT: call void @free +; CHECK-NEXT: ret void + +declare i32 @readnone_func() readnone + +declare void @print_same() +declare void @print_diff() +declare i8* @llvm.coro.free(token, i8*) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) + +declare token @llvm.coro.id(i32, i8*, i8*, i8*) +declare i1 @llvm.coro.alloc(token) +declare i8* @llvm.coro.begin(token, i8*) +declare i1 @llvm.coro.end(i8*, i1) + +declare noalias i8* @malloc(i32) +declare void @free(i8*) Index: llvm/test/Transforms/Coroutines/coro-readnone-07.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Coroutines/coro-readnone-07.ll @@ -0,0 +1,43 @@ +; Tests that the calls with "coro_readnone" operand bundles could be optimized +; correctly by CoroCleanup pass. +; RUN: opt < %s -S -passes='module(coro-cleanup)' -opaque-pointers | FileCheck %s + +define i1 @f() { +entry: + %i = call i32 @readnone_func() [ "coro_readnone"() ] + %j = call i32 @readnone_func() [ "coro_readnone"() ] + %cmp = icmp eq i32 %i, %j + br i1 %cmp, label %same, label %diff + +same: + call void @print_same() + ret i1 true + +diff: + call void @print_diff() + ret i1 false +} + +; CHECK-LABEL: @f( +; CHECK-NEXT: entry +; CHECK-NEXT: call i32 @readnone_func() +; CHECK-NEXT: call void @print_same() +; CHECK-NEXT: ret i1 true + +declare void @f.resume() +declare void @f.destroy() +declare void @f.cleanup() + +declare i32 @readnone_func() readnone +declare void @print_same() +declare void @print_diff() + +declare ptr @llvm.coro.free(token, ptr nocapture readonly) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) +declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) +declare i1 @llvm.coro.alloc(token) +declare ptr @llvm.coro.begin(token, ptr writeonly) +declare i1 @llvm.coro.end(ptr, i1) +declare noalias ptr @malloc(i32) +declare void @free(ptr)