Index: docs/Coroutines.rst =================================================================== --- docs/Coroutines.rst +++ docs/Coroutines.rst @@ -93,7 +93,7 @@ define i8* @f(i32 %n) { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %alloc) @@ -106,7 +106,7 @@ switch i8 %0, label %suspend [i8 0, label %loop i8 1, label %cleanup] cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -168,7 +168,7 @@ define i8* @f(i32 %n) { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %alloc = call noalias i8* @malloc(i32 24) %0 = call noalias i8* @llvm.coro.begin(token %id, i8* %alloc) %frame = bitcast i8* %0 to %f.frame* @@ -227,7 +227,7 @@ .. code-block:: none entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin dyn.alloc: @@ -245,7 +245,7 @@ .. code-block:: llvm cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) %need.dyn.free = icmp ne i8* %mem, null br i1 %need.dyn.free, label %dyn.free, label %if.end dyn.free: @@ -417,7 +417,7 @@ entry: %promise = alloca i32 %pv = bitcast i32* %promise to i8* - %id = call token @llvm.coro.id(i32 0, i8* %pv, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* %pv, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin dyn.alloc: @@ -436,7 +436,7 @@ switch i8 %0, label %suspend [i8 0, label %loop i8 1, label %cleanup] cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -699,7 +699,7 @@ %promise = alloca i32 %pv = bitcast i32* %promise to i8* ; the second argument to coro.id points to the coroutine promise. - %id = call token @llvm.coro.id(i32 0, i8* %pv, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* %pv, i8* null, i8* null) ... %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %alloc) ... @@ -791,7 +791,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: - declare i8* @llvm.coro.free(i8* ) + declare i8* @llvm.coro.free(token %id, i8* ) Overview: """"""""" @@ -803,8 +803,11 @@ Arguments: """""""""" -A pointer to the coroutine frame. This should be the same pointer that was -returned by prior `coro.begin` call. +The first argument is a token returned by a call to '``llvm.coro.id``' +identifying the coroutine. + +The second argument is a pointer to the coroutine frame. This should be the same +pointer that was returned by prior `coro.begin` call. Example (custom deallocation function): """"""""""""""""""""""""""""""""""""""" @@ -812,7 +815,7 @@ .. code-block:: llvm cleanup: - %mem = call i8* @llvm.coro.free(i8* %frame) + %mem = call i8* @llvm.coro.free(token %id, i8* %frame) %mem_not_null = icmp ne i8* %mem, null br i1 %mem_not_null, label %if.then, label %if.end if.then: @@ -827,7 +830,7 @@ .. code-block:: llvm cleanup: - %mem = call i8* @llvm.coro.free(i8* %frame) + %mem = call i8* @llvm.coro.free(token %id, i8* %frame) call void @free(i8* %mem) ret void @@ -864,7 +867,7 @@ .. code-block:: text entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %dyn.alloc.required = call i1 @llvm.coro.alloc(token %id) br i1 %dyn.alloc.required, label %coro.alloc, label %coro.begin @@ -909,7 +912,8 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: - declare token @llvm.coro.id(i32 , i8* , i8* ) + declare token @llvm.coro.id(i32 , i8* , i8* , + i8* ) Overview: """"""""" @@ -927,7 +931,10 @@ The second argument, if not `null`, designates a particular alloca instruction to be a `coroutine promise`_. -The third argument is `null` before coroutine is split, and later is replaced +The third argument is `null` coming out of the frontend. The CoroEarly pass sets +this argument to point to the function this coro.id belongs to. + +The fourth argument is `null` before coroutine is split, and later is replaced to point to a private global constant array containing function pointers to outlined resume and destroy parts of the coroutine. @@ -1210,19 +1217,6 @@ This pass runs late to lower all coroutine related intrinsics not replaced by earlier passes. -Upstreaming sequence (rough plan) -================================= -#. Add documentation. -#. Add coroutine intrinsics. -#. Add empty coroutine passes. -#. Add coroutine devirtualization + tests. -#. Add CGSCC restart trigger + tests. -#. Add coroutine heap elision + tests. -#. Add custom allocation heap elision + tests. <== we are here -#. Add coroutine splitting logic + tests. -#. Add simple coroutine frame builder + tests. -#. Add the rest of the logic + tests. (Maybe split further as needed). - Areas Requiring Attention ========================= #. A coroutine frame is bigger than it could be. Adding stack packing and stack Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -603,15 +603,16 @@ // Coroutine Structure Intrinsics. def int_coro_id : Intrinsic<[llvm_token_ty], [llvm_i32_ty, llvm_ptr_ty, - llvm_ptr_ty], + llvm_ptr_ty, llvm_ptr_ty], [IntrArgMemOnly, IntrReadMem, ReadNone<1>, ReadOnly<2>, NoCapture<2>]>; def int_coro_alloc : Intrinsic<[llvm_i1_ty], [llvm_token_ty], []>; def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty], [WriteOnly<1>]>; -def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], - [IntrArgMemOnly, ReadOnly<0>, NoCapture<0>]>; +def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly, ReadOnly<1>, + NoCapture<1>]>; def int_coro_end : Intrinsic<[], [llvm_ptr_ty, llvm_i1_ty], []>; def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -3873,7 +3873,7 @@ default: break; case Intrinsic::coro_id: { - auto *InfoArg = CS.getArgOperand(2)->stripPointerCasts(); + auto *InfoArg = CS.getArgOperand(3)->stripPointerCasts(); if (isa(InfoArg)) break; auto *GV = dyn_cast(InfoArg); Index: lib/Transforms/Coroutines/CoroEarly.cpp =================================================================== --- lib/Transforms/Coroutines/CoroEarly.cpp +++ lib/Transforms/Coroutines/CoroEarly.cpp @@ -81,6 +81,7 @@ if (CII->getInfo().isPreSplit()) { F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT); setCannotDuplicate(CII); + CII->setCoroutineSelf(); } } break; Index: lib/Transforms/Coroutines/CoroElide.cpp =================================================================== --- lib/Transforms/Coroutines/CoroElide.cpp +++ lib/Transforms/Coroutines/CoroElide.cpp @@ -14,6 +14,8 @@ #include "CoroInternal.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" @@ -35,7 +37,10 @@ Lowerer(Module &M) : LowererBase(M) {} void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA); - bool processCoroId(CoroIdInst *, AAResults &AA); + bool shouldElide(Function *F, DominatorTree &DT, + PostDominatorTree &PDT) const; + bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT, + PostDominatorTree &PDT); }; } // end anonymous namespace @@ -122,14 +127,6 @@ CA->eraseFromParent(); } - // To suppress deallocation code, we replace all llvm.coro.free intrinsics - // associated with this coro.begin with null constant. - auto *NullPtr = ConstantPointerNull::get(Type::getInt8PtrTy(C)); - for (auto *CF : CoroFrees) { - CF->replaceAllUsesWith(NullPtr); - CF->eraseFromParent(); - } - // FIXME: Design how to transmit alignment information for every alloca that // is spilled into the coroutine frame and recreate the alignment information // here. Possibly we will need to do a mini SROA here and break the coroutine @@ -148,9 +145,65 @@ removeTailCallAttribute(Frame, AA); } -bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) { +// This function is used to check whether a terminator causes control flow to +// leaves the function in some way. +static bool returnsOrUnwindsToCaller(TerminatorInst *T) { + if (isa(T) || isa(T)) + return true; + + if (auto *CR = dyn_cast(T)) + return CR->unwindsToCaller(); + if (auto *CS = dyn_cast(T)) + return CS->unwindsToCaller(); + + return false; +} + +// Check that on every code path leaving the function dominated by a coro.begin, +// there is either a coro.free or coro.destroy postdominating a coro.begin. +bool Lowerer::shouldElide(Function *F, DominatorTree &DT, + PostDominatorTree &PDT) const { + // If no CoroAllocs, we cannot suppress allocation, so elision is not + // possible. + if (CoroAllocs.empty()) + return false; + + for (BasicBlock &BB : *F) { + auto *T = BB.getTerminator(); + if (!returnsOrUnwindsToCaller(T)) + continue; + + for (CoroBeginInst *CB : CoroBegins) { + if (DT.dominates(CB, T)) { + bool PostDominated = false; + for (CoroFreeInst *CF : CoroFrees) { + if (PDT.dominates(&BB, CF->getParent())) { + PostDominated = true; + break; + } + } + if (PostDominated) + continue; + + for (CoroSubFnInst *SF : DestroyAddr) { + if (PDT.dominates(&BB, SF->getParent())) { + PostDominated = true; + break; + } + } + if (!PostDominated) + return false; + } + } + } + return true; +} + +bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA, + DominatorTree &DT, PostDominatorTree &PDT) { CoroBegins.clear(); CoroAllocs.clear(); + CoroFrees.clear(); ResumeAddr.clear(); DestroyAddr.clear(); @@ -160,6 +213,8 @@ CoroBegins.push_back(CB); else if (auto *CA = dyn_cast(U)) CoroAllocs.push_back(CA); + else if (auto *CF = dyn_cast(U)) + CoroFrees.push_back(CF); } // Collect all coro.subfn.addrs associated with coro.begin. @@ -191,27 +246,20 @@ replaceWithConstant(ResumeAddrConstant, ResumeAddr); - if (DestroyAddr.empty()) - return true; + bool ShouldElide = shouldElide(CoroId->getFunction(), DT, PDT); - auto *DestroyAddrConstant = - ConstantExpr::getExtractValue(Resumers, CoroSubFnInst::DestroyIndex); + auto *DestroyAddrConstant = ConstantExpr::getExtractValue( + Resumers, + ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex); replaceWithConstant(DestroyAddrConstant, DestroyAddr); - // If there is a coro.alloc that llvm.coro.id refers to, we have the ability - // to suppress dynamic allocation. - if (!CoroAllocs.empty()) { - // FIXME: The check above is overly lax. It only checks for whether we have - // an ability to elide heap allocations, not whether it is safe to do so. - // We need to do something like: - // If for every exit from the function where coro.begin is - // live, there is a coro.free or coro.destroy dominating that exit block, - // then it is safe to elide heap allocation, since the lifetime of coroutine - // is fully enclosed in its caller. + if (ShouldElide) { auto *FrameTy = getFrameType(cast(ResumeAddrConstant)); elideHeapAllocations(CoroId->getFunction(), FrameTy, AA); + coro::replaceCoroFree(CoroId, /*Elide=*/true); } + return true; } @@ -262,29 +310,33 @@ Changed = replaceDevirtTrigger(F); L->CoroIds.clear(); - L->CoroFrees.clear(); - // Collect all PostSplit coro.ids and all coro.free. + // Collect all PostSplit coro.ids. for (auto &I : instructions(F)) - if (auto *CF = dyn_cast(&I)) - L->CoroFrees.push_back(CF); - else if (auto *CII = dyn_cast(&I)) + if (auto *CII = dyn_cast(&I)) if (CII->getInfo().isPostSplit()) - L->CoroIds.push_back(CII); + // If it is the coroutine itself, don't touch it. + if (CII->getCoroutine() != CII->getFunction()) + L->CoroIds.push_back(CII); // If we did not find any coro.id, there is nothing to do. if (L->CoroIds.empty()) return Changed; AAResults &AA = getAnalysis().getAAResults(); + DominatorTree &DT = getAnalysis().getDomTree(); + PostDominatorTree &PDT = + getAnalysis().getPostDomTree(); + for (auto *CII : L->CoroIds) - Changed |= L->processCoroId(CII, AA); + Changed |= L->processCoroId(CII, AA, DT, PDT); return Changed; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); } }; } @@ -295,6 +347,8 @@ "Coroutine frame allocation elision and indirect calls replacement", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_END( CoroElide, "coro-elide", "Coroutine frame allocation elision and indirect calls replacement", false, Index: lib/Transforms/Coroutines/CoroFrame.cpp =================================================================== --- lib/Transforms/Coroutines/CoroFrame.cpp +++ lib/Transforms/Coroutines/CoroFrame.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/circular_raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -182,8 +183,9 @@ } // Iterate propagating consumes and kills until they stop changing - int Iteration = 0; (void)Iteration; - + int Iteration = 0; + (void)Iteration; + bool Changed; do { DEBUG(dbgs() << "iteration " << ++Iteration); @@ -307,10 +309,10 @@ /*IsVarArgs=*/false); auto *FnPtrTy = FnTy->getPointerTo(); - if (Shape.CoroSuspends.size() > UINT32_MAX) - report_fatal_error("Cannot handle coroutine with this many suspend points"); + // Figure out how wide should be an integer type storing the suspend index. + unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); - SmallVector Types{FnPtrTy, FnPtrTy, Type::getInt32Ty(C)}; + SmallVector Types{FnPtrTy, FnPtrTy, Type::getIntNTy(C, IndexBits)}; Value *CurrentDef = nullptr; // Create an entry for every spilled value. @@ -333,13 +335,6 @@ return FrameTy; } -// Returns the index of the last non-spill field in the coroutine frame. -// 2 - if there is no coroutine promise specified or 3, if there is. -static unsigned getLastNonSpillIndex(coro::Shape &Shape) { - // TODO: Add support for coroutine promise. - return 2; -} - // Replace all alloca and SSA values that are accessed across suspend points // with GetElementPointer from coroutine frame + loads and stores. Create an // AllocaSpillBB that will become the new entry block for the resume parts of @@ -373,7 +368,7 @@ Value *CurrentValue = nullptr; BasicBlock *CurrentBlock = nullptr; Value *CurrentReload = nullptr; - unsigned Index = getLastNonSpillIndex(Shape); + unsigned Index = coro::Shape::LastKnownField; // We need to keep track of any allocas that need "spilling" // since they will live in the coroutine frame now, all access to them @@ -622,6 +617,10 @@ // frame. if (isa(&I)) continue; + // A token returned CoroIdInst is used to tie together structural intrinsics + // in a coroutine. It should not be saved to the coroutine frame. + if (isa(&I)) + continue; for (User *U : I.users()) if (Checker.isDefinitionAcrossSuspend(I, U)) { Index: lib/Transforms/Coroutines/CoroInstr.h =================================================================== --- lib/Transforms/Coroutines/CoroInstr.h +++ lib/Transforms/Coroutines/CoroInstr.h @@ -11,7 +11,7 @@ // allows you to do things like: // // if (auto *SF = dyn_cast(Inst)) -// ... SF->getFrame() ... +// ... SF->getFrame() ... // // All intrinsic function calls are instances of the call instruction, so these // are all subclasses of the CallInst class. Note that none of these classes @@ -37,6 +37,7 @@ RestartTrigger = -1, ResumeIndex, DestroyIndex, + CleanupIndex, IndexLast, IndexFirst = RestartTrigger }; @@ -76,7 +77,8 @@ /// This represents the llvm.coro.alloc instruction. class LLVM_LIBRARY_VISIBILITY CoroIdInst : public IntrinsicInst { - enum { AlignArg, PromiseArg, InfoArg }; + enum { AlignArg, PromiseArg, CoroutineArg, InfoArg }; + public: // Info argument of coro.id is // fresh out of the frontend: null ; @@ -118,6 +120,16 @@ void setInfo(Constant *C) { setArgOperand(InfoArg, C); } + Function *getCoroutine() const { + return cast(getArgOperand(CoroutineArg)->stripPointerCasts()); + } + void setCoroutineSelf() { + assert(isa(getArgOperand(CoroutineArg)) && + "Coroutine argument is already assigned"); + auto *const Int8PtrTy = Type::getInt8PtrTy(getContext()); + setArgOperand(CoroutineArg, + ConstantExpr::getBitCast(getFunction(), Int8PtrTy)); + } // Methods to support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const IntrinsicInst *I) { @@ -142,7 +154,11 @@ /// This represents the llvm.coro.free instruction. class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst { + enum { IdArg, FrameArg }; + public: + Value *getFrame() const { return getArgOperand(FrameArg); } + // Methods to support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_free; @@ -157,9 +173,7 @@ enum { IdArg, MemArg }; public: - CoroIdInst *getId() const { - return cast(getArgOperand(IdArg)); - } + CoroIdInst *getId() const { return cast(getArgOperand(IdArg)); } Value *getMem() const { return getArgOperand(MemArg); } Index: lib/Transforms/Coroutines/CoroInternal.h =================================================================== --- lib/Transforms/Coroutines/CoroInternal.h +++ lib/Transforms/Coroutines/CoroInternal.h @@ -46,6 +46,7 @@ bool declaresIntrinsics(Module &M, std::initializer_list); void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement); void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement); +void replaceCoroFree(CoroIdInst *CoroId, bool Elide); void updateCallGraph(Function &Caller, ArrayRef Funcs, CallGraph &CG, CallGraphSCC &SCC); @@ -71,9 +72,10 @@ // Field Indexes for known coroutine frame fields. enum { - ResumeField = 0, - DestroyField = 1, - IndexField = 2, + ResumeField, + DestroyField, + IndexField, + LastKnownField = IndexField }; StructType *FrameTy; @@ -82,6 +84,14 @@ SwitchInst* ResumeSwitch; bool HasFinalSuspend; + IntegerType* getIndexType() const { + assert(FrameTy && "frame type not assigned"); + return cast(FrameTy->getElementType(IndexField)); + } + ConstantInt *getIndex(uint64_t Value) const { + return ConstantInt::get(getIndexType(), Value); + } + Shape() = default; explicit Shape(Function &F) { buildFrom(F); } void buildFrom(Function &F); Index: lib/Transforms/Coroutines/CoroSplit.cpp =================================================================== --- lib/Transforms/Coroutines/CoroSplit.cpp +++ lib/Transforms/Coroutines/CoroSplit.cpp @@ -64,7 +64,7 @@ uint32_t SuspendIndex = 0; for (auto S : Shape.CoroSuspends) { - ConstantInt *IndexVal = Builder.getInt32(SuspendIndex); + ConstantInt *IndexVal = Shape.getIndex(SuspendIndex); // Replace CoroSave with a store to Index: // %index.addr = getelementptr %f.frame... (index field number) @@ -140,7 +140,6 @@ // resume or cleanup pass for every suspend point. static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape, BasicBlock *ResumeEntry, int8_t FnIndex) { - Module *M = F.getParent(); auto *FrameTy = Shape.FrameTy; auto *FnPtrTy = cast(FrameTy->getElementType(0)); @@ -207,7 +206,11 @@ OldVFrame->replaceAllUsesWith(NewVFrame); // Replace coro suspend with the appropriate resume index. - auto *NewValue = Builder.getInt8(FnIndex); + // Replacing coro.suspend with (0) will result in control flow proceeding to + // a resume label associated with a suspend point, replacing it with (1) will + // result in control flow proceeding to a cleanup label associated with this + // suspend point. + auto *NewValue = Builder.getInt8(FnIndex ? 1 : 0); for (CoroSuspendInst *CS : Shape.CoroSuspends) { auto *MappedCS = cast(VMap[CS]); MappedCS->replaceAllUsesWith(NewValue); @@ -219,11 +222,23 @@ // FIXME: coming in upcoming patches: // replaceUnwindCoroEnds(Shape.CoroEnds, VMap); - // Store the address of this clone in the coroutine frame. - Builder.SetInsertPoint(Shape.FramePtr->getNextNode()); - auto *G = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, Shape.FramePtr, 0, - FnIndex, "fn.addr"); - Builder.CreateStore(NewF, G); + // We only store resume(0) and destroy(1) addresses in the coroutine frame. + // The cleanup(2) clone is only used during devirtualization when coroutine is + // eligible for heap elision and thus does not participate in indirect calls + // and does not need its address to be stored in the coroutine frame. + if (FnIndex < 2) { + // Store the address of this clone in the coroutine frame. + Builder.SetInsertPoint(Shape.FramePtr->getNextNode()); + auto *G = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, Shape.FramePtr, + 0, FnIndex, "fn.addr"); + Builder.CreateStore(NewF, G); + } + + // Eliminate coro.free from the clones, replacing it with 'null' in cleanup, + // to suppress deallocation code. + coro::replaceCoroFree(cast(VMap[Shape.CoroBegin->getId()]), + /*Elide=*/FnIndex == 2); + NewF->setCallingConv(CallingConv::Fast); return NewF; @@ -303,10 +318,12 @@ return; buildCoroutineFrame(F, Shape); + replaceFrameSize(Shape); auto *ResumeEntry = createResumeEntryBlock(F, Shape); - auto *ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0); - auto *DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1); + auto ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0); + auto DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1); + auto CleanupClone = createClone(F, ".cleanup", Shape, ResumeEntry, 2); // We no longer need coro.end in F. removeCoroEnds(Shape); @@ -314,11 +331,10 @@ postSplitCleanup(F); postSplitCleanup(*ResumeClone); postSplitCleanup(*DestroyClone); + postSplitCleanup(*CleanupClone); - replaceFrameSize(Shape); - - setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone}); - coro::updateCallGraph(F, {ResumeClone, DestroyClone}, CG, SCC); + setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone, CleanupClone}); + coro::updateCallGraph(F, {ResumeClone, DestroyClone, CleanupClone}, CG, SCC); } // When we see the coroutine the first time, we insert an indirect call to a Index: lib/Transforms/Coroutines/Coroutines.cpp =================================================================== --- lib/Transforms/Coroutines/Coroutines.cpp +++ lib/Transforms/Coroutines/Coroutines.cpp @@ -127,6 +127,27 @@ return false; } +// Replace all coro.frees associated with the provided CoroId either with 'null' +// if Elide is true and with its frame parameter otherwise. +void coro::replaceCoroFree(CoroIdInst *CoroId, bool Elide) { + SmallVector CoroFrees; + for (User *U : CoroId->users()) + if (auto CF = dyn_cast(U)) + CoroFrees.push_back(CF); + + if (CoroFrees.empty()) + return; + + Value *Replacement = + Elide ? ConstantPointerNull::get(Type::getInt8PtrTy(CoroId->getContext())) + : CoroFrees.front()->getFrame(); + + for (CoroFreeInst *CF : CoroFrees) { + CF->replaceAllUsesWith(Replacement); + CF->eraseFromParent(); + } +} + // FIXME: This code is stolen from CallGraph::addToCallGraph(Function *F), which // happens to be private. It is better for this functionality exposed by the // CallGraph. @@ -286,5 +307,5 @@ // Canonicalize coro.suspend by inserting a coro.save if needed. for (CoroSuspendInst *CS : CoroSuspends) if (!CS->getCoroSave()) - createCoroSave(CoroBegin, CoroSuspends.back()); + createCoroSave(CoroBegin, CS); } Index: test/Transforms/Coroutines/coro-elide.ll =================================================================== --- test/Transforms/Coroutines/coro-elide.ll +++ test/Transforms/Coroutines/coro-elide.ll @@ -23,6 +23,7 @@ define i8* @f() { entry: %id = call token @llvm.coro.id(i32 0, i8* null, + i8* bitcast (i8*()* @f to i8*), i8* bitcast ([2 x void (i8*)*]* @f.resumers to i8*)) %hdl = call i8* @llvm.coro.begin(token %id, i8* null) ret i8* %hdl @@ -31,10 +32,9 @@ ; CHECK-LABEL: @callResume( define void @callResume() { entry: -; CHECK: call i8* @llvm.coro.begin %hdl = call i8* @f() -; CHECK-NEXT: call void @print(i32 0) +; CHECK: call void @print(i32 0) %0 = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 0) %1 = bitcast i8* %0 to void (i8*)* call fastcc void %1(i8* %hdl) @@ -51,10 +51,9 @@ ; CHECK-LABEL: @eh( define void @eh() personality i8* null { entry: -; CHECK: call i8* @llvm.coro.begin %hdl = call i8* @f() -; CHECK-NEXT: call void @print(i32 0) +; CHECK: call void @print(i32 0) %0 = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 0) %1 = bitcast i8* %0 to void (i8*)* invoke void %1(i8* %hdl) @@ -71,7 +70,7 @@ ; no devirtualization here, since coro.begin info parameter is null define void @no_devirt_info_null() { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %hdl = call i8* @llvm.coro.begin(token %id, i8* null) ; CHECK: call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 0) @@ -107,7 +106,7 @@ ret void } -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i8* @llvm.coro.begin(token, i8*) declare i8* @llvm.coro.frame() declare i8* @llvm.coro.subfn.addr(i8*, i8) Index: test/Transforms/Coroutines/coro-heap-elide.ll =================================================================== --- test/Transforms/Coroutines/coro-heap-elide.ll +++ test/Transforms/Coroutines/coro-heap-elide.ll @@ -11,19 +11,21 @@ declare fastcc void @f.resume(%f.frame*) declare fastcc void @f.destroy(%f.frame*) +declare fastcc void @f.cleanup(%f.frame*) declare void @may_throw() declare i8* @CustomAlloc(i32) declare void @CustomFree(i8*) -@f.resumers = internal constant - [2 x void (%f.frame*)*] [void (%f.frame*)* @f.resume, void (%f.frame*)* @f.destroy] +@f.resumers = internal constant [3 x void (%f.frame*)*] + [void (%f.frame*)* @f.resume, void (%f.frame*)* @f.destroy, void (%f.frame*)* @f.cleanup] ; a coroutine start function define i8* @f() personality i8* null { entry: %id = call token @llvm.coro.id(i32 0, i8* null, - i8* bitcast ([2 x void (%f.frame*)*]* @f.resumers to i8*)) + i8* bitcast (i8*()* @f to i8*), + i8* bitcast ([3 x void (%f.frame*)*]* @f.resumers to i8*)) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin dyn.alloc: @@ -39,7 +41,7 @@ ehcleanup: %tok = cleanuppad within none [] - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) %need.dyn.free = icmp ne i8* %mem, null br i1 %need.dyn.free, label %dyn.free, label %if.end dyn.free: @@ -62,7 +64,7 @@ ; CHECK-NOT: tail call void @bar( ; CHECK: call void @bar( tail call void @bar(i8* %hdl) -; CHECK: tail call void @bar( +; CHECK: tail call void @bar( tail call void @bar(i8* null) ; CHECK-NEXT: call fastcc void bitcast (void (%f.frame*)* @f.resume to void (i8*)*)(i8* %vFrame) @@ -70,7 +72,7 @@ %1 = bitcast i8* %0 to void (i8*)* call fastcc void %1(i8* %hdl) -; CHECK-NEXT: call fastcc void bitcast (void (%f.frame*)* @f.destroy to void (i8*)*)(i8* %vFrame) +; CHECK-NEXT: call fastcc void bitcast (void (%f.frame*)* @f.cleanup to void (i8*)*)(i8* %vFrame) %2 = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 1) %3 = bitcast i8* %2 to void (i8*)* call fastcc void %3(i8* %hdl) @@ -84,7 +86,8 @@ define i8* @f_no_elision() personality i8* null { entry: %id = call token @llvm.coro.id(i32 0, i8* null, - i8* bitcast ([2 x void (%f.frame*)*]* @f.resumers to i8*)) + i8* bitcast (i8*()* @f_no_elision to i8*), + i8* bitcast ([3 x void (%f.frame*)*]* @f.resumers to i8*)) %alloc = call i8* @CustomAlloc(i32 4) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) ret i8* %hdl @@ -116,9 +119,9 @@ ret void } -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i1 @llvm.coro.alloc(token) -declare i8* @llvm.coro.free(i8*) +declare i8* @llvm.coro.free(token, i8*) declare i8* @llvm.coro.begin(token, i8*) declare i8* @llvm.coro.frame(token) declare i8* @llvm.coro.subfn.addr(i8*, i8) Index: test/Transforms/Coroutines/coro-split-00.ll =================================================================== --- test/Transforms/Coroutines/coro-split-00.ll +++ test/Transforms/Coroutines/coro-split-00.ll @@ -3,7 +3,7 @@ define i8* @f() "coroutine.presplit"="1" { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) @@ -16,7 +16,7 @@ br label %cleanup cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -45,13 +45,13 @@ ; CHECK: call void @free( ; CHECK: ret void -declare i8* @llvm.coro.free(i8*) +declare i8* @llvm.coro.free(token, i8*) declare i32 @llvm.coro.size.i32() declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(i8*) declare void @llvm.coro.destroy(i8*) -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i8* @llvm.coro.alloc(token) declare i8* @llvm.coro.begin(token, i8*) declare void @llvm.coro.end(i8*, i1) Index: test/Transforms/Coroutines/coro-split-01.ll =================================================================== --- test/Transforms/Coroutines/coro-split-01.ll +++ test/Transforms/Coroutines/coro-split-01.ll @@ -3,7 +3,7 @@ define i8* @f() { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin dyn.alloc: @@ -22,7 +22,7 @@ br label %cleanup cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -35,22 +35,18 @@ call void @llvm.coro.resume(i8* %hdl) ret i32 0 ; CHECK-LABEL: @main( -; CHECK: call i8* @malloc -; CHECK-NOT: call void @free ; CHECK: call void @print(i32 0) -; CHECK-NOT: call void @free ; CHECK: call void @print(i32 1) -; CHECK: call void @free ; CHECK: ret i32 0 } -declare i8* @llvm.coro.free(i8*) +declare i8* @llvm.coro.free(token, i8*) declare i32 @llvm.coro.size.i32() declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(i8*) declare void @llvm.coro.destroy(i8*) -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i1 @llvm.coro.alloc(token) declare i8* @llvm.coro.begin(token, i8*) declare void @llvm.coro.end(i8*, i1) Index: test/Transforms/Coroutines/ex0.ll =================================================================== --- test/Transforms/Coroutines/ex0.ll +++ test/Transforms/Coroutines/ex0.ll @@ -3,7 +3,7 @@ define i8* @f(i32 %n) { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) @@ -20,7 +20,7 @@ br label %loop cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -43,9 +43,9 @@ ; CHECK: ret i32 0 } -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i8* @llvm.coro.alloc(token) -declare i8* @llvm.coro.free(i8*) +declare i8* @llvm.coro.free(token, i8*) declare i32 @llvm.coro.size.i32() declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(i8*) Index: test/Transforms/Coroutines/ex1.ll =================================================================== --- test/Transforms/Coroutines/ex1.ll +++ test/Transforms/Coroutines/ex1.ll @@ -3,7 +3,7 @@ define i8* @f(i32 %n) { entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %alloc) @@ -16,7 +16,7 @@ switch i8 %0, label %suspend [i8 0, label %loop i8 1, label %cleanup] cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -43,11 +43,11 @@ declare void @free(i8*) declare void @print(i32) -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i32 @llvm.coro.size.i32() declare i8* @llvm.coro.begin(token, i8*) declare i8 @llvm.coro.suspend(token, i1) -declare i8* @llvm.coro.free(i8*) +declare i8* @llvm.coro.free(token, i8*) declare void @llvm.coro.end(i8*, i1) declare void @llvm.coro.resume(i8*) Index: test/Transforms/Coroutines/ex2.ll =================================================================== --- /dev/null +++ test/Transforms/Coroutines/ex2.ll @@ -0,0 +1,63 @@ +; Second example from Doc/Coroutines.rst (custom alloc and free functions) +; RUN: opt < %s -O2 -enable-coroutines -S | FileCheck %s + +define i8* @f(i32 %n) { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) + br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin +dyn.alloc: + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @CustomAlloc(i32 %size) + br label %coro.begin +coro.begin: + %phi = phi i8* [ null, %entry ], [ %alloc, %dyn.alloc ] + %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %phi) + br label %loop +loop: + %n.val = phi i32 [ %n, %coro.begin ], [ %inc, %loop ] + %inc = add nsw i32 %n.val, 1 + call void @print(i32 %n.val) + %0 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %0, label %suspend [i8 0, label %loop + i8 1, label %cleanup] +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + %need.dyn.free = icmp ne i8* %mem, null + br i1 %need.dyn.free, label %dyn.free, label %suspend +dyn.free: + call void @CustomFree(i8* %mem) + br label %suspend +suspend: + call void @llvm.coro.end(i8* %hdl, i1 false) + ret i8* %hdl +} + +; CHECK-LABEL: @main +define i32 @main() { +entry: + %hdl = call i8* @f(i32 4) + call void @llvm.coro.resume(i8* %hdl) + call void @llvm.coro.resume(i8* %hdl) + call void @llvm.coro.destroy(i8* %hdl) + ret i32 0 +; CHECK: call void @print(i32 4) +; CHECK-NEXT: call void @print(i32 5) +; CHECK-NEXT: call void @print(i32 6) +; CHECK-NEXT: ret i32 0 +} + +declare i8* @CustomAlloc(i32) +declare void @CustomFree(i8*) +declare void @print(i32) + +declare token @llvm.coro.id(i32, i8*, i8*, i8*) +declare i1 @llvm.coro.alloc(token) +declare i32 @llvm.coro.size.i32() +declare i8* @llvm.coro.begin(token, i8*) +declare i8 @llvm.coro.suspend(token, i1) +declare i8* @llvm.coro.free(token, i8*) +declare void @llvm.coro.end(i8*, i1) + +declare void @llvm.coro.resume(i8*) +declare void @llvm.coro.destroy(i8*) Index: test/Transforms/Coroutines/ex3.ll =================================================================== --- /dev/null +++ test/Transforms/Coroutines/ex3.ll @@ -0,0 +1,60 @@ +; Third example from Doc/Coroutines.rst (two suspend points) +; RUN: opt < %s -O2 -enable-coroutines -S | FileCheck %s + +define i8* @f(i32 %n) { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %alloc) + br label %loop +loop: + %n.val = phi i32 [ %n, %entry ], [ %inc, %loop.resume ] + call void @print(i32 %n.val) #4 + %0 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %0, label %suspend [i8 0, label %loop.resume + i8 1, label %cleanup] +loop.resume: + %inc = add nsw i32 %n.val, 1 + %sub = xor i32 %n.val, -1 + call void @print(i32 %sub) + %1 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %1, label %suspend [i8 0, label %loop + i8 1, label %cleanup] +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %suspend +suspend: + call void @llvm.coro.end(i8* %hdl, i1 false) + ret i8* %hdl +} + +; CHECK-LABEL: @main +define i32 @main() { +entry: + %hdl = call i8* @f(i32 4) + call void @llvm.coro.resume(i8* %hdl) + call void @llvm.coro.resume(i8* %hdl) + call void @llvm.coro.destroy(i8* %hdl) + ret i32 0 +; CHECK: call void @print(i32 4) +; CHECK-NEXT: call void @print(i32 -5) +; CHECK-NEXT: call void @print(i32 5) +; CHECK: ret i32 0 +} + +declare i8* @malloc(i32) +declare void @free(i8*) +declare void @print(i32) + +declare token @llvm.coro.id(i32, i8*, i8*, i8*) +declare i1 @llvm.coro.alloc(token) +declare i32 @llvm.coro.size.i32() +declare i8* @llvm.coro.begin(token, i8*) +declare i8 @llvm.coro.suspend(token, i1) +declare i8* @llvm.coro.free(token, i8*) +declare void @llvm.coro.end(i8*, i1) + +declare void @llvm.coro.resume(i8*) +declare void @llvm.coro.destroy(i8*) Index: test/Transforms/Coroutines/restart-trigger.ll =================================================================== --- test/Transforms/Coroutines/restart-trigger.ll +++ test/Transforms/Coroutines/restart-trigger.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: CoroSplit: Processing coroutine 'f' state: 1 define void @f() { - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null) + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() %alloc = call i8* @malloc(i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) @@ -21,7 +21,7 @@ br label %cleanup cleanup: - %mem = call i8* @llvm.coro.free(i8* %hdl) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) call void @free(i8* %mem) br label %suspend suspend: @@ -29,9 +29,9 @@ ret void } -declare token @llvm.coro.id(i32, i8*, i8*) +declare token @llvm.coro.id(i32, i8*, i8*, i8*) declare i8* @llvm.coro.begin(token, i8*) -declare i8* @llvm.coro.free(i8*) +declare i8* @llvm.coro.free(token, i8*) declare i32 @llvm.coro.size.i32() declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(i8*)