diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -622,61 +622,6 @@ return FrameTy; } -// We use a pointer use visitor to discover if there are any writes into an -// alloca that dominates CoroBegin. If that is the case, insertSpills will copy -// the value from the alloca into the coroutine frame spill slot corresponding -// to that alloca. -namespace { -struct AllocaUseVisitor : PtrUseVisitor { - using Base = PtrUseVisitor; - AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, - const CoroBeginInst &CB) - : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {} - - // We are only interested in uses that dominate coro.begin. - void visit(Instruction &I) { - if (DT.dominates(&I, &CoroBegin)) - Base::visit(I); - } - // We need to provide this overload as PtrUseVisitor uses a pointer based - // visiting function. - void visit(Instruction *I) { return visit(*I); } - - void visitLoadInst(LoadInst &) {} // Good. Nothing to do. - - // If the use is an operand, the pointer escaped and anything can write into - // that memory. If the use is the pointer, we are definitely writing into the - // alloca and therefore we need to copy. - void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); } - - // Any other instruction that is not filtered out by PtrUseVisitor, will - // result in the copy. - void visitInstruction(Instruction &I) { PI.setAborted(&I); } - -private: - const DominatorTree &DT; - const CoroBeginInst &CoroBegin; -}; -} // namespace -static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT, - const CoroBeginInst &CB) { - const DataLayout &DL = A.getModule()->getDataLayout(); - AllocaUseVisitor Visitor(DL, DT, CB); - auto PtrI = Visitor.visitPtr(A); - if (PtrI.isEscaped() || PtrI.isAborted()) { - auto *PointerEscapingInstr = PtrI.getEscapingInst() - ? PtrI.getEscapingInst() - : PtrI.getAbortingInst(); - if (PointerEscapingInstr) { - LLVM_DEBUG( - dbgs() << "AllocaInst copy was triggered by instruction: " - << *PointerEscapingInstr << "\n"); - } - return true; - } - return false; -} - // We need to make room to insert a spill after initial PHIs, but before // catchswitch instruction. Placing it before violates the requirement that // catchswitch, like all other EHPads must be the first nonPHI in a block. @@ -698,6 +643,30 @@ return CleanupRet; } +// Find the nearest Terminator instruction that dominates all CoroSave +// instructions. +static Instruction *findDominatorOfAllCoroSaves(coro::Shape &Shape) { + Instruction *Dom{nullptr}; + DominatorTree DT(*Shape.CoroBegin->getFunction()); + for (const AnyCoroSuspendInst *I : Shape.CoroSuspends) { + CoroSaveInst *CSI = I->getCoroSave(); + if (!Dom) { + // Each CoroSaveInst should be in a separate BB due to earlier + // simplifications, hence it's guaranteed to have a single + // predecessor. + Dom = CSI->getParent()->getSinglePredecessor()->getTerminator(); + continue; + } + BasicBlock *BB = + DT.findNearestCommonDominator(Dom->getParent(), CSI->getParent()); + if (BB == CSI->getParent()) + Dom = CSI->getParent()->getSinglePredecessor()->getTerminator(); + else if (BB != Dom->getParent()) + Dom = BB->getTerminator(); + } + return Dom; +} + // Replace all alloca and SSA values that are accessed across suspend points // with GetElementPointer from coroutine frame + loads and stores. Create an // AllocaSpillBB that will become the new entry block for the resume parts of @@ -916,14 +885,32 @@ return FramePtr; } - // If we found any alloca, replace all of their remaining uses with GEP - // instructions. Because new dbg.declare have been created for these alloca, - // we also delete the original dbg.declare and replace other uses with undef. - // Note: We cannot replace the alloca with GEP instructions indiscriminately, - // as some of the uses may not be dominated by CoroBegin. - bool MightNeedToCopy = false; - Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front()); - SmallVector UsersToUpdate; + // We need a new Checker here to account for the spill blocks just created. + SuspendCrossingInfo Checker(*CB->getFunction(), Shape); + + // The following code deals with allocas. For each alloca, there are 3 types + // of uses, uses before CoroBegin, uses after CoroBegin but before any + // CoroSave, and uses across suspend points. + // Uses across suspend points will always be replaced with GEP instructions + // that fetch the data from the frame. + // If there are no uses before CoroBegin, we can safely replace every use of + // the alloca to the GEP instructions. + // If there are uses before CoroBegin, we take a conservative appproach here + // to assume that it will lead to changes to the data at some point either + // directly or through aliases. In this case, we leave the usese of these + // allocas untouched, but before the first CoroSave, we copy the data from + // stack to the frame to make sure they stay in sync. + // TODO: Optimize simple cases where we can tell the data is never modified. + // TODO: Sink uses before CoroBegin that can be safely moved to after. + SmallVector UsersAfterSuspend; + SmallVector UsersAfterCoroBegin; + auto UpdateUsersWithGEP = [&](auto &P, auto &UsersToUpdate) { + Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front()); + auto *G = GetFramePointer(P.second, P.first); + G->takeName(P.first); + for (Instruction *I : UsersToUpdate) + I->replaceUsesOfWith(P.first, G); + }; for (auto &P : Allocas) { AllocaInst *const A = P.first; @@ -931,41 +918,36 @@ DI->eraseFromParent(); replaceDbgUsesWithUndef(A); - UsersToUpdate.clear(); + UsersAfterSuspend.clear(); + UsersAfterCoroBegin.clear(); + bool HasUsersBeforeCoroBegin = false; for (User *U : A->users()) { auto *I = cast(U); - if (DT.dominates(CB, I)) - UsersToUpdate.push_back(I); + if (Checker.isDefinitionAcrossSuspend(*P.first, U)) + UsersAfterSuspend.push_back(I); + else if (DT.dominates(CB, I)) + UsersAfterCoroBegin.push_back(I); else - MightNeedToCopy = true; + HasUsersBeforeCoroBegin = true; } - if (!UsersToUpdate.empty()) { - auto *G = GetFramePointer(P.second, A); - G->takeName(A); - for (Instruction *I : UsersToUpdate) - I->replaceUsesOfWith(A, G); + if (!UsersAfterSuspend.empty()) + UpdateUsersWithGEP(P, UsersAfterSuspend); + if (!HasUsersBeforeCoroBegin) { + UpdateUsersWithGEP(P, UsersAfterCoroBegin); + continue; } - } - // If we discovered such uses not dominated by CoroBegin, see if any of them - // preceed coro begin and have instructions that can modify the - // value of the alloca and therefore would require a copying the value into - // the spill slot in the coroutine frame. - if (MightNeedToCopy) { - Builder.SetInsertPoint(FramePtr->getNextNode()); - for (auto &P : Allocas) { - AllocaInst *const A = P.first; - if (mightWriteIntoAllocaPtr(*A, DT, *CB)) { - if (A->isArrayAllocation()) - report_fatal_error( - "Coroutines cannot handle copying of array allocas yet"); + Instruction *Dom = findDominatorOfAllCoroSaves(Shape); + Builder.SetInsertPoint(Dom); + if (A->isArrayAllocation()) + report_fatal_error( + "Coroutines cannot handle copying of array allocas yet"); - auto *G = GetFramePointer(P.second, A); - auto *Value = Builder.CreateLoad(A->getAllocatedType(), A); - Builder.CreateStore(Value, G); - } - } + auto *G = GetFramePointer(P.second, A); + auto *Value = Builder.CreateLoad(A->getAllocatedType(), A); + Builder.CreateStore(Value, G); } + return FramePtr; } diff --git a/llvm/test/Transforms/Coroutines/ArgAddr.ll b/llvm/test/Transforms/Coroutines/ArgAddr.ll --- a/llvm/test/Transforms/Coroutines/ArgAddr.ll +++ b/llvm/test/Transforms/Coroutines/ArgAddr.ll @@ -7,7 +7,7 @@ entry: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null); %n.addr = alloca i32 - store i32 %n, i32* %n.addr ; this needs to go after coro.begin + store i32 %n, i32* %n.addr %0 = tail call i32 @llvm.coro.size.i32() %call = tail call i8* @malloc(i32 %0) %1 = tail call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %call) @@ -47,6 +47,7 @@ ret i32 0 ; CHECK: call void @ctor ; CHECK-NEXT: call void @print(i32 4) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) ; CHECK-NEXT: call void @print(i32 3) ; CHECK-NEXT: call void @print(i32 2) ; CHECK: ret i32 0 diff --git a/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll b/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll --- a/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll @@ -40,9 +40,9 @@ ; See if we used correct index to access prefix, data, suffix (@f) ; CHECK-LABEL: @f( -; CHECK: %prefix = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2 -; CHECK-NEXT: %data = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4 -; CHECK-NEXT: %suffix = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 3 +; CHECK: %suffix = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 3 +; CHECK-NEXT: %data = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4, i32 0 +; CHECK-NEXT: %prefix = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2 ; CHECK-NEXT: call void @consume.double.ptr(double* %prefix) ; CHECK-NEXT: call void @consume.i32.ptr(i32* %data) ; CHECK-NEXT: call void @consume.double.ptr(double* %suffix) diff --git a/llvm/test/Transforms/Coroutines/coro-param-copy.ll b/llvm/test/Transforms/Coroutines/coro-param-copy.ll --- a/llvm/test/Transforms/Coroutines/coro-param-copy.ll +++ b/llvm/test/Transforms/Coroutines/coro-param-copy.ll @@ -5,22 +5,39 @@ define i8* @f() "coroutine.presplit"="1" { entry: + %a.addr = alloca i64 ; used only after coro.begin. + %x.addr = alloca i64 - call void @use(i64* %x.addr) ; might write to %x + call void @use(i64* %x.addr) ; uses %x.addr before coro.begin. + %y.addr = alloca i64 - %y = load i64, i64* %y.addr ; cannot modify the value, don't need to copy - call void @print(i64 %y) + %y.cast = bitcast i64* %y.addr to i8* ; alias created and used after coro.begin. + + %z.addr = alloca i64 + %flag = call i1 @check() + br i1 %flag, label %flag_true, label %flag_merge +flag_true: + call void @use(i64* %z.addr) ; conditionally used %z.addr. + br label %flag_merge + +flag_merge: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() - %alloc = call i8* @myAlloc(i64 %y, i32 %size) + %alloc = call i8* @myAlloc(i64* %y.addr, i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + call void @use(i64* %a.addr) + call void @use(i64* %x.addr) + call void @llvm.memset.p0i8.i32(i8* %y.cast, i8 1, i32 4, i1 false) + call void @use(i64* %z.addr) %0 = call i8 @llvm.coro.suspend(token none, i1 false) switch i8 %0, label %suspend [i8 0, label %resume i8 1, label %cleanup] resume: + call void @use(i64* %a.addr) call void @use(i64* %x.addr) call void @use(i64* %y.addr) + call void @use(i64* %z.addr) br label %cleanup cleanup: @@ -32,26 +49,39 @@ ret i8* %hdl } -; See that we added both x and y to the frame. -; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i64, i64, i1 } +; See that we added a, x, y and z to the frame. +; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i64, i64, i64, i64, i1 } ; See that all of the uses prior to coro-begin stays put. ; CHECK-LABEL: define i8* @f() { ; CHECK-NEXT: entry: -; CHECK-NEXT: %x.addr = alloca i64 +; CHECK-NEXT: %x.addr = alloca i64, align 8 ; CHECK-NEXT: call void @use(i64* %x.addr) -; CHECK-NEXT: %y.addr = alloca i64 -; CHECK-NEXT: %y = load i64, i64* %y.addr -; CHECK-NEXT: call void @print(i64 %y) +; CHECK-NEXT: %y.addr = alloca i64, align 8 +; CHECK-NEXT: %y.cast = bitcast i64* %y.addr to i8* +; CHECK-NEXT: %z.addr = alloca i64, align 8 +; CHECK-NEXT: %flag = call i1 @check() +; CHECK-NEXT: br i1 %flag, label %flag_true, label %flag_merge -; See that we only copy the x as y was not modified prior to coro.begin. -; CHECK: store void (%f.Frame*)* @f.destroy, void (%f.Frame*)** %destroy.addr -; CHECK-NEXT: %0 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2 -; CHECK-NEXT: %1 = load i64, i64* %x.addr -; CHECK-NEXT: store i64 %1, i64* %0 -; CHECK-NEXT: %index.addr1 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4 -; CHECK-NEXT: store i1 false, i1* %index.addr1 -; CHECK-NEXT: ret i8* %hdl +; CHECK: call void @use(i64* %z.addr) +; CHECK-NEXT: br label %flag_merge + +; See that all uses of x, y and z will stay put, but in the end they are all copied into the frame. +; a will not be copied since it wasn't touched before coro.begin. +; CHECK: %a.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2 +; CHECK-NEXT: call void @use(i64* %a.addr) +; CHECK-NEXT: call void @use(i64* %x.addr) +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %y.cast, i8 1, i32 4, i1 false) +; CHECK-NEXT: call void @use(i64* %z.addr) +; CHECK-NEXT: %0 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 3 +; CHECK-NEXT: %1 = load i64, i64* %x.addr, align 4 +; CHECK-NEXT: store i64 %1, i64* %0, align 4 +; CHECK-NEXT: %2 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4 +; CHECK-NEXT: %3 = load i64, i64* %y.addr, align 4 +; CHECK-NEXT: store i64 %3, i64* %2, align 4 +; CHECK-NEXT: %4 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5 +; CHECK-NEXT: %5 = load i64, i64* %z.addr, align 4 +; CHECK-NEXT: store i64 %5, i64* %4, align 4 declare i8* @llvm.coro.free(token, i8*) declare i32 @llvm.coro.size.i32() @@ -64,7 +94,9 @@ declare i8* @llvm.coro.begin(token, i8*) declare i1 @llvm.coro.end(i8*, i1) -declare noalias i8* @myAlloc(i64, i32) -declare void @print(i64) +declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i1) + +declare noalias i8* @myAlloc(i64*, i32) declare void @use(i64*) declare void @free(i8*) +declare i1 @check()