Index: lib/Transforms/Coroutines/CoroEarly.cpp =================================================================== --- lib/Transforms/Coroutines/CoroEarly.cpp +++ lib/Transforms/Coroutines/CoroEarly.cpp @@ -13,10 +13,12 @@ #include "CoroInternal.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" using namespace llvm; @@ -38,7 +40,7 @@ AnyResumeFnPtrTy(FunctionType::get(Type::getVoidTy(Context), Int8Ptr, /*isVarArg=*/false) ->getPointerTo()) {} - bool lowerEarlyIntrinsics(Function &F); + bool lowerEarlyIntrinsics(Function &F, DominatorTree &DT); }; } @@ -103,6 +105,43 @@ II->eraseFromParent(); } +static bool allocaOnlyStoresConstantInts(AllocaInst *AI) { + for (User *U : AI->users()) + if (auto *SI = dyn_cast(U)) + if (!isa(SI->getValueOperand())) + return false; + + return true; +} + +// Allocas that follow the pattern of cleanup.dest.slot, namely, they are +// integer allocas that don't escape and only store constants. We don't want +// them to be saved into the coroutine frame (as some of the cleanup code will +// access them after coroutine frame destroyed). +// +// PromiseAlloca should be always stored in the coroutine frame even if its +// usage pattern only consists of constant stores. +static void +promoteCleanupSlotLikeAllocasToRegisters(Function &F, DominatorTree& DT, + AllocaInst *PromiseAlloca) { + SmallVector Allocas; + BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function + + // Find allocas that are safe to promote, by looking at all non-terminator + // instructions in the entry node. + for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) + if (auto *AI = dyn_cast(I)) + if (AI != PromiseAlloca && AI->getAllocatedType()->isIntegerTy() && + isAllocaPromotable(AI) && allocaOnlyStoresConstantInts(AI)) + Allocas.push_back(AI); + + // None found, nothing to do. + if (Allocas.empty()) + return; + + PromoteMemToReg(Allocas, DT); +} + // Prior to CoroSplit, calls to coro.begin needs to be marked as NoDuplicate, // as CoroSplit assumes there is exactly one coro.begin. After CoroSplit, // NoDuplicate attribute will be removed from coro.begin otherwise, it will @@ -113,7 +152,7 @@ CB->setCannotDuplicate(); } -bool Lowerer::lowerEarlyIntrinsics(Function &F) { +bool Lowerer::lowerEarlyIntrinsics(Function &F, DominatorTree &DT) { bool Changed = false; CoroIdInst *CoroId = nullptr; SmallVector CoroFrees; @@ -147,6 +186,11 @@ setCannotDuplicate(CII); CII->setCoroutineSelf(); CoroId = cast(&I); + // Eliminate cleanup.slot-like allocas. Required for O0, since + // cleanup slot allocas are accessed (potentially) after coroutine + // frame is destroyed. + promoteCleanupSlotLikeAllocasToRegisters(F, DT, + CoroId->getPromise()); } } break; @@ -204,11 +248,13 @@ if (!L) return false; - return L->lowerEarlyIntrinsics(F); + auto &DTWP = getAnalysis(); + return L->lowerEarlyIntrinsics(F, DTWP.getDomTree()); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); } StringRef getPassName() const override { return "Lower early coroutine intrinsics"; @@ -217,7 +263,10 @@ } char CoroEarly::ID = 0; -INITIALIZE_PASS(CoroEarly, "coro-early", "Lower early coroutine intrinsics", - false, false) +INITIALIZE_PASS_BEGIN(CoroEarly, "coro-early", + "Lower early coroutine intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(CoroEarly, "coro-early", "Lower early coroutine intrinsics", + false, false) Pass *llvm::createCoroEarlyPass() { return new CoroEarly(); } Index: test/Transforms/Coroutines/coro-cleanup-dest-promote.ll =================================================================== --- /dev/null +++ test/Transforms/Coroutines/coro-cleanup-dest-promote.ll @@ -0,0 +1,119 @@ +; Check that we do not spill cleanup-slot like allocas. +; RUN: opt < %s -coro-early -S | FileCheck %s + +define i8* @happy.case() { +entry: + %slot = alloca i32 + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + + store i32 1, i32* %slot + + %tok = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %tok, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + store i32 2, i32* %slot + br label %cleanup + +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + %x = load i32, i32* %slot + call void @print.i32(i32 %x) + br label %suspend +suspend: + call i1 @llvm.coro.end(i8* %hdl, i1 0) + ret i8* %hdl +} + +; See that we got a PHI with constants in the happy case +; CHECK-LABEL: define i8* @happy.case( +; CHECK: %[[SLOT:.+]] = phi i32 [ 1, %entry ], [ 2, %resume ] +; CHECK: call void @free(i8* {{.+}}) +; CHECK: call void @print.i32(i32 %[[SLOT]]) +; CHECK: ret i8* + +define i8* @escaped.case() { +entry: + %slot = alloca i32 + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + + store i32 1, i32* %slot + call void @escape(i32* %slot) + + %tok = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %tok, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + store i32 2, i32* %slot + br label %cleanup + +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + %x = load i32, i32* %slot + call void @print.i32(i32 %x) + call void @free(i8* %mem) + br label %suspend +suspend: + call i1 @llvm.coro.end(i8* %hdl, i1 0) + ret i8* %hdl +} + +; See that we kept the load when the alloca escaped +; CHECK-LABEL: define i8* @escaped.case( +; CHECK: %[[SLOT:.+]] = load i32, i32* %{{.+}} +; CHECK: call void @print.i32(i32 %[[SLOT]]) +; CHECK: ret i8* + +define i8* @promise.case() { +entry: + %slot = alloca i32 + %pv = bitcast i32* %slot to i8* + %id = call token @llvm.coro.id(i32 0, i8* %pv, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + + %tok = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %tok, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + store i32 2, i32* %slot + br label %cleanup + +cleanup: + %x = load i32, i32* %slot + call void @print.i32(i32 %x) + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %suspend +suspend: + call i1 @llvm.coro.end(i8* %hdl, i1 0) + ret i8* %hdl +} + +; See that we kept the load when the alloca is from promise +; CHECK-LABEL: define i8* @promise.case( +; CHECK: %[[SLOT:.+]] = load i32, i32* %{{.+}} +; CHECK: call void @print.i32(i32 %[[SLOT]]) +; CHECK: ret i8* + +declare i8* @llvm.coro.free(token, i8*) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) + +declare token @llvm.coro.id(i32, i8*, i8*, i8*) +declare i1 @llvm.coro.alloc(token) +declare i8* @llvm.coro.begin(token, i8*) +declare i1 @llvm.coro.end(i8*, i1) + +declare noalias i8* @malloc(i32) +declare void @print.i32(i32) +declare void @escape(i32*) +declare void @free(i8*)