diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/IteratedDominanceFrontier.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -44,7 +43,9 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include #include @@ -61,13 +62,32 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); +static bool onlyUsedByOptimizableCall(const Use &U) { + const Instruction *I = dyn_cast(U.getUser()); + if (const CallBase *CB = dyn_cast_or_null(I)) { + if (!CB->isArgOperand(&U)) + return false; + unsigned ArgNo = CB->getArgOperandNo(&U); + // Argument must be not be captured or accessed + if (CB->paramHasAttr(ArgNo, Attribute::NoCapture) && + (CB->hasFnAttr(Attribute::InaccessibleMemOnly) || + CB->hasFnAttr(Attribute::ReadNone) || + CB->paramHasAttr(ArgNo, Attribute::ReadNone))) + return true; + } + + return false; +} + bool llvm::isAllocaPromotable(const AllocaInst *AI) { + bool usedByOtherInsts = !AI->getNumUses(); // FIXME: If the memory unit is of pointer or integer type, we can permit // assignments to subsections of the memory unit. unsigned AS = AI->getType()->getAddressSpace(); // Only allow direct and non-volatile loads and stores... - for (const User *U : AI->users()) { + for (const Use &UU : AI->uses()) { + const User *U = UU.getUser(); if (const LoadInst *LI = dyn_cast(U)) { // Note that atomic loads can be transformed; atomic semantics do // not have any meaning for a local alloca. @@ -83,7 +103,11 @@ } else if (const IntrinsicInst *II = dyn_cast(U)) { if (!II->isLifetimeStartOrEnd()) return false; + } else if (isa(U) && onlyUsedByOptimizableCall(UU)) { + continue; } else if (const BitCastInst *BCI = dyn_cast(U)) { + if (BCI->hasOneUse() && onlyUsedByOptimizableCall(*BCI->use_begin())) + continue; if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) return false; if (!onlyUsedByLifetimeMarkers(BCI)) @@ -95,12 +119,14 @@ return false; if (!onlyUsedByLifetimeMarkers(GEPI)) return false; - } else { + } else return false; - } + usedByOtherInsts = true; } - return true; + // Not promotable because it is a fresh alloca (only used by bitcasts and/or + // as a non-accessed and non-capturing function call operand). + return usedByOtherInsts; } namespace { @@ -186,7 +212,6 @@ DenseMap InstNumbers; public: - /// This code only looks at accesses to allocas. static bool isInterestingInstruction(const Instruction *I) { return (isa(I) && isa(I->getOperand(0))) || @@ -312,27 +337,54 @@ AC->registerAssumption(CI); } -static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { +static void removeNonLoadStoreUsers(AllocaInst *AI) { + SmallVector Roots; + ValueMap VisitedMap; + AllocaInst *NewAlloca = nullptr; + // Knowing that this alloca is promotable, we know that it's safe to kill all // instructions except for load and store. - - for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { - Instruction *I = cast(*UI); - ++UI; - if (isa(I) || isa(I)) - continue; - - if (!I->getType()->isVoidTy()) { - // The only users of this bitcast/GEP instruction are lifetime intrinsics. - // Follow the use/def chain to erase them now instead of leaving it for - // dead code elimination later. - for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { - Instruction *Inst = cast(*UUI); - ++UUI; - Inst->eraseFromParent(); - } + Roots.push_back(AI); + while (Roots.size()) { + Instruction *V = Roots.pop_back_val(); + auto it = VisitedMap.insert({V, nullptr}).first; + for (auto UI = V->user_begin(), UE = V->user_end(); UI != UE;) { + Instruction *I = cast(*UI); + ++UI; + if (isa(I) || isa(I)) + continue; + else if (isa(I) || isa(I)) + Roots.push_back(I); + else if (isa(I) && !isa(I)) { + // Introduce a fresh alloca for this readnone operand, because it + // still needs some stack variable, but doesn't have to be this one. + if (!NewAlloca) { + NewAlloca = cast(AI->clone()); + NewAlloca->insertBefore(AI); + } + // Replace the root with either the fresh alloca or a bitcast of it. + if (!it->second) + it->second = (NewAlloca->getType() == V->getType()) + ? cast(NewAlloca) + : new BitCastInst(NewAlloca, V->getType(), "", AI); + } else + I->eraseFromParent(); } - I->eraseFromParent(); + } + + // Remove original roots except AI, and change uses to the fresh alloca. + for (auto P : VisitedMap) { + Instruction *R = P.first, *V = P.second; + if (V) { + // Only bitcasts and qualifying calls should refer to the fresh alloca. + R->replaceUsesWithIf(V, [](Use &U) { + Instruction *I = cast(U.getUser()); + return isa(I) || isa(I); + }); + } else if (R != AI) + R->dropAllReferences(); + if (R != AI && !R->getNumUses()) + R->eraseFromParent(); } } @@ -544,7 +596,7 @@ assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); - removeLifetimeIntrinsicUsers(AI); + removeNonLoadStoreUsers(AI); if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. diff --git a/llvm/test/Transforms/Mem2Reg/fnattr.ll b/llvm/test/Transforms/Mem2Reg/fnattr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/fnattr.ll @@ -0,0 +1,111 @@ +; RUN: opt -S < %s -mem2reg | FileCheck %s + +declare void @foo1a(i8* readnone nocapture, i8) local_unnamed_addr +declare void @foo1b(i8* nocapture, i8) local_unnamed_addr readnone +declare void @foo1c(i8* nocapture, i8) local_unnamed_addr inaccessiblememonly + +; Doesn't read from pointer argument, promotable +define i32 @a() norecurse { +; CHECK-LABEL: @a +; CHECK-NOT: %g1 +; CHECK: alloca +; CHECK-NEXT: bitcast +; CHECK: ret i32 42 +; CHECK: } + %g1 = alloca i32 + store i32 42, i32 *%g1 + %p = bitcast i32* %g1 to i8* + call void @foo1a(i8* %p, i8 0) + call void @foo1b(i8* %p, i8 0) + call void @foo1c(i8* %p, i8 0) + %a = load i32, i32* %g1 + ret i32 %a +} + +declare void @foo2a(i8*, i8) local_unnamed_addr readnone +declare void @foo2b(i8* nocapture, i8) local_unnamed_addr + +; May-capture and may-read, not promotable +define i32 @b() norecurse { +; CHECK-LABEL: @b +; CHECK: %g2 = alloca +; CHECK: store +; CHECK: load +; CHECK: } + %g2 = alloca i32 + store i32 42, i32 *%g2 + %p = bitcast i32* %g2 to i8* + call void @foo2a(i8* %p, i8 0) + call void @foo2b(i8* %p, i8 0) + %b = load i32, i32* %g2 + ret i32 %b +} + +define i1 @is_aligned(i32* readnone nocapture) norecurse { + %int = ptrtoint i32* %0 to i64 + %and = and i64 %int, 31 + %cmp = icmp ne i64 %and, 0 + ret i1 %cmp +} + +; No non-bitcasts/qualifying calls, not promotable +define i1 @is_stack_aligned() norecurse { +; CHECK-LABEL: @is_stack_aligned +; CHECK: alloca +; CHECK: call +; CHECK: } + %var = alloca i32, align 4 + %ret = call zeroext i1 @is_aligned(i32* nonnull %var) + ret i1 %ret +} + +define i1 @is_same(i32* readnone, i16* readnone) norecurse { + %cast = bitcast i16* %1 to i32* + %cmp = icmp eq i32* %cast, %0 + ret i1 %cmp +} + +; No non-bitcasts/qualifying calls, not promotable +define i1 @return_true() norecurse { +; CHECK-LABEL: @return_true +; CHECK: alloca +; CHECK-NEXT: bitcast +; CHECK-NEXT: call +; CHECK: } + %var = alloca i32, align 4 + %cast = bitcast i32* %var to i16* + %ret = call zeroext i1 @is_same(i32* nonnull %var, i16* nonnull %cast) + ret i1 %ret +} + +; Bitcast dominates loads/stores, not promotable +define i32 @c() norecurse { +; CHECK-LABEL: @c +; CHECK: alloca +; CHECK: store +; CHECK: load +; CHECK: } + %var = alloca i32, align 4 + %cast = bitcast i32* %var to i32* + store i32 42, i32* %cast + %cast2 = bitcast i32* %var to i16* + %ret = call zeroext i1 @is_same(i32* nonnull %var, i16* nonnull %cast2) + %c = load i32, i32* %cast + ret i32 %c +} + +; Bitcast only dominates qualifying calls, promotable +define i32 @d() norecurse { +; CHECK-LABEL: @d +; CHECK: alloca +; CHECK-NEXT: bitcast +; CHECK-NEXT: call +; CHECK: ret i32 42 +; CHECK: } + %var = alloca i32, align 4 + store i32 42, i32* %var + %cast = bitcast i32* %var to i16* + %ret = call zeroext i1 @is_same(i32* nonnull %var, i16* nonnull %cast) + %d = load i32, i32* %var + ret i32 %d +}