diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/IteratedDominanceFrontier.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -44,7 +43,9 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include #include @@ -61,13 +62,32 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); +static bool onlyUsedByOptimizableCall(const Use &U) { + const Instruction *I = dyn_cast(U.getUser()); + if (const CallBase *CB = dyn_cast_or_null(I)) { + if (!CB->isArgOperand(&U)) + return false; + unsigned ArgNo = CB->getArgOperandNo(&U); + // Argument must be not be captured or accessed + if (CB->paramHasAttr(ArgNo, Attribute::NoCapture) && + (CB->hasFnAttr(Attribute::InaccessibleMemOnly) || + CB->hasFnAttr(Attribute::ReadNone) || + CB->paramHasAttr(ArgNo, Attribute::ReadNone))) + return true; + } + + return false; +} + bool llvm::isAllocaPromotable(const AllocaInst *AI) { + bool usedByOtherInsts = !AI->getNumUses(); // FIXME: If the memory unit is of pointer or integer type, we can permit // assignments to subsections of the memory unit. unsigned AS = AI->getType()->getAddressSpace(); // Only allow direct and non-volatile loads and stores... - for (const User *U : AI->users()) { + for (const Use &UU : AI->uses()) { + const User *U = UU.getUser(); if (const LoadInst *LI = dyn_cast(U)) { // Note that atomic loads can be transformed; atomic semantics do // not have any meaning for a local alloca. @@ -83,7 +103,15 @@ } else if (const IntrinsicInst *II = dyn_cast(U)) { if (!II->isLifetimeStartOrEnd()) return false; + } else if (isa(U) && onlyUsedByOptimizableCall(UU)) { + continue; } else if (const BitCastInst *BCI = dyn_cast(U)) { + bool onlyOptBCI = true; + for (auto BI = BCI->use_begin(), BE = BCI->use_end(); + onlyOptBCI && BI != BE; ++BI) + onlyOptBCI = onlyOptBCI && onlyUsedByOptimizableCall(*BI); + if (onlyOptBCI) + continue; if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) return false; if (!onlyUsedByLifetimeMarkers(BCI)) @@ -95,12 +123,14 @@ return false; if (!onlyUsedByLifetimeMarkers(GEPI)) return false; - } else { + } else return false; - } + usedByOtherInsts = true; } - return true; + // Not promotable because it is a fresh alloca (only used by bitcasts and/or + // as a non-accessed and non-capturing function call operand). + return usedByOtherInsts; } namespace { @@ -186,7 +216,6 @@ DenseMap InstNumbers; public: - /// This code only looks at accesses to allocas. static bool isInterestingInstruction(const Instruction *I) { return (isa(I) && isa(I->getOperand(0))) || @@ -312,27 +341,56 @@ AC->registerAssumption(CI); } -static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { - // Knowing that this alloca is promotable, we know that it's safe to kill all - // instructions except for load and store. +static void removeNonLoadStoreUsers(AllocaInst *AI) { + SmallVector Roots; + ValueMap VisitedMap; + AllocaInst *NewAlloca = nullptr; - for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { - Instruction *I = cast(*UI); - ++UI; - if (isa(I) || isa(I)) - continue; - - if (!I->getType()->isVoidTy()) { - // The only users of this bitcast/GEP instruction are lifetime intrinsics. - // Follow the use/def chain to erase them now instead of leaving it for - // dead code elimination later. - for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { - Instruction *Inst = cast(*UUI); - ++UUI; - Inst->eraseFromParent(); - } + // Given that this alloca is promotable, remove all non-load/store + // instructions on AI, which assumed by the remainder of this pass. In the + // event AI is used as an uncaptured and unread argument to a function call, + // create a new fresh AllocaInst and redirect these uses to that variable. + Roots.push_back(AI); + while (Roots.size()) { + Instruction *V = Roots.pop_back_val(); + auto it = VisitedMap.insert({V, nullptr}).first; + for (auto UI = V->user_begin(), UE = V->user_end(); UI != UE;) { + Instruction *I = cast(*UI); + ++UI; + if (isa(I) || isa(I)) + continue; + else if (isa(I) || isa(I)) + Roots.push_back(I); + else if (isa(I) && !isa(I)) { + // Introduce a fresh alloca, because it still needs some stack variable, + // but aliasing with this one will inhibit load/store optimizations. + if (!NewAlloca) { + NewAlloca = cast(AI->clone()); + NewAlloca->insertBefore(AI); + } + // Replace the root with either the fresh alloca or a bitcast of it. + if (!it->second) + it->second = (NewAlloca->getType() == V->getType()) + ? cast(NewAlloca) + : new BitCastInst(NewAlloca, V->getType(), "", AI); + } else + I->eraseFromParent(); } - I->eraseFromParent(); + } + + // Remove original roots except AI, and change uses to the fresh alloca. + for (auto P : VisitedMap) { + Instruction *R = P.first, *V = P.second; + if (V) { + // Only bitcasts and qualifying calls should refer to the fresh alloca. + R->replaceUsesWithIf(V, [](Use &U) { + Instruction *I = cast(U.getUser()); + return isa(I) || isa(I); + }); + } else if (R != AI) + R->dropAllReferences(); + if (R != AI && !R->getNumUses()) + R->eraseFromParent(); } } @@ -544,7 +602,7 @@ assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); - removeLifetimeIntrinsicUsers(AI); + removeNonLoadStoreUsers(AI); if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. diff --git a/llvm/test/Transforms/Mem2Reg/fnattr.ll b/llvm/test/Transforms/Mem2Reg/fnattr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/fnattr.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -mem2reg | FileCheck %s + +declare void @foo1a(i8* readnone nocapture, i8) local_unnamed_addr +declare void @foo1b(i8* nocapture, i8) local_unnamed_addr readnone +declare void @foo1c(i8* nocapture, i8) local_unnamed_addr inaccessiblememonly + +; Doesn't read from pointer argument, promotable +define i32 @a() norecurse { +; CHECK-LABEL: define {{[^@]+}}@a() +; CHECK-NEXT: [[TMP1:%.*]] = alloca i32 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEXT: call void @foo1a(i8* [[TMP2]], i8 0) +; CHECK-NEXT: call void @foo1b(i8* [[TMP2]], i8 0) +; CHECK-NEXT: call void @foo1c(i8* [[TMP2]], i8 0) +; CHECK-NEXT: ret i32 42 +; + %g1 = alloca i32 + store i32 42, i32 *%g1 + %p = bitcast i32* %g1 to i8* + call void @foo1a(i8* %p, i8 0) + call void @foo1b(i8* %p, i8 0) + call void @foo1c(i8* %p, i8 0) + %a = load i32, i32* %g1 + ret i32 %a +} + +declare void @foo2a(i8*, i8) local_unnamed_addr readnone +declare void @foo2b(i8* nocapture, i8) local_unnamed_addr + +; May-capture and may-read, not promotable +define i32 @b() norecurse { +; CHECK-LABEL: define {{[^@]+}}@b() +; CHECK-NEXT: [[G2:%.*]] = alloca i32 +; CHECK-NEXT: store i32 42, i32* [[G2]] +; CHECK-NEXT: [[P:%.*]] = bitcast i32* [[G2]] to i8* +; CHECK-NEXT: call void @foo2a(i8* [[P]], i8 0) +; CHECK-NEXT: call void @foo2b(i8* [[P]], i8 0) +; CHECK-NEXT: [[B:%.*]] = load i32, i32* [[G2]] +; CHECK-NEXT: ret i32 [[B]] +; + %g2 = alloca i32 + store i32 42, i32 *%g2 + %p = bitcast i32* %g2 to i8* + call void @foo2a(i8* %p, i8 0) + call void @foo2b(i8* %p, i8 0) + %b = load i32, i32* %g2 + ret i32 %b +} + +define i1 @is_aligned(i32* nocapture readnone) norecurse { +; CHECK-LABEL: define {{[^@]+}}@is_aligned +; CHECK-SAME: (i32* nocapture readnone [[TMP0:%.*]]) +; CHECK-NEXT: [[INT:%.*]] = ptrtoint i32* [[TMP0]] to i64 +; CHECK-NEXT: [[AND:%.*]] = and i64 [[INT]], 31 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[AND]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %int = ptrtoint i32* %0 to i64 + %and = and i64 %int, 31 + %cmp = icmp ne i64 %and, 0 + ret i1 %cmp +} + +; No non-bitcasts/qualifying calls, not promotable +define i1 @is_stack_aligned() norecurse { +; CHECK-LABEL: define {{[^@]+}}@is_stack_aligned() +; CHECK-NEXT: [[VAR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RET:%.*]] = call zeroext i1 @is_aligned(i32* nonnull [[VAR]]) +; CHECK-NEXT: ret i1 [[RET]] +; + %var = alloca i32, align 4 + %ret = call zeroext i1 @is_aligned(i32* nonnull %var) + ret i1 %ret +} + +define i1 @is_same(i32* nocapture readnone, i16* nocapture readnone) norecurse { +; CHECK-LABEL: define {{[^@]+}}@is_same +; CHECK-SAME: (i32* nocapture readnone [[TMP0:%.*]], i16* nocapture readnone [[TMP1:%.*]]) +; CHECK-NEXT: [[CAST:%.*]] = bitcast i16* [[TMP1]] to i32* +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[CAST]], [[TMP0]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %cast = bitcast i16* %1 to i32* + %cmp = icmp eq i32* %cast, %0 + ret i1 %cmp +} + +; No non-bitcasts/qualifying calls, not promotable +define i1 @return_true() norecurse { +; CHECK-LABEL: define {{[^@]+}}@return_true() +; CHECK-NEXT: [[VAR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32* [[VAR]] to i16* +; CHECK-NEXT: [[RET:%.*]] = call zeroext i1 @is_same(i32* nonnull [[VAR]], i16* nonnull [[CAST]]) +; CHECK-NEXT: ret i1 [[RET]] +; + %var = alloca i32, align 4 + %cast = bitcast i32* %var to i16* + %ret = call zeroext i1 @is_same(i32* nonnull %var, i16* nonnull %cast) + ret i1 %ret +} + +declare void @llvm.trap() + +; Bitcast dominates loads/stores, not promotable +define i32 @c() norecurse { +; CHECK-LABEL: define {{[^@]+}}@c() +; CHECK-NEXT: [[VAR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32* [[VAR]] to i32* +; CHECK-NEXT: store i32 42, i32* [[CAST]] +; CHECK-NEXT: [[CAST2:%.*]] = bitcast i32* [[VAR]] to i16* +; CHECK-NEXT: [[RET:%.*]] = call zeroext i1 @is_same(i32* nonnull [[VAR]], i16* nonnull [[CAST2]]) +; CHECK-NEXT: br i1 [[RET]], label [[CONT:%.*]], label [[TRAP:%.*]] +; CHECK: trap: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: cont: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[CAST]] +; CHECK-NEXT: ret i32 [[C]] +; + %var = alloca i32, align 4 + %cast = bitcast i32* %var to i32* + store i32 42, i32* %cast + %cast2 = bitcast i32* %var to i16* + %ret = call zeroext i1 @is_same(i32* nonnull %var, i16* nonnull %cast2) + br i1 %ret, label %cont, label %trap +trap: + call void @llvm.trap() + unreachable +cont: + %c = load i32, i32* %cast + ret i32 %c +} + +; Bitcast only dominates qualifying calls, promotable +define i32 @d() norecurse { +; CHECK-LABEL: define {{[^@]+}}@d() +; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i16* +; CHECK-NEXT: [[RET:%.*]] = call zeroext i1 @is_same(i32* nonnull [[TMP1]], i16* nonnull [[TMP2]]) +; CHECK-NEXT: br i1 [[RET]], label [[CONT:%.*]], label [[TRAP:%.*]] +; CHECK: trap: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: cont: +; CHECK-NEXT: ret i32 42 +; + %var = alloca i32, align 4 + store i32 42, i32* %var + %cast = bitcast i32* %var to i16* + %ret = call zeroext i1 @is_same(i32* nonnull %var, i16* nonnull %cast) + br i1 %ret, label %cont, label %trap +trap: + call void @llvm.trap() + unreachable +cont: + %d = load i32, i32* %var + ret i32 %d +}