diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -23,6 +23,7 @@ class Instruction; class DominatorTree; class LoopInfo; + class Function; /// getDefaultMaxUsesToExploreForCaptureTracking - Return default value of /// the maximal number of uses to explore before giving up. It is used by @@ -61,6 +62,11 @@ const Instruction *I, const DominatorTree *DT, bool IncludeI = false, unsigned MaxUsesToExplore = 0, const LoopInfo *LI = nullptr); + Instruction *FindEarliestCapture(const Value *V, Function &F, + bool ReturnCaptures, bool StoreCaptures, + const DominatorTree &DT, + unsigned MaxUsesToExplore = 0); + /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -143,6 +143,88 @@ const LoopInfo *LI; }; + + /// Find the 'earliest' instruction before which the pointer is known not to + /// be captured. Here an instruction A is considered earlier than instruction + /// B, if A dominates B. If 2 escapes do not dominate each other, the + /// terminator of the common dominator is chosen. If not all uses cannot be + /// analyzed, the earliest escape is set to the first instruction in the + /// function entry block. + // NOTE: Users have to make sure instructions compared against the earliest + // escape are not in a cycle. + struct EarliestCaptures : public CaptureTracker { + + EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT) + : DT(DT), ReturnCaptures(ReturnCaptures), Captured(false), F(F) {} + + void tooManyUses() override { + Captured = true; + EarliestCapture = &*F.getEntryBlock().begin(); + } + + bool isSafeToPrune(Instruction *I) { + if (!EarliestCapture) + return false; + + if (EarliestCapture == I) + return true; + + // We explore this usage only if the usage can reach "EarliestCapture". + // If use is not reachable from entry, there is no need to explore. + if (!DT.isReachableFromEntry(I->getParent())) + return true; + + // Check whether there is a path from I to EarliestCapture. + return !isPotentiallyReachable(I, EarliestCapture, nullptr, &DT); + } + + bool captured(const Use *U) override { + Instruction *I = cast(U->getUser()); + if (isa(I) && !ReturnCaptures) + return false; + + // Check isSafeToPrune() here rather than in shouldExplore() to avoid + // an expensive reachability query for every instruction we look at. + // Instead we only do one for actual capturing candidates. + if (isSafeToPrune(I)) + return false; + + if (!EarliestCapture) { + EarliestCapture = I; + } else if (EarliestCapture->getParent() == I->getParent()) { + if (I->comesBefore(EarliestCapture)) + EarliestCapture = I; + } else { + BasicBlock *CurrentBB = I->getParent(); + BasicBlock *EarliestBB = EarliestCapture->getParent(); + if (DT.dominates(EarliestBB, CurrentBB)) { + // EarliestCapture already comes before the current use. + } else if (DT.dominates(CurrentBB, EarliestBB)) { + EarliestCapture = I; + } else { + // Otherwise find the nearest common dominator and use its terminator. + auto *NearestCommonDom = + DT.findNearestCommonDominator(CurrentBB, EarliestBB); + EarliestCapture = NearestCommonDom->getTerminator(); + } + } + Captured = true; + + // Return false to continue analysis; we need to see all potential + // captures. + return false; + } + + Instruction *EarliestCapture = nullptr; + + const DominatorTree &DT; + + bool ReturnCaptures; + + bool Captured; + + Function &F; + }; } /// PointerMayBeCaptured - Return true if this pointer value may be captured @@ -205,6 +287,22 @@ return CB.Captured; } +Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, + bool ReturnCaptures, bool StoreCaptures, + const DominatorTree &DT, + unsigned MaxUsesToExplore) { + assert(!isa(V) && + "It doesn't make sense to ask whether a global is captured."); + + EarliestCaptures CB(ReturnCaptures, F, DT); + PointerMayBeCaptured(V, &CB, MaxUsesToExplore); + if (CB.Captured) + ++NumCapturedBefore; + else + ++NumNotCapturedBefore; + return CB.EarliestCapture; +} + void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, unsigned MaxUsesToExplore) { assert(V->getType()->isPointerTy() && "Capture is for pointers only!"); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -38,6 +38,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" @@ -902,6 +903,9 @@ /// basic block. DenseMap IOLs; + DenseMap EarliestEscapes; + DenseMap> Inst2Obj; + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, const LoopInfo &LI) @@ -1263,6 +1267,22 @@ InstWriteOffset) == OW_Complete; } + bool MaybeCapturedBefore(const Value *Object, Instruction *I) { + if (!isa(Object)) + return true; + + auto Iter = EarliestEscapes.insert({Object, nullptr}); + if (Iter.second) { + Iter.first->second = FindEarliestCapture(Object, F, false, true, DT); + auto Ins = Inst2Obj.insert({Iter.first->second, {}}); + Ins.first->second.push_back(Object); + } + + return !Iter.first->second || !DT.dominates(I, Iter.first->second) || + I == Iter.first->second || + isPotentiallyReachable(I->getNextNode(), I, nullptr, &DT, &LI); + } + // Returns true if \p Use may read from \p DefLoc. bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) { if (isNoopIntrinsic(UseInst)) @@ -1280,6 +1300,25 @@ if (CB->onlyAccessesInaccessibleMemory()) return false; + // BasicAA does not spend linear time to check whether local objects escape + // before potentially aliasing accesses. To improve DSE results, compute and + // cache escape info for local objects in certain circumstances. + if (auto *LI = dyn_cast(UseInst)) { + // If the loads reads from a loaded underlying object accesses the load + // cannot alias DefLoc, if DefUO is a local object that has not escaped + // before the load. + auto *ReadUO = getUnderlyingObject(LI->getPointerOperand()); + auto *DefUO = getUnderlyingObject(DefLoc.Ptr); + if (DefUO && isa(DefUO) && ReadUO && isa(ReadUO) && + !MaybeCapturedBefore(DefUO, UseInst)) { + assert( + !PointerMayBeCapturedBefore(DefLoc.Ptr, false, true, UseInst, &DT, + false, 0, &this->LI) && + "cached analysis disagrees with fresh PointerMayBeCapturedBefore"); + return false; + } + } + // NOTE: For calls, the number of stores removed could be slightly improved // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to // be expensive compared to the benefits in practice. For now, avoid more @@ -1704,7 +1743,15 @@ if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) { if (MemoryDef *MD = dyn_cast(MA)) { SkipStores.insert(MD); + + // Clear any cached escape info for objects associated with the + // removed instructions. + auto Iter = Inst2Obj.find(DeadInst); + if (Iter != Inst2Obj.end()) + for (const Value *Obj : Iter->second) + EarliestEscapes.erase(Obj); } + Updater.removeMemoryAccess(MA); } diff --git a/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll b/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll --- a/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll +++ b/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll @@ -8,7 +8,6 @@ define i32 @test_not_captured_before_load_same_bb(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -27,7 +26,6 @@ define i32 @test_not_captured_before_load_same_bb_escape_unreachable_block(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb_escape_unreachable_block( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -74,7 +72,6 @@ define i32 @test_captured_after_load_same_bb_2_clobbered_later(i32** %in.ptr) { ; CHECK-LABEL: @test_captured_after_load_same_bb_2_clobbered_later( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -381,7 +378,6 @@ define i32 @test_not_captured_before_load_other_blocks_1(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_1( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -415,7 +411,6 @@ define i32 @test_not_captured_before_load_other_blocks_2(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_2( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -451,7 +446,6 @@ define i32 @test_not_captured_before_load_other_blocks_3(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_3( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -485,7 +479,6 @@ define i32 @test_not_captured_before_load_other_blocks_4(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_4( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -524,7 +517,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 @@ -559,7 +551,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_6( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 @@ -596,7 +587,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_7( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -652,6 +642,31 @@ ret i32 %res } +define i32 @test_not_captured_before_load_may_alias_same_bb_but_read(i32** %in.ptr, i32* %b, i1 %c) { +; CHECK-LABEL: @test_not_captured_before_load_may_alias_same_bb_but_read( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 55, i32* [[A]], align 4 +; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 +; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 +; CHECK-NEXT: [[PTR:%.*]] = select i1 [[C:%.*]], i32* [[A]], i32* [[B:%.*]] +; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[PTR]], align 4 +; CHECK-NEXT: store i32 99, i32* [[A]], align 4 +; CHECK-NEXT: call void @escape_and_clobber(i32* [[A]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[IN_LV_2]], [[LV]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = alloca i32, align 4 + store i32 55, i32* %a + %in.lv.1 = load i32* , i32** %in.ptr, align 2 + %in.lv.2 = load i32 , i32* %in.lv.1, align 2 + %ptr = select i1 %c, i32* %a, i32* %b + %lv = load i32, i32* %ptr + store i32 99, i32* %a, align 4 + call void @escape_and_clobber(i32* %a) + %res = add i32 %in.lv.2, %lv + ret i32 %res +} + define i32 @test_captured_after_loop(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_captured_after_loop( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -719,9 +734,6 @@ define void @test_memset_not_captured_before_load() { ; CHECK-LABEL: @test_memset_not_captured_before_load( ; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 -; CHECK-NEXT: [[CAST_A:%.*]] = bitcast [2 x i32]* [[A]] to i8* -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[CAST_A]], i32 4 -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[TMP1]], i8 0, i32 4, i1 false) ; CHECK-NEXT: [[LV_1:%.*]] = load [10 x i16]*, [10 x i16]** @global, align 8 ; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i32 0, i32 0 ; CHECK-NEXT: store i32 1, i32* [[GEP_A_0]], align 4 @@ -753,7 +765,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 ; CHECK-NEXT: [[CAST_A:%.*]] = bitcast [2 x i32]* [[A]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[CAST_A]], i8 0, i32 8, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CAST_A]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[TMP0]], i8 0, i32 4, i1 false) ; CHECK-NEXT: [[LV_1:%.*]] = load [10 x i16]*, [10 x i16]** @global, align 8 ; CHECK-NEXT: [[GEP_LV:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* [[LV_1]], i64 0, i32 1 ; CHECK-NEXT: [[LV_2:%.*]] = load i16, i16* [[GEP_LV]], align 2