diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -23,6 +23,7 @@ class Instruction; class DominatorTree; class LoopInfo; + class Function; /// getDefaultMaxUsesToExploreForCaptureTracking - Return default value of /// the maximal number of uses to explore before giving up. It is used by @@ -61,6 +62,19 @@ const Instruction *I, const DominatorTree *DT, bool IncludeI = false, unsigned MaxUsesToExplore = 0, const LoopInfo *LI = nullptr); + // Returns the 'earliest' instruction that captures \p V in \F. An instruction + // A is considered earlier than instruction B, if A dominates B. If 2 escapes + // do not dominate each other, the terminator of the common dominator is + // chosen. If not all uses cannot be analyzed, the earliest escape is set to + // the first instruction in the function entry block. If \p V does not escape, + // nullptr is returned. Note that the caller of the function has to ensure + // that the instruction the result value is compared against is not in a + // cycle. + Instruction *FindEarliestCapture(const Value *V, Function &F, + bool ReturnCaptures, bool StoreCaptures, + const DominatorTree &DT, + unsigned MaxUsesToExplore = 0); + /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -143,6 +143,88 @@ const LoopInfo *LI; }; + + /// Find the 'earliest' instruction before which the pointer is known not to + /// be captured. Here an instruction A is considered earlier than instruction + /// B, if A dominates B. If 2 escapes do not dominate each other, the + /// terminator of the common dominator is chosen. If not all uses cannot be + /// analyzed, the earliest escape is set to the first instruction in the + /// function entry block. + // NOTE: Users have to make sure instructions compared against the earliest + // escape are not in a cycle. + struct EarliestCaptures : public CaptureTracker { + + EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT) + : DT(DT), ReturnCaptures(ReturnCaptures), Captured(false), F(F) {} + + void tooManyUses() override { + Captured = true; + EarliestCapture = &*F.getEntryBlock().begin(); + } + + bool isSafeToPrune(Instruction *I) { + if (!EarliestCapture) + return false; + + if (EarliestCapture == I) + return true; + + // We explore this usage only if the usage can reach "EarliestCapture". + // If use is not reachable from entry, there is no need to explore. + if (!DT.isReachableFromEntry(I->getParent())) + return true; + + // Check whether there is a path from I to EarliestCapture. + return !isPotentiallyReachable(I, EarliestCapture, nullptr, &DT); + } + + bool captured(const Use *U) override { + Instruction *I = cast(U->getUser()); + if (isa(I) && !ReturnCaptures) + return false; + + // Check isSafeToPrune() here rather than in shouldExplore() to avoid + // an expensive reachability query for every instruction we look at. + // Instead we only do one for actual capturing candidates. + if (isSafeToPrune(I)) + return false; + + if (!EarliestCapture) { + EarliestCapture = I; + } else if (EarliestCapture->getParent() == I->getParent()) { + if (I->comesBefore(EarliestCapture)) + EarliestCapture = I; + } else { + BasicBlock *CurrentBB = I->getParent(); + BasicBlock *EarliestBB = EarliestCapture->getParent(); + if (DT.dominates(EarliestBB, CurrentBB)) { + // EarliestCapture already comes before the current use. + } else if (DT.dominates(CurrentBB, EarliestBB)) { + EarliestCapture = I; + } else { + // Otherwise find the nearest common dominator and use its terminator. + auto *NearestCommonDom = + DT.findNearestCommonDominator(CurrentBB, EarliestBB); + EarliestCapture = NearestCommonDom->getTerminator(); + } + } + Captured = true; + + // Return false to continue analysis; we need to see all potential + // captures. + return false; + } + + Instruction *EarliestCapture = nullptr; + + const DominatorTree &DT; + + bool ReturnCaptures; + + bool Captured; + + Function &F; + }; } /// PointerMayBeCaptured - Return true if this pointer value may be captured @@ -205,6 +287,22 @@ return CB.Captured; } +Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, + bool ReturnCaptures, bool StoreCaptures, + const DominatorTree &DT, + unsigned MaxUsesToExplore) { + assert(!isa(V) && + "It doesn't make sense to ask whether a global is captured."); + + EarliestCaptures CB(ReturnCaptures, F, DT); + PointerMayBeCaptured(V, &CB, MaxUsesToExplore); + if (CB.Captured) + ++NumCapturedBefore; + else + ++NumNotCapturedBefore; + return CB.EarliestCapture; +} + void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, unsigned MaxUsesToExplore) { assert(V->getType()->isPointerTy() && "Capture is for pointers only!"); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -38,6 +38,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" @@ -902,6 +903,9 @@ /// basic block. DenseMap IOLs; + DenseMap EarliestEscapes; + DenseMap> Inst2Obj; + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, const LoopInfo &LI) @@ -1263,6 +1267,26 @@ InstWriteOffset) == OW_Complete; } + /// Returns true if \p Object is not captured before or by \p I. + bool notCapturedBefore(const Value *Object, Instruction *I) { + if (!isa(Object)) + return false; + + auto Iter = EarliestEscapes.insert({Object, nullptr}); + if (Iter.second) { + Iter.first->second = FindEarliestCapture(Object, F, false, true, DT); + auto Ins = Inst2Obj.insert({Iter.first->second, {}}); + Ins.first->second.push_back(Object); + } + + // No capturing instruction. + if (!Iter.first->second) + return true; + + return DT.dominates(I, Iter.first->second) && I != Iter.first->second && + !isPotentiallyReachable(I->getNextNode(), I, nullptr, &DT, &LI); + } + // Returns true if \p Use may read from \p DefLoc. bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) { if (isNoopIntrinsic(UseInst)) @@ -1280,6 +1304,25 @@ if (CB->onlyAccessesInaccessibleMemory()) return false; + // BasicAA does not spend linear time to check whether local objects escape + // before potentially aliasing accesses. To improve DSE results, compute and + // cache escape info for local objects in certain circumstances. + if (auto *LI = dyn_cast(UseInst)) { + // If the loads reads from a loaded underlying object accesses the load + // cannot alias DefLoc, if DefUO is a local object that has not escaped + // before the load. + auto *ReadUO = getUnderlyingObject(LI->getPointerOperand()); + auto *DefUO = getUnderlyingObject(DefLoc.Ptr); + if (DefUO && isa(DefUO) && ReadUO && isa(ReadUO) && + notCapturedBefore(DefUO, UseInst)) { + assert( + !PointerMayBeCapturedBefore(DefLoc.Ptr, false, true, UseInst, &DT, + false, 0, &this->LI) && + "cached analysis disagrees with fresh PointerMayBeCapturedBefore"); + return false; + } + } + // NOTE: For calls, the number of stores removed could be slightly improved // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to // be expensive compared to the benefits in practice. For now, avoid more @@ -1703,7 +1746,17 @@ if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) { if (MemoryDef *MD = dyn_cast(MA)) { SkipStores.insert(MD); + + // Clear any cached escape info for objects associated with the + // removed instructions. + auto Iter = Inst2Obj.find(DeadInst); + if (Iter != Inst2Obj.end()) { + for (const Value *Obj : Iter->second) + EarliestEscapes.erase(Obj); + Inst2Obj.erase(DeadInst); + } } + Updater.removeMemoryAccess(MA); } diff --git a/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll b/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll --- a/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll +++ b/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll @@ -8,7 +8,6 @@ define i32 @test_not_captured_before_load_same_bb(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -27,7 +26,6 @@ define i32 @test_not_captured_before_load_same_bb_escape_unreachable_block(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb_escape_unreachable_block( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -74,7 +72,6 @@ define i32 @test_captured_after_load_same_bb_2_clobbered_later(i32** %in.ptr) { ; CHECK-LABEL: @test_captured_after_load_same_bb_2_clobbered_later( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -381,7 +378,6 @@ define i32 @test_not_captured_before_load_other_blocks_1(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_1( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -415,7 +411,6 @@ define i32 @test_not_captured_before_load_other_blocks_2(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_2( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -451,7 +446,6 @@ define i32 @test_not_captured_before_load_other_blocks_3(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_3( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -485,7 +479,6 @@ define i32 @test_not_captured_before_load_other_blocks_4(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_4( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -524,7 +517,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 @@ -559,7 +551,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_6( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 @@ -596,7 +587,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_7( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -652,6 +642,31 @@ ret i32 %res } +define i32 @test_not_captured_before_load_may_alias_same_bb_but_read(i32** %in.ptr, i32* %b, i1 %c) { +; CHECK-LABEL: @test_not_captured_before_load_may_alias_same_bb_but_read( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 55, i32* [[A]], align 4 +; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 +; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 +; CHECK-NEXT: [[PTR:%.*]] = select i1 [[C:%.*]], i32* [[A]], i32* [[B:%.*]] +; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[PTR]], align 4 +; CHECK-NEXT: store i32 99, i32* [[A]], align 4 +; CHECK-NEXT: call void @escape_and_clobber(i32* [[A]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[IN_LV_2]], [[LV]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = alloca i32, align 4 + store i32 55, i32* %a + %in.lv.1 = load i32* , i32** %in.ptr, align 2 + %in.lv.2 = load i32 , i32* %in.lv.1, align 2 + %ptr = select i1 %c, i32* %a, i32* %b + %lv = load i32, i32* %ptr + store i32 99, i32* %a, align 4 + call void @escape_and_clobber(i32* %a) + %res = add i32 %in.lv.2, %lv + ret i32 %res +} + define i32 @test_captured_after_loop(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_captured_after_loop( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -719,9 +734,6 @@ define void @test_memset_not_captured_before_load() { ; CHECK-LABEL: @test_memset_not_captured_before_load( ; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 -; CHECK-NEXT: [[CAST_A:%.*]] = bitcast [2 x i32]* [[A]] to i8* -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[CAST_A]], i32 4 -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[TMP1]], i8 0, i32 4, i1 false) ; CHECK-NEXT: [[LV_1:%.*]] = load [10 x i16]*, [10 x i16]** @global, align 8 ; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i32 0, i32 0 ; CHECK-NEXT: store i32 1, i32* [[GEP_A_0]], align 4 @@ -753,7 +765,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 ; CHECK-NEXT: [[CAST_A:%.*]] = bitcast [2 x i32]* [[A]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[CAST_A]], i8 0, i32 8, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CAST_A]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[TMP0]], i8 0, i32 4, i1 false) ; CHECK-NEXT: [[LV_1:%.*]] = load [10 x i16]*, [10 x i16]** @global, align 8 ; CHECK-NEXT: [[GEP_LV:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* [[LV_1]], i64 0, i32 1 ; CHECK-NEXT: [[LV_2:%.*]] = load i16, i16* [[GEP_LV]], align 2 @@ -797,3 +810,50 @@ call void @escape_and_clobber(i32* %gep.a.0) ret void } + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #0 +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 + +declare void @use.i64(i64) + +define i64 @test_a_not_captured_at_all(i64** %ptr, i64** %ptr.2, i1 %c) { +; CHECK-LABEL: @test_a_not_captured_at_all( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[B:%.*]] = alloca i64, align 8 +; CHECK-NEXT: store i64* [[B]], i64** [[PTR:%.*]], align 8 +; CHECK-NEXT: [[LV_1:%.*]] = load i64*, i64** [[PTR_2:%.*]], align 8 +; CHECK-NEXT: br i1 [[C:%.*]], label [[EXIT:%.*]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LV_2:%.*]] = load i64, i64* [[LV_1]], align 4 +; CHECK-NEXT: call void @use.i64(i64 [[LV_2]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[A_CAST:%.*]] = bitcast i64* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[A_CAST]]) +; CHECK-NEXT: call void @clobber() +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[A_CAST]], i8 0, i64 8, i1 false) +; CHECK-NEXT: [[L:%.*]] = load i64, i64* [[A]], align 4 +; CHECK-NEXT: ret i64 [[L]] +; +entry: + %a = alloca i64, align 8 + %b = alloca i64, align 8 + store i64* %b, i64** %ptr, align 8 + %lv.1 = load i64*, i64** %ptr.2, align 8 + br i1 %c, label %exit, label %then + +then: + %lv.2 = load i64, i64* %lv.1 + call void @use.i64(i64 %lv.2) + br label %exit + +exit: + %a.cast = bitcast i64* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* %a.cast) + store i64 99, i64* %a + call void @clobber() + call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 8, i1 false) + %l = load i64, i64* %a + ret i64 %l +}