diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -23,6 +23,7 @@ class Instruction; class DominatorTree; class LoopInfo; + class Function; /// getDefaultMaxUsesToExploreForCaptureTracking - Return default value of /// the maximal number of uses to explore before giving up. It is used by @@ -63,6 +64,19 @@ unsigned MaxUsesToExplore = 0, const LoopInfo *LI = nullptr); + // Returns the 'earliest' instruction that captures \p V in \F. An instruction + // A is considered earlier than instruction B, if A dominates B. If 2 escapes + // do not dominate each other, the terminator of the common dominator is + // chosen. If not all uses can be analyzed, the earliest escape is set to + // the first instruction in the function entry block. If \p V does not escape, + // nullptr is returned. Note that the caller of the function has to ensure + // that the instruction the result value is compared against is not in a + // cycle. + Instruction *FindEarliestCapture(const Value *V, Function &F, + bool ReturnCaptures, bool StoreCaptures, + const DominatorTree &DT, + unsigned MaxUsesToExplore = 0); + /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -143,6 +143,66 @@ const LoopInfo *LI; }; + + /// Find the 'earliest' instruction before which the pointer is known not to + /// be captured. Here an instruction A is considered earlier than instruction + /// B, if A dominates B. If 2 escapes do not dominate each other, the + /// terminator of the common dominator is chosen. If not all uses cannot be + /// analyzed, the earliest escape is set to the first instruction in the + /// function entry block. + // NOTE: Users have to make sure instructions compared against the earliest + // escape are not in a cycle. + struct EarliestCaptures : public CaptureTracker { + + EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT) + : DT(DT), ReturnCaptures(ReturnCaptures), Captured(false), F(F) {} + + void tooManyUses() override { + Captured = true; + EarliestCapture = &*F.getEntryBlock().begin(); + } + + bool captured(const Use *U) override { + Instruction *I = cast(U->getUser()); + if (isa(I) && !ReturnCaptures) + return false; + + if (!EarliestCapture) { + EarliestCapture = I; + } else if (EarliestCapture->getParent() == I->getParent()) { + if (I->comesBefore(EarliestCapture)) + EarliestCapture = I; + } else { + BasicBlock *CurrentBB = I->getParent(); + BasicBlock *EarliestBB = EarliestCapture->getParent(); + if (DT.dominates(EarliestBB, CurrentBB)) { + // EarliestCapture already comes before the current use. + } else if (DT.dominates(CurrentBB, EarliestBB)) { + EarliestCapture = I; + } else { + // Otherwise find the nearest common dominator and use its terminator. + auto *NearestCommonDom = + DT.findNearestCommonDominator(CurrentBB, EarliestBB); + EarliestCapture = NearestCommonDom->getTerminator(); + } + } + Captured = true; + + // Return false to continue analysis; we need to see all potential + // captures. + return false; + } + + Instruction *EarliestCapture = nullptr; + + const DominatorTree &DT; + + bool ReturnCaptures; + + bool Captured; + + Function &F; + }; } /// PointerMayBeCaptured - Return true if this pointer value may be captured @@ -206,6 +266,22 @@ return CB.Captured; } +Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, + bool ReturnCaptures, bool StoreCaptures, + const DominatorTree &DT, + unsigned MaxUsesToExplore) { + assert(!isa(V) && + "It doesn't make sense to ask whether a global is captured."); + + EarliestCaptures CB(ReturnCaptures, F, DT); + PointerMayBeCaptured(V, &CB, MaxUsesToExplore); + if (CB.Captured) + ++NumCapturedBefore; + else + ++NumNotCapturedBefore; + return CB.EarliestCapture; +} + void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, unsigned MaxUsesToExplore) { assert(V->getType()->isPointerTy() && "Capture is for pointers only!"); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -38,6 +38,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" @@ -897,6 +898,9 @@ /// basic block. DenseMap IOLs; + DenseMap EarliestEscapes; + DenseMap> Inst2Obj; + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, const LoopInfo &LI) @@ -1264,6 +1268,30 @@ DepWriteOffset) == OW_Complete; } + /// Returns true if \p Object is not captured before or by \p I. + bool notCapturedBeforeOrAt(const Value *Object, Instruction *I) { + if (!isIdentifiedFunctionLocal(Object)) + return false; + + auto Iter = EarliestEscapes.insert({Object, nullptr}); + if (Iter.second) { + Instruction *EarliestCapture = FindEarliestCapture( + Object, F, /*ReturnCaptures=*/false, /*StoreCaptures=*/true, DT); + if (EarliestCapture) { + auto Ins = Inst2Obj.insert({EarliestCapture, {}}); + Ins.first->second.push_back(Object); + } + Iter.first->second = EarliestCapture; + } + + // No capturing instruction. + if (!Iter.first->second) + return true; + + return I != Iter.first->second && + !isPotentiallyReachable(Iter.first->second, I, nullptr, &DT, &LI); + } + // Returns true if \p Use may read from \p DefLoc. bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) { if (isNoopIntrinsic(UseInst)) @@ -1281,6 +1309,25 @@ if (CB->onlyAccessesInaccessibleMemory()) return false; + // BasicAA does not spend linear time to check whether local objects escape + // before potentially aliasing accesses. To improve DSE results, compute and + // cache escape info for local objects in certain circumstances. + if (auto *LI = dyn_cast(UseInst)) { + // If the loads reads from a loaded underlying object accesses the load + // cannot alias DefLoc, if DefUO is a local object that has not escaped + // before the load. + auto *ReadUO = getUnderlyingObject(LI->getPointerOperand()); + auto *DefUO = getUnderlyingObject(DefLoc.Ptr); + if (DefUO && ReadUO && isa(ReadUO) && + notCapturedBeforeOrAt(DefUO, UseInst)) { + assert( + !PointerMayBeCapturedBefore(DefLoc.Ptr, false, true, UseInst, &DT, + false, 0, &this->LI) && + "cached analysis disagrees with fresh PointerMayBeCapturedBefore"); + return false; + } + } + // NOTE: For calls, the number of stores removed could be slightly improved // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to // be expensive compared to the benefits in practice. For now, avoid more @@ -1706,7 +1753,17 @@ if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) { if (MemoryDef *MD = dyn_cast(MA)) { SkipStores.insert(MD); + + // Clear any cached escape info for objects associated with the + // removed instructions. + auto Iter = Inst2Obj.find(DeadInst); + if (Iter != Inst2Obj.end()) { + for (const Value *Obj : Iter->second) + EarliestEscapes.erase(Obj); + Inst2Obj.erase(DeadInst); + } } + Updater.removeMemoryAccess(MA); } diff --git a/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll b/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll --- a/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll +++ b/llvm/test/Transforms/DeadStoreElimination/captures-before-load.ll @@ -8,7 +8,6 @@ define i32 @test_not_captured_before_load_same_bb(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -27,7 +26,6 @@ define i32 @test_not_captured_before_load_same_bb_escape_unreachable_block(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb_escape_unreachable_block( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -74,7 +72,6 @@ define i32 @test_captured_after_load_same_bb_2_clobbered_later(i32** %in.ptr) { ; CHECK-LABEL: @test_captured_after_load_same_bb_2_clobbered_later( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -138,8 +135,8 @@ ; CHECK-LABEL: @test_captured_before_load_same_bb_2( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 55, i32* [[A]], align 4 +; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 -; CHECK-NEXT: call void @escape_and_clobber(i32* [[A]]) ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 ; CHECK-NEXT: call void @clobber() @@ -147,8 +144,8 @@ ; %a = alloca i32, align 4 store i32 55, i32* %a + call void @escape_writeonly(i32* %a) %in.lv.1 = load i32* , i32** %in.ptr, align 2 - call void @escape_and_clobber(i32* %a) %in.lv.2 = load i32 , i32* %in.lv.1, align 2 store i32 99, i32* %a, align 4 call void @clobber() @@ -200,7 +197,6 @@ define i32 @test_captured_sibling_path_to_load_other_blocks_1(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_captured_sibling_path_to_load_other_blocks_1( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -238,7 +234,6 @@ define i32 @test_only_captured_sibling_path_with_ret_to_load_other_blocks(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_only_captured_sibling_path_with_ret_to_load_other_blocks( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -417,7 +412,6 @@ define i32 @test_not_captured_before_load_other_blocks_1(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_1( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -451,7 +445,6 @@ define i32 @test_not_captured_before_load_other_blocks_2(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_2( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -487,7 +480,6 @@ define i32 @test_not_captured_before_load_other_blocks_3(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_3( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -521,7 +513,6 @@ define i32 @test_not_captured_before_load_other_blocks_4(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_4( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -560,7 +551,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 @@ -632,7 +622,6 @@ ; CHECK-LABEL: @test_not_captured_before_load_other_blocks_7( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: call void @escape_writeonly(i32* [[A]]) @@ -716,7 +705,6 @@ define i32 @test_captured_after_loop(i32** %in.ptr, i1 %c.1) { ; CHECK-LABEL: @test_captured_after_loop( ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 @@ -780,9 +768,6 @@ define void @test_memset_not_captured_before_load() { ; CHECK-LABEL: @test_memset_not_captured_before_load( ; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 -; CHECK-NEXT: [[CAST_A:%.*]] = bitcast [2 x i32]* [[A]] to i8* -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[CAST_A]], i32 4 -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[TMP1]], i8 0, i32 4, i1 false) ; CHECK-NEXT: [[LV_1:%.*]] = load [10 x i16]*, [10 x i16]** @global, align 8 ; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i32 0, i32 0 ; CHECK-NEXT: store i32 1, i32* [[GEP_A_0]], align 4 @@ -814,7 +799,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 ; CHECK-NEXT: [[CAST_A:%.*]] = bitcast [2 x i32]* [[A]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[CAST_A]], i8 0, i32 8, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CAST_A]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[TMP0]], i8 0, i32 4, i1 false) ; CHECK-NEXT: [[LV_1:%.*]] = load [10 x i16]*, [10 x i16]** @global, align 8 ; CHECK-NEXT: [[GEP_LV:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* [[LV_1]], i64 0, i32 1 ; CHECK-NEXT: [[LV_2:%.*]] = load i16, i16* [[GEP_LV]], align 2 @@ -1038,7 +1024,6 @@ define i32 @test_not_captured_before_load_same_bb_noalias_call(i32** %in.ptr) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb_noalias_call( ; CHECK-NEXT: [[A:%.*]] = call i32* @alloc() -; CHECK-NEXT: store i32 55, i32* [[A]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 ; CHECK-NEXT: store i32 99, i32* [[A]], align 4 @@ -1056,10 +1041,9 @@ define i32 @test_not_captured_before_load_same_bb_noalias_arg(i32** %in.ptr, i32* noalias %a) { ; CHECK-LABEL: @test_not_captured_before_load_same_bb_noalias_arg( -; CHECK-NEXT: store i32 55, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[IN_LV_1:%.*]] = load i32*, i32** [[IN_PTR:%.*]], align 2 ; CHECK-NEXT: [[IN_LV_2:%.*]] = load i32, i32* [[IN_LV_1]], align 2 -; CHECK-NEXT: store i32 99, i32* [[A]], align 4 +; CHECK-NEXT: store i32 99, i32* [[A:%.*]], align 4 ; CHECK-NEXT: call void @escape_and_clobber(i32* [[A]]) ; CHECK-NEXT: ret i32 [[IN_LV_2]] ; @@ -1070,3 +1054,42 @@ call void @escape_and_clobber(i32* %a) ret i32 %in.lv.2 } + +define i32 @instruction_captures_multiple_objects(i32* %p.1, i32** %p.2, i32** %p.3, i1 %c) { +; CHECK-LABEL: @instruction_captures_multiple_objects( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A_2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[P_1:%.*]], align 8 +; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LV_2:%.*]] = load i32*, i32** [[P_2:%.*]], align 8 +; CHECK-NEXT: [[LV_2_2:%.*]] = load i32, i32* [[LV_2]], align 4 +; CHECK-NEXT: ret i32 [[LV_2_2]] +; CHECK: else: +; CHECK-NEXT: [[LV_3:%.*]] = load i32*, i32** [[P_3:%.*]], align 8 +; CHECK-NEXT: [[LV_3_2:%.*]] = load i32, i32* [[LV_3]], align 4 +; CHECK-NEXT: call void @capture_and_clobber_multiple(i32* [[A_1]], i32* [[A_2]]) +; CHECK-NEXT: ret i32 [[LV_3_2]] +; +entry: + %a.1 = alloca i32 + %a.2 = alloca i32 + store i32 0, i32* %p.1, align 8 + br i1 %c, label %then, label %else + +then: + store i32 99, i32* %a.2, align 4 + %lv.2 = load i32*, i32** %p.2 + %lv.2.2 = load i32, i32* %lv.2 + store i32 0, i32* %a.1, align 8 + ret i32 %lv.2.2 + +else: + %lv.3 = load i32*, i32** %p.3 + %lv.3.2 = load i32, i32* %lv.3 + call void @capture_and_clobber_multiple(i32* %a.1, i32* %a.2) + ret i32 %lv.3.2 +} + +declare void @capture_and_clobber_multiple(i32*, i32*)