diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -87,6 +87,8 @@ STATISTIC(NumCFGChecks, "Number of stores modified"); STATISTIC(NumCFGTries, "Number of stores modified"); STATISTIC(NumCFGSuccess, "Number of stores modified"); +STATISTIC(NumGetDomMemoryDefPassed, + "Number of times a valid candidate is returned from getDomMemoryDef"); STATISTIC(NumDomMemDefChecks, "Number iterations check for reads in getDomMemoryDef"); @@ -1792,7 +1794,8 @@ Optional getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current, MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache, - unsigned &ScanLimit, unsigned &WalkerStepLimit) { + unsigned &ScanLimit, unsigned &WalkerStepLimit, + bool IsMemTerm, unsigned &PartialLimit) { if (ScanLimit == 0 || WalkerStepLimit == 0) { LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n"); return None; @@ -1800,9 +1803,11 @@ MemoryAccess *DomAccess; MemoryAccess *StartAccess = Current; + Instruction *KillingI = KillingDef->getMemoryInst(); bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); + // Find the next clobbering Mod access for DefLoc, starting at Current. do { StepAgain = false; @@ -1829,13 +1834,86 @@ if (isa(DomAccess)) break; - // Check if we can skip DomDef for DSE. - MemoryDef *DomDef = dyn_cast(DomAccess); - if (DomDef && canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) { + // Below, check if DomDef is a valid candidate to be eliminated by + // KillingDef. If it is not, check the next candidate. + MemoryDef *DomDef = cast(DomAccess); + Instruction *DomI = DomDef->getMemoryInst(); + + // Before we try to remove anything, check for any extra throwing + // instructions that block us from DSEing + if (mayThrowBetween(KillingI, DomI, DefUO)) { + LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); + return None; + } + + // Check for anything that looks like it will be a barrier to further + // removal + if (isDSEBarrier(DefUO, KillingI)) { + LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); + return None; + } + + if (canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) { StepAgain = true; Current = DomDef->getDefiningAccess(); + continue; } + // If DomAccess is known to be on path that reads DefLoc or is a read + // clobber, bail out, as the path is not profitable. We skip this check + // for intrinsic calls, because the code knows how to handle memcpy + // intrinsics. + if (!isa(DomI) && (Cache.KnownReads.contains(DomAccess) || + isReadClobber(DefLoc, DomI))) { + Cache.KnownReads.insert(DomAccess); + return None; + } + + // If DomAccess cannot be analyzed or is not removable, check the next + // candidate. + if (!hasAnalyzableMemoryWrite(DomI, TLI) || !isRemovable(DomI)) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + continue; + } + + auto DomLoc = getLocForWriteEx(DomI); + if (!DomLoc) + break; + + if (IsMemTerm) { + // If the killing def is a memory terminator (e.g. lifetime.end), check + // the next candidate if the current DomAccess does not write the same + // underlying object as the terminator. + const Value *NIUnd = getUnderlyingObject(DomLoc->Ptr); + if (DefUO != NIUnd) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + } + continue; + } else { + int64_t InstWriteOffset, DepWriteOffset; + auto OR = isOverwrite(DefLoc, *DomLoc, DL, TLI, DepWriteOffset, + InstWriteOffset, BatchAA, &F); + // If DomAccess does not write to the same object as KillingDef, check + // the next candidate. + if (OR == OW_Unknown) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + } else if (OR == OW_MaybePartial) { + // If KillingDef only partially overwrites DomAccess, check the next + // candidate if the partial step limit is exceeded. This aggressively + // limits the number of candidates for partial store elimination, + // which are less likely to be removable in the end. + if (PartialLimit <= 1) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + WalkerStepLimit -= 1; + continue; + } + PartialLimit -= 1; + } + } } while (StepAgain); // Accesses to objects accessible after the function returns can only be @@ -1843,7 +1921,7 @@ // the blocks with killing (=completely overwriting MemoryDefs) and check if // they cover all paths from DomAccess to any function exit. SmallPtrSet KillingDefs; - KillingDefs.insert(KillingDef->getMemoryInst()); + KillingDefs.insert(KillingI); Instruction *DomMemInst = isa(DomAccess) ? cast(DomAccess)->getMemoryInst() : nullptr; @@ -2257,7 +2335,11 @@ SetVector ToCheck; ToCheck.insert(KillingDef->getDefiningAccess()); + if (!SILocUnd) + continue; unsigned WalkerStepLimit = 50; + unsigned PartialLimit = 5; + bool IsMemTerm = State.isMemTerminatorInst(SI); DSEState::CheckCache Cache; // Check if MemoryAccesses in the worklist are killed by KillingDef. for (unsigned I = 0; I < ToCheck.size(); I++) { @@ -2265,9 +2347,9 @@ if (State.SkipStores.count(Current)) continue; - Optional Next = - State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache, - ScanLimit, WalkerStepLimit); + Optional Next = State.getDomMemoryDef( + KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit, + WalkerStepLimit, IsMemTerm, PartialLimit); if (!Next) { LLVM_DEBUG(dbgs() << " finished walk\n"); @@ -2295,41 +2377,17 @@ MemoryDef *NextDef = dyn_cast(DomAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " (" << *NI << ")\n"); - - // Before we try to remove anything, check for any extra throwing - // instructions that block us from DSEing - if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); - break; - } - - // Check for anything that looks like it will be a barrier to further - // removal - if (State.isDSEBarrier(SILocUnd, NI)) { - LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); - continue; - } - ToCheck.insert(NextDef->getDefiningAccess()); - - if (!hasAnalyzableMemoryWrite(NI, TLI)) { - LLVM_DEBUG(dbgs() << " ... skip, cannot analyze def\n"); - continue; - } - - if (!isRemovable(NI)) { - LLVM_DEBUG(dbgs() << " ... skip, cannot remove def\n"); - continue; - } + NumGetDomMemoryDefPassed++; if (!DebugCounter::shouldExecute(MemorySSACounter)) continue; MemoryLocation NILoc = *State.getLocForWriteEx(NI); - if (State.isMemTerminatorInst(SI)) { + if (IsMemTerm) { const Value *NIUnd = getUnderlyingObject(NILoc.Ptr); - if (!SILocUnd || SILocUnd != NIUnd) + if (SILocUnd != NIUnd) continue; LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -209,10 +209,15 @@ declare void @goFunc(%struct.foostruct*) declare i32 @fa(i8*, i8**, i32, i8, i8*) +; We miss this case, because of aggressive an aggressive limit of partial +; overlap analysis. define void @test4() { ; CHECK-LABEL: @test4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 +; CHECK-NEXT: [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false) ; CHECK-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 ; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 ; CHECK-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; XFAIL: * - ; REQUIRES: asserts ; Eliminates store to %R in the entry block. diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; XFAIL: * - ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll @@ -7,6 +7,8 @@ declare void @may_throw() declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) +; We miss this case, because of aggressive an aggressive limit of partial +; overlap analysis. define void @overlap1(%struct.ham* %arg, i1 %cond) { ; CHECK-LABEL: @overlap1( ; CHECK-NEXT: bb: @@ -16,6 +18,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false) ; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] ; CHECK: bb7: ; CHECK-NEXT: br label [[BB9:%.*]] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll @@ -483,9 +483,8 @@ ; it could unwind define void @test34(i32* noalias %p) { ; CHECK-LABEL: @test34( -; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; CHECK-NEXT: call void @unknown_func() -; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; store i32 1, i32* %p @@ -642,8 +641,8 @@ define void @test41(i32* noalias %P) { ; CHECK-LABEL: @test41( ; CHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: store i32 1, i32* [[P]], align 4 ; CHECK-NEXT: call void @unknown_func() +; CHECK-NEXT: store i32 2, i32* [[P]], align 4 ; CHECK-NEXT: call void @free(i8* [[P2]]) ; CHECK-NEXT: ret void ;