diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -87,6 +87,8 @@ STATISTIC(NumCFGChecks, "Number of stores modified"); STATISTIC(NumCFGTries, "Number of stores modified"); STATISTIC(NumCFGSuccess, "Number of stores modified"); +STATISTIC(NumGetDomMemoryDefPassed, + "Number of times a valid candidate is returned from getDomMemoryDef"); STATISTIC(NumDomMemDefChecks, "Number iterations check for reads in getDomMemoryDef"); @@ -116,6 +118,12 @@ cl::desc("The maximum number of steps while walking upwards to find " "MemoryDefs that may be killed (default = 70)")); +static cl::opt MemorySSAPartialStoreLimit( + "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden, + cl::desc("The maximum number candidates that only partially overwrite the " + "killing MemoryDef to consider" + " (default = 5)")); + static cl::opt MemorySSADefsPerBlockLimit( "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden, cl::desc("The number of MemoryDefs we consider as candidates to eliminated " @@ -1464,12 +1472,12 @@ // 2. Check that there are no reads between EarlierAccess and the StartDef by // checking all uses starting at EarlierAccess and walking until we see // StartDef. -// 3. For each found EarlierDef, check that: -// 1. There are no barrier instructions between EarlierDef and StartDef (like +// 3. For each found CurrentDef, check that: +// 1. There are no barrier instructions between CurrentDef and StartDef (like // throws or stores with ordering constraints). -// 2. StartDef is executed whenever EarlierDef is executed. -// 3. StartDef completely overwrites EarlierDef. -// 4. Erase EarlierDef from the function and MemorySSA. +// 2. StartDef is executed whenever CurrentDef is executed. +// 3. StartDef completely overwrites CurrentDef. +// 4. Erase CurrentDef from the function and MemorySSA. // Returns true if \p M is an intrisnic that does not read or write memory. bool isNoopIntrinsic(MemoryUseOrDef *M) { @@ -1801,26 +1809,29 @@ return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc)); } - // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no - // read access between them or on any other path to a function exit block if - // \p DefLoc is not accessible after the function returns. If there is no such - // MemoryDef, return None. The returned value may not (completely) overwrite - // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse - // (read). + // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with + // no read access between them or on any other path to a function exit block + // if \p DefLoc is not accessible after the function returns. If there is no + // such MemoryDef, return None. The returned value may not (completely) + // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing + // MemoryUse (read). Optional - getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current, + getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess, MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache, - unsigned &ScanLimit, unsigned &WalkerStepLimit) { + unsigned &ScanLimit, unsigned &WalkerStepLimit, + bool IsMemTerm, unsigned &PartialLimit) { if (ScanLimit == 0 || WalkerStepLimit == 0) { LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n"); return None; } - MemoryAccess *StartAccess = Current; + MemoryAccess *Current = StartAccess; + Instruction *KillingI = KillingDef->getMemoryInst(); bool StepAgain; - LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current - << "\n"); - // Find the next clobbering Mod access for DefLoc, starting at Current. + LLVM_DEBUG(dbgs() << " trying to get dominating access for " + << *StartAccess << "\n"); + + // Find the next clobbering Mod access for DefLoc, starting at StartAccess. do { StepAgain = false; // Reached TOP. @@ -1839,12 +1850,86 @@ if (isa(Current)) break; - // Check if we can skip EarlierDef for DSE. - MemoryDef *CurrentDef = dyn_cast(Current); - if (CurrentDef && - canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) { + // Below, check if CurrentDef is a valid candidate to be eliminated by + // KillingDef. If it is not, check the next candidate. + MemoryDef *CurrentDef = cast(Current); + Instruction *CurrentI = CurrentDef->getMemoryInst(); + + if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) { + StepAgain = true; + Current = CurrentDef->getDefiningAccess(); + continue; + } + + // Before we try to remove anything, check for any extra throwing + // instructions that block us from DSEing + if (mayThrowBetween(KillingI, CurrentI, DefUO)) { + LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); + return None; + } + + // Check for anything that looks like it will be a barrier to further + // removal + if (isDSEBarrier(DefUO, CurrentI)) { + LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); + return None; + } + + // If Current is known to be on path that reads DefLoc or is a read + // clobber, bail out, as the path is not profitable. We skip this check + // for intrinsic calls, because the code knows how to handle memcpy + // intrinsics. + if (!isa(CurrentI) && + (Cache.KnownReads.contains(Current) || + isReadClobber(DefLoc, CurrentI))) { + Cache.KnownReads.insert(Current); + return None; + } + + // If Current cannot be analyzed or is not removable, check the next + // candidate. + if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) { StepAgain = true; Current = CurrentDef->getDefiningAccess(); + continue; + } + + auto CurrentLoc = getLocForWriteEx(CurrentI); + if (!CurrentLoc) + break; + + if (IsMemTerm) { + // If the killing def is a memory terminator (e.g. lifetime.end), check + // the next candidate if the current Current does not write the same + // underlying object as the terminator. + const Value *NIUnd = getUnderlyingObject(CurrentLoc->Ptr); + if (DefUO != NIUnd) { + StepAgain = true; + Current = CurrentDef->getDefiningAccess(); + } + continue; + } else { + int64_t InstWriteOffset, DepWriteOffset; + auto OR = isOverwrite(DefLoc, *CurrentLoc, DL, TLI, DepWriteOffset, + InstWriteOffset, BatchAA, &F); + // If Current does not write to the same object as KillingDef, check + // the next candidate. + if (OR == OW_Unknown) { + StepAgain = true; + Current = CurrentDef->getDefiningAccess(); + } else if (OR == OW_MaybePartial) { + // If KillingDef only partially overwrites Current, check the next + // candidate if the partial step limit is exceeded. This aggressively + // limits the number of candidates for partial store elimination, + // which are less likely to be removable in the end. + if (PartialLimit <= 1) { + StepAgain = true; + Current = CurrentDef->getDefiningAccess(); + WalkerStepLimit -= 1; + continue; + } + PartialLimit -= 1; + } } } while (StepAgain); @@ -2260,10 +2345,14 @@ unsigned ScanLimit = MemorySSAScanLimit; unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit; + unsigned PartialLimit = MemorySSAPartialStoreLimit; // Worklist of MemoryAccesses that may be killed by KillingDef. SetVector ToCheck; ToCheck.insert(KillingDef->getDefiningAccess()); + if (!SILocUnd) + continue; + bool IsMemTerm = State.isMemTerminatorInst(SI); DSEState::CheckCache Cache; // Check if MemoryAccesses in the worklist are killed by KillingDef. for (unsigned I = 0; I < ToCheck.size(); I++) { @@ -2271,9 +2360,9 @@ if (State.SkipStores.count(Current)) continue; - Optional Next = - State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache, - ScanLimit, WalkerStepLimit); + Optional Next = State.getDomMemoryDef( + KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit, + WalkerStepLimit, IsMemTerm, PartialLimit); if (!Next) { LLVM_DEBUG(dbgs() << " finished walk\n"); @@ -2301,41 +2390,17 @@ MemoryDef *NextDef = dyn_cast(EarlierAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " (" << *NI << ")\n"); - - // Before we try to remove anything, check for any extra throwing - // instructions that block us from DSEing - if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); - break; - } - - // Check for anything that looks like it will be a barrier to further - // removal - if (State.isDSEBarrier(SILocUnd, NI)) { - LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); - continue; - } - ToCheck.insert(NextDef->getDefiningAccess()); - - if (!hasAnalyzableMemoryWrite(NI, TLI)) { - LLVM_DEBUG(dbgs() << " ... skip, cannot analyze def\n"); - continue; - } - - if (!isRemovable(NI)) { - LLVM_DEBUG(dbgs() << " ... skip, cannot remove def\n"); - continue; - } + NumGetDomMemoryDefPassed++; if (!DebugCounter::shouldExecute(MemorySSACounter)) continue; MemoryLocation NILoc = *State.getLocForWriteEx(NI); - if (State.isMemTerminatorInst(SI)) { + if (IsMemTerm) { const Value *NIUnd = getUnderlyingObject(NILoc.Ptr); - if (!SILocUnd || SILocUnd != NIUnd) + if (SILocUnd != NIUnd) continue; LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s +; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s +; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux" @@ -209,22 +210,43 @@ declare void @goFunc(%struct.foostruct*) declare i32 @fa(i8*, i8**, i32, i8, i8*) +; We miss this case, because of an aggressive limit of partial overlap analysis. +; With a larger partial store limit, we remove the memset. define void @test4() { -; CHECK-LABEL: @test4( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 -; CHECK-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 -; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 -; CHECK-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 -; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 -; CHECK-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 -; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 -; CHECK-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 -; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 -; CHECK-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 -; CHECK-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 -; CHECK-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) -; CHECK-NEXT: ret void +; DEFAULT-LIMIT-LABEL: @test4( +; DEFAULT-LIMIT-NEXT: entry: +; DEFAULT-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 +; DEFAULT-LIMIT-NEXT: [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8* +; DEFAULT-LIMIT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32 +; DEFAULT-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false) +; DEFAULT-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 +; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 +; DEFAULT-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 +; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 +; DEFAULT-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 +; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 +; DEFAULT-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 +; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 +; DEFAULT-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 +; DEFAULT-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 +; DEFAULT-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) +; DEFAULT-LIMIT-NEXT: ret void +; +; LARGER-LIMIT-LABEL: @test4( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 +; LARGER-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 +; LARGER-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 +; LARGER-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 +; LARGER-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 +; LARGER-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 +; LARGER-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 +; LARGER-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) +; LARGER-LIMIT-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; XFAIL: * - ; REQUIRES: asserts ; Eliminates store to %R in the entry block. diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; XFAIL: * - ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s +; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s +; RUN: opt -dse -enable-dse-memoryssa -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s %struct.ham = type { [3 x double], [3 x double]} @@ -7,28 +8,55 @@ declare void @may_throw() declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) +; We miss this case, because of an aggressive limit of partial overlap analysis. +; With a larger partial store limit, we remove the memset. define void @overlap1(%struct.ham* %arg, i1 %cond) { -; CHECK-LABEL: @overlap1( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 -; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] -; CHECK: bb7: -; CHECK-NEXT: br label [[BB9:%.*]] -; CHECK: bb8: -; CHECK-NEXT: br label [[BB9]] -; CHECK: bb9: -; CHECK-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 -; CHECK-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 -; CHECK-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 -; CHECK-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 -; CHECK-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 -; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 -; CHECK-NEXT: ret void +; DEFAULT-LIMIT-LABEL: @overlap1( +; DEFAULT-LIMIT-NEXT: bb: +; DEFAULT-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; DEFAULT-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; DEFAULT-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; DEFAULT-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; DEFAULT-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; DEFAULT-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; DEFAULT-LIMIT-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8* +; DEFAULT-LIMIT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32 +; DEFAULT-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false) +; DEFAULT-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; DEFAULT-LIMIT: bb7: +; DEFAULT-LIMIT-NEXT: br label [[BB9:%.*]] +; DEFAULT-LIMIT: bb8: +; DEFAULT-LIMIT-NEXT: br label [[BB9]] +; DEFAULT-LIMIT: bb9: +; DEFAULT-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; DEFAULT-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; DEFAULT-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; DEFAULT-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; DEFAULT-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; DEFAULT-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; DEFAULT-LIMIT-NEXT: ret void +; +; LARGER-LIMIT-LABEL: @overlap1( +; LARGER-LIMIT-NEXT: bb: +; LARGER-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; LARGER-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; LARGER-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; LARGER-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; LARGER-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; LARGER-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; LARGER-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; LARGER-LIMIT: bb7: +; LARGER-LIMIT-NEXT: br label [[BB9:%.*]] +; LARGER-LIMIT: bb8: +; LARGER-LIMIT-NEXT: br label [[BB9]] +; LARGER-LIMIT: bb9: +; LARGER-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; LARGER-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; LARGER-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; LARGER-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; LARGER-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; LARGER-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; LARGER-LIMIT-NEXT: ret void ; bb: %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll @@ -477,10 +477,8 @@ ret i32 0 } -; TODO -; We can remove redundant store, as noalias %p guarantees that the function does -; only access it via %p. This also holds for the call to unknown_func even though -; it could unwind +; We cannot remove any stores, because @unknown_func may unwind and the caller +; may read %p while unwinding. define void @test34(i32* noalias %p) { ; CHECK-LABEL: @test34( ; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 @@ -636,9 +634,10 @@ ret void } -; I think this case is currently handled incorrectly by memdeps dse -; throwing should leave store i32 1, not remove from the free. declare void @free(i8* nocapture) + +; We cannot remove `store i32 1, i32* %p`, because @unknown_func may unwind +; and the caller may read %p while unwinding. define void @test41(i32* noalias %P) { ; CHECK-LABEL: @test41( ; CHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*