diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -87,6 +87,8 @@ STATISTIC(NumCFGChecks, "Number of stores modified"); STATISTIC(NumCFGTries, "Number of stores modified"); STATISTIC(NumCFGSuccess, "Number of stores modified"); +STATISTIC(NumGetDomMemoryDefPassed, + "Number of times a valid candidate is returned from getDomMemoryDef"); STATISTIC(NumDomMemDefChecks, "Number iterations check for reads in getDomMemoryDef"); @@ -116,6 +118,12 @@ cl::desc("The maximum number of steps while walking upwards to find " "MemoryDefs that may be killed (default = 70)")); +static cl::opt MemorySSAPartialStoreLimit( + "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden, + cl::desc("The maximum number candidates that only partially overwrite the " + "killing MemoryDef to consider" + " (default = 5)")); + static cl::opt MemorySSADefsPerBlockLimit( "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden, cl::desc("The number of MemoryDefs we consider as candidates to eliminated " @@ -1809,7 +1817,8 @@ Optional getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current, MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache, - unsigned &ScanLimit, unsigned &WalkerStepLimit) { + unsigned &ScanLimit, unsigned &WalkerStepLimit, + bool IsMemTerm, unsigned &PartialLimit) { if (ScanLimit == 0 || WalkerStepLimit == 0) { LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n"); return None; @@ -1817,9 +1826,11 @@ MemoryAccess *DomAccess; MemoryAccess *StartAccess = Current; + Instruction *KillingI = KillingDef->getMemoryInst(); bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); + // Find the next clobbering Mod access for DefLoc, starting at Current. do { StepAgain = false; @@ -1853,13 +1864,86 @@ if (isa(DomAccess)) break; - // Check if we can skip DomDef for DSE. - MemoryDef *DomDef = dyn_cast(DomAccess); - if (DomDef && canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) { + // Below, check if DomDef is a valid candidate to be eliminated by + // KillingDef. If it is not, check the next candidate. + MemoryDef *DomDef = cast(DomAccess); + Instruction *DomI = DomDef->getMemoryInst(); + + if (canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + continue; + } + + // Before we try to remove anything, check for any extra throwing + // instructions that block us from DSEing + if (mayThrowBetween(KillingI, DomI, DefUO)) { + LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); + return None; + } + + // Check for anything that looks like it will be a barrier to further + // removal + if (isDSEBarrier(DefUO, DomI)) { + LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); + return None; + } + + // If DomAccess is known to be on path that reads DefLoc or is a read + // clobber, bail out, as the path is not profitable. We skip this check + // for intrinsic calls, because the code knows how to handle memcpy + // intrinsics. + if (!isa(DomI) && (Cache.KnownReads.contains(DomAccess) || + isReadClobber(DefLoc, DomI))) { + Cache.KnownReads.insert(DomAccess); + return None; + } + + // If DomAccess cannot be analyzed or is not removable, check the next + // candidate. + if (!hasAnalyzableMemoryWrite(DomI, TLI) || !isRemovable(DomI)) { StepAgain = true; Current = DomDef->getDefiningAccess(); + continue; } + auto DomLoc = getLocForWriteEx(DomI); + if (!DomLoc) + break; + + if (IsMemTerm) { + // If the killing def is a memory terminator (e.g. lifetime.end), check + // the next candidate if the current DomAccess does not write the same + // underlying object as the terminator. + const Value *NIUnd = getUnderlyingObject(DomLoc->Ptr); + if (DefUO != NIUnd) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + } + continue; + } else { + int64_t InstWriteOffset, DepWriteOffset; + auto OR = isOverwrite(DefLoc, *DomLoc, DL, TLI, DepWriteOffset, + InstWriteOffset, BatchAA, &F); + // If DomAccess does not write to the same object as KillingDef, check + // the next candidate. + if (OR == OW_Unknown) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + } else if (OR == OW_MaybePartial) { + // If KillingDef only partially overwrites DomAccess, check the next + // candidate if the partial step limit is exceeded. This aggressively + // limits the number of candidates for partial store elimination, + // which are less likely to be removable in the end. + if (PartialLimit <= 1) { + StepAgain = true; + Current = DomDef->getDefiningAccess(); + WalkerStepLimit -= 1; + continue; + } + PartialLimit -= 1; + } + } } while (StepAgain); // Accesses to objects accessible after the function returns can only be @@ -1867,7 +1951,7 @@ // the blocks with killing (=completely overwriting MemoryDefs) and check if // they cover all paths from DomAccess to any function exit. SmallPtrSet KillingDefs; - KillingDefs.insert(KillingDef->getMemoryInst()); + KillingDefs.insert(KillingI); Instruction *DomMemInst = isa(DomAccess) ? cast(DomAccess)->getMemoryInst() : nullptr; @@ -2278,10 +2362,14 @@ unsigned ScanLimit = MemorySSAScanLimit; unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit; + unsigned PartialLimit = MemorySSAPartialStoreLimit; // Worklist of MemoryAccesses that may be killed by KillingDef. SetVector ToCheck; ToCheck.insert(KillingDef->getDefiningAccess()); + if (!SILocUnd) + continue; + bool IsMemTerm = State.isMemTerminatorInst(SI); DSEState::CheckCache Cache; // Check if MemoryAccesses in the worklist are killed by KillingDef. for (unsigned I = 0; I < ToCheck.size(); I++) { @@ -2289,9 +2377,9 @@ if (State.SkipStores.count(Current)) continue; - Optional Next = - State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache, - ScanLimit, WalkerStepLimit); + Optional Next = State.getDomMemoryDef( + KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit, + WalkerStepLimit, IsMemTerm, PartialLimit); if (!Next) { LLVM_DEBUG(dbgs() << " finished walk\n"); @@ -2319,41 +2407,17 @@ MemoryDef *NextDef = dyn_cast(DomAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " (" << *NI << ")\n"); - - // Before we try to remove anything, check for any extra throwing - // instructions that block us from DSEing - if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " ... skip, may throw!\n"); - break; - } - - // Check for anything that looks like it will be a barrier to further - // removal - if (State.isDSEBarrier(SILocUnd, NI)) { - LLVM_DEBUG(dbgs() << " ... skip, barrier\n"); - continue; - } - ToCheck.insert(NextDef->getDefiningAccess()); - - if (!hasAnalyzableMemoryWrite(NI, TLI)) { - LLVM_DEBUG(dbgs() << " ... skip, cannot analyze def\n"); - continue; - } - - if (!isRemovable(NI)) { - LLVM_DEBUG(dbgs() << " ... skip, cannot remove def\n"); - continue; - } + NumGetDomMemoryDefPassed++; if (!DebugCounter::shouldExecute(MemorySSACounter)) continue; MemoryLocation NILoc = *State.getLocForWriteEx(NI); - if (State.isMemTerminatorInst(SI)) { + if (IsMemTerm) { const Value *NIUnd = getUnderlyingObject(NILoc.Ptr); - if (!SILocUnd || SILocUnd != NIUnd) + if (SILocUnd != NIUnd) continue; LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s +; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefix=LARGER-LIMIT %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux" @@ -34,6 +35,34 @@ ; CHECK-NEXT: store float [[MUL_I_I_I]], float* [[_M_VALUE_IMAGP_I_I]], align 4 ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @_Z4testSt7complexIfE( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +; LARGER-LIMIT-NEXT: [[TMPCAST:%.*]] = bitcast i64* [[REF_TMP]] to %"struct.std::complex"* +; LARGER-LIMIT-NEXT: [[C_SROA_0_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[C_COERCE:%.*]], 32 +; LARGER-LIMIT-NEXT: [[C_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[C_SROA_0_0_EXTRACT_SHIFT]] to i32 +; LARGER-LIMIT-NEXT: [[TMP0:%.*]] = bitcast i32 [[C_SROA_0_0_EXTRACT_TRUNC]] to float +; LARGER-LIMIT-NEXT: [[C_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[C_COERCE]] to i32 +; LARGER-LIMIT-NEXT: [[TMP1:%.*]] = bitcast i32 [[C_SROA_2_0_EXTRACT_TRUNC]] to float +; LARGER-LIMIT-NEXT: call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret [[TMPCAST]], i64 [[C_COERCE]]) +; LARGER-LIMIT-NEXT: [[TMP2:%.*]] = load i64, i64* [[REF_TMP]], align 8 +; LARGER-LIMIT-NEXT: [[_M_VALUE_REALP_I_I:%.*]] = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* [[AGG_RESULT:%.*]], i64 0, i32 0, i32 0 +; LARGER-LIMIT-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; LARGER-LIMIT-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; LARGER-LIMIT-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float +; LARGER-LIMIT-NEXT: [[_M_VALUE_IMAGP_I_I:%.*]] = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* [[AGG_RESULT]], i64 0, i32 0, i32 1 +; LARGER-LIMIT-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP2]] to i32 +; LARGER-LIMIT-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP6]] to float +; LARGER-LIMIT-NEXT: [[MUL_AD_I_I:%.*]] = fmul fast float [[TMP5]], [[TMP1]] +; LARGER-LIMIT-NEXT: [[MUL_BC_I_I:%.*]] = fmul fast float [[TMP7]], [[TMP0]] +; LARGER-LIMIT-NEXT: [[MUL_I_I_I:%.*]] = fadd fast float [[MUL_AD_I_I]], [[MUL_BC_I_I]] +; LARGER-LIMIT-NEXT: [[MUL_AC_I_I:%.*]] = fmul fast float [[TMP5]], [[TMP0]] +; LARGER-LIMIT-NEXT: [[MUL_BD_I_I:%.*]] = fmul fast float [[TMP7]], [[TMP1]] +; LARGER-LIMIT-NEXT: [[MUL_R_I_I:%.*]] = fsub fast float [[MUL_AC_I_I]], [[MUL_BD_I_I]] +; LARGER-LIMIT-NEXT: store float [[MUL_R_I_I]], float* [[_M_VALUE_REALP_I_I]], align 4 +; LARGER-LIMIT-NEXT: store float [[MUL_I_I_I]], float* [[_M_VALUE_IMAGP_I_I]], align 4 +; LARGER-LIMIT-NEXT: ret void +; entry: %ref.tmp = alloca i64, align 8 @@ -81,6 +110,18 @@ ; CHECK-NEXT: store i16 2020, i16* [[WPTRP]], align 1 ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @test1( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8* +; LARGER-LIMIT-NEXT: [[WPTR:%.*]] = bitcast i32* [[PTR]] to i16* +; LARGER-LIMIT-NEXT: store i16 -30062, i16* [[WPTR]], align 2 +; LARGER-LIMIT-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3 +; LARGER-LIMIT-NEXT: store i8 47, i8* [[BPTR3]], align 1 +; LARGER-LIMIT-NEXT: [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1 +; LARGER-LIMIT-NEXT: [[WPTRP:%.*]] = bitcast i8* [[BPTR1]] to i16* +; LARGER-LIMIT-NEXT: store i16 2020, i16* [[WPTRP]], align 1 +; LARGER-LIMIT-NEXT: ret void +; entry: store i32 5, i32* %ptr @@ -120,6 +161,25 @@ ; CHECK-NEXT: store i16 5656, i16* [[WPTR3]], align 1 ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @test2( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8* +; LARGER-LIMIT-NEXT: [[BPTRM1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 -1 +; LARGER-LIMIT-NEXT: [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1 +; LARGER-LIMIT-NEXT: [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2 +; LARGER-LIMIT-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3 +; LARGER-LIMIT-NEXT: [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16* +; LARGER-LIMIT-NEXT: [[WPTRM1:%.*]] = bitcast i8* [[BPTRM1]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR3:%.*]] = bitcast i8* [[BPTR3]] to i16* +; LARGER-LIMIT-NEXT: store i16 1456, i16* [[WPTRM1]], align 1 +; LARGER-LIMIT-NEXT: store i16 1346, i16* [[WPTR]], align 1 +; LARGER-LIMIT-NEXT: store i16 1756, i16* [[WPTR1]], align 1 +; LARGER-LIMIT-NEXT: store i16 1126, i16* [[WPTR2]], align 1 +; LARGER-LIMIT-NEXT: store i16 5656, i16* [[WPTR3]], align 1 +; LARGER-LIMIT-NEXT: ret void +; entry: store i32 5, i32* %ptr @@ -170,6 +230,27 @@ ; CHECK-NEXT: store i16 5656, i16* [[WPTR3]], align 1 ; CHECK-NEXT: ret i8 [[V]] ; +; LARGER-LIMIT-LABEL: @test3( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: store i32 5, i32* [[PTR:%.*]], align 4 +; LARGER-LIMIT-NEXT: [[BPTR:%.*]] = bitcast i32* [[PTR]] to i8* +; LARGER-LIMIT-NEXT: [[BPTRM1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 -1 +; LARGER-LIMIT-NEXT: [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1 +; LARGER-LIMIT-NEXT: [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2 +; LARGER-LIMIT-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3 +; LARGER-LIMIT-NEXT: [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16* +; LARGER-LIMIT-NEXT: [[WPTRM1:%.*]] = bitcast i8* [[BPTRM1]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR3:%.*]] = bitcast i8* [[BPTR3]] to i16* +; LARGER-LIMIT-NEXT: [[V:%.*]] = load i8, i8* [[BPTR]], align 1 +; LARGER-LIMIT-NEXT: store i16 1456, i16* [[WPTRM1]], align 1 +; LARGER-LIMIT-NEXT: store i16 1346, i16* [[WPTR]], align 1 +; LARGER-LIMIT-NEXT: store i16 1756, i16* [[WPTR1]], align 1 +; LARGER-LIMIT-NEXT: store i16 1126, i16* [[WPTR2]], align 1 +; LARGER-LIMIT-NEXT: store i16 5656, i16* [[WPTR3]], align 1 +; LARGER-LIMIT-NEXT: ret i8 [[V]] +; entry: store i32 5, i32* %ptr @@ -209,10 +290,15 @@ declare void @goFunc(%struct.foostruct*) declare i32 @fa(i8*, i8**, i32, i8, i8*) +; We miss this case, because of aggressive an aggressive limit of partial +; overlap analysis. With a larger partial store limit, we remove the memset. define void @test4() { ; CHECK-LABEL: @test4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 +; CHECK-NEXT: [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false) ; CHECK-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 ; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 ; CHECK-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 @@ -226,6 +312,22 @@ ; CHECK-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @test4( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 +; LARGER-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 +; LARGER-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 +; LARGER-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 +; LARGER-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 +; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 +; LARGER-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 +; LARGER-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 +; LARGER-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) +; LARGER-LIMIT-NEXT: ret void +; entry: %bang = alloca %struct.foostruct, align 8 @@ -261,6 +363,20 @@ ; CHECK-NEXT: store i16 1346, i16* [[WPTR]], align 1 ; CHECK-NEXT: ret i8 0 ; +; LARGER-LIMIT-LABEL: @test5( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8* +; LARGER-LIMIT-NEXT: [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1 +; LARGER-LIMIT-NEXT: [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2 +; LARGER-LIMIT-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3 +; LARGER-LIMIT-NEXT: [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16* +; LARGER-LIMIT-NEXT: [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16* +; LARGER-LIMIT-NEXT: store i16 -1, i16* [[WPTR2]], align 1 +; LARGER-LIMIT-NEXT: store i16 1456, i16* [[WPTR1]], align 1 +; LARGER-LIMIT-NEXT: store i16 1346, i16* [[WPTR]], align 1 +; LARGER-LIMIT-NEXT: ret i8 0 +; entry: store i32 0, i32* %ptr @@ -292,6 +408,15 @@ ; CHECK-NEXT: store i16 -1, i16* [[BPTR1]], align 1 ; CHECK-NEXT: ret i8 0 ; +; LARGER-LIMIT-LABEL: @test6( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i16* +; LARGER-LIMIT-NEXT: [[BPTR1:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 0 +; LARGER-LIMIT-NEXT: [[BPTR2:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 1 +; LARGER-LIMIT-NEXT: store i16 1456, i16* [[BPTR2]], align 1 +; LARGER-LIMIT-NEXT: store i16 -1, i16* [[BPTR1]], align 1 +; LARGER-LIMIT-NEXT: ret i8 0 +; entry: store i32 0, i32* %ptr @@ -321,6 +446,19 @@ ; CHECK-NEXT: store i16 5656, i16* [[BPTR4]], align 1 ; CHECK-NEXT: ret i8 0 ; +; LARGER-LIMIT-LABEL: @test7( +; LARGER-LIMIT-NEXT: entry: +; LARGER-LIMIT-NEXT: [[BPTR:%.*]] = bitcast i64* [[PTR:%.*]] to i16* +; LARGER-LIMIT-NEXT: [[BPTR1:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 0 +; LARGER-LIMIT-NEXT: [[BPTR2:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 1 +; LARGER-LIMIT-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 2 +; LARGER-LIMIT-NEXT: [[BPTR4:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 3 +; LARGER-LIMIT-NEXT: store i16 1346, i16* [[BPTR1]], align 1 +; LARGER-LIMIT-NEXT: store i16 1756, i16* [[BPTR3]], align 1 +; LARGER-LIMIT-NEXT: store i16 1456, i16* [[BPTR2]], align 1 +; LARGER-LIMIT-NEXT: store i16 5656, i16* [[BPTR4]], align 1 +; LARGER-LIMIT-NEXT: ret i8 0 +; entry: store i64 0, i64* %ptr diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; XFAIL: * - ; REQUIRES: asserts ; Eliminates store to %R in the entry block. diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; XFAIL: * - ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s +; RUN: opt -dse -enable-dse-memoryssa -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefix=LARGER-LIMIT %s %struct.ham = type { [3 x double], [3 x double]} @@ -7,6 +8,8 @@ declare void @may_throw() declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) +; We miss this case, because of aggressive an aggressive limit of partial +; overlap analysis. define void @overlap1(%struct.ham* %arg, i1 %cond) { ; CHECK-LABEL: @overlap1( ; CHECK-NEXT: bb: @@ -16,6 +19,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false) ; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] ; CHECK: bb7: ; CHECK-NEXT: br label [[BB9:%.*]] @@ -30,6 +36,28 @@ ; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @overlap1( +; LARGER-LIMIT-NEXT: bb: +; LARGER-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; LARGER-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; LARGER-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; LARGER-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; LARGER-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; LARGER-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; LARGER-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; LARGER-LIMIT: bb7: +; LARGER-LIMIT-NEXT: br label [[BB9:%.*]] +; LARGER-LIMIT: bb8: +; LARGER-LIMIT-NEXT: br label [[BB9]] +; LARGER-LIMIT: bb9: +; LARGER-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; LARGER-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; LARGER-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; LARGER-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; LARGER-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; LARGER-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; LARGER-LIMIT-NEXT: ret void +; bb: %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2 %tmp1 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 1 @@ -83,6 +111,31 @@ ; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @overlap2( +; LARGER-LIMIT-NEXT: bb: +; LARGER-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; LARGER-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; LARGER-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; LARGER-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; LARGER-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; LARGER-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; LARGER-LIMIT-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8* +; LARGER-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP6]], i8 0, i64 48, i1 false) +; LARGER-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; LARGER-LIMIT: bb7: +; LARGER-LIMIT-NEXT: call void @may_throw() +; LARGER-LIMIT-NEXT: br label [[BB9:%.*]] +; LARGER-LIMIT: bb8: +; LARGER-LIMIT-NEXT: br label [[BB9]] +; LARGER-LIMIT: bb9: +; LARGER-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; LARGER-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; LARGER-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; LARGER-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; LARGER-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; LARGER-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; LARGER-LIMIT-NEXT: ret void +; bb: %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2 %tmp1 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 1 @@ -127,6 +180,18 @@ ; CHECK-NEXT: store i16 0, i16* [[CAST_I16]], align 4 ; CHECK-NEXT: ret void ; +; LARGER-LIMIT-LABEL: @overlap_no_dominance( +; LARGER-LIMIT-NEXT: bb: +; LARGER-LIMIT-NEXT: br i1 [[C:%.*]], label [[BB13:%.*]], label [[BB9:%.*]] +; LARGER-LIMIT: bb9: +; LARGER-LIMIT-NEXT: [[CAST_I32:%.*]] = bitcast [4 x i8]* [[ARG:%.*]] to i32* +; LARGER-LIMIT-NEXT: store i32 844283136, i32* [[CAST_I32]], align 4 +; LARGER-LIMIT-NEXT: br label [[BB13]] +; LARGER-LIMIT: bb13: +; LARGER-LIMIT-NEXT: [[CAST_I16:%.*]] = bitcast [4 x i8]* [[ARG]] to i16* +; LARGER-LIMIT-NEXT: store i16 0, i16* [[CAST_I16]], align 4 +; LARGER-LIMIT-NEXT: ret void +; bb: br i1 %c, label %bb13, label %bb9 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll @@ -477,10 +477,8 @@ ret i32 0 } -; TODO -; We can remove redundant store, as noalias %p guarantees that the function does -; only access it via %p. This also holds for the call to unknown_func even though -; it could unwind +; We cannot remove any stores, because @unknown_func may unwind and the caller +; may read %p while unwinding. define void @test34(i32* noalias %p) { ; CHECK-LABEL: @test34( ; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 @@ -636,9 +634,10 @@ ret void } -; I think this case is currently handled incorrectly by memdeps dse -; throwing should leave store i32 1, not remove from the free. declare void @free(i8* nocapture) + +; We cannot remove `store i32 1, i32* %p`, because @unknown_func may unwind +; and the caller may read %p while unwinding. define void @test41(i32* noalias %P) { ; CHECK-LABEL: @test41( ; CHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*