Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -40,10 +40,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -853,6 +855,11 @@ PostDominatorTree &PDT; const TargetLibraryInfo &TLI; const DataLayout &DL; + const LoopInfo &LI; + + // Whether the function contains any irreducible control flow, useful for + // being accurately able to detect loops. + bool ContainsIrreducibleLoops; // All MemoryDefs that potentially could kill other MemDefs. SmallVector MemDefs; @@ -876,14 +883,15 @@ DenseMap IOLs; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, const TargetLibraryInfo &TLI) + PostDominatorTree &PDT, const TargetLibraryInfo &TLI, + const LoopInfo &LI) : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI), - DL(F.getParent()->getDataLayout()) {} + DL(F.getParent()->getDataLayout()), LI(LI) {} static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { - DSEState State(F, AA, MSSA, DT, PDT, TLI); + const TargetLibraryInfo &TLI, const LoopInfo &LI) { + DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -911,6 +919,9 @@ State.InvisibleToCallerAfterRet.insert({&AI, true}); } + // Collect whether there is any irreducible control flow in the function. + State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); + return State; } @@ -925,6 +936,12 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI, const MemoryLocation &Later, const MemoryLocation &Earlier, int64_t &EarlierOff, int64_t &LaterOff) { + // AliasAnalysis does not account for loops. Limit overwrite checks to + // candidates for which we can guarantee they always store to the same + // memory location and not located in different loops. + if (!IsGuaranteedLoopInvariant(EarlierI, LaterI, Later)) + return OW_Unknown; + // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll // get imprecise values here, though (except for unknown sizes). if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) { @@ -1253,8 +1270,21 @@ /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible /// loop. In particular, this guarantees that it only references a single /// MemoryLocation during execution of the containing function. - bool IsGuaranteedLoopInvariant(Value *Ptr) { - auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) { + bool IsGuaranteedLoopInvariant(const Instruction *Current, + const Instruction *KillingDef, + MemoryLocation CurrentLoc) { + // Limit elimination to candidates for which we can guarantee they always + // store to the same memory location and not located in different loops. But + // also be careful with irreducible control flow, which can create cycles + // without appearing as loops. + if (Current->getParent() == KillingDef->getParent()) + return true; + const Loop *CurrentLI = LI.getLoopFor(Current->getParent()); + if (!ContainsIrreducibleLoops && CurrentLI && + CurrentLI == LI.getLoopFor(KillingDef->getParent())) + return true; + + auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) { Ptr = Ptr->stripPointerCasts(); if (auto *I = dyn_cast(Ptr)) { if (isa(Ptr)) @@ -1268,7 +1298,7 @@ return true; }; - Ptr = Ptr->stripPointerCasts(); + const Value *Ptr = CurrentLoc.Ptr->stripPointerCasts(); if (auto *I = dyn_cast(Ptr)) { if (I->getParent()->isEntryBlock()) return true; @@ -1398,9 +1428,9 @@ // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same - // memory location and not multiple locations in a loop. - if (Current->getBlock() != KillingDef->getBlock() && - !IsGuaranteedLoopInvariant(const_cast(CurrentLoc->Ptr))) { + // memory location and not located in different loops. + if (!IsGuaranteedLoopInvariant(CurrentI, KillingI, *CurrentLoc)) { + LLVM_DEBUG(dbgs() << " ... not guaranteed loop invariant\n"); StepAgain = true; Current = CurrentDef->getDefiningAccess(); WalkerStepLimit -= 1; @@ -1836,12 +1866,13 @@ } }; -bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + const TargetLibraryInfo &TLI, + const LoopInfo &LI) { bool MadeChange = false; - DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); + DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2014,8 +2045,9 @@ DominatorTree &DT = AM.getResult(F); MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); + LoopInfo &LI = AM.getResult(F); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2029,6 +2061,7 @@ PreservedAnalyses PA; PA.preserveSet(); PA.preserve(); + PA.preserve(); return PA; } @@ -2054,8 +2087,9 @@ MemorySSA &MSSA = getAnalysis().getMSSA(); PostDominatorTree &PDT = getAnalysis().getPostDomTree(); + LoopInfo &LI = getAnalysis().getLoopInfo(); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2077,6 +2111,8 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; @@ -2093,6 +2129,7 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -515,8 +515,8 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MemCpy Optimization -; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass @@ -874,8 +874,8 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MemCpy Optimization -; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/test/Other/opt-O2-pipeline.ll +++ llvm/test/Other/opt-O2-pipeline.ll @@ -162,8 +162,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -168,8 +168,8 @@ ; LEGACY-NEXT: Function Alias Analysis Results ; LEGACY-NEXT: Memory SSA ; LEGACY-NEXT: MemCpy Optimization -; LEGACY-NEXT: Dead Store Elimination ; LEGACY-NEXT: Natural Loop Information +; LEGACY-NEXT: Dead Store Elimination ; LEGACY-NEXT: Canonicalize natural loops ; LEGACY-NEXT: LCSSA Verifier ; LEGACY-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline.ll +++ llvm/test/Other/opt-O3-pipeline.ll @@ -167,8 +167,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/test/Other/opt-Os-pipeline.ll +++ llvm/test/Other/opt-Os-pipeline.ll @@ -148,8 +148,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll =================================================================== --- llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll +++ llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll @@ -111,7 +111,6 @@ ; CHECK: for.body4.lr.ph: ; CHECK-NEXT: [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]] -; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]] ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.body4: @@ -627,6 +626,7 @@ ; CHECK-LABEL: @partial_override_overloop( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 [[I:%.*]] +; CHECK-NEXT: store i16 1, i16* [[ARRAYIDX]], align 1 ; CHECK-NEXT: br label [[DO_BODY:%.*]] ; CHECK: do.body: ; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[DO_BODY]] ]