Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -40,10 +40,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -972,6 +974,11 @@ PostDominatorTree &PDT; const TargetLibraryInfo &TLI; const DataLayout &DL; + const LoopInfo &LI; + + // Whether the function contains any irreducible control flow, useful for + // being accurately able to detect loops. + bool ContainsIrreducibleLoops; // All MemoryDefs that potentially could kill other MemDefs. SmallVector MemDefs; @@ -995,14 +1002,15 @@ DenseMap IOLs; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, const TargetLibraryInfo &TLI) + PostDominatorTree &PDT, const TargetLibraryInfo &TLI, + const LoopInfo &LI) : F(F), AA(AA), BatchAA(AA, /*CacheOffsets =*/true), MSSA(MSSA), DT(DT), - PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()) {} + PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {} static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { - DSEState State(F, AA, MSSA, DT, PDT, TLI); + const TargetLibraryInfo &TLI, const LoopInfo &LI) { + DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -1030,6 +1038,9 @@ State.InvisibleToCallerAfterRet.insert({&AI, true}); } + // Collect whether there is any irreducible control flow in the function. + State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); + return State; } @@ -1402,9 +1413,16 @@ // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same - // memory location and not multiple locations in a loop. - if (Current->getBlock() != KillingDef->getBlock() && + // memory location and not located in different loops. But also be + // careful with irreducible control flow, which can create cycles without + // appearing as loops. + if (((!ContainsIrreducibleLoops && + LI.getLoopFor(Current->getBlock()) != + LI.getLoopFor(KillingDef->getBlock())) || + (ContainsIrreducibleLoops && + Current->getBlock() != KillingDef->getBlock())) && !IsGuaranteedLoopInvariant(const_cast(CurrentLoc->Ptr))) { + LLVM_DEBUG(dbgs() << " ... not guaranteed loop invariant\n"); StepAgain = true; Current = CurrentDef->getDefiningAccess(); WalkerStepLimit -= 1; @@ -1833,12 +1851,13 @@ } }; -bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + const TargetLibraryInfo &TLI, + const LoopInfo &LI) { bool MadeChange = false; - DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); + DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -1883,9 +1902,9 @@ if (State.SkipStores.count(Current)) continue; - Optional Next = State.getDomMemoryDef( - KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit, - IsMemTerm, PartialLimit); + Optional Next = + State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, ScanLimit, + WalkerStepLimit, IsMemTerm, PartialLimit); if (!Next) { LLVM_DEBUG(dbgs() << " finished walk\n"); @@ -2012,8 +2031,9 @@ DominatorTree &DT = AM.getResult(F); MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); + LoopInfo &LI = AM.getResult(F); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2028,6 +2048,7 @@ PA.preserveSet(); PA.preserve(); PA.preserve(); + PA.preserve(); return PA; } @@ -2053,8 +2074,9 @@ MemorySSA &MSSA = getAnalysis().getMSSA(); PostDominatorTree &PDT = getAnalysis().getPostDomTree(); + LoopInfo &LI = getAnalysis().getLoopInfo(); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2076,6 +2098,8 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -523,8 +523,8 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MemCpy Optimization -; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass @@ -886,8 +886,8 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MemCpy Optimization -; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/test/Other/opt-O2-pipeline.ll +++ llvm/test/Other/opt-O2-pipeline.ll @@ -166,8 +166,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -171,8 +171,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline.ll +++ llvm/test/Other/opt-O3-pipeline.ll @@ -171,8 +171,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/test/Other/opt-Os-pipeline.ll +++ llvm/test/Other/opt-Os-pipeline.ll @@ -152,8 +152,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass Index: llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll =================================================================== --- llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll +++ llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll @@ -111,7 +111,6 @@ ; CHECK: for.body4.lr.ph: ; CHECK-NEXT: [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]] -; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]] ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.body4: @@ -356,3 +355,103 @@ store i16 1, i16* %arrayidx2, align 1 ret i16 0 } + +; Similar to above, but with an irreducible loop. The stores should not be removed. +define i16 @irreducible(i1 %c) { +; CHECK-LABEL: @irreducible( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[B]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[I_0]] +; CHECK-NEXT: br label [[B]] +; CHECK: B: +; CHECK-NEXT: [[J_0:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[I_0]], [[A]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[J_0]] +; CHECK-NEXT: store i16 2, i16* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i16 [[J_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[J_0]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[A]] +; CHECK: exit: +; CHECK-NEXT: store i16 1, i16* [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i16 0 +; +entry: + br i1 %c, label %A, label %B + +A: + %i.0 = phi i16 [ 0, %entry ], [ %inc, %B ] + %arrayidx2 = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %i.0 + br label %B + +B: + %j.0 = phi i16 [ 0, %entry ], [ %i.0, %A ] + %arrayidx = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %j.0 + store i16 2, i16* %arrayidx, align 1 + %inc = add nuw nsw i16 %j.0, 1 + %exitcond = icmp eq i16 %j.0, 4 + br i1 %exitcond, label %exit, label %A + +exit: + store i16 1, i16* %arrayidx, align 1 + ret i16 0 +} + +; An irreducible loop inside another loop. +define i16 @irreducible_nested() { +; CHECK-LABEL: @irreducible_nested( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[X:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INCX:%.*]], [[OUTERL:%.*]] ] +; CHECK-NEXT: [[C:%.*]] = icmp sgt i16 [[X]], 2 +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[OUTER]] ], [ [[INC:%.*]], [[B]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[I_0]] +; CHECK-NEXT: br label [[B]] +; CHECK: B: +; CHECK-NEXT: [[J_0:%.*]] = phi i16 [ 0, [[OUTER]] ], [ [[I_0]], [[A]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[J_0]] +; CHECK-NEXT: store i16 2, i16* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i16 [[J_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[J_0]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[OUTERL]], label [[A]] +; CHECK: outerl: +; CHECK-NEXT: store i16 1, i16* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INCX]] = add nuw nsw i16 [[X]], 1 +; CHECK-NEXT: [[EXITCONDX:%.*]] = icmp eq i16 [[X]], 4 +; CHECK-NEXT: br i1 [[EXITCONDX]], label [[END:%.*]], label [[OUTER]] +; CHECK: end: +; CHECK-NEXT: ret i16 0 +; +entry: + br label %outer + +outer: + %x = phi i16 [ 0, %entry ], [ %incx, %outerl ] + %c = icmp sgt i16 %x, 2 + br i1 %c, label %A, label %B + +A: + %i.0 = phi i16 [ 0, %outer ], [ %inc, %B ] + %arrayidx2 = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %i.0 + br label %B + +B: + %j.0 = phi i16 [ 0, %outer ], [ %i.0, %A ] + %arrayidx = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %j.0 + store i16 2, i16* %arrayidx, align 1 + %inc = add nuw nsw i16 %j.0, 1 + %exitcond = icmp eq i16 %j.0, 4 + br i1 %exitcond, label %outerl, label %A + +outerl: + store i16 1, i16* %arrayidx, align 1 + %incx = add nuw nsw i16 %x, 1 + %exitcondx = icmp eq i16 %x, 4 + br i1 %exitcondx, label %end, label %outer + +end: + ret i16 0 +}