diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -43,7 +44,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -1461,6 +1461,9 @@ SmallPtrSet InvisibleToCaller; // Keep track of blocks with throwing instructions not modeled in MemorySSA. SmallPtrSet ThrowingBlocks; + // Post-order numbers for each basic block. Used to figure out if memory + // accesses are executed before another access. + DenseMap PostOrderNumbers; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI) @@ -1472,23 +1475,28 @@ DSEState State(F, AA, MSSA, DT, PDT, TLI); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. - for (Instruction &I : instructions(F)) { - if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) - State.ThrowingBlocks.insert(I.getParent()); - - auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); - if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && - hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) - State.MemDefs.push_back(MD); - - // Track alloca and alloca-like objects. Here we care about objects not - // visible to the caller during function execution. Alloca objects are - // invalid in the caller, for alloca-like objects we ensure that they are - // not captured throughout the function. - if (isa(&I) || - (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) - State.InvisibleToCaller.insert(&I); + unsigned PO = 0; + for (BasicBlock *BB : post_order(&F)) { + State.PostOrderNumbers[BB] = PO++; + for (Instruction &I : *BB) { + if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) + State.ThrowingBlocks.insert(I.getParent()); + + auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); + if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && + hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) + State.MemDefs.push_back(MD); + + // Track alloca and alloca-like objects. Here we care about objects not + // visible to the caller during function execution. Alloca objects are + // invalid in the caller, for alloca-like objects we ensure that they + // are not captured throughout the function. + if (isa(&I) || + (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) + State.InvisibleToCaller.insert(&I); + } } + // Treat byval or inalloca arguments the same as Allocas, stores to them are // dead at the end of the function. for (Argument &AI : F.args()) @@ -1568,8 +1576,7 @@ MemoryLocation DefLoc, bool DefVisibleToCaller, int &ScanLimit) const { - MemoryDef *DomDef; - MemoryAccess *StartDef = Current; + MemoryAccess *DomAccess; bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); @@ -1580,37 +1587,41 @@ if (MSSA.isLiveOnEntryDef(Current)) return None; + if (isa(Current)) { + DomAccess = Current; + break; + } MemoryUseOrDef *CurrentUD = dyn_cast(Current); if (!CurrentUD) return None; // Look for access that clobber DefLoc. - MemoryAccess *DomAccess = - MSSA.getSkipSelfWalker()->getClobberingMemoryAccess( - CurrentUD->getDefiningAccess(), DefLoc); - DomDef = dyn_cast(DomAccess); - if (!DomDef || MSSA.isLiveOnEntryDef(DomDef)) + DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD, + DefLoc); + if (MSSA.isLiveOnEntryDef(DomAccess)) return None; // Check if we can skip DomDef for DSE. We also require the KillingDef // execute whenever DomDef executes and use post-dominance to ensure that. - if (canSkipDef(DomDef, DefVisibleToCaller) || + + MemoryDef *DomDef = dyn_cast(DomAccess); + if ((DomDef && canSkipDef(DomDef, DefVisibleToCaller)) || !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock())) { StepAgain = true; - Current = DomDef; + Current = DomDef->getDefiningAccess(); } } while (StepAgain); - LLVM_DEBUG(dbgs() << " Checking for reads of " << *DomDef << " (" - << *DomDef->getMemoryInst() << ")\n"); + LLVM_DEBUG(dbgs() << " Checking for reads of " << *DomAccess << " (" + << *cast(DomAccess)->getMemoryInst() << ")\n"); SmallSetVector WorkList; auto PushMemUses = [&WorkList](MemoryAccess *Acc) { for (Use &U : Acc->uses()) WorkList.insert(cast(U.getUser())); }; - PushMemUses(DomDef); + PushMemUses(DomAccess); // Check if DomDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { @@ -1622,10 +1633,12 @@ return None; } - // Bail out on MemoryPhis for now. if (isa(UseAccess)) { - LLVM_DEBUG(dbgs() << " ... hit MemoryPhi\n"); - return None; + if (KillingDef == UseAccess) + continue; + + PushMemUses(UseAccess); + continue; } Instruction *UseInst = cast(UseAccess)->getMemoryInst(); @@ -1643,7 +1656,7 @@ return None; } - if (StartDef == UseAccess) + if (KillingDef == UseAccess) continue; // Check all uses for MemoryDefs, except for defs completely overwriting @@ -1662,8 +1675,8 @@ } } - // No aliasing MemoryUses of DomDef found, DomDef is potentially dead. - return {DomDef}; + // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead. + return {DomAccess}; } // Delete dead memory defs @@ -1749,10 +1762,10 @@ DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { - MemoryDef *Current = State.MemDefs[I]; - if (State.SkipStores.count(Current)) + MemoryDef *KillingDef = State.MemDefs[I]; + if (State.SkipStores.count(KillingDef)) continue; - Instruction *SI = cast(Current)->getMemoryInst(); + Instruction *SI = KillingDef->getMemoryInst(); MemoryLocation SILoc = *State.getLocForWriteEx(SI); assert(SILoc.Ptr && "SILoc should not be null"); const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL); @@ -1763,35 +1776,66 @@ !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT)))) DefVisibleToCaller = false; - LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI - << "\n"); + MemoryAccess *Current = KillingDef; + LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " + << *KillingDef << " (" << *SI << ")\n"); int ScanLimit = MemorySSAScanLimit; - MemoryDef *StartDef = Current; Optional Next; + SetVector ToCheck; + ToCheck.insert(KillingDef->getDefiningAccess()); // Walk MemorySSA upward to find MemoryDefs that might be killed by SI. - while ((Next = State.getDomMemoryDef(StartDef, Current, SILoc, - DefVisibleToCaller, ScanLimit))) { + for (unsigned I = 0; I < ToCheck.size(); I++) { + Current = ToCheck[I]; + if (State.SkipStores.count(Current)) + continue; + + Next = State.getDomMemoryDef(KillingDef, Current, SILoc, + DefVisibleToCaller, ScanLimit); + + if (!Next) { + LLVM_DEBUG(dbgs() << " finished walk\n"); + continue; + } + MemoryAccess *DomAccess = *Next; LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n"); + if (isa(DomAccess)) { + for (Value *V : cast(DomAccess)->incoming_values()) { + MemoryAccess *IncomingAccess = cast(V); + BasicBlock *IncomingBlock = IncomingAccess->getBlock(); + BasicBlock *PhiBlock = DomAccess->getBlock(); + + // We only consider incoming MemoryAccesses that come before the + // MemoryPhi. Otherwise we could discover candidates that do not + // strictly dominate our starting def. + if (State.PostOrderNumbers[IncomingBlock] > + State.PostOrderNumbers[PhiBlock]) + ToCheck.insert(IncomingAccess); + } + continue; + } MemoryDef *NextDef = dyn_cast(DomAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " def " << *NI << "\n"); - if (!hasAnalyzableMemoryWrite(NI, TLI)) - break; + if (!hasAnalyzableMemoryWrite(NI, TLI)) { + LLVM_DEBUG(dbgs() << " skip, cannot analyze def\n"); + continue; + } + MemoryLocation NILoc = *State.getLocForWriteEx(NI); // Check for anything that looks like it will be a barrier to further // removal if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) { - LLVM_DEBUG(dbgs() << " stop, barrier\n"); - break; + LLVM_DEBUG(dbgs() << " skip, barrier\n"); + continue; } // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " stop, may throw!\n"); + LLVM_DEBUG(dbgs() << " skip, may throw!\n"); break; } @@ -1804,16 +1848,16 @@ OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset, InstWriteOffset, NI, IOL, AA, &F); + ToCheck.insert(NextDef->getDefiningAccess()); if (OR == OW_Complete) { LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); State.deleteDeadInstruction(NI); ++NumFastStores; MadeChange = true; - } else - Current = NextDef; + } + } } - } return MadeChange; } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll @@ -18,7 +18,6 @@ ; CHECK-NEXT: invoke void @f() ; CHECK-NEXT: to label [[BLOCK3:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: block3: -; CHECK-NEXT: store i32 30, i32* [[SV]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[CS1:%.*]] = catchswitch within none [label %catch] unwind label [[CLEANUP:%.*]] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll @@ -27,10 +27,9 @@ define void @test14(i32* noalias %P) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -282,3 +281,36 @@ ret void } +%struct.hoge = type { i32, i32 } + +@global = external local_unnamed_addr global %struct.hoge*, align 8 + +define void @widget(i8* %tmp) { +; CHECK-LABEL: @widget( +; CHECK-NEXT: bb: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP:%.*]], i8* nonnull align 16 undef, i64 64, i1 false) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP2:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[TMP2]], i64 undef, i32 1 +; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[TMP4]], i64 undef, i32 1 +; CHECK-NEXT: store i32 10, i32* [[TMP5]], align 4 +; CHECK-NEXT: br label [[BB1]] +; +bb: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %tmp, i8* nonnull align 16 undef, i64 64, i1 false) + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp2 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp3 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp2, i64 undef, i32 1 + store i32 0, i32* %tmp3, align 4 + %tmp4 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp5 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp4, i64 undef, i32 1 + store i32 10, i32* %tmp5, align 4 + br label %bb1 +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll @@ -33,13 +33,11 @@ ; CHECK-LABEL: @test5( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: store i32 1, i32* [[P]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -58,13 +56,12 @@ ; CHECK-LABEL: @test8( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 1, i32* [[Q:%.*]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -115,7 +112,6 @@ ; CHECK: bb1: ; CHECK-NEXT: br i1 [[C2:%.*]], label [[BB2:%.*]], label [[BB3]] ; CHECK: bb2: -; CHECK-NEXT: store i32 -1, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: br label [[BB4:%.*]] @@ -126,7 +122,7 @@ ; CHECK-NEXT: i32 2, label [[BB7:%.*]] ; CHECK-NEXT: ] ; CHECK: bb5: -; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 +; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[BB8]] ; CHECK: bb6: ; CHECK-NEXT: store i32 1, i32* [[PTR]], align 4 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll @@ -127,7 +127,7 @@ ; CHECK: bb1: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: ret void +; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: store i32 2, i32* [[P]] ; CHECK-NEXT: ret void @@ -142,8 +142,109 @@ bb1: br label %bb3 bb2: - ret void + br label %bb3 bb3: store i32 2, i32* %P ret void } + +define void @test10(i32* %P) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test11() { +; CHECK-LABEL: @test11( +; CHECK-NEXT: [[P:%.*]] = alloca i32 +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + %P = alloca i32 + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test12(i32* %P) { +; CHECK-LABEL: @test12( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + ret void +bb3: + ret void +} + + +define void @test13(i32* %P) { +; CHECK-LABEL: @test13( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + br label %bb3 +bb3: + ret void +}