diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -43,7 +44,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -1455,6 +1455,9 @@ SmallPtrSet InvisibleToCaller; // Keep track of blocks with throwing instructions not modeled in MemorySSA. SmallPtrSet ThrowingBlocks; + // Post-order numbers for each basic block. Used to figure out if memory + // accesses are executed before another access. + DenseMap PostOrderNumbers; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI) @@ -1466,22 +1469,27 @@ DSEState State(F, AA, MSSA, DT, PDT, TLI); // Collect blocks with throwing instructions not modeled in MemroySSA and // alloc-like objects. - for (Instruction &I : instructions(F)) { - if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) - State.ThrowingBlocks.insert(I.getParent()); - - auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); - if (MD && hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) - State.MemDefs.push_back(MD); - - // Track alloca and alloca-like objects. Here we care about objects not - // visible to the caller during function execution. Alloca objects are - // invalid in the caller, for alloca-like objects we ensure that they are - // not captured throughout the function. - if (isa(&I) || - (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) - State.InvisibleToCaller.insert(&I); + unsigned PO = 0; + for (BasicBlock *BB : post_order(&F)) { + State.PostOrderNumbers[BB] = PO++; + for (Instruction &I : *BB) { + if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) + State.ThrowingBlocks.insert(I.getParent()); + + auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); + if (MD && hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) + State.MemDefs.push_back(MD); + + // Track alloca and alloca-like objects. Here we care about objects not + // visible to the caller during function execution. Alloca objects are + // invalid in the caller, for alloca-like objects we ensure that they + // are not captured throughout the function. + if (isa(&I) || + (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) + State.InvisibleToCaller.insert(&I); + } } + // Treat byval or inalloca arguments the same as Allocas, stores to them are // dead at the end of the function. for (Argument &AI : F.args()) @@ -1554,11 +1562,12 @@ // following // * An aliasing MemoryUse (read). // * A MemoryPHI - Optional getDomMemoryDef(MemoryAccess *Current, MemoryLocation DefLoc, - bool DefVisibleToCaller, int &ScanLimit) const { - MemoryDef *StartDef = cast(Current); + Optional getDomMemoryDef(MemoryAccess *StartAccess, + MemoryAccess *Current, + MemoryLocation DefLoc, + bool DefVisibleToCaller, + int &ScanLimit) const { MemoryAccess *DomAccess; - MemoryDef *DomDef; bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); @@ -1569,21 +1578,25 @@ if (MSSA.isLiveOnEntryDef(Current)) return None; + if (isa(Current)) { + DomAccess = Current; + break; + } MemoryUseOrDef *CurrentUD = dyn_cast(Current); if (!CurrentUD) return None; // Look for access that clobber DefLoc. - DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess( - CurrentUD->getDefiningAccess(), DefLoc); - DomDef = dyn_cast(DomAccess); - if (!DomDef || MSSA.isLiveOnEntryDef(DomDef)) + DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD, + DefLoc); + if (MSSA.isLiveOnEntryDef(DomAccess)) return None; // Check if we can skip DomDef for DSE. - if (canSkipDef(DomDef, DefVisibleToCaller)) { + MemoryDef *DomDef = dyn_cast(DomAccess); + if (DomDef && canSkipDef(DomDef, DefVisibleToCaller)) { StepAgain = true; - Current = DomDef; + Current = DomDef->getDefiningAccess(); } } while (StepAgain); @@ -1594,7 +1607,10 @@ LLVM_DEBUG(dbgs() << "\n"); SmallVector WorkList; - auto PushMemUses = [&WorkList](MemoryAccess *Acc) { + SmallPtrSet Visited; + auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) { + if (!Visited.insert(Acc).second) + return; for (Use &U : Acc->uses()) WorkList.push_back(cast(U.getUser())); }; @@ -1610,10 +1626,12 @@ return None; } - // Bail out on MemoryPhis for now. if (isa(UseAccess)) { - LLVM_DEBUG(dbgs() << " ... hit MemoryPhi\n"); - return None; + if (StartAccess == UseAccess) + continue; + + PushMemUses(UseAccess); + continue; } Instruction *UseInst = cast(UseAccess)->getMemoryInst(); @@ -1631,7 +1649,7 @@ return None; } - if (StartDef == UseAccess) + if (StartAccess == UseAccess) continue; // Check all uses for MemoryDefs. We have to check uses of *all* @@ -1738,10 +1756,10 @@ DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { - MemoryDef *Current = State.MemDefs[I]; - if (State.SkipStores.count(Current)) + MemoryDef *StartDef = State.MemDefs[I]; + if (State.SkipStores.count(StartDef)) continue; - Instruction *SI = cast(Current)->getMemoryInst(); + Instruction *SI = StartDef->getMemoryInst(); MemoryLocation SILoc = *State.getLocForWriteEx(SI); assert(SILoc.Ptr && "SILoc should not be null"); const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL); @@ -1752,60 +1770,96 @@ !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT)))) DefVisibleToCaller = false; - LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI - << "\n"); + MemoryAccess *Current = StartDef; + LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " + << *StartDef << " (" << *SI << ")\n"); int ScanLimit = MemorySSAScanLimit; Optional Next; + SetVector ToCheck; + ToCheck.insert(StartDef->getDefiningAccess()); // Walk MemorySSA upward to find MemoryDefs that might be killed by SI. - while ((Next = State.getDomMemoryDef(Current, SILoc, DefVisibleToCaller, - ScanLimit))) { + for (unsigned I = 0; I < ToCheck.size(); I++) { + Current = ToCheck[I]; + if (State.SkipStores.count(Current)) + continue; + + Next = State.getDomMemoryDef(StartDef, Current, SILoc, DefVisibleToCaller, + ScanLimit); + + if (!Next) { + LLVM_DEBUG(dbgs() << " finished walk\n"); + continue; + } + MemoryAccess *DomAccess = *Next; LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n"); + if (isa(DomAccess)) { + for (Value *V : cast(DomAccess)->incoming_values()) { + MemoryAccess *IncomingAccess = cast(V); + BasicBlock *IncomingBlock = IncomingAccess->getBlock(); + BasicBlock *PhiBlock = DomAccess->getBlock(); + + // We only consider incoming MemoryAccesses that come before the + // MemoryPhi. Otherwise we could discover candidates that do not + // strictly dominate our starting def. + if (State.PostOrderNumbers[IncomingBlock] > + State.PostOrderNumbers[PhiBlock]) + ToCheck.insert(IncomingAccess); + } + continue; + } MemoryDef *NextDef = dyn_cast(DomAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " def " << *NI << "\n"); - if (!hasAnalyzableMemoryWrite(NI, TLI)) - break; + if (!hasAnalyzableMemoryWrite(NI, TLI)) { + LLVM_DEBUG(dbgs() << " skip, cannot analyze def\n"); + continue; + } + MemoryLocation NILoc = *State.getLocForWriteEx(NI); // Check for anything that looks like it will be a barrier to further // removal if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) { - LLVM_DEBUG(dbgs() << " stop, barrier\n"); - break; + LLVM_DEBUG(dbgs() << " skip, barrier\n"); + continue; } - // We must post-dom the instructions to safely remove it. Continue checking from NextDef + // We must post-dom the instructions to safely remove it. Continue + // checking from NextDef if (!PDT.dominates(SI->getParent(), NextDef->getBlock())) { - Current = NextDef; - LLVM_DEBUG(dbgs() << " continue, not post-dominating!\n"); + ToCheck.insert(NextDef->getDefiningAccess()); + LLVM_DEBUG(dbgs() << " skip, not post-dominating!\n"); continue; } // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " stop, may throw!\n"); + LLVM_DEBUG(dbgs() << " skip, may throw!\n"); break; } + if (!DebugCounter::shouldExecute(MemorySSACounter)) + break; + // Check if NI overwrites SI. int64_t InstWriteOffset, DepWriteOffset; InstOverlapIntervalsTy IOL; OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset, InstWriteOffset, NI, IOL, AA, &F); + ToCheck.insert(NextDef->getDefiningAccess()); if (OR == OW_Complete) { LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); State.deleteDeadInstruction(NI); ++NumFastStores; MadeChange = true; - } else - Current = NextDef; + } + } } - } return MadeChange; } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll @@ -18,7 +18,6 @@ ; CHECK-NEXT: invoke void @f() ; CHECK-NEXT: to label [[BLOCK3:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: block3: -; CHECK-NEXT: store i32 30, i32* [[SV]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[CS1:%.*]] = catchswitch within none [label %catch] unwind label [[CLEANUP:%.*]] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll @@ -27,10 +27,9 @@ define void @test14(i32* noalias %P) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -169,3 +168,39 @@ %exitcond29 = icmp eq i32 %inc11, %N br i1 %exitcond29, label %for.cond.cleanup, label %for.body4.lr.ph } + +%struct.hoge = type { i32, i32 } + +@global = external local_unnamed_addr global %struct.hoge*, align 8 + +define void @widget(i8* %tmp) { +; CHECK-LABEL: @widget( +; CHECK-NEXT: bb: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP:%.*]], i8* nonnull align 16 undef, i64 64, i1 false) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP2:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[TMP2]], i64 undef, i32 1 +; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[TMP4]], i64 undef, i32 1 +; CHECK-NEXT: store i32 10, i32* [[TMP5]], align 4 +; CHECK-NEXT: br label [[BB1]] +; +bb: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %tmp, i8* nonnull align 16 undef, i64 64, i1 false) + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp2 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp3 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp2, i64 undef, i32 1 + store i32 0, i32* %tmp3, align 4 + %tmp4 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp5 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp4, i64 undef, i32 1 + store i32 10, i32* %tmp5, align 4 + br label %bb1 +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) + + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll @@ -33,13 +33,11 @@ ; CHECK-LABEL: @test5( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: store i32 1, i32* [[P]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -58,13 +56,12 @@ ; CHECK-LABEL: @test8( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 1, i32* [[Q:%.*]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll @@ -112,3 +112,105 @@ store i32 0, i32* %P ret void } + + +define void @test10(i32* %P) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test11() { +; CHECK-LABEL: @test11( +; CHECK-NEXT: [[P:%.*]] = alloca i32 +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + %P = alloca i32 + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test12(i32* %P) { +; CHECK-LABEL: @test12( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + ret void +bb3: + ret void +} + + +define void @test13(i32* %P) { +; CHECK-LABEL: @test13( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + br label %bb3 +bb3: + ret void +}