Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -51,6 +51,8 @@ #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -109,6 +111,20 @@ DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", "Controls which MemoryDefs are eliminated."); +// If enabled, this optimizes loop patterns like this +// memset(P, 0, sizeof(int) * N); <- removed +// for (int i = 0; i < N; ++i) +// P[i] = 1; +static cl::opt EnableOptimizationAcrossLoops( + "enable-dse-optimize-across-loops", cl::init(true), cl::Hidden, + cl::desc("Enable elimination for redundant " + "loop-variant stores and memory intrinsics")); + +static cl::opt AcrossLoopStoreThreshold( + "dse-across-loop-store-threshold", cl::init(4), + cl::Hidden, + cl::desc("The maximum number of stores across loops to optimize")); + static cl::opt EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", cl::init(true), cl::Hidden, @@ -725,10 +741,33 @@ return false; } +struct ContinuousMemoryRange { + const SCEV *Start; + const SCEV *Length; + + ContinuousMemoryRange(const SCEV *Start, const SCEV *Length) + : Start(Start), Length(Length) {} + + static ContinuousMemoryRange createEmpty() { + return ContinuousMemoryRange(nullptr, nullptr); + } + + bool isEmpty() { return Start == nullptr || Length == nullptr; } + + bool operator==(const ContinuousMemoryRange &Other) const { + return Start == Other.Start && Length == Other.Length; + } + + bool operator!=(const ContinuousMemoryRange &Other) const { + return !(*this == Other); + } +}; + struct DSEState { Function &F; AliasAnalysis &AA; EarliestEscapeInfo EI; + ScalarEvolution *SE; /// The single BatchAA instance that is used to cache AA queries. It will /// not be invalidated over the whole run. This is safe, because: @@ -768,6 +807,8 @@ // analysis. SmallPtrSet EphValues; + DenseMap MemRanges; + /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. MapVector IOLs; @@ -785,11 +826,30 @@ DSEState(const DSEState &) = delete; DSEState &operator=(const DSEState &) = delete; + void prepareStateForAcrossLoopOptimization() { + MemDefs.clear(); + SkipStores.clear(); + for (auto *L : LI) { + if (!canAddMoreStoresFromLoop()) { + break; + } + if (!L->isInnermost() || !L->isOutermost()) { + continue; + } + + if (L->getNumBlocks() != 1) { + continue; + } + + prepareLoopVariantStoresWithRanges(L); + } + } + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, AssumptionCache &AC, + PostDominatorTree &PDT, ScalarEvolution *SE, AssumptionCache &AC, const TargetLibraryInfo &TLI, const LoopInfo &LI) : F(F), AA(AA), EI(DT, LI, EphValues), BatchAA(AA, &EI), MSSA(MSSA), - DT(DT), PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { + DT(DT), PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), SE(SE) { // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -823,6 +883,106 @@ CodeMetrics::collectEphemeralValues(&F, &AC, EphValues); } + bool canAddMoreStoresFromLoop() { + return MemDefs.size() <= AcrossLoopStoreThreshold; + } + + void prepareLoopVariantStoresWithRanges(Loop *L) { + auto *BB = L->getLoopLatch(); + if (ThrowingBlocks.count(BB)) + return; + + for (auto &I : *BB) { + if (I.mayThrow()) + break; + if (auto *S = dyn_cast(&I)) + if (auto *MD = dyn_cast(MSSA.getMemoryAccess(S))) { + if (S->isVolatile() || S->isAtomic()) { + continue; + } + + auto Range = computeMemoryRange(L, S); + if (!Range.isEmpty()) { + MemDefs.push_back(MD); + addMemRange(S, Range); + // Do not waste resources on analyzing this block further + // One store should be enough for eliminating + // memsets or stores that killed by this loop + return; + } + } + } + } + + void addMemRange(Instruction *I, ContinuousMemoryRange &Range) { + if (Range.isEmpty()) + return; + MemRanges.insert({I, Range}); + } + + ContinuousMemoryRange computeMemoryRange(const AnyMemIntrinsic *MemIntr) { + const auto *Length = SE->getSCEV(MemIntr->getLength()); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + auto *Start = SE->getSCEV(MemIntr->getDest()); + return ContinuousMemoryRange(Start, Length); + } + + ContinuousMemoryRange computeMemoryRange(Loop *L, StoreInst *Store) { + auto *GEP = dyn_cast(Store->getPointerOperand()); + if (!GEP) + return ContinuousMemoryRange::createEmpty(); + + // GEP must be in the same loop as store that uses it + if (!L->contains(GEP)) + return ContinuousMemoryRange::createEmpty(); + + auto *Type = Store->getValueOperand()->getType(); + if (!Type->isFloatingPointTy() && !Type->isIntegerTy()) + return ContinuousMemoryRange::createEmpty(); + + auto *StoreRec = dyn_cast(SE->getSCEV(GEP)); + if (!StoreRec) + return ContinuousMemoryRange::createEmpty(); + + if (!StoreRec->isAffine()) + return ContinuousMemoryRange::createEmpty(); + + auto *BTC = SE->getBackedgeTakenCount(L); + if (isa(BTC)) + return ContinuousMemoryRange::createEmpty(); + + const auto *Start = StoreRec->getOperand(0); + if (isa(Start)) + return ContinuousMemoryRange::createEmpty(); + + auto *Count = SE->getAddExpr(BTC, SE->getOne(BTC->getType())); + auto SizeInBytes = + Store->getValueOperand()->getType()->getScalarSizeInBits() / 8; + auto *StoreSize = SE->getConstant(Count->getType(), SizeInBytes); + + auto *Step = StoreRec->getOperand(1); + if (Step != StoreSize) + return ContinuousMemoryRange::createEmpty(); + + const auto *Length = SE->getMulExpr(Count, StoreSize); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + return ContinuousMemoryRange(Start, Length); + } + + bool isOptimizableAcrossLoops(const Instruction *KillingI, + const Instruction *DeadI) { + if (!EnableOptimizationAcrossLoops) + return false; + + // Alias Analysis can handle two memory intrinsics, skip it + if (isa(DeadI) && isa(KillingI)) + return false; + return true; + } + + /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p /// KillingI instruction) completely overwrites a store to the 'DeadLoc' /// location (by \p DeadI instruction). @@ -836,6 +996,10 @@ const MemoryLocation &KillingLoc, const MemoryLocation &DeadLoc, int64_t &KillingOff, int64_t &DeadOff) { + + if (isOptimizableAcrossLoops(KillingI, DeadI)) + if (isLoopVariantStoreOverwrite(KillingI, DeadI)) + return OW_Complete; // AliasAnalysis does not always account for loops. Limit overwrite checks // to dependencies for which we can guarantee they are independent of any // loops they are in. @@ -1327,9 +1491,11 @@ // candidates for which we can guarantee they always store to the same // memory location and not located in different loops. if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { - LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); - CanOptimize = false; - continue; + if (!EnableOptimizationAcrossLoops || MemRanges.empty()) { + LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); + CanOptimize = false; + continue; + } } if (IsMemTerm) { @@ -1466,8 +1632,11 @@ // store kills itself. if (MaybeDeadAccess == UseAccess && !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) { - LLVM_DEBUG(dbgs() << " ... found not loop invariant self access\n"); - return None; + if (!EnableOptimizationAcrossLoops || !MemRanges.count(MaybeDeadI)) { + LLVM_DEBUG(dbgs() + << " ... found not loop invariant self access\n"); + return None; + } } // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check // if it reads the memory location. @@ -1515,6 +1684,7 @@ SmallPtrSet KillingBlocks; for (Instruction *KD : KillingDefs) KillingBlocks.insert(KD->getParent()); + assert(!KillingBlocks.empty() && "Expected at least a single killing block"); @@ -1583,6 +1753,27 @@ return {MaybeDeadAccess}; } + bool isLoopVariantStoreOverwrite(const Instruction *KillingI, + const Instruction *DeadI) { + auto KillingRange = MemRanges.find(KillingI); + auto DeadRange = MemRanges.find(DeadI); + + if (MemRanges.count(DeadI) && MemRanges.count(KillingI)) + return DeadRange->second == KillingRange->second; + + if (MemRanges.count(DeadI) && isa(KillingI)) { + auto MemIntrRange = computeMemoryRange(cast(KillingI)); + if (!MemIntrRange.isEmpty()) + return DeadRange->second == MemIntrRange; + } else if (MemRanges.count(KillingI) && isa(DeadI)) { + auto MemIntrRange = computeMemoryRange(cast(DeadI)); + if (!MemIntrRange.isEmpty()) + return KillingRange->second == MemIntrRange; + } + + return false; + } + // Delete dead memory defs void deleteDeadInstruction(Instruction *SI) { MemorySSAUpdater Updater(&MSSA); @@ -1949,15 +2140,8 @@ } }; -static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - AssumptionCache &AC, - const TargetLibraryInfo &TLI, - const LoopInfo &LI) { +static bool runDSEOptimizationLoop(DSEState &State) { bool MadeChange = false; - - MSSA.ensureOptimizedUses(); - DSEState State(F, AA, MSSA, DT, PDT, AC, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2066,10 +2250,10 @@ // We are re-using tryToMergePartialOverlappingStores, which requires // DeadSI to dominate DeadSI. // TODO: implement tryToMergeParialOverlappingStores using MemorySSA. - if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) { + if (DeadSI && KillingSI && State.DT.dominates(DeadSI, KillingSI)) { if (Constant *Merged = tryToMergePartialOverlappingStores( KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL, - State.BatchAA, &DT)) { + State.BatchAA, &State.DT)) { // Update stored value of earlier store to merged constant. DeadSI->setOperand(0, Merged); @@ -2117,13 +2301,39 @@ continue; } } + return MadeChange; +} + +static bool runAcrossLoopDSE(DSEState &State) { + if (!EnableOptimizationAcrossLoops) { + return false; + } + State.prepareStateForAcrossLoopOptimization(); + return runDSEOptimizationLoop(State); +} + +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + ScalarEvolution *SE, + AssumptionCache &AC, + const TargetLibraryInfo &TLI, + const LoopInfo &LI) { + + MSSA.ensureOptimizedUses(); + DSEState State(F, AA, MSSA, DT, PDT, SE, AC, TLI, LI); + bool MadeChange = false; + MadeChange |= runDSEOptimizationLoop(State); if (EnablePartialOverwriteTracking) for (auto &KV : State.IOLs) MadeChange |= State.removePartiallyOverlappedStores(KV.second); MadeChange |= State.eliminateRedundantStoresOfExistingValues(); MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); + // Optimize across loops at the end because there should be + // less stores to analyze. Plus, stores that overwrite some other stores + // in the same loop are removed too + MadeChange |= runAcrossLoopDSE(State); return MadeChange; } } // end anonymous namespace @@ -2140,7 +2350,11 @@ AssumptionCache &AC = AM.getResult(F); LoopInfo &LI = AM.getResult(F); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI); + ScalarEvolution *SE = nullptr; + if (EnableOptimizationAcrossLoops) + SE = &AM.getResult(F); + + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, AC, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2148,10 +2362,14 @@ NumRemainingStores += isa(&I); #endif - if (!Changed) + if (!Changed) { return PreservedAnalyses::all(); - + } PreservedAnalyses PA; + if (EnableOptimizationAcrossLoops) { + PA.preserve(); + } + PA.preserveSet(); PA.preserve(); PA.preserve(); @@ -2183,8 +2401,11 @@ AssumptionCache &AC = getAnalysis().getAssumptionCache(F); LoopInfo &LI = getAnalysis().getLoopInfo(); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI); + ScalarEvolution *SE = nullptr; + if (EnableOptimizationAcrossLoops) { + SE = &getAnalysis().getSE(); + } + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, AC, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2197,6 +2418,10 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (EnableOptimizationAcrossLoops) { + AU.addRequired(); + AU.addPreserved(); + } AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -2227,6 +2452,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) Index: llvm/test/Transforms/DeadStoreElimination/loop-variant-store-complete-overwrite.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/loop-variant-store-complete-overwrite.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=dse < %s | FileCheck %s +define dso_local void @init_vector(ptr %p) { +; CHECK-LABEL: @init_vector( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[N_06:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[N_06]] to i32 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[N_06]] +; CHECK-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[N_06]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[NRVO_SKIPDTOR:%.*]], label [[FOR_BODY]] +; CHECK: nrvo.skipdtor: +; CHECK-NEXT: ret void +; +entry: +; 1 = MemoryDef(liveOnEntry) + call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 16384, i1 false) + br label %for.body + +for.body: ; preds = %for.body, %entry +; 3 = MemoryPhi({entry,1},{for.body,2}) + %n.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %conv = trunc i64 %n.06 to i32 + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %n.06 +; 2 = MemoryDef(3) + store i32 %conv, ptr %arrayidx, align 4 + %inc = add nuw nsw i64 %n.06, 1 + %exitcond.not = icmp eq i64 %inc, 4096 + br i1 %exitcond.not, label %nrvo.skipdtor, label %for.body + +nrvo.skipdtor: ; preds = %for.body + ret void +} + +define dso_local void @store_in_two_loops(ptr nocapture noundef writeonly %P, i32 noundef %N) { +; CHECK-LABEL: @store_in_two_loops( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP17_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP17_NOT]], label [[FOR_COND_CLEANUP4:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond2.preheader: +; CHECK-NEXT: br i1 [[CMP17_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_BODY5_PREHEADER:%.*]] +; CHECK: for.body5.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT25:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY5:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND2_PREHEADER:%.*]], label [[FOR_BODY]] +; CHECK: for.cond.cleanup4: +; CHECK-NEXT: ret void +; CHECK: for.body5: +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[FOR_BODY5_PREHEADER]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[FOR_BODY5]] ] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[EXITCOND26_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], [[WIDE_TRIP_COUNT25]] +; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_BODY5]] +; +entry: + %cmp17.not = icmp eq i32 %N, 0 + br i1 %cmp17.not, label %for.cond.cleanup4, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond2.preheader: ; preds = %for.body + br i1 %cmp17.not, label %for.cond.cleanup4, label %for.body5.preheader + +for.body5.preheader: ; preds = %for.cond2.preheader + %wide.trip.count25 = zext i32 %N to i64 + br label %for.body5 + +for.body: ; preds = %for.body, %for.body.preheader +; 5 = MemoryPhi({for.body.preheader,liveOnEntry},{for.body,1}) + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %P, i64 %indvars.iv +; 1 = MemoryDef(5) + store i32 1, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond2.preheader, label %for.body + +for.cond.cleanup4: ; preds = %for.body5, %for.cond2.preheader, %entry +; 3 = MemoryPhi({entry,liveOnEntry},{for.cond2.preheader,1},{for.body5,2}) + ret void + +for.body5: ; preds = %for.body5, %for.body5.preheader +; 4 = MemoryPhi({for.body5.preheader,1},{for.body5,2}) + %indvars.iv22 = phi i64 [ 0, %for.body5.preheader ], [ %indvars.iv.next23, %for.body5 ] + %arrayidx7 = getelementptr inbounds i32, ptr %P, i64 %indvars.iv22 +; 2 = MemoryDef(4) + store i32 1, ptr %arrayidx7, align 4 + %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1 + %exitcond26.not = icmp eq i64 %indvars.iv.next23, %wide.trip.count25 + br i1 %exitcond26.not, label %for.cond.cleanup4, label %for.body5 +} + + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)