Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -38,8 +38,10 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" @@ -47,7 +49,10 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -79,8 +84,11 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopRotationUtils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include #include #include @@ -108,9 +116,64 @@ STATISTIC(NumDomMemDefChecks, "Number iterations check for reads in getDomMemoryDef"); +STATISTIC(NotRotatedFormLoop, "Number of loops that are not in rotated form " + "and failed to be transformed"); +STATISTIC(NotSimplifyFormLoop, "Number of loops that are not in simplify form " + "and failed to be transformed"); +STATISTIC(ComputedStoreMemRanges, + "Number of memory ranges computed for stores"); +STATISTIC(ComputedMemIntrMemRanges, + "Number of memory ranges computed for memory intrinsics"); +STATISTIC(ReplacedGuard, + "Number of store's block replaced by guard in killing block set"); +STATISTIC(MissedGuard, "Number of missed guards"); +STATISTIC(TooManyBlocksInLoop, + "Number of loop which have too many blocks (more than 20)"); +STATISTIC( + VolatileOrAtomicInLoop, + "Number of stores in loop skipped because store is volatile or atomic"); +STATISTIC(LoopCount, "Number of loops to handle"); +STATISTIC(CollectedStoreCount, "Number of loop-variant stores"); +STATISTIC(FailedToComputeRangeForStore, + "Number of memory intrinsics without computed memory range"); +STATISTIC( + MayThrowInLoop, + "Number of loop skipped because there is an instruction that may throw"); +STATISTIC(OuterOrInnermost, "Number of times a loop is skipped because it is " + "nested or other loops are nested in it"); +STATISTIC(LoopVariantStoreRemoved, "Number of deleted loop variant stores"); +STATISTIC(RemovedMemIntrWithRange, "Number of deleted "); + DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", "Controls which MemoryDefs are eliminated."); +// If enabled, this optimizes loop patterns like this +// for (int i = 0; i < N; ++i) +// P[i] = 1; +// for (int i = 0; i < N; ++i) +// P[i] = 2; +// +// or +// +// memset(P, 0, sizeof(int) * N); +// for (int i = 0; i < N; ++i) +// P[i] = 2; +static cl::opt EnableOptimizationAcrossLoops( + "dse-optimize-across-loops", cl::init(false), cl::Hidden, + cl::desc("Enable elimination for redundant " + "loop-variant stores and memory intrinsics")); + +// Loops with relatively small number of basic blocks are more likely to be met, +// but there are may be loops with > 1000 basic blocks, so we introduce an upper +// limit to preserve time +static cl::opt BasicBlocksInOneLoopThreshold( + "dse-basic-blocks-in-loop-threshold", cl::init(20), cl::Hidden, + cl::desc("The maximum number of basic block in a loop in total")); + +static cl::opt AcrossLoopStoreThreshold( + "dse-across-loop-store-threshold", cl::init(32), cl::Hidden, + cl::desc("The maximum number of stores across loops to optimize")); + static cl::opt EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", cl::init(true), cl::Hidden, @@ -852,6 +915,32 @@ return false; } +// It represents a range in memory that MemoryDef overwrites completely +// without gaps. If there are supposed to be at least one gap within a range +// then do not create an instance of this type +// TODO: We need offset for base pointer too +struct ContinuousMemoryRange { + const SCEV *Start; + const SCEV *Length; + + ContinuousMemoryRange(const SCEV *Start, const SCEV *Length) + : Start(Start), Length(Length) {} + + static ContinuousMemoryRange createEmpty() { + return ContinuousMemoryRange(nullptr, nullptr); + } + + bool isEmpty() { return Start == nullptr || Length == nullptr; } + + bool operator==(const ContinuousMemoryRange &Other) const { + return Start == Other.Start && Length == Other.Length; + } + + bool operator!=(const ContinuousMemoryRange &Other) const { + return !(*this == Other); + } +}; + struct DSEState { Function &F; AliasAnalysis &AA; @@ -865,18 +954,20 @@ /// information for a deleted value cannot be accessed by a re-used new /// value pointer. BatchAAResults BatchAA; - + AssumptionCache *AC; MemorySSA &MSSA; DominatorTree &DT; PostDominatorTree &PDT; const TargetLibraryInfo &TLI; const DataLayout &DL; const LoopInfo &LI; - + ScalarEvolution *SE; // Whether the function contains any irreducible control flow, useful for // being accurately able to detect loops. bool ContainsIrreducibleLoops; + bool ControlFlowChanged; + // All MemoryDefs that potentially could kill other MemDefs. SmallVector MemDefs; // Any that should be skipped as they are already deleted @@ -894,6 +985,19 @@ // accesses are executed before another access. DenseMap PostOrderNumbers; + // This map may contain loop-variant store which fits the following + // requirements + // - It is not volatile or atomic + // - It may not throw + // - Store size == stride + // - Store type is integer or floating scalar + // - It must always execute + // And their loop must match the following requirements + // - Its backedge taken count is computable + // - It has simplify rotated form + // - Induction variable starts from zero + // - There is no throwing instruction + DenseMap MemRanges; /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. DenseMap IOLs; @@ -902,40 +1006,192 @@ DSEState(const DSEState &) = delete; DSEState &operator=(const DSEState &) = delete; - DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, const TargetLibraryInfo &TLI, - const LoopInfo &LI) - : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT), - PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { - // Collect blocks with throwing instructions not modeled in MemorySSA and - // alloc-like objects. - unsigned PO = 0; - for (BasicBlock *BB : post_order(&F)) { - PostOrderNumbers[BB] = PO++; - for (Instruction &I : *BB) { - MemoryAccess *MA = MSSA.getMemoryAccess(&I); - if (I.mayThrow() && !MA) - ThrowingBlocks.insert(I.getParent()); - - auto *MD = dyn_cast_or_null(MA); - if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit && - (getLocForWriteEx(&I) || isMemTerminatorInst(&I))) - MemDefs.push_back(MD); + bool prepareStateForAcrossLoopOptimization() { + if (!EnableOptimizationAcrossLoops) { + return false; + } + bool MadeChange = false; + MemDefs.clear(); + // If optimization across loops is enabled, transform loops into a form + // suitable for further analysis and collect stores that + // we can analyze and remove + MemorySSAUpdater MSSAU(&MSSA); + const auto &DL = F.getParent()->getDataLayout(); + SimplifyQuery SQ(DL, &TLI, &DT, AC); + TargetTransformInfo TTI(DL); + for (auto *L : LI) { + ++LoopCount; + + if (!L->isOutermost() || !L->isInnermost()) { + ++OuterOrInnermost; + continue; + } + + if (L->getNumBlocks() > BasicBlocksInOneLoopThreshold) { + ++TooManyBlocksInLoop; + continue; + } + + if (auto *S = prepareLastLoopVariantStoreWithRange(L)) { + // Either rotated form or single-block loop will be okay + if (!L->isRotatedForm() && L->getNumBlocks() != 1) { + if (!llvm::LoopRotation(L, const_cast(&LI), &TTI, AC, + &DT, SE, &MSSAU, SQ, false, -1, true, + false)) { + ++NotRotatedFormLoop; + continue; + } + MadeChange = true; + } + if (!L->isLoopSimplifyForm()) { + if (!llvm::simplifyLoop(L, &DT, const_cast(&LI), SE, AC, + &MSSAU, false)) { + ++NotSimplifyFormLoop; + continue; + } + MadeChange = true; + } + auto *MD = cast(MSSA.getMemoryAccess(S)); + MemDefs.push_back(MD); + if (MemDefs.size() >= AcrossLoopStoreThreshold) { + break; + } + } + } + CollectedStoreCount += MemDefs.size(); + return MadeChange; + } + +DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, + PostDominatorTree &PDT, ScalarEvolution *SE, + const TargetLibraryInfo &TLI, AssumptionCache *AC, const LoopInfo &LI) + : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), AC(AC), MSSA(MSSA), DT(DT), PDT(PDT), + TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), SE(SE) { + // Collect blocks with throwing instructions not modeled in MemorySSA and + // alloc-like objects. + unsigned PO = 0; + for (BasicBlock *BB : post_order(&F)) { + PostOrderNumbers[BB] = PO++; + for (Instruction &I : *BB) { + MemoryAccess *MA = MSSA.getMemoryAccess(&I); + if (I.mayThrow() && !MA) + ThrowingBlocks.insert(I.getParent()); + auto *MD = dyn_cast_or_null(MA); + if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit && + (getLocForWriteEx(&I) || isMemTerminatorInst(&I))) { + MemDefs.push_back(MD); } } + } + + // Treat byval or inalloca arguments the same as Allocas, stores to them are + // dead at the end of the function. + for (Argument &AI : F.args()) + if (AI.hasPassPointeeByValueCopyAttr()) { + // For byval, the caller doesn't know the address of the allocation. + if (AI.hasByValAttr()) + InvisibleToCallerBeforeRet.insert({&AI, true}); + InvisibleToCallerAfterRet.insert({&AI, true}); + } + + // Collect whether there is any irreducible control flow in the function. + ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); +} + - // Treat byval or inalloca arguments the same as Allocas, stores to them are - // dead at the end of the function. - for (Argument &AI : F.args()) - if (AI.hasPassPointeeByValueCopyAttr()) { - // For byval, the caller doesn't know the address of the allocation. - if (AI.hasByValAttr()) - InvisibleToCallerBeforeRet.insert({&AI, true}); - InvisibleToCallerAfterRet.insert({&AI, true}); + StoreInst* prepareLastLoopVariantStoreWithRange(Loop *L) { + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(L); + if (SafetyInfo.anyBlockMayThrow()) { + ++MayThrowInLoop; + return nullptr; + } + auto LastRange = ContinuousMemoryRange::createEmpty(); + StoreInst *LastStore = nullptr; + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (!isa(I)) + continue; + + if (I.isVolatile() || I.isAtomic()) { + ++VolatileOrAtomicInLoop; + continue; + } + + auto *S = cast(&I); + auto Range = computeMemoryRange(L, S); + if (!Range.isEmpty()) { + ++ComputedStoreMemRanges; + LastStore = S; + LastRange = Range; + } + + ++FailedToComputeRangeForStore; } + } + addMemRange(LastStore, LastRange); + return LastStore; + } - // Collect whether there is any irreducible control flow in the function. - ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); + void addMemRange(Instruction *I, ContinuousMemoryRange &Range) { + if (Range.isEmpty()) + return; + MemRanges.insert({I, Range}); + } + + // This computes a continuous range that memory intrinsic fills + // + // If it cannot compute a range, it will return an empty range + ContinuousMemoryRange computeMemoryRange(const AnyMemIntrinsic *MemIntr) { + const auto *Length = SE->getSCEV(MemIntr->getLength()); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + auto *Start = SE->getSCEV(MemIntr->getDest()); + return ContinuousMemoryRange(Start, Length); + } + + // This computes a continuous range to which a store within a loop write + // It works only with loops where there is a canonical induction variable + // + // If it cannot compute a range, it will return an empty range + ContinuousMemoryRange computeMemoryRange(Loop *L, StoreInst *Store) { + auto *Type = Store->getValueOperand()->getType(); + if (!Type->isFloatingPointTy() && !Type->isIntegerTy()) { + return ContinuousMemoryRange::createEmpty(); + } + + auto *GEP = dyn_cast(Store->getPointerOperand()); + if (!GEP) + return ContinuousMemoryRange::createEmpty(); + + auto *StoreRec = dyn_cast(SE->getSCEV(GEP)); + if (!StoreRec) + return ContinuousMemoryRange::createEmpty(); + + if (!StoreRec->isAffine()) + return ContinuousMemoryRange::createEmpty(); + + auto *BTC = SE->getBackedgeTakenCount(L); + if (isa(BTC)) + return ContinuousMemoryRange::createEmpty(); + + const auto *Start = StoreRec->getOperand(0); + if (isa(Start)) + return ContinuousMemoryRange::createEmpty(); + + auto *Count = SE->getAddExpr(BTC, SE->getOne(BTC->getType())); + auto SizeInBytes = + Store->getValueOperand()->getType()->getScalarSizeInBits() / 8; + auto *StoreSize = SE->getConstant(Count->getType(), SizeInBytes); + + auto *Step = StoreRec->getOperand(1); + if (Step != StoreSize) + return ContinuousMemoryRange::createEmpty(); + + const auto *Length = SE->getMulExpr(Count, StoreSize); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + return ContinuousMemoryRange(Start, Length); } /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p @@ -950,6 +1206,14 @@ const MemoryLocation &KillingLoc, const MemoryLocation &DeadLoc, int64_t &KillingOff, int64_t &DeadOff) { + if (EnableOptimizationAcrossLoops) { + if ((isa(DeadI) && isa(KillingI)) || + (isa(KillingI) && isa(DeadI)) || + (isa(DeadI) && isa(KillingI))) { + if (isLoopVariantStoreOverwrite(KillingI, DeadI)) + return OW_Complete; + } + } // AliasAnalysis does not always account for loops. Limit overwrite checks // to dependencies for which we can guarantee they are independent of any // loops they are in. @@ -1300,6 +1564,7 @@ if (!ContainsIrreducibleLoops && CurrentLI && CurrentLI == LI.getLoopFor(KillingDef->getParent())) return true; + // Otherwise check the memory location is invariant to any loops. return isGuaranteedLoopInvariant(CurrentLoc.Ptr); } @@ -1440,13 +1705,16 @@ if (!CurrentLoc) continue; + // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same // memory location and not located in different loops. if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { - LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); - WalkerStepLimit -= 1; - continue; + if (!EnableOptimizationAcrossLoops || MemRanges.empty()) { + LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); + WalkerStepLimit -= 1; + continue; + } } if (IsMemTerm) { @@ -1562,12 +1830,15 @@ } // If this worklist walks back to the original memory access (and the - // pointer is not guarenteed loop invariant) then we cannot assume that a - // store kills itself. + // pointer is not guarenteed loop invariant) then we cannot assume that + // a store kills itself. if (MaybeDeadAccess == UseAccess && !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) { - LLVM_DEBUG(dbgs() << " ... found not loop invariant self access\n"); - return None; + if (!EnableOptimizationAcrossLoops || !MemRanges.count(MaybeDeadI)) { + LLVM_DEBUG(dbgs() + << " ... found not loop invariant self access\n"); + return None; + } } // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check // if it reads the memory location. @@ -1613,8 +1884,13 @@ // MaybeDeadAccess to the exit. if (!isInvisibleToCallerAfterRet(KillingUndObj)) { SmallPtrSet KillingBlocks; - for (Instruction *KD : KillingDefs) - KillingBlocks.insert(KD->getParent()); + for (Instruction *KD : KillingDefs) { + if (EnableOptimizationAcrossLoops) { + KillingBlocks.insert(tryToReplaceByGuard(KD, KD->getParent())); + } else { + KillingBlocks.insert(KD->getParent()); + } + } assert(!KillingBlocks.empty() && "Expected at least a single killing block"); @@ -1625,7 +1901,6 @@ break; CommonPred = PDT.findNearestCommonDominator(CommonPred, BB); } - // If CommonPred is in the set of killing blocks, just check if it // post-dominates MaybeDeadAccess. if (KillingBlocks.count(CommonPred)) { @@ -1681,8 +1956,82 @@ return {MaybeDeadAccess}; } + /** + * This function does not do anything with loop-invariant instructions and + * simply returns their KillingBlock, otherwise it tries to return a guard + */ + BasicBlock *tryToReplaceByGuard(Instruction *KillingI, + BasicBlock *KillingBlock) { + assert(KillingI->getParent() == KillingBlock && + "Killing instruction must be inside KillingBlock"); + if (!EnableOptimizationAcrossLoops) + return KillingBlock; + + if (!MemRanges.count(KillingI)) + return KillingBlock; + + if (!isa(KillingI)) + return KillingBlock; + + auto *L = LI.getLoopFor(KillingBlock); + assert(L && "Killing instruction must be inside a loop here"); + assert(L->isLoopSimplifyForm() && L->isRotatedForm() && + "Loop must be in simplify rotated form"); + if (!L->isGuarded()) { + ++MissedGuard; + return KillingBlock; + } + + auto *Br = L->getLoopGuardBranch(); + assert(Br && + "Loop is in simplify form, but it does not seem to have a guard"); + if (auto *Guard = Br->getParent()) { + ++ReplacedGuard; + return Guard; + } + return KillingBlock; + } + + bool isLoopVariantStoreOverwrite(const Instruction *KillingI, + const Instruction *DeadI) { + if (!MemRanges.count(DeadI) && !MemRanges.count(KillingI)) + return false; + auto DeadRange = MemRanges.find(DeadI); + auto KillingRange = MemRanges.find(KillingI); + + if (MemRanges.count(DeadI) && MemRanges.count(KillingI)) + return DeadRange->second == KillingRange->second; + + if (MemRanges.count(DeadI) && isa(KillingI)) { + auto MemIntrRange = computeMemoryRange(cast(KillingI)); + if (!MemIntrRange.isEmpty()) + ++ComputedMemIntrMemRanges; + return DeadRange->second == MemIntrRange; + } else if (MemRanges.count(KillingI) && isa(DeadI)) { + auto MemIntrRange = computeMemoryRange(cast(DeadI)); + if (!MemIntrRange.isEmpty()) + ++ComputedMemIntrMemRanges; + return KillingRange->second == MemIntrRange; + } + + return false; + } + + // TODO: If this instruction is loop-variant store, then we don't remove + // loops that had nothing more than just deleted store // Delete dead memory defs void deleteDeadInstruction(Instruction *SI) { +#ifdef LLVM_ENABLE_STATS + if (AreStatisticsEnabled()) { + if (isa(SI)) { + if (MemRanges.count(SI)) + ++LoopVariantStoreRemoved; + } + } +#endif + if (isa(SI)) { + ++RemovedMemIntrWithRange; + } MemorySSAUpdater Updater(&MSSA); SmallVector NowDeadInsts; NowDeadInsts.push_back(SI); @@ -1922,13 +2271,8 @@ } }; -static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI, - const LoopInfo &LI) { +static bool runDSEOptimizationLoop(DSEState &State) { bool MadeChange = false; - - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2039,10 +2383,10 @@ // We are re-using tryToMergePartialOverlappingStores, which requires // DeadSI to dominate DeadSI. // TODO: implement tryToMergeParialOverlappingStores using MemorySSA. - if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) { + if (DeadSI && KillingSI && State.DT.dominates(DeadSI, KillingSI)) { if (Constant *Merged = tryToMergePartialOverlappingStores( KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL, - State.BatchAA, &DT)) { + State.BatchAA, &State.DT)) { // Update stored value of earlier store to merged constant. DeadSI->setOperand(0, Merged); @@ -2082,12 +2426,61 @@ continue; } } + return MadeChange; +} + +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + ScalarEvolution *SE, + const TargetLibraryInfo &TLI, + AssumptionCache* AC, const LoopInfo &LI) { + + DSEState State(F, AA, MSSA, DT, PDT, SE, TLI, AC, LI); + bool MadeChange = false; + + MadeChange |= runDSEOptimizationLoop(State); if (EnablePartialOverwriteTracking) for (auto &KV : State.IOLs) MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI); MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); + + // Run across loop optimization at the end because it will preserve some time + // on things like calculating SCEVs because there are loops like for (...) + // P[i] = 1; + // ... + // P[i] = 3; + // Both stores are at the same loop level and DSE can remove them + // As both stores can be candidates for removal in across loop optimization + // we will remove all store we can first and later try to optimize survived + // ones So this one below will be transformed for (...) + // P[i] = 1; + // ... + // P[i] = 3; + // --> + // for (...) + // ... + // P[i] = 3; + // And P[i] = 3 will be removed in across loop optimization by some other + // store in other loop Plus, P[i] = 3 would be checked first and won't be + // considered as dead store killed by another loop's store see the examle + // below + // loop: + // ; 10 = MemoryPhi({preheader1,liveOnEntry},{loop,2}) + // %loop.iv = phi i64 [ 0, %preheader1 ], [ %loop.iv, %loop ] + // %loop.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.iv + // ; 1 = MemoryDef(10) + // store i32 2, ptr %loop.1.idx, align 4 + // ; 2 = MemoryDef(1) + // store i32 2, ptr %loop.1.idx, align 4 + // %loop.iv.1 = add nuw nsw i64 %loop.iv, 1 + // %loop.cond = icmp eq i64 %loop.iv.1, %n.zext1 + // br i1 %loop.cond, label %loop.exit, label % + // 2 = MemoryDef(1) is read by MemoryPhi and this Phi used by preceding store + // So it is better to try to kill 1 by 2 + MadeChange |= State.prepareStateForAcrossLoopOptimization(); + MadeChange |= runDSEOptimizationLoop(State); return MadeChange; } } // end anonymous namespace @@ -2102,8 +2495,13 @@ MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); LoopInfo &LI = AM.getResult(F); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + ScalarEvolution *SE = nullptr; + AssumptionCache *AC = nullptr; + if (EnableOptimizationAcrossLoops) { + SE = &AM.getResult(F); + AC = &AM.getResult(F); + } + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, TLI, AC, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2111,13 +2509,24 @@ NumRemainingStores += isa(&I); #endif - if (!Changed) + if (!Changed) { return PreservedAnalyses::all(); - + } PreservedAnalyses PA; - PA.preserveSet(); - PA.preserve(); - PA.preserve(); + if (EnableOptimizationAcrossLoops) { + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + } else { + PA.preserveSet(); + PA.preserve(); + PA.preserve(); + } return PA; } @@ -2144,8 +2553,13 @@ PostDominatorTree &PDT = getAnalysis().getPostDomTree(); LoopInfo &LI = getAnalysis().getLoopInfo(); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + AssumptionCache *AC = nullptr; + ScalarEvolution *SE = nullptr; + if (EnableOptimizationAcrossLoops) { + AC = &getAnalysis().getAssumptionCache(F); + SE = &getAnalysis().getSE(); + } + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, TLI, AC, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2157,7 +2571,15 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); + if (EnableOptimizationAcrossLoops) { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + } + else { + AU.setPreservesCFG(); + } AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -2171,7 +2593,6 @@ AU.addPreserved(); } }; - } // end anonymous namespace char DSELegacyPass::ID = 0; @@ -2186,6 +2607,7 @@ INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -516,12 +516,12 @@ ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MemCpy Optimization ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Lazy Branch Probability Analysis ; GCN-O2-NEXT: Lazy Block Frequency Analysis ; GCN-O2-NEXT: Loop Pass Manager @@ -875,12 +875,12 @@ ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MemCpy Optimization ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Lazy Branch Probability Analysis ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Loop Pass Manager Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-dom-memory-intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-dom-memory-intrinsic.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if memory intrinsic is removed in %entry because %loop overwrites it +; and we account for invariants %n < 0 or %n == 0 (see function arguments in all tests) +; on some paths from %entry to %exit where it is possible + +; Check if memory intrinsic is removed when there is smax +; TODO: We can remove memset +define void @smax_and_memory_intrinsic(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @smax_and_memory_intrinsic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: [[MAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 0) +; CHECK-NEXT: [[MAX_CAST:%.*]] = zext i32 [[MAX]] to i64 +; CHECK-NEXT: [[HEADER_COND1:%.*]] = icmp eq i64 0, [[MAX_CAST]] +; CHECK-NEXT: br i1 [[HEADER_COND1]], label [[EXIT:%.*]], label [[LATCH_LR_PH:%.*]] +; CHECK: latch.lr.ph: +; CHECK-NEXT: br label [[LATCH:%.*]] +; CHECK: header.exit_crit_edge: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: latch: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ 0, [[LATCH_LR_PH]] ], [ [[IV_1:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV2]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV2]], 1 +; CHECK-NEXT: [[HEADER_COND:%.*]] = icmp eq i64 [[IV_1]], [[MAX_CAST]] +; CHECK-NEXT: br i1 [[HEADER_COND]], label [[HEADER_EXIT_CRIT_EDGE:%.*]], label [[LATCH]] +; +entry: + %n.cast = sext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %max = call i32 @llvm.smax.i32(i32 %n, i32 0) + %max.cast = zext i32 %max to i64 + br label %header + +header: ; preds = %latch, %entry + %iv = phi i64 [ %iv.1, %latch ], [ 0, %entry ] + %header.cond = icmp eq i64 %iv, %max.cast + br i1 %header.cond, label %exit, label %latch + +exit: ; preds = %header + ret void + +latch: ; preds = %header + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + br label %header +} + +; Check a simple case when we can remove memory intrinsic +define void @memset_eq(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_eq( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check if memory intrinsic in %entry is removed when store in %loop overwrites it completely, but +; the induction variable is decremented by moving from %n to 0 +; TODO: We don't remove memset in %entry now +define void @decremented_indvar_and_memory_intrinsic(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @decremented_indvar_and_memory_intrinsic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[IV_1:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV_1]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[HEADER_COND:%.*]] = icmp sgt i32 [[IV_1]], -1 +; CHECK-NEXT: br i1 [[HEADER_COND]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: latch: +; CHECK-NEXT: [[IV_1_CAST:%.*]] = zext i32 [[IV_1]] to i64 +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1_CAST]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: br label [[HEADER]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + br label %header + +header: ; preds = %latch, %entry + %iv = phi i32 [ %n, %entry ], [ %iv.1, %latch ] + %iv.1 = add i32 %iv, -1 + %header.cond = icmp sgt i32 %iv.1, -1 + br i1 %header.cond, label %latch, label %exit + +exit: ; preds = %header + ret void + +latch: ; preds = %header + %iv.1.cast = zext i32 %iv.1 to i64 + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv.1.cast + store i32 1, ptr %idx, align 4 + br label %header +} + +; Check if memory intrinsic in %entry is removed because its offset is different +; from what we have for store in %loop +; TODO: We don't account for different offset +define void @different_offsets(ptr nocapture %ptr.a, ptr nocapture %ptr.b, i32 %n) { +; CHECK-LABEL: @different_offsets( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR_A_OFFSET:%.*]] = getelementptr i32, ptr [[PTR_A:%.*]], i32 4 +; CHECK-NEXT: [[PTR_A_CAST:%.*]] = bitcast ptr [[PTR_A_OFFSET]] to ptr +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR_A_CAST]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX_A:%.*]] = getelementptr inbounds i32, ptr [[PTR_A]], i64 [[IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[IDX_A]], align 4 +; CHECK-NEXT: [[IDX_B:%.*]] = getelementptr inbounds i32, ptr [[PTR_B:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 [[TMP]], ptr [[IDX_B]], align 4 +; CHECK-NEXT: store i32 1, ptr [[IDX_A]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %ptr.a.offset = getelementptr i32, ptr %ptr.a, i32 4 + %ptr.a.cast = bitcast ptr %ptr.a.offset to ptr + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr.a.cast, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx.a = getelementptr inbounds i32, ptr %ptr.a, i64 %iv + %tmp = load i32, ptr %idx.a + %idx.b = getelementptr inbounds i32, ptr %ptr.b, i64 %iv + store i32 %tmp, ptr %idx.b + store i32 1, ptr %idx.a + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; It checks if memory intrinsic is not removed when there is a path on which +; we don't store anything and on another one we have a store so this intrinsic is not redundant +define void @on_overwrite_in_loop(ptr nocapture %ptr, i32 %n, i1 zeroext %x) { +; CHECK-LABEL: @on_overwrite_in_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_SEXT:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_SEXT]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[DUMMY_BLOCK:%.*]], label [[STORE_BLOCK:%.*]] +; CHECK: dummy.block: +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: br label [[LATCH]] +; CHECK: store.block: +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LATCH_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_ZEXT]] +; CHECK-NEXT: br i1 [[LATCH_COND]], label [[EXIT]], label [[HEADER]] +; +entry: + %n.sext = sext i32 %n to i64 + %len = shl nsw i64 %n.sext, 2 + call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp sgt i32 %n, 0 + br i1 %entry.cond, label %preheader, label %exit + +preheader: ; preds = %entry + %n.zext = zext i32 %n to i64 + br label %header + +exit: ; preds = %latch, %entry + ret void + +header: ; preds = %preheader, %latch + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %latch ] + br i1 %x, label %dummy.block, label %store.block + +dummy.block: ; preds = %header + call void @dummy() + br label %latch + +store.block: ; preds = %header + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + br label %latch + +latch: ; preds = %dummy.block, %store.block + %iv.1 = add nuw nsw i64 %iv, 1 + %latch.cond = icmp eq i64 %iv.1, %n.zext + br i1 %latch.cond, label %exit, label %header +} + +define void @constant_length(ptr nocapture %ptr) { +; CHECK-LABEL: @constant_length( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], 128 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT:%.*]], label [[LOOP]] +; +entry: + call void @llvm.memset.p0i8.i64(ptr noundef nonnull align 4 dereferenceable(512) %ptr, i8 0, i64 512, i1 false) + br label %loop + +exit: ; preds = %loop + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ 0, %entry ], [ %iv.1, %loop ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, 128 + br i1 %loop.cond, label %exit, label %loop +} + +declare void @dummy() +declare i32 @llvm.smax.i32(i32, i32) +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-post-dom-memory-intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-post-dom-memory-intrinsic.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if store in %loop because memory intrinsic in %exit overwrites it + +; Check if store in %loop is removed because memory intrinsic in +; %exit overwrites it +; TODO: Store is not removed even if it is redundant +define void @post_dom_memset(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @post_dom_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[PREHEADER:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: preexit: +; CHECK-NEXT: [[PREEXIT_LEN:%.*]] = shl nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[LEN:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[PREEXIT_LEN]], [[PREEXIT:%.*]] ] +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[PREEXIT]], label [[LOOP]] +; +entry: + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %preheader + +preheader: ; preds = %entry + %n.cast = zext i32 %n to i64 + br label %loop + +preexit: ; preds = %loop + %preexit.len = shl nuw nsw i64 %n.cast, 2 + br label %exit + +exit: ; preds = %preexit, %entry + %len = phi i64 [ 0, %entry ], [ %preexit.len, %preexit ] + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + ret void + +loop: ; preds = %preheader, %loop + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %loop ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %preexit, label %loop +} + +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-negative-tests.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-negative-tests.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if memory intrinsic is not removed +; + +; Check if memory intrinsic in %entry is not removed if there is a step more than one +; because store in %loop does not overwrite it completely leaving gaps after each step +define void @step_with_gaps(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @step_with_gaps( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR:%.*]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check if memory intrinsic in %entry is not removed if they write to different pointers with the store in %loop +; even if the size of underlying arrays are equals +define void @different_base_ptrs(ptr nocapture %ptr.a, i32* nocapture %ptr.b, i32 %n) { +; CHECK-LABEL: @different_base_ptrs( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR_A:%.*]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR_B:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr.a, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr.b, i64 %iv + store i32 1, ptr %idx + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-stores.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-stores.ll @@ -0,0 +1,342 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if store is removed because there is overwrite in another +; loop + +; The simplest case when one store overwrites another when they are in different loops +; Store in %loop.2 must cause removal of store in %loop.1 +define void @overwrite_store(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @overwrite_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; +entry: + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1 + +guard: ; preds = %loop.1 + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1: ; preds = %loop.1, %preheader1 +; 5 = MemoryPhi({preheader1,liveOnEntry},{loop.1,1}) + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1 ] + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv +; 1 = MemoryDef(5) + store i32 2, ptr %loop.1.idx, align 4 + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1 + +exit: ; preds = %loop.2, %guard, %entry +; 3 = MemoryPhi({entry,liveOnEntry},{guard,1},{loop.2,2}) + ret void + +loop.2: ; preds = %loop.2, %preheader2 +; 4 = MemoryPhi({preheader2,1},{loop.2,2}) + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv +; 2 = MemoryDef(4) + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit, label %loop.2 +} + +; Check if store in %loop.1.header is removed even when there are more than one path +; to %loop.1.latch +define void @must_execute_store(ptr %ptr, i32 %n, i1 %x) { +; CHECK-LABEL: @must_execute_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1.header: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[LOOP_1_DUMMY:%.*]], label [[LOOP_1_DUMMY2:%.*]] +; CHECK: loop.1.dummy: +; CHECK-NEXT: tail call void @dummy1() +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.dummy2: +; CHECK-NEXT: tail call void @dummy2() +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.latch: +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1_HEADER]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; +entry: + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1.header + +guard: ; preds = %loop.1.latch + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1.header: ; preds = %loop.1.latch, %preheader1 +; 5 = MemoryPhi({preheader1,liveOnEntry},{loop.1.latch,1}) + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1.latch ] + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv +; 1 = MemoryDef(5) + store i32 2, ptr %loop.1.idx, align 4 + br i1 %x, label %loop.1.dummy, label %loop.1.dummy2 + +loop.1.dummy: ; preds = %loop.1.header + tail call void @dummy1() + br label %loop.1.latch + +loop.1.dummy2: ; preds = %loop.1.header + tail call void @dummy2() + br label %loop.1.latch + +loop.1.latch: ; preds = %loop.1.dummy2, %loop.1.dummy + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1.header + +exit: ; preds = %loop.2, %guard, %entry +; 3 = MemoryPhi({entry,liveOnEntry},{guard,1},{loop.2,2}) + ret void + +loop.2: ; preds = %loop.2, %preheader2 +; 4 = MemoryPhi({preheader2,1},{loop.2,2}) + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv +; 2 = MemoryDef(4) + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit, label %loop.2 +} +; Check if store in %loop.1.store is removed even when it does not always execute +define void @not_always_execute(ptr %ptr, i32 %n, i1 %x) { +; CHECK-LABEL: @not_always_execute( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1.header: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[LOOP_1_STORE:%.*]], label [[LOOP_1_DUMMY2:%.*]] +; CHECK: loop.1.store: +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.dummy2: +; CHECK-NEXT: tail call void @dummy2() +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.latch: +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1_HEADER]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; +entry: + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1.header + +guard: ; preds = %loop.1.latch + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1.header: ; preds = %loop.1.latch, %preheader1 +; 6 = MemoryPhi({preheader1,liveOnEntry},{loop.1.latch,5}) + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1.latch ] + br i1 %x, label %loop.1.store, label %loop.1.dummy2 + +loop.1.store: ; preds = %loop.1.header + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv +; 1 = MemoryDef(6) + store i32 2, ptr %loop.1.idx, align 4 + br label %loop.1.latch + +loop.1.dummy2: ; preds = %loop.1.header + tail call void @dummy2() + br label %loop.1.latch + +loop.1.latch: ; preds = %loop.1.dummy2, %loop.1.store +; 5 = MemoryPhi({loop.1.store,1},{loop.1.dummy2,6}) + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1.header + +exit: ; preds = %loop.2, %guard, %entry +; 3 = MemoryPhi({entry,liveOnEntry},{guard,5},{loop.2,2}) + ret void + +loop.2: ; preds = %loop.2, %preheader2 +; 4 = MemoryPhi({preheader2,5},{loop.2,2}) + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv +; 2 = MemoryDef(4) + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit, label %loop.2 +} + +define void @multiple_stores_in_one_loop(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @multiple_stores_in_one_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; +entry: + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1 + +guard: ; preds = %loop.1 + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1: ; preds = %loop.1, %preheader1 +; 10 = MemoryPhi({preheader1,liveOnEntry},{loop.1,6}) + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1 ] + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + %loop.1.idx2 = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + %loop.1.idx3 = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv +; 1 = MemoryDef(10) + store i32 2, ptr %loop.1.idx2, align 4 +; 2 = MemoryDef(1) + store i32 2, ptr %loop.1.idx, align 4 +; 3 = MemoryDef(2) + store i32 2, ptr %loop.1.idx, align 4 +; 4 = MemoryDef(3) + store i32 2, ptr %loop.1.idx2, align 4 +; 5 = MemoryDef(4) + store i32 2, ptr %loop.1.idx, align 4 +; 6 = MemoryDef(5) + store i32 2, ptr %loop.1.idx3, align 4 + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1 + +exit: ; preds = %loop.2, %guard, %entry +; 8 = MemoryPhi({entry,liveOnEntry},{guard,6},{loop.2,7}) + ret void + +loop.2: ; preds = %loop.2, %preheader2 +; 9 = MemoryPhi({preheader2,6},{loop.2,7}) + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv +; 7 = MemoryDef(9) + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit, label %loop.2 +} + + +declare void @dummy1() nounwind willreturn argmemonly readnone +declare void @dummy2() nounwind willreturn argmemonly readnone Index: llvm/test/Transforms/DeadStoreElimination/debug-counter.ll =================================================================== --- llvm/test/Transforms/DeadStoreElimination/debug-counter.ll +++ llvm/test/Transforms/DeadStoreElimination/debug-counter.ll @@ -20,29 +20,29 @@ define void @test(i32* noalias %P, i32* noalias %Q, i32* noalias %R) { ; SKIP0-COUNT1-LABEL: @test( -; SKIP0-COUNT1-NEXT: store i32 1, i32* [[P:%.*]] +; SKIP0-COUNT1-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; SKIP0-COUNT1-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; SKIP0-COUNT1: bb1: ; SKIP0-COUNT1-NEXT: br label [[BB3:%.*]] ; SKIP0-COUNT1: bb2: ; SKIP0-COUNT1-NEXT: br label [[BB3]] ; SKIP0-COUNT1: bb3: -; SKIP0-COUNT1-NEXT: store i32 0, i32* [[Q:%.*]] -; SKIP0-COUNT1-NEXT: store i32 0, i32* [[R:%.*]] -; SKIP0-COUNT1-NEXT: store i32 0, i32* [[P]] +; SKIP0-COUNT1-NEXT: store i32 0, i32* [[Q:%.*]], align 4 +; SKIP0-COUNT1-NEXT: store i32 0, i32* [[R:%.*]], align 4 +; SKIP0-COUNT1-NEXT: store i32 0, i32* [[P]], align 4 ; SKIP0-COUNT1-NEXT: ret void ; ; SKIP1-COUNT1-LABEL: @test( -; SKIP1-COUNT1-NEXT: store i32 1, i32* [[R:%.*]] +; SKIP1-COUNT1-NEXT: store i32 1, i32* [[R:%.*]], align 4 ; SKIP1-COUNT1-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; SKIP1-COUNT1: bb1: ; SKIP1-COUNT1-NEXT: br label [[BB3:%.*]] ; SKIP1-COUNT1: bb2: ; SKIP1-COUNT1-NEXT: br label [[BB3]] ; SKIP1-COUNT1: bb3: -; SKIP1-COUNT1-NEXT: store i32 0, i32* [[Q:%.*]] -; SKIP1-COUNT1-NEXT: store i32 0, i32* [[R]] -; SKIP1-COUNT1-NEXT: store i32 0, i32* [[P:%.*]] +; SKIP1-COUNT1-NEXT: store i32 0, i32* [[Q:%.*]], align 4 +; SKIP1-COUNT1-NEXT: store i32 0, i32* [[R]], align 4 +; SKIP1-COUNT1-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; SKIP1-COUNT1-NEXT: ret void ; ; SKIP0-COUNT2-LABEL: @test( @@ -52,23 +52,22 @@ ; SKIP0-COUNT2: bb2: ; SKIP0-COUNT2-NEXT: br label [[BB3]] ; SKIP0-COUNT2: bb3: -; SKIP0-COUNT2-NEXT: store i32 0, i32* [[Q:%.*]] -; SKIP0-COUNT2-NEXT: store i32 0, i32* [[R:%.*]] -; SKIP0-COUNT2-NEXT: store i32 0, i32* [[P:%.*]] +; SKIP0-COUNT2-NEXT: store i32 0, i32* [[Q:%.*]], align 4 +; SKIP0-COUNT2-NEXT: store i32 0, i32* [[R:%.*]], align 4 +; SKIP0-COUNT2-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; SKIP0-COUNT2-NEXT: ret void ; ; SKIP2-COUNT1-LABEL: @test( -; SKIP2-COUNT1-NEXT: store i32 1, i32* [[P:%.*]] -; SKIP2-COUNT1-NEXT: store i32 1, i32* [[R:%.*]] +; SKIP2-COUNT1-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; SKIP2-COUNT1-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; SKIP2-COUNT1: bb1: ; SKIP2-COUNT1-NEXT: br label [[BB3:%.*]] ; SKIP2-COUNT1: bb2: ; SKIP2-COUNT1-NEXT: br label [[BB3]] ; SKIP2-COUNT1: bb3: -; SKIP2-COUNT1-NEXT: store i32 0, i32* [[Q:%.*]] -; SKIP2-COUNT1-NEXT: store i32 0, i32* [[R]] -; SKIP2-COUNT1-NEXT: store i32 0, i32* [[P]] +; SKIP2-COUNT1-NEXT: store i32 0, i32* [[Q:%.*]], align 4 +; SKIP2-COUNT1-NEXT: store i32 0, i32* [[R:%.*]], align 4 +; SKIP2-COUNT1-NEXT: store i32 0, i32* [[P]], align 4 ; SKIP2-COUNT1-NEXT: ret void ; store i32 1, i32* %P