Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -47,6 +47,8 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" @@ -79,6 +81,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include @@ -108,9 +111,65 @@ STATISTIC(NumDomMemDefChecks, "Number iterations check for reads in getDomMemoryDef"); +STATISTIC(NotRotatedFormLoop, "Number of loops that are not in rotated form " + "and failed to be transformed"); +STATISTIC(NotSimplifyFormLoop, "Number of loops that are not in simplify form " + "and failed to be transformed"); +STATISTIC(ComputedStoreMemRanges, + "Number of memory ranges computed for stores"); +STATISTIC(ComputedMemIntrMemRanges, + "Number of memory ranges computed for memory intrinsics"); +STATISTIC(ReplacedGuard, + "Number of store's block replaced by guard in killing block set"); +STATISTIC(MissedGuard, "Number of missed guards"); +STATISTIC(TooManyBlocksInLoop, + "Number of loop which have too many blocks (more than 20)"); +STATISTIC( + VolatileOrAtomicInLoop, + "Number of stores in loop skipped because store is volatile or atomic"); +STATISTIC(LoopCount, "Number of loops to handle"); +STATISTIC(CollectedStoreCount, "Number of loop-variant stores"); +STATISTIC(FailedToComputeRangeForStore, + "Number of memory intrinsics without computed memory range"); +STATISTIC( + MayThrowInLoop, + "Number of loop skipped because there is an instruction that may throw"); +STATISTIC(LoopVariantStoreRemoved, "Number of deleted loop variant stores"); +STATISTIC(RemovedMemIntrWithRange, "Number of deleted "); + DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", "Controls which MemoryDefs are eliminated."); +// If enabled, this optimizes loop patterns like this +// for (int i = 0; i < N; ++i) +// P[i] = 1; +// for (int i = 0; i < N; ++i) +// P[i] = 2; +// +// or +// +// memset(P, 0, sizeof(int) * N); +// for (int i = 0; i < N; ++i) +// P[i] = 2; +static cl::opt EnableOptimizationAcrossLoops( + "dse-optimize-across-loops", cl::init(false), cl::Hidden, + cl::desc("Enable elimination for redundant " + "loop-variant stores and memory intrinsics")); + +// Loops with relatively small number of basic blocks are more likely to be met, +// but there are may be loops with > 1000 basic blocks, so we introduce an upper +// limit to preserve time +static cl::opt BasicBlocksInOneLoopThreshold( + "dse-basic-blocks-in-loop-threshold", cl::init(20), cl::Hidden, + cl::desc("The maximum number of basic block in a loop in total")); + +const int MaxStoreInAcrossLoopDSE = 32; + +static cl::opt AcrossLoopStoreThreshold( + "dse-across-loop-store-threshold", cl::init(MaxStoreInAcrossLoopDSE), + cl::Hidden, + cl::desc("The maximum number of stores across loops to optimize")); + static cl::opt EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", cl::init(true), cl::Hidden, @@ -727,6 +786,32 @@ return false; } +// It represents a range in memory that MemoryDef overwrites completely +// without gaps. If there are supposed to be at least one gap within a range +// then do not create an instance of this type +// TODO: We need offset for base pointer too +struct ContinuousMemoryRange { + const SCEV *Start; + const SCEV *Length; + + ContinuousMemoryRange(const SCEV *Start, const SCEV *Length) + : Start(Start), Length(Length) {} + + static ContinuousMemoryRange createEmpty() { + return ContinuousMemoryRange(nullptr, nullptr); + } + + bool isEmpty() { return Start == nullptr || Length == nullptr; } + + bool operator==(const ContinuousMemoryRange &Other) const { + return Start == Other.Start && Length == Other.Length; + } + + bool operator!=(const ContinuousMemoryRange &Other) const { + return !(*this == Other); + } +}; + struct DSEState { Function &F; AliasAnalysis &AA; @@ -740,18 +825,19 @@ /// information for a deleted value cannot be accessed by a re-used new /// value pointer. BatchAAResults BatchAA; - MemorySSA &MSSA; DominatorTree &DT; PostDominatorTree &PDT; const TargetLibraryInfo &TLI; const DataLayout &DL; const LoopInfo &LI; - + ScalarEvolution *SE; // Whether the function contains any irreducible control flow, useful for // being accurately able to detect loops. bool ContainsIrreducibleLoops; + bool ControlFlowChanged; + // All MemoryDefs that potentially could kill other MemDefs. SmallVector MemDefs; // Any that should be skipped as they are already deleted @@ -767,6 +853,26 @@ // accesses are executed before another access. DenseMap PostOrderNumbers; + // This map may contain loop-variant store which fits the following + // requirements + // - It is not volatile or atomic + // - It may not throw + // - Store size == stride + // - Store type is integer or floating scalar + // - It must always execute + // And their loop must match the following requirements + // - Its backedge taken count is computable + // - It has simplify rotated form + // - Induction variable starts from zero + // - There is no throwing instruction + DenseMap MemRanges; + + // A set for stores that are not guaranteed to execute + // TODO: Do we need to save stores which are guaranteed to execute or not? + // Which is more likely to happen? + SmallPtrSet + StoresNotGuaranteedToExecute; + /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. MapVector IOLs; @@ -775,11 +881,57 @@ DSEState(const DSEState &) = delete; DSEState &operator=(const DSEState &) = delete; + void prepareStateForAcrossLoopOptimization() { + MemDefs.clear(); + ICFLoopSafetyInfo SafetyInfo; + for (auto *L : LI) { + if (!canAddMoreStoresFromLoop()) { + break; + } + ++LoopCount; + + // TODO: There is a potential enhancement to handle stores in nested + // loops, but skip it for now. Do we even need this? + if (!L->isInnermost() || !L->isOutermost()) { + continue; + } + + // We can easily find loops with a large number of basic blocks so + // limit it to some extent + if (L->getNumBlocks() > BasicBlocksInOneLoopThreshold) { + ++TooManyBlocksInLoop; + continue; + } + // If one block in loop then it is easy to handle, but otherwise check + // if loop has some canonical form + if (L->getNumBlocks() != 1) { + if (!L->isRotatedForm() && L->getNumBlocks() != 1) { + ++NotRotatedFormLoop; + continue; + } + + if (!L->isLoopSimplifyForm()) { + ++NotSimplifyFormLoop; + continue; + } + } + + SafetyInfo.computeLoopSafetyInfo(L); + if (SafetyInfo.anyBlockMayThrow()) { + ++MayThrowInLoop; + continue; + } + + prepareLoopVariantStoresWithRanges(L, SafetyInfo); + } + CollectedStoreCount += MemDefs.size(); + } + DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, const TargetLibraryInfo &TLI, - const LoopInfo &LI) + PostDominatorTree &PDT, ScalarEvolution *SE, + const TargetLibraryInfo &TLI, const LoopInfo &LI) : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT), - PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { + PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), SE(SE) { // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -789,11 +941,11 @@ MemoryAccess *MA = MSSA.getMemoryAccess(&I); if (I.mayThrow() && !MA) ThrowingBlocks.insert(I.getParent()); - auto *MD = dyn_cast_or_null(MA); if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit && - (getLocForWrite(&I) || isMemTerminatorInst(&I))) + (getLocForWrite(&I) || isMemTerminatorInst(&I))) { MemDefs.push_back(MD); + } } } @@ -807,6 +959,145 @@ ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); } + bool canAddMoreStoresFromLoop() { + return MemDefs.size() <= AcrossLoopStoreThreshold; + } + + void prepareLoopVariantStoresWithRanges(Loop *L, + ICFLoopSafetyInfo &SafetyInfo) { + for (auto *BB : L->getBlocks()) { + if (!canAddMoreStoresFromLoop()) { + break; + } + + for (auto &I : *BB) { + if (!canAddMoreStoresFromLoop()) { + break; + } + + if (auto *S = dyn_cast(&I)) { + if (!SafetyInfo.isGuaranteedToExecute(I, &DT, L)) { + StoresNotGuaranteedToExecute.insert(&I); + } + + // TODO: If store is volatile, it can overwrite another non-volatile + // store We shouldn't skip it here, but probably just not to remove + // volatile stores + if (I.isVolatile() || I.isAtomic()) { + ++VolatileOrAtomicInLoop; + continue; + } + + auto Range = computeMemoryRange(L, S); + if (!Range.isEmpty()) { + auto *MD = cast(MSSA.getMemoryAccess(S)); + MemDefs.push_back(MD); + addMemRange(S, Range); + ++ComputedStoreMemRanges; + continue; + } + ++FailedToComputeRangeForStore; + } + } + } + } + + void addMemRange(Instruction *I, ContinuousMemoryRange &Range) { + if (Range.isEmpty()) + return; + MemRanges.insert({I, Range}); + } + + // This computes a continuous range that memory intrinsic fills + // + // If it cannot compute a range, it will return an empty range + ContinuousMemoryRange computeMemoryRange(const AnyMemIntrinsic *MemIntr) { + const auto *Length = SE->getSCEV(MemIntr->getLength()); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + auto *Start = SE->getSCEV(MemIntr->getDest()); + return ContinuousMemoryRange(Start, Length); + } + + // This computes a continuous range to which a store within a loop write + // It works only with loops where there is a canonical induction variable + // + // If it cannot compute a range, it will return an empty range + ContinuousMemoryRange computeMemoryRange(Loop *L, StoreInst *Store) { + auto *GEP = dyn_cast(Store->getPointerOperand()); + if (!GEP) + return ContinuousMemoryRange::createEmpty(); + + // This method implies that GEP and store are from the same loop and as a + // result GEP depends on induction variable of the loop. It is possible that + // GEP can be from another loop and we still compute memory range for it, + // but we should skip it because the resulting range can be wrong + if (!L->contains(GEP)) + return ContinuousMemoryRange::createEmpty(); + + auto *Type = Store->getValueOperand()->getType(); + if (!Type->isFloatingPointTy() && !Type->isIntegerTy()) + return ContinuousMemoryRange::createEmpty(); + + auto *StoreRec = dyn_cast(SE->getSCEV(GEP)); + if (!StoreRec) + return ContinuousMemoryRange::createEmpty(); + + if (!StoreRec->isAffine()) + return ContinuousMemoryRange::createEmpty(); + + auto *BTC = SE->getBackedgeTakenCount(L); + if (isa(BTC)) + return ContinuousMemoryRange::createEmpty(); + + const auto *Start = StoreRec->getOperand(0); + if (isa(Start)) + return ContinuousMemoryRange::createEmpty(); + + auto *Count = SE->getAddExpr(BTC, SE->getOne(BTC->getType())); + auto SizeInBytes = + Store->getValueOperand()->getType()->getScalarSizeInBits() / 8; + auto *StoreSize = SE->getConstant(Count->getType(), SizeInBytes); + + auto *Step = StoreRec->getOperand(1); + if (Step != StoreSize) + return ContinuousMemoryRange::createEmpty(); + + const auto *Length = SE->getMulExpr(Count, StoreSize); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + return ContinuousMemoryRange(Start, Length); + } + + bool isOptimizableAcrossLoops(const Instruction *KillingI, + const Instruction *DeadI) { + if (!EnableOptimizationAcrossLoops) + return false; + + // None of these instructions is a store in a loop, so skip it. + // Alias Analysis can handle two memory intrinsics. + if (isa(DeadI) && isa(KillingI)) + return false; + + // Killing store must always execute. Otherwise, it needs additional + // analysis to prove that KillingI overwrites DeadI, see below for (...) + // if (x) + // arr[i] = C1 <-- Is arr filled by C2 later or not? + // ... + // for (...) + // if (y) + // arr[i] = C2 + // + // TODO: What if x == y as in the example above? We should probably rely on + // loop unswitch and other similar optimizations to remove if-statement that + // allow DSE to optimize it + if (isa(KillingI) && + StoresNotGuaranteedToExecute.contains(KillingI)) + return false; + + return true; + } + /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p /// KillingI instruction) completely overwrites a store to the 'DeadLoc' /// location (by \p DeadI instruction). @@ -820,6 +1111,10 @@ const MemoryLocation &KillingLoc, const MemoryLocation &DeadLoc, int64_t &KillingOff, int64_t &DeadOff) { + if (isOptimizableAcrossLoops(KillingI, DeadI)) + if (isLoopVariantStoreOverwrite(KillingI, DeadI)) + return OW_Complete; + // AliasAnalysis does not always account for loops. Limit overwrite checks // to dependencies for which we can guarantee they are independent of any // loops they are in. @@ -1172,6 +1467,7 @@ if (!ContainsIrreducibleLoops && CurrentLI && CurrentLI == LI.getLoopFor(KillingDef->getParent())) return true; + // Otherwise check the memory location is invariant to any loops. return isGuaranteedLoopInvariant(CurrentLoc.Ptr); } @@ -1308,10 +1604,13 @@ // candidates for which we can guarantee they always store to the same // memory location and not located in different loops. if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { - LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); - WalkerStepLimit -= 1; - CanOptimize = false; - continue; + + if (!EnableOptimizationAcrossLoops || MemRanges.empty()) { + LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); + WalkerStepLimit -= 1; + CanOptimize = false; + continue; + } } if (IsMemTerm) { @@ -1444,12 +1743,15 @@ } // If this worklist walks back to the original memory access (and the - // pointer is not guarenteed loop invariant) then we cannot assume that a - // store kills itself. + // pointer is not guarenteed loop invariant) then we cannot assume that + // a store kills itself. if (MaybeDeadAccess == UseAccess && !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) { - LLVM_DEBUG(dbgs() << " ... found not loop invariant self access\n"); - return None; + if (!EnableOptimizationAcrossLoops || !MemRanges.count(MaybeDeadI)) { + LLVM_DEBUG(dbgs() + << " ... found not loop invariant self access\n"); + return None; + } } // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check // if it reads the memory location. @@ -1495,8 +1797,13 @@ // MaybeDeadAccess to the exit. if (!isInvisibleToCallerAfterRet(KillingUndObj)) { SmallPtrSet KillingBlocks; - for (Instruction *KD : KillingDefs) - KillingBlocks.insert(KD->getParent()); + for (Instruction *KD : KillingDefs) { + if (EnableOptimizationAcrossLoops) { + KillingBlocks.insert(tryToReplaceByGuard(KD, KD->getParent())); + } else { + KillingBlocks.insert(KD->getParent()); + } + } assert(!KillingBlocks.empty() && "Expected at least a single killing block"); @@ -1507,7 +1814,6 @@ break; CommonPred = PDT.findNearestCommonDominator(CommonPred, BB); } - // If CommonPred is in the set of killing blocks, just check if it // post-dominates MaybeDeadAccess. if (KillingBlocks.count(CommonPred)) { @@ -1563,8 +1869,81 @@ return {MaybeDeadAccess}; } + /** + * This function does not do anything with loop-invariant instructions and + * simply returns their KillingBlock, otherwise it tries to return a guard + */ + BasicBlock *tryToReplaceByGuard(Instruction *KillingI, + BasicBlock *KillingBlock) { + assert(KillingI->getParent() == KillingBlock && + "Killing instruction must be inside KillingBlock"); + if (!EnableOptimizationAcrossLoops) + return KillingBlock; + + if (!MemRanges.count(KillingI)) + return KillingBlock; + + if (!isa(KillingI)) + return KillingBlock; + + auto *L = LI.getLoopFor(KillingBlock); + assert(L && "Killing instruction must be inside a loop here"); + assert((L->isLoopSimplifyForm() && L->isRotatedForm()) || + L->getNumBlocks() == 1 && + "Loop must be in simplify rotated form or consist of one block"); + if (!L->isGuarded()) { + ++MissedGuard; + return KillingBlock; + } + + auto *Br = L->getLoopGuardBranch(); + assert(Br && + "Loop is in simplify form, but it does not seem to have a guard"); + if (auto *Guard = Br->getParent()) { + ++ReplacedGuard; + return Guard; + } + return KillingBlock; + } + + bool isLoopVariantStoreOverwrite(const Instruction *KillingI, + const Instruction *DeadI) { + auto KillingRange = MemRanges.find(KillingI); + auto DeadRange = MemRanges.find(DeadI); + + if (MemRanges.count(DeadI) && MemRanges.count(KillingI)) + return DeadRange->second == KillingRange->second; + + if (MemRanges.count(DeadI) && isa(KillingI)) { + auto MemIntrRange = computeMemoryRange(cast(KillingI)); + if (!MemIntrRange.isEmpty()) + ++ComputedMemIntrMemRanges; + return DeadRange->second == MemIntrRange; + } else if (MemRanges.count(KillingI) && isa(DeadI)) { + auto MemIntrRange = computeMemoryRange(cast(DeadI)); + if (!MemIntrRange.isEmpty()) + ++ComputedMemIntrMemRanges; + return KillingRange->second == MemIntrRange; + } + + return false; + } + + // TODO: If this instruction is loop-variant store, then we don't remove + // loops that had nothing more than just deleted store // Delete dead memory defs void deleteDeadInstruction(Instruction *SI) { +#ifdef LLVM_ENABLE_STATS + if (AreStatisticsEnabled()) { + if (isa(SI)) { + if (MemRanges.count(SI)) + ++LoopVariantStoreRemoved; + } + } +#endif + if (isa(SI)) { + ++RemovedMemIntrWithRange; + } MemorySSAUpdater Updater(&MSSA); SmallVector NowDeadInsts; NowDeadInsts.push_back(SI); @@ -1919,13 +2298,8 @@ } }; -static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI, - const LoopInfo &LI) { +static bool runDSEOptimizationLoop(DSEState &State) { bool MadeChange = false; - - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2034,10 +2408,10 @@ // We are re-using tryToMergePartialOverlappingStores, which requires // DeadSI to dominate DeadSI. // TODO: implement tryToMergeParialOverlappingStores using MemorySSA. - if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) { + if (DeadSI && KillingSI && State.DT.dominates(DeadSI, KillingSI)) { if (Constant *Merged = tryToMergePartialOverlappingStores( KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL, - State.BatchAA, &DT)) { + State.BatchAA, &State.DT)) { // Update stored value of earlier store to merged constant. DeadSI->setOperand(0, Merged); @@ -2085,6 +2459,61 @@ continue; } } + return MadeChange; +} + +static bool runAcrossLoopDSE(DSEState &State) { + if (!EnableOptimizationAcrossLoops) { + return false; + } + // Run across loop optimization at the end because it will preserve some time + // on things like calculating SCEVs because there are loops like for (...) + // P[i] = 1; + // ... + // P[i] = 3; + // Both stores are at the same loop level and DSE can remove them + // As both stores can be candidates for removal in across loop optimization + // we will remove all store we can first and later try to optimize survived + // ones So this one below will be transformed for (...) + // P[i] = 1; + // ... + // P[i] = 3; + // --> + // for (...) + // ... + // P[i] = 3; + // And P[i] = 3 will be removed in across loop optimization by some other + // store in other loop Plus, P[i] = 3 would be checked first and won't be + // considered as dead store killed by another loop's store see the examle + // below + // loop: + // ; 10 = MemoryPhi({preheader1,liveOnEntry},{loop,2}) + // %loop.iv = phi i64 [ 0, %preheader1 ], [ %loop.iv, %loop ] + // %loop.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.iv + // ; 1 = MemoryDef(10) + // store i32 2, ptr %loop.1.idx, align 4 + // ; 2 = MemoryDef(1) + // store i32 2, ptr %loop.1.idx, align 4 + // %loop.iv.1 = add nuw nsw i64 %loop.iv, 1 + // %loop.cond = icmp eq i64 %loop.iv.1, %n.zext1 + // br i1 %loop.cond, label %loop.exit, label % + // 2 = MemoryDef(1) is read by MemoryPhi and this Phi used by preceding store + // So it is better to try to kill 1 by 2 + + State.prepareStateForAcrossLoopOptimization(); + return runDSEOptimizationLoop(State); +} + +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + ScalarEvolution *SE, + const TargetLibraryInfo &TLI, + const LoopInfo &LI) { + + DSEState State(F, AA, MSSA, DT, PDT, SE, TLI, LI); + bool MadeChange = false; + + MadeChange |= runDSEOptimizationLoop(State); if (EnablePartialOverwriteTracking) for (auto &KV : State.IOLs) @@ -2092,6 +2521,7 @@ MadeChange |= State.eliminateRedundantStoresOfExistingValues(); MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); + MadeChange |= runAcrossLoopDSE(State); return MadeChange; } } // end anonymous namespace @@ -2106,8 +2536,11 @@ MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); LoopInfo &LI = AM.getResult(F); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + ScalarEvolution *SE = nullptr; + if (EnableOptimizationAcrossLoops) { + SE = &AM.getResult(F); + } + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2115,10 +2548,13 @@ NumRemainingStores += isa(&I); #endif - if (!Changed) + if (!Changed) { return PreservedAnalyses::all(); - + } PreservedAnalyses PA; + if (EnableOptimizationAcrossLoops) { + PA.preserve(); + } PA.preserveSet(); PA.preserve(); PA.preserve(); @@ -2148,8 +2584,11 @@ PostDominatorTree &PDT = getAnalysis().getPostDomTree(); LoopInfo &LI = getAnalysis().getLoopInfo(); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + ScalarEvolution *SE = nullptr; + if (EnableOptimizationAcrossLoops) { + SE = &getAnalysis().getSE(); + } + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2162,6 +2601,10 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (EnableOptimizationAcrossLoops) { + AU.addRequired(); + AU.addPreserved(); + } AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -2175,7 +2618,6 @@ AU.addPreserved(); } }; - } // end anonymous namespace char DSELegacyPass::ID = 0; @@ -2190,6 +2632,7 @@ INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-dom-memory-intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-dom-memory-intrinsic.ll @@ -0,0 +1,288 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if memory intrinsic is removed in %entry because %loop overwrites it +; and we account for invariants %n < 0 or %n == 0 (see function arguments in all tests) +; on some paths from %entry to %exit where it is possible + +; Check if memory intrinsic is removed when there is smax +; TODO: We can remove memset +define void @smax_and_memory_intrinsic(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @smax_and_memory_intrinsic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: [[MAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 0) +; CHECK-NEXT: [[MAX_CAST:%.*]] = zext i32 [[MAX]] to i64 +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[HEADER_COND:%.*]] = icmp eq i64 [[IV]], [[MAX_CAST]] +; CHECK-NEXT: br i1 [[HEADER_COND]], label [[EXIT:%.*]], label [[LATCH]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: latch: +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: br label [[HEADER]] +; +entry: + %n.cast = sext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %max = call i32 @llvm.smax.i32(i32 %n, i32 0) + %max.cast = zext i32 %max to i64 + br label %header + +header: ; preds = %latch, %entry + %iv = phi i64 [ %iv.1, %latch ], [ 0, %entry ] + %header.cond = icmp eq i64 %iv, %max.cast + br i1 %header.cond, label %exit, label %latch + +exit: ; preds = %header + ret void + +latch: ; preds = %header + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + br label %header +} + +; Check a simple case when we can remove memory intrinsic +define void @memset_eq(ptr nocapture %ptr, i32 %n) local_unnamed_addr { +; CHECK-LABEL: @memset_eq( +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP]] +; + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %loop.preheader + +loop.preheader: ; preds = %entry + br label %loop + +exit.loopexit: ; preds = %loop + br label %exit + +exit: ; preds = %exit.loopexit, %entry + ret void + +loop: ; preds = %loop.preheader, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %loop.preheader ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit.loopexit, label %loop +} + +; Check if memory intrinsic in %entry is removed when store in %loop overwrites it completely, but +; the induction variable is decremented by moving from %n to 0 +; TODO: We don't remove memset in %entry now +define void @decremented_indvar_and_memory_intrinsic(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @decremented_indvar_and_memory_intrinsic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[IV_1:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV_1]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[HEADER_COND:%.*]] = icmp sgt i32 [[IV_1]], -1 +; CHECK-NEXT: br i1 [[HEADER_COND]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: latch: +; CHECK-NEXT: [[IV_1_CAST:%.*]] = zext i32 [[IV_1]] to i64 +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1_CAST]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: br label [[HEADER]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + br label %header + +header: ; preds = %latch, %entry + %iv = phi i32 [ %n, %entry ], [ %iv.1, %latch ] + %iv.1 = add i32 %iv, -1 + %header.cond = icmp sgt i32 %iv.1, -1 + br i1 %header.cond, label %latch, label %exit + +exit: ; preds = %header + ret void + +latch: ; preds = %header + %iv.1.cast = zext i32 %iv.1 to i64 + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv.1.cast + store i32 1, ptr %idx, align 4 + br label %header +} + +; Check if memory intrinsic in %entry is removed because its offset is different +; from what we have for store in %loop +; TODO: We don't account for different offset +define void @different_offsets(ptr nocapture %ptr.a, ptr nocapture %ptr.b, i32 %n) { +; CHECK-LABEL: @different_offsets( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR_A_OFFSET:%.*]] = getelementptr i32, ptr [[PTR_A:%.*]], i32 4 +; CHECK-NEXT: [[PTR_A_CAST:%.*]] = bitcast ptr [[PTR_A_OFFSET]] to ptr +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR_A_CAST]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX_A:%.*]] = getelementptr inbounds i32, ptr [[PTR_A]], i64 [[IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[IDX_A]], align 4 +; CHECK-NEXT: [[IDX_B:%.*]] = getelementptr inbounds i32, ptr [[PTR_B:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 [[TMP]], ptr [[IDX_B]], align 4 +; CHECK-NEXT: store i32 1, ptr [[IDX_A]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %ptr.a.offset = getelementptr i32, ptr %ptr.a, i32 4 + %ptr.a.cast = bitcast ptr %ptr.a.offset to ptr + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr.a.cast, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx.a = getelementptr inbounds i32, ptr %ptr.a, i64 %iv + %tmp = load i32, ptr %idx.a + %idx.b = getelementptr inbounds i32, ptr %ptr.b, i64 %iv + store i32 %tmp, ptr %idx.b + store i32 1, ptr %idx.a + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; It checks if memory intrinsic is not removed when there is a path on which +; we don't store anything and on another one we have a store so this intrinsic is not redundant +define void @on_overwrite_in_loop(ptr nocapture %ptr, i32 %n, i1 zeroext %x) { +; CHECK-LABEL: @on_overwrite_in_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_SEXT:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_SEXT]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[DUMMY_BLOCK:%.*]], label [[STORE_BLOCK:%.*]] +; CHECK: dummy.block: +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: br label [[LATCH]] +; CHECK: store.block: +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LATCH_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_ZEXT]] +; CHECK-NEXT: br i1 [[LATCH_COND]], label [[EXIT]], label [[HEADER]] +; +entry: + %n.sext = sext i32 %n to i64 + %len = shl nsw i64 %n.sext, 2 + call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp sgt i32 %n, 0 + br i1 %entry.cond, label %preheader, label %exit + +preheader: ; preds = %entry + %n.zext = zext i32 %n to i64 + br label %header + +exit: ; preds = %latch, %entry + ret void + +header: ; preds = %preheader, %latch + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %latch ] + br i1 %x, label %dummy.block, label %store.block + +dummy.block: ; preds = %header + call void @dummy() + br label %latch + +store.block: ; preds = %header + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + br label %latch + +latch: ; preds = %dummy.block, %store.block + %iv.1 = add nuw nsw i64 %iv, 1 + %latch.cond = icmp eq i64 %iv.1, %n.zext + br i1 %latch.cond, label %exit, label %header +} + +define void @constant_length(ptr nocapture %ptr) { +; CHECK-LABEL: @constant_length( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], 128 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT:%.*]], label [[LOOP]] +; +entry: + call void @llvm.memset.p0i8.i64(ptr noundef nonnull align 4 dereferenceable(512) %ptr, i8 0, i64 512, i1 false) + br label %loop + +exit: ; preds = %loop + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ 0, %entry ], [ %iv.1, %loop ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, 128 + br i1 %loop.cond, label %exit, label %loop +} + +declare void @dummy() +declare i32 @llvm.smax.i32(i32, i32) +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-post-dom-memory-intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-and-post-dom-memory-intrinsic.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if store in %loop because memory intrinsic in %exit overwrites it + +; Check if store in %loop is removed because memory intrinsic in +; %exit overwrites it +; TODO: Store is not removed even if it is redundant +define void @post_dom_memset(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @post_dom_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[PREHEADER:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: preexit: +; CHECK-NEXT: [[PREEXIT_LEN:%.*]] = shl nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[LEN:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[PREEXIT_LEN]], [[PREEXIT:%.*]] ] +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[PREEXIT]], label [[LOOP]] +; +entry: + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %preheader + +preheader: ; preds = %entry + %n.cast = zext i32 %n to i64 + br label %loop + +preexit: ; preds = %loop + %preexit.len = shl nuw nsw i64 %n.cast, 2 + br label %exit + +exit: ; preds = %preexit, %entry + %len = phi i64 [ 0, %entry ], [ %preexit.len, %preexit ] + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + ret void + +loop: ; preds = %preheader, %loop + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %loop ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %preexit, label %loop +} + +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-negative-tests.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-store-negative-tests.ll @@ -0,0 +1,227 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if memory intrinsic is not removed +; + +; Check if store in %11 is not removed because store in %19 does not always overwrite it +define dso_local void @store_is_not_always_overwritten(i32* nocapture noundef writeonly %0, i32 noundef %1, i1 noundef %2) local_unnamed_addr #0 { +; CHECK-LABEL: @store_is_not_always_overwritten( +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP1:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP7:%.*]] +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: br label [[TMP11:%.*]] +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP16:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: br label [[TMP17:%.*]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ 0, [[TMP5]] ], [ [[TMP14:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 [[TMP12]] +; CHECK-NEXT: store i32 1, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14]] = add nuw nsw i64 [[TMP12]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[TMP14]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP15]], label [[TMP7]], label [[TMP11]] +; CHECK: 16: +; CHECK-NEXT: ret void +; CHECK: 17: +; CHECK-NEXT: [[TMP18:%.*]] = phi i64 [ 0, [[TMP9]] ], [ [[TMP22:%.*]], [[TMP21:%.*]] ] +; CHECK-NEXT: br i1 [[TMP2:%.*]], label [[TMP19:%.*]], label [[TMP21]] +; CHECK: 19: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP18]] +; CHECK-NEXT: store i32 2, ptr [[TMP20]], align 4 +; CHECK-NEXT: br label [[TMP21]] +; CHECK: 21: +; CHECK-NEXT: [[TMP22]] = add nuw nsw i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], [[TMP10]] +; CHECK-NEXT: br i1 [[TMP23]], label [[TMP16]], label [[TMP17]] +; + %4 = icmp sgt i32 %1, 0 + br i1 %4, label %5, label %7 + +5: ; preds = %3 + %6 = zext i32 %1 to i64 + br label %11 + +7: ; preds = %11, %3 + %8 = icmp sgt i32 %1, 0 + br i1 %8, label %9, label %16 + +9: ; preds = %7 + %10 = zext i32 %1 to i64 + br label %17 + +11: ; preds = %5, %11 + %12 = phi i64 [ 0, %5 ], [ %14, %11 ] + %13 = getelementptr inbounds i32, i32* %0, i64 %12 + store i32 1, i32* %13, align 4 + %14 = add nuw nsw i64 %12, 1 + %15 = icmp eq i64 %14, %6 + br i1 %15, label %7, label %11 + +16: ; preds = %21, %7 + ret void + +17: ; preds = %9, %21 + %18 = phi i64 [ 0, %9 ], [ %22, %21 ] + br i1 %2, label %19, label %21 + +19: ; preds = %17 + %20 = getelementptr inbounds i32, i32* %0, i64 %18 + store i32 2, i32* %20, align 4 + br label %21 + +21: ; preds = %17, %19 + %22 = add nuw nsw i64 %18, 1 + %23 = icmp eq i64 %22, %10 + br i1 %23, label %16, label %17 +} + +; Check if memory intrinsic in %entry is not removed if there is a step more than one +; because store in %loop does not overwrite it completely leaving gaps after each step +define void @step_with_gaps(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @step_with_gaps( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR:%.*]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check if memory intrinsic in %entry is not removed if they write to different pointers with the store in %loop +; even if the size of underlying arrays are equals +define void @different_base_ptrs(ptr nocapture %ptr.a, i32* nocapture %ptr.b, i32 %n) { +; CHECK-LABEL: @different_base_ptrs( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR_A:%.*]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR_B:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr.a, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr.b, i64 %iv + store i32 1, ptr %idx + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Tricky case where GEP is in another loop that can cause false removal. +; Store in %loop.2 must not cause removal of store in %loop.1 +define void @store_and_gep_in_different_loops(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @store_and_gep_in_different_loops( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[LOOP_1_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_1_IV]] +; CHECK-NEXT: store i32 2, ptr [[LOOP_1_IDX]], align 4 +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: store i32 3, ptr [[LOOP_1_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1 + +guard: ; preds = %loop.1 + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1: ; preds = %loop.1, %preheader1 + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1 ] + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + store i32 2, ptr %loop.1.idx, align 4 + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1 + +exit.loopexit: ; preds = %loop.2 + br label %exit + +exit: ; preds = %exit.loopexit, %guard, %entry + ret void + +loop.2: ; preds = %loop.2, %preheader2 + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + store i32 3, ptr %loop.1.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit.loopexit, label %loop.2 +} + +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) Index: llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-stores.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/Loop/loop-variant-stores.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -opaque-pointers -dse-optimize-across-loops -passes=dse -S | FileCheck %s + +; These tests check if store is removed because there is overwrite in another +; loop + + +; The simplest case when one store overwrites another when they are in different loops +; Store in %loop.2 must cause removal of store in %loop.1 +define void @overwrite_store(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @overwrite_store( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1 + +guard: ; preds = %loop.1 + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1: ; preds = %loop.1, %preheader1 + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1 ] + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + store i32 2, ptr %loop.1.idx, align 4 + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1 + +exit.loopexit: ; preds = %loop.2 + br label %exit + +exit: ; preds = %exit.loopexit, %guard, %entry + ret void + +loop.2: ; preds = %loop.2, %preheader2 + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit.loopexit, label %loop.2 +} + +; Check if store in %loop.1.header is removed even when there are more than one path +; to %loop.1.latch +define void @must_execute_store(ptr %ptr, i32 %n, i1 %x) { +; CHECK-LABEL: @must_execute_store( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1.header: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[LOOP_1_DUMMY:%.*]], label [[LOOP_1_DUMMY2:%.*]] +; CHECK: loop.1.dummy: +; CHECK-NEXT: tail call void @dummy1() +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.dummy2: +; CHECK-NEXT: tail call void @dummy2() +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.latch: +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1_HEADER]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; +%cond = icmp sgt i32 %n, 0 +br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry +%n.zext1 = zext i32 %n to i64 +br label %loop.1.header + +guard: ; preds = %loop.1.latch +br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard +%n.zext2 = zext i32 %n to i64 +br label %loop.2 + +loop.1.header: ; preds = %loop.1.latch, %preheader1 +%loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1.latch ] +%loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv +store i32 2, ptr %loop.1.idx, align 4 +br i1 %x, label %loop.1.dummy, label %loop.1.dummy2 + +loop.1.dummy: ; preds = %loop.1.header +tail call void @dummy1() +br label %loop.1.latch + +loop.1.dummy2: ; preds = %loop.1.header +tail call void @dummy2() +br label %loop.1.latch + +loop.1.latch: ; preds = %loop.1.dummy2, %loop.1.dummy +%loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 +%loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 +br i1 %loop.1.cond, label %guard, label %loop.1.header + +exit.loopexit: ; preds = %loop.2 +br label %exit + +exit: ; preds = %exit.loopexit, %guard, %entry +ret void + +loop.2: ; preds = %loop.2, %preheader2 +%loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] +%loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv +store i32 3, ptr %loop.2.idx, align 4 +%loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 +%loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 +br i1 %loop.2.cond, label %exit.loopexit, label %loop.2 +} + +; Check if store in %loop.1.store is removed even when it does not always execute +define void @not_always_execute(ptr %ptr, i32 %n, i1 %x) { +; CHECK-LABEL: @not_always_execute( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1.header: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[LOOP_1_STORE:%.*]], label [[LOOP_1_DUMMY2:%.*]] +; CHECK: loop.1.store: +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.dummy2: +; CHECK-NEXT: tail call void @dummy2() +; CHECK-NEXT: br label [[LOOP_1_LATCH]] +; CHECK: loop.1.latch: +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1_HEADER]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1.header + +guard: ; preds = %loop.1.latch + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1.header: ; preds = %loop.1.latch, %preheader1 + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1.latch ] + br i1 %x, label %loop.1.store, label %loop.1.dummy2 + +loop.1.store: ; preds = %loop.1.header + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + store i32 2, ptr %loop.1.idx, align 4 + br label %loop.1.latch + +loop.1.dummy2: ; preds = %loop.1.header + tail call void @dummy2() + br label %loop.1.latch + +loop.1.latch: ; preds = %loop.1.dummy2, %loop.1.store + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1.header + +exit.loopexit: ; preds = %loop.2 + br label %exit + +exit: ; preds = %exit.loopexit, %guard, %entry + ret void + +loop.2: ; preds = %loop.2, %preheader2 + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit.loopexit, label %loop.2 +} + +define void @multiple_stores_in_one_loop(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @multiple_stores_in_one_loop( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER1:%.*]], label [[EXIT:%.*]] +; CHECK: preheader1: +; CHECK-NEXT: [[N_ZEXT1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: guard: +; CHECK-NEXT: br i1 [[COND]], label [[PREHEADER2:%.*]], label [[EXIT]] +; CHECK: preheader2: +; CHECK-NEXT: [[N_ZEXT2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[LOOP_1_IV:%.*]] = phi i64 [ 0, [[PREHEADER1]] ], [ [[LOOP_1_IV_1:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[LOOP_1_IV_1]] = add nuw nsw i64 [[LOOP_1_IV]], 1 +; CHECK-NEXT: [[LOOP_1_COND:%.*]] = icmp eq i64 [[LOOP_1_IV_1]], [[N_ZEXT1]] +; CHECK-NEXT: br i1 [[LOOP_1_COND]], label [[GUARD:%.*]], label [[LOOP_1]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.2: +; CHECK-NEXT: [[LOOP_2_IV:%.*]] = phi i64 [ 0, [[PREHEADER2]] ], [ [[LOOP_2_IV_1:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[LOOP_2_IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[LOOP_2_IV]] +; CHECK-NEXT: store i32 3, ptr [[LOOP_2_IDX]], align 4 +; CHECK-NEXT: [[LOOP_2_IV_1]] = add nuw nsw i64 [[LOOP_2_IV]], 1 +; CHECK-NEXT: [[LOOP_2_COND:%.*]] = icmp eq i64 [[LOOP_2_IV_1]], [[N_ZEXT2]] +; CHECK-NEXT: br i1 [[LOOP_2_COND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]] +; + %cond = icmp sgt i32 %n, 0 + br i1 %cond, label %preheader1, label %exit + +preheader1: ; preds = %entry + %n.zext1 = zext i32 %n to i64 + br label %loop.1 + +guard: ; preds = %loop.1 + br i1 %cond, label %preheader2, label %exit + +preheader2: ; preds = %guard + %n.zext2 = zext i32 %n to i64 + br label %loop.2 + +loop.1: ; preds = %loop.1, %preheader1 + %loop.1.iv = phi i64 [ 0, %preheader1 ], [ %loop.1.iv.1, %loop.1 ] + %loop.1.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + %loop.1.idx2 = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + %loop.1.idx3 = getelementptr inbounds i32, ptr %ptr, i64 %loop.1.iv + store i32 2, ptr %loop.1.idx2, align 4 + store i32 2, ptr %loop.1.idx, align 4 + store i32 2, ptr %loop.1.idx, align 4 + store i32 2, ptr %loop.1.idx2, align 4 + store i32 2, ptr %loop.1.idx, align 4 + store i32 2, ptr %loop.1.idx3, align 4 + %loop.1.iv.1 = add nuw nsw i64 %loop.1.iv, 1 + %loop.1.cond = icmp eq i64 %loop.1.iv.1, %n.zext1 + br i1 %loop.1.cond, label %guard, label %loop.1 + +exit.loopexit: ; preds = %loop.2 + br label %exit + +exit: ; preds = %exit.loopexit, %guard, %entry + ret void + +loop.2: ; preds = %loop.2, %preheader2 + %loop.2.iv = phi i64 [ 0, %preheader2 ], [ %loop.2.iv.1, %loop.2 ] + %loop.2.idx = getelementptr inbounds i32, ptr %ptr, i64 %loop.2.iv + store i32 3, ptr %loop.2.idx, align 4 + %loop.2.iv.1 = add nuw nsw i64 %loop.2.iv, 1 + %loop.2.cond = icmp eq i64 %loop.2.iv.1, %n.zext2 + br i1 %loop.2.cond, label %exit.loopexit, label %loop.2 +} + + +declare void @dummy1() #0 + +declare void @dummy2() #0 + +attributes #0 = { argmemonly nounwind readnone willreturn }