Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -47,6 +47,8 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" @@ -111,6 +113,14 @@ DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", "Controls which MemoryDefs are eliminated."); +// Temporary feature-flag for elimination memory intrinsics overwritten by +// stores in loops +// TODO: Remove this flag when this feature becomes stable +static cl::opt EnableMemIntrinsicEliminationByStores( + "enable-mem-intrinsic-elimination-by-stores", cl::init(true), cl::Hidden, + cl::desc("Enable elimination for redundant memory intrinsics" + "that are overwritten by stores that have loop-variant")); + static cl::opt EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", cl::init(true), cl::Hidden, @@ -857,6 +867,32 @@ return false; } +// It represents a range in memory that MemoryDef overwrites completely +// without gaps. If there are supposed to be at least one gap within a range +// then do not create an instance of this type +// TODO: We need offset for base pointer too +struct ContinuousMemoryRange { + const SCEV *Start; + const SCEV *Length; + + ContinuousMemoryRange(const SCEV *Start, const SCEV *Length) + : Start(Start), Length(Length) {} + + static ContinuousMemoryRange createEmpty() { + return ContinuousMemoryRange(nullptr, nullptr); + } + + bool isEmpty() { return Start == nullptr || Length == nullptr; } + + bool operator==(const ContinuousMemoryRange &Other) const { + return Start == Other.Start && Length == Other.Length; + } + + bool operator!=(const ContinuousMemoryRange &Other) const { + return !(*this == Other); + } +}; + struct DSEState { Function &F; AliasAnalysis &AA; @@ -876,7 +912,7 @@ const TargetLibraryInfo &TLI; const DataLayout &DL; const LoopInfo &LI; - + ScalarEvolution &SE; // Whether the function contains any irreducible control flow, useful for // being accurately able to detect loops. bool ContainsIrreducibleLoops; @@ -898,20 +934,35 @@ // accesses are executed before another access. DenseMap PostOrderNumbers; + // Particularly, this is intended for computed ranges of loop-variant stores, + // memory intrinsics + DenseMap MemRanges; + DenseMap> LoopDependantStores; /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. DenseMap IOLs; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, - const LoopInfo &LI) + const LoopInfo &LI, ScalarEvolution &SE) : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI), - DL(F.getParent()->getDataLayout()), LI(LI) {} + DL(F.getParent()->getDataLayout()), LI(LI), SE(SE) {} static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI, const LoopInfo &LI) { - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); + ScalarEvolution &SE, const TargetLibraryInfo &TLI, + const LoopInfo &LI) { + DSEState State(F, AA, MSSA, DT, PDT, TLI, LI, SE); + + auto AddMemRange = [&State](MemoryDef *MD, ContinuousMemoryRange &Range) { + if (!EnableMemIntrinsicEliminationByStores) + return; + if (Range.isEmpty()) + return; + State.MemRanges.insert({MD->getMemoryInst(), Range}); + }; + + bool HasLoops = !LI.empty(); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -921,11 +972,29 @@ MemoryAccess *MA = MSSA.getMemoryAccess(&I); if (I.mayThrow() && !MA) State.ThrowingBlocks.insert(I.getParent()); - auto *MD = dyn_cast_or_null(MA); if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && - (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I))) + (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I))) { + if (EnableMemIntrinsicEliminationByStores && HasLoops) { + if (auto *Store = dyn_cast(&I)) { + if (auto *L = LI.getLoopFor(Store->getParent())) { + // TODO: Handle cases with several latches + if (!State.isGuaranteedLoopInvariant( + Store->getPointerOperand()) && + L->getLoopLatch()) { + State.LoopDependantStores[L].insert(Store); + auto Range = State.computeMemoryRange(Store); + AddMemRange(MD, Range); + } + } + } else if (auto *MemIntr = dyn_cast(&I)) { + auto Range = State.computeMemoryRange(MemIntr); + AddMemRange(MD, Range); + } + } + State.MemDefs.push_back(MD); + } } } @@ -945,6 +1014,118 @@ return State; } + Value *getMemIntrItemCount(AnyMemIntrinsic *MemIntr) { + if (auto *Length = MemIntr->getLength()) + if (auto *I = dyn_cast(Length)) { + if (I->getOpcode() == Instruction::Mul) { + if (auto *Count = getCount(I)) + return Count; + } else if (I->getOpcode() == Instruction::Shl) { + if (auto *Count = getCount(I)) + return Count; + } + } + return nullptr; + } + + Value *getCount(Instruction *I) { + Value *FirstOp = I->getOperand(0); + Value *SecondOp = I->getOperand(1); + if (Value *Count = getArgumentOperand(FirstOp, SecondOp)) + return Count; + else if (Value *Count = getArgumentOperand(SecondOp, FirstOp)) + return Count; + return nullptr; + } + + Value *getArgumentOperand(Value *First, Value *Second) { + if (isa(First)) { + if (isa(Second)) { + if (Second->getType()->isIntegerTy()) + return Second; + } else if (auto *ZExt = dyn_cast(Second)) { + if (ZExt->getType()->isIntegerTy()) + return ZExt->getOperand(0); + } + } + return nullptr; + } + + // This computes a continuous range that memory intrinsic fills + // + // If it cannot compute a range, it will return an empty range + ContinuousMemoryRange computeMemoryRange(AnyMemIntrinsic *MemIntr) { + const auto *Length = SE.getSCEV(MemIntr->getLength()); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + auto *Start = SE.getSCEV(MemIntr->getDest()); + return ContinuousMemoryRange(Start, Length); + } + + // This computes a continuous range to which a store within a loop write + // It works only with loops where there is a canonical induction variable + // + // If it cannot compute a range, it will return an empty range + ContinuousMemoryRange computeMemoryRange(StoreInst *Store) { + auto *L = LI.getLoopFor(Store->getParent()); + // It is better to handle loop-invariant stores via Alias Analysis + if (!L || isGuaranteedLoopInvariant(Store->getPointerOperand())) + return ContinuousMemoryRange::createEmpty(); + + auto *GEP = dyn_cast(Store->getPointerOperand()); + + if (!GEP) + return ContinuousMemoryRange::createEmpty(); + + auto *SourceType = GEP->getSourceElementType(); + // TODO: Support others type except scalar + if (SourceType->isVectorTy() || SourceType->isArrayTy()) + return ContinuousMemoryRange::createEmpty(); + auto *GEPEvol = SE.getSCEV(GEP); + + if (!isa(GEPEvol)) + return ContinuousMemoryRange::createEmpty(); + + auto *StoreRec = cast(GEPEvol); + + if (!StoreRec) + return ContinuousMemoryRange::createEmpty(); + + if (!StoreRec->isAffine()) + return ContinuousMemoryRange::createEmpty(); + + auto *BTC = SE.getBackedgeTakenCount(L); + + if (isa(BTC)) + return ContinuousMemoryRange::createEmpty(); + + const auto *Start = StoreRec->getOperand(0); + + if (isa(Start)) + return ContinuousMemoryRange::createEmpty(); + + auto *Count = SE.getAddExpr(BTC, SE.getOne(BTC->getType())); + + if (isa(Count)) + return ContinuousMemoryRange::createEmpty(); + + if (isa(Count)) + return ContinuousMemoryRange::createEmpty(); + + auto SizeInBytes = + Store->getValueOperand()->getType()->getScalarSizeInBits() / 8; + auto *StoreSize = SE.getConstant(Count->getType(), SizeInBytes); + + auto *Step = StoreRec->getOperand(1); + if (Step != StoreSize) + return ContinuousMemoryRange::createEmpty(); + + const auto *Length = SE.getMulExpr(Count, StoreSize); + if (isa(Length)) + return ContinuousMemoryRange::createEmpty(); + return ContinuousMemoryRange(Start, Length); + } + /// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI /// instruction) completely overwrites a store to the 'Earlier' location. /// (by \p EarlierI instruction). @@ -956,6 +1137,14 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI, const MemoryLocation &Later, const MemoryLocation &Earlier, int64_t &EarlierOff, int64_t &LaterOff) { + + if (EnableMemIntrinsicEliminationByStores) + if (auto *MemIntr = dyn_cast(EarlierI)) { + if (isGuaranteedLoopInvariant(MemIntr->getDest())) + if (!isGuaranteedLoopInvariant(Later.Ptr)) + if (isMemoryIntrCompletelyOverwritten(LaterI, EarlierI)) + return OW_Complete; + } // AliasAnalysis does not always account for loops. Limit overwrite checks // to dependencies for which we can guarantee they are independant of any // loops they are in. @@ -1305,6 +1494,7 @@ if (!ContainsIrreducibleLoops && CurrentLI && CurrentLI == LI.getLoopFor(KillingDef->getParent())) return true; + // Otherwise check the memory location is invariant to any loops. return isGuaranteedLoopInvariant(CurrentLoc.Ptr); } @@ -1447,7 +1637,8 @@ // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same // memory location and not located in different loops. - if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { + if (!EnableMemIntrinsicEliminationByStores && + !isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); WalkerStepLimit -= 1; continue; @@ -1491,6 +1682,7 @@ MemoryAccess *EarlierAccess = Current; Instruction *EarlierMemInst = cast(EarlierAccess)->getMemoryInst(); + LLVM_DEBUG(dbgs() << " Checking for reads of " << *EarlierAccess << " (" << *EarlierMemInst << ")\n"); @@ -1618,6 +1810,12 @@ SmallPtrSet KillingBlocks; for (Instruction *KD : KillingDefs) KillingBlocks.insert(KD->getParent()); + + if (EnableMemIntrinsicEliminationByStores) { + tryToCollectBBWithNoMemIntrEffect(KillingBlocks, EarlierMemInst, + KillingI); + } + assert(!KillingBlocks.empty() && "Expected at least a single killing block"); @@ -1685,6 +1883,149 @@ return {EarlierAccess}; } + // Consider an example + // memset(P, 0, N * sizeof(int)); + // if (N > 0) + // { + // int i = 0; + // do { P[i] = 1; ... ++i; } while (i < N); + // } + // else { + // foo(); + // } + // + // If else-block is executed, then we know that N <= 0, then memset + // hasn't filled anything, and as it doesn't have any visible effect + // it is reasonable to think like this memory intrinsic is completely + // overwritten on this path + void tryToCollectBBWithNoMemIntrEffect( + SmallPtrSet &KillingBlocks, Instruction *EarlierMemInst, + Instruction *KillingI) { + auto *MemIntr = dyn_cast(EarlierMemInst); + if (!MemIntr) + return; + auto *Count = getMemIntrItemCount(MemIntr); + if (!Count) + return; + auto *Store = dyn_cast(KillingI); + if (!Store) + return; + if (isGuaranteedLoopInvariant(Store->getPointerOperand())) + return; + // TODO: Handle switch and other terminator instructions + // TODO: Adjust SmallSize + SmallPtrSet Visited; + SmallVector Worklist; + Worklist.push_back(EarlierMemInst->getParent()); + while (!Worklist.empty()) { + auto *Current = Worklist.pop_back_val(); + if (Visited.count(Current)) + continue; + Visited.insert(Current); + auto *Term = Current->getTerminator(); + if (isa(Term)) + continue; + + // Finding killing basic blocks which are executed when memory intrinsic + // does not do anything which implies that N <= 0 where N is byte length + // to be written + if (auto *Br = dyn_cast(Term)) { + if (Br->isUnconditional()) { + Worklist.push_back(Br->getSuccessor(0)); + } else if (auto *Cond = dyn_cast(Br->getCondition())) { + collectBranchKillingSuccessors(Br, Cond, Count, KillingBlocks, + Worklist); + } + } + } + } + + // It collects killing BBs where may be no MemoryDef, + // but they are killing because branch instruction introduces + // invariants that imply the fact + void + collectBranchKillingSuccessors(BranchInst *Br, ICmpInst *Cond, Value *Count, + SmallPtrSet &KillingBlocks, + SmallVector &Worklist) { + unsigned OpIdx; + if (auto *LHS = dyn_cast(Cond->getOperand(0))) { + if (!LHS->isZero()) + return; + OpIdx = 1; + } else if (auto *RHS = dyn_cast(Cond->getOperand(1))) { + if (!RHS->isZero()) + return; + OpIdx = 0; + } else { + return; + } + + auto Pred = Cond->getPredicate(); + unsigned MaybeKillingBBIdx; + unsigned BBToProcessIdx; + if (Pred == ICmpInst::Predicate::ICMP_EQ) { + MaybeKillingBBIdx = 0; + BBToProcessIdx = 1; + } else if (Pred == ICmpInst::Predicate::ICMP_ULT || + Pred == ICmpInst::Predicate::ICMP_SLT) { + if (OpIdx) { + MaybeKillingBBIdx = 1; + BBToProcessIdx = 0; + } else { + return; + } + } else if (Pred == ICmpInst::Predicate::ICMP_UGT || + Pred == ICmpInst::Predicate::ICMP_SGT) { + if (!OpIdx) { + MaybeKillingBBIdx = 1; + BBToProcessIdx = 0; + } else { + return; + } + } else if (Pred == ICmpInst::Predicate::ICMP_ULE || + Pred == ICmpInst::Predicate::ICMP_SLE) { + if (!OpIdx) { + MaybeKillingBBIdx = 0; + BBToProcessIdx = 1; + } else { + return; + } + } else if (Pred == ICmpInst::Predicate::ICMP_UGE || + Pred == ICmpInst::Predicate::ICMP_SGE) { + if (OpIdx) { + MaybeKillingBBIdx = 0; + BBToProcessIdx = 1; + } else { + return; + } + } else { + return; + } + + if (Cond->getOperand(OpIdx) == Count) + KillingBlocks.insert(Br->getSuccessor(MaybeKillingBBIdx)); + else + Worklist.push_back(Br->getSuccessor(MaybeKillingBBIdx)); + Worklist.push_back(Br->getSuccessor(BBToProcessIdx)); + } + + bool isMemoryIntrCompletelyOverwritten(const Instruction *KillingI, + const Instruction *KilledI) { + if (const auto *MemIntr = dyn_cast(KilledI)) { + if (isa(KillingI)) { + if (!MemRanges.count(KilledI)) + return false; + if (!MemRanges.count(KillingI)) + return false; + auto MemIntrRange = *MemRanges.find(KilledI); + auto StoreRange = *MemRanges.find(KillingI); + // Memory intrinsic and store in loop are not overlapping + return MemIntrRange.second == StoreRange.second; + } + } + return false; + } + // Delete dead memory defs void deleteDeadInstruction(Instruction *SI) { MemorySSAUpdater Updater(&MSSA); @@ -1925,11 +2266,12 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, + ScalarEvolution &SE, const TargetLibraryInfo &TLI, const LoopInfo &LI) { bool MadeChange = false; - DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI); + DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, SE, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2103,8 +2445,8 @@ MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); LoopInfo &LI = AM.getResult(F); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + ScalarEvolution &SE = AM.getResult(F); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2145,8 +2487,8 @@ PostDominatorTree &PDT = getAnalysis().getPostDomTree(); LoopInfo &LI = getAnalysis().getLoopInfo(); - - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + ScalarEvolution &SE = getAnalysis().getSE(); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2170,6 +2512,8 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; @@ -2187,6 +2531,7 @@ INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -516,12 +516,12 @@ ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MemCpy Optimization ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Lazy Branch Probability Analysis ; GCN-O2-NEXT: Lazy Block Frequency Analysis ; GCN-O2-NEXT: Loop Pass Manager @@ -875,12 +875,12 @@ ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MemCpy Optimization ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Lazy Branch Probability Analysis ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Loop Pass Manager Index: llvm/test/Transforms/DeadStoreElimination/loop-variant-store-and-dom-memory-intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/loop-variant-store-and-dom-memory-intrinsic.ll @@ -0,0 +1,616 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s --force-opaque-pointers -passes=dse -S | FileCheck %s + +; These tests check if memory intrinsic is removed in %entry because %loop overwrites it +; and we account for invariants %n < 0 or %n == 0 (see function arguments in all tests) +; on some paths from %entry to %exit where it is possible + +; Check if memory intrinsic is removed when there is smax +; TODO: We can remove memset in this case because +; 1) %n == 0 then memset and store in %latch both do nothing +; 2) %n > 0, then memset is overwritten by store in %latch for sure +; 3) %n < 0, then it must be UB +define void @smax_and_memory_intrinsic(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @smax_and_memory_intrinsic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: [[MAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 0) +; CHECK-NEXT: [[MAX_CAST:%.*]] = zext i32 [[MAX]] to i64 +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[HEADER_COND:%.*]] = icmp eq i64 [[IV]], [[MAX_CAST]] +; CHECK-NEXT: br i1 [[HEADER_COND]], label [[EXIT:%.*]], label [[LATCH]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: latch: +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: br label [[HEADER]] +; +entry: + %n.cast = sext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %max = call i32 @llvm.smax.i32(i32 %n, i32 0) + %max.cast = zext i32 %max to i64 + br label %header + +header: ; preds = %latch, %entry + %iv = phi i64 [ %iv.1, %latch ], [ 0, %entry ] + %header.cond = icmp eq i64 %iv, %max.cast + br i1 %header.cond, label %exit, label %latch + +exit: ; preds = %header + ret void + +latch: ; preds = %header + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + br label %header +} + +; Check condition %n == 0 before %loop +define void @memset_eq(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_eq( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition 0 == %n before %loop +define void @memset_eq_swapped(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_eq_swapped( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 0, [[N]] +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp eq i32 0, %n + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition %n > 0 before loop for unsigned %n +define void @memset_ugt(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_ugt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ugt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[LOOP:%.*]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp ugt i32 %n, 0 + br i1 %entry.cond, label %loop, label %exit + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition %n > 0 before loop for signed %n +define void @memset_sgt(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_sgt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[LOOP:%.*]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp sgt i32 %n, 0 + br i1 %entry.cond, label %loop, label %exit + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition %n < 0 before loop for unsigned %n +define void @memset_ult(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_ult( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ult i32 0, [[N]] +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[LOOP:%.*]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp ult i32 0, %n + br i1 %entry.cond, label %loop, label %exit + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition %n < 0 before loop for signed %n +define void @memset_slt(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_slt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp slt i32 0, [[N]] +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[LOOP:%.*]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp slt i32 0, %n + br i1 %entry.cond, label %loop, label %exit + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition %n <= 0 before loop for unsigned %n +define void @memset_ule(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_ule( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ule i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp ule i32 %n, 0 + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition %n <= 0 before loop for signed %n +define void @memset_sle(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_sle( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sle i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp sle i32 %n, 0 + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition 0 >= %n before loop for unsigned %n +define void @memset_uge(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_uge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp uge i32 0, [[N]] +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp uge i32 0, %n + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check condition 0 >= %n before loop for signed %n +define void @memset_sge(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memset_sge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sge i32 0, [[N]] +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp sge i32 0, %n + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check that memcpy in %entry is removed because store in %loop overwrites it +define void @memcpy_eq(ptr nocapture %dst, ptr nocapture readonly %src, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @memcpy_eq( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %dst.cast = bitcast ptr %dst to ptr + %src.cast = bitcast ptr %src to ptr + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memcpy.p0i8.p0i8.i64(ptr align 4 %dst.cast, ptr align 4 %src.cast, i64 %len, i1 false) + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.1, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check if memory intrinsic in %entry is removed when store in %loop overwrites it completely, but +; the induction variable is decremented by moving from %n to 0 +; TODO: We don't remove memset in %entry now +define void @decremented_indvar_and_memory_intrinsic(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @decremented_indvar_and_memory_intrinsic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[IV_1:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV_1]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[HEADER_COND:%.*]] = icmp sgt i32 [[IV_1]], -1 +; CHECK-NEXT: br i1 [[HEADER_COND]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: latch: +; CHECK-NEXT: [[IV_1_CAST:%.*]] = zext i32 [[IV_1]] to i64 +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1_CAST]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: br label [[HEADER]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nuw nsw i64 %n.cast, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + br label %header + +header: ; preds = %latch, %entry + %iv = phi i32 [ %n, %entry ], [ %iv.1, %latch ] + %iv.1 = add i32 %iv, -1 + %header.cond = icmp sgt i32 %iv.1, -1 + br i1 %header.cond, label %latch, label %exit + +exit: ; preds = %header + ret void + +latch: ; preds = %header + %iv.1.cast = zext i32 %iv.1 to i64 + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv.1.cast + store i32 1, ptr %idx, align 4 + br label %header +} + +; Check if memory intrinsic in %entry is removed because its offset is different +; from what we have for store in %loop +; TODO: We don't account for different offset +define void @different_offsets(ptr nocapture %ptr.a, ptr nocapture %ptr.b, i32 %n) { +; CHECK-LABEL: @different_offsets( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR_A_OFFSET:%.*]] = getelementptr i32, ptr [[PTR_A:%.*]], i32 4 +; CHECK-NEXT: [[PTR_A_CAST:%.*]] = bitcast ptr [[PTR_A_OFFSET]] to ptr +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR_A_CAST]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX_A:%.*]] = getelementptr inbounds i32, ptr [[PTR_A]], i64 [[IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[IDX_A]], align 4 +; CHECK-NEXT: [[IDX_B:%.*]] = getelementptr inbounds i32, ptr [[PTR_B:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 [[TMP]], ptr [[IDX_B]], align 4 +; CHECK-NEXT: store i32 1, ptr [[IDX_A]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %ptr.a.offset = getelementptr i32, ptr %ptr.a, i32 4 + %ptr.a.cast = bitcast ptr %ptr.a.offset to ptr + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr.a.cast, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx.a = getelementptr inbounds i32, ptr %ptr.a, i64 %iv + %tmp = load i32, ptr %idx.a + %idx.b = getelementptr inbounds i32, ptr %ptr.b, i64 %iv + store i32 %tmp, ptr %idx.b + store i32 1, ptr %idx.a + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; It checks if memory intrinsic is not removed when there is a path on which +; we don't store anything and on another one we have a store so this intrinsic is not redundant +; TODO: We handle only loops with one latch, but if one looked at e.g. SQLite, then they find out one-latch-loop only in ~30% of all cases +define void @on_overwrite_in_loop(ptr nocapture %ptr, i32 %n, i1 zeroext %x) { +; CHECK-LABEL: @on_overwrite_in_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_SEXT:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_SEXT]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[X:%.*]], label [[DUMMY_BLOCK:%.*]], label [[STORE_BLOCK:%.*]] +; CHECK: dummy.block: +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: br label [[LATCH]] +; CHECK: store.block: +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LATCH_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_ZEXT]] +; CHECK-NEXT: br i1 [[LATCH_COND]], label [[EXIT]], label [[HEADER]] +; +entry: + %n.sext = sext i32 %n to i64 + %len = shl nsw i64 %n.sext, 2 + call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + %entry.cond = icmp sgt i32 %n, 0 + br i1 %entry.cond, label %preheader, label %exit + +preheader: ; preds = %entry + %n.zext = zext i32 %n to i64 + br label %header + +exit: ; preds = %latch, %entry + ret void + +header: ; preds = %preheader, %latch + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %latch ] + br i1 %x, label %dummy.block, label %store.block + +dummy.block: ; preds = %header + call void @dummy() + br label %latch + +store.block: ; preds = %header + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + br label %latch + +latch: ; preds = %dummy.block, %store.block + %iv.1 = add nuw nsw i64 %iv, 1 + %latch.cond = icmp eq i64 %iv.1, %n.zext + br i1 %latch.cond, label %exit, label %header +} + +declare void @dummy() +declare i32 @llvm.smax.i32(i32, i32) +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1) Index: llvm/test/Transforms/DeadStoreElimination/loop-variant-store-and-post-dom-memory-intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/loop-variant-store-and-post-dom-memory-intrinsic.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s --force-opaque-pointers -passes=dse -S | FileCheck %s + +; These tests check if store in %loop because memory intrinsic in %exit overwrites it + +; Check if store in %loop is removed because memory intrinsic in +; %exit overwrites it +; TODO: Store is not removed even if it is redundant +define void @post_dom_memset(ptr nocapture %ptr, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @post_dom_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[PREHEADER:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: preexit: +; CHECK-NEXT: [[PREEXIT_LEN:%.*]] = shl nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[LEN:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[PREEXIT_LEN]], [[PREEXIT:%.*]] ] +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR:%.*]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[PREEXIT]], label [[LOOP]] +; +entry: + %entry.cond = icmp eq i32 %n, 0 + br i1 %entry.cond, label %exit, label %preheader + +preheader: ; preds = %entry + %n.cast = zext i32 %n to i64 + br label %loop + +preexit: ; preds = %loop + %preexit.len = shl nuw nsw i64 %n.cast, 2 + br label %exit + +exit: ; preds = %preexit, %entry + %len = phi i64 [ 0, %entry ], [ %preexit.len, %preexit ] + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + ret void + +loop: ; preds = %preheader, %loop + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %loop ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.cast + br i1 %loop.cond, label %preexit, label %loop +} + +; It checks that store is not removed because there is a path through %bypass to %exit with ret where +; it is not overwritten +define void @no_post_dom_memset(ptr nocapture %ptr, i32 %n, i32 %k) local_unnamed_addr #0 { +; CHECK-LABEL: @no_post_dom_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[PREHEADER:%.*]], label [[BYPASS:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[N_ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: bypass: +; CHECK-NEXT: [[BYPASS_COND:%.*]] = icmp eq i32 [[K:%.*]], 0 +; CHECK-NEXT: br i1 [[BYPASS_COND]], label [[EXIT:%.*]], label [[PREEXIT:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IV_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_1]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_1]], [[N_ZEXT]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[BYPASS]], label [[LOOP]] +; CHECK: preexit: +; CHECK-NEXT: [[N_SEXT:%.*]] = sext i32 [[N]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_SEXT]], 2 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 4 [[PTR]], i8 0, i64 [[LEN]], i1 false) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %entry.cond = icmp sgt i32 %n, 0 + br i1 %entry.cond, label %preheader, label %bypass + +preheader: ; preds = %entry + %n.zext = zext i32 %n to i64 + br label %loop + +bypass: ; preds = %loop, %entry + %bypass.cond = icmp eq i32 %k, 0 + br i1 %bypass.cond, label %exit, label %preexit + +loop: ; preds = %preheader, %loop + %iv = phi i64 [ 0, %preheader ], [ %iv.1, %loop ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx, align 4 + %iv.1 = add nuw nsw i64 %iv, 1 + %loop.cond = icmp eq i64 %iv.1, %n.zext + br i1 %loop.cond, label %bypass, label %loop + +preexit: ; preds = %bypass + %n.sext = sext i32 %n to i64 + %len = shl nsw i64 %n.sext, 2 + tail call void @llvm.memset.p0i8.i64(ptr align 4 %ptr, i8 0, i64 %len, i1 false) + br label %exit + +exit: ; preds = %preexit, %bypass + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) Index: llvm/test/Transforms/DeadStoreElimination/loop-variant-store-negative-tests.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/loop-variant-store-negative-tests.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s --force-opaque-pointers -passes=dse -S | FileCheck %s + +; These tests check if memory intrinsic is not removed +; + +; Check if memory intrinsic in %entry is not removed if there is a step more than one +; because store in %loop does not overwrite it completely leaving gaps after each step +define void @step_with_gaps(ptr nocapture %ptr, i32 %n) { +; CHECK-LABEL: @step_with_gaps( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR:%.*]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr, i64 %iv + store i32 1, ptr %idx + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +; Check if memory intrinsic in %entry is not removed if they write to different pointers with the store in %loop +; even if the size of underlying arrays are equals +define void @different_base_ptrs(ptr nocapture %ptr.a, i32* nocapture %ptr.b, i32 %n) { +; CHECK-LABEL: @different_base_ptrs( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[N_CAST:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[LEN:%.*]] = shl nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[PTR_A:%.*]], i8 0, i64 [[N_CAST]], i1 false) +; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp eq i64 [[N_CAST]], 0 +; CHECK-NEXT: br i1 [[ENTRY_COND]], label [[EXIT:%.*]], label [[LOOP:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_2:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = getelementptr inbounds i32, ptr [[PTR_B:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[IDX]], align 4 +; CHECK-NEXT: [[IV_2]] = add nuw nsw i64 [[N_CAST]], 2 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[IV_2]], [[N_CAST]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]] +; +entry: + %n.cast = zext i32 %n to i64 + %len = shl nsw i64 %n.cast, 2 + call void @llvm.memset.p0i8.i64(ptr %ptr.a, i8 0, i64 %n.cast, i1 false) + %entry.cond = icmp eq i64 %n.cast, 0 + br i1 %entry.cond, label %exit, label %loop +exit: + ret void + +loop: + %iv = phi i64 [ %iv.2, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, ptr %ptr.b, i64 %iv + store i32 1, ptr %idx + %iv.2 = add nuw nsw i64 %n.cast, 2 + %loop.cond = icmp eq i64 %iv.2, %n.cast + br i1 %loop.cond, label %exit, label %loop +} + +declare void @llvm.memset.p0i8.i64(ptr nocapture writeonly, i8, i64, i1 immarg)