Index: llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h =================================================================== --- llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -34,6 +34,7 @@ class MemorySSA; class MemorySSAUpdater; class MemSetInst; +class PostDominatorTree; class StoreInst; class TargetLibraryInfo; class Value; @@ -43,6 +44,7 @@ AAResults *AA = nullptr; AssumptionCache *AC = nullptr; DominatorTree *DT = nullptr; + PostDominatorTree *PDT = nullptr; MemorySSA *MSSA = nullptr; MemorySSAUpdater *MSSAU = nullptr; @@ -53,7 +55,8 @@ // Glue for the old PM. bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA, - AssumptionCache *AC, DominatorTree *DT, MemorySSA *MSSA); + AssumptionCache *AC, DominatorTree *DT, PostDominatorTree *PDT, + MemorySSA *MSSA); private: // Helper functions Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -19,12 +19,14 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" @@ -96,7 +98,7 @@ MaybeAlign Alignment; /// TheStores - The actual stores that make up this range. - SmallVector TheStores; + SmallVector TheStores; bool isProfitableToUseMemset(const DataLayout &DL) const; }; @@ -105,10 +107,12 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. - if (TheStores.size() >= 4 || End-Start >= 16) return true; + if (TheStores.size() >= 4 || End - Start >= 16) + return true; // If there is nothing to merge, don't do anything. - if (TheStores.size() < 2) return false; + if (TheStores.size() < 2) + return false; // If any of the stores are a memset, then it is always good to extend the // memset. @@ -118,7 +122,8 @@ // Assume that the code generator is capable of merging pairs of stores // together if it wants to. - if (TheStores.size() == 2) return false; + if (TheStores.size() == 2) + return false; // If we have fewer than 8 stores, it can still be worthwhile to do this. // For example, merging 4 i8 stores into an i32 store is useful almost always. @@ -130,7 +135,7 @@ // the maximum GPR width is the same size as the largest legal integer // size. If so, check to see whether we will end up actually reducing the // number of stores used. - unsigned Bytes = unsigned(End-Start); + unsigned Bytes = unsigned(End - Start); unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; if (MaxIntSize == 0) MaxIntSize = 1; @@ -142,7 +147,7 @@ // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 // etc. - return TheStores.size() > NumPointerStores+NumByteStores; + return TheStores.size() > NumPointerStores + NumByteStores; } namespace { @@ -194,7 +199,7 @@ /// existing ranges as appropriate. void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment, Instruction *Inst) { - int64_t End = Start+Size; + int64_t End = Start + Size; range_iterator I = partition_point( Ranges, [=](const MemsetRange &O) { return O.End < Start; }); @@ -204,10 +209,10 @@ // to insert a new range. Handle this now. if (I == Ranges.end() || End < I->Start) { MemsetRange &R = *Ranges.insert(I, MemsetRange()); - R.Start = Start; - R.End = End; - R.StartPtr = Ptr; - R.Alignment = Alignment; + R.Start = Start; + R.End = End; + R.StartPtr = Ptr; + R.Alignment = Alignment; R.TheStores.push_back(Inst); return; } @@ -383,7 +388,8 @@ if (auto *NextStore = dyn_cast(BI)) { // If this is a store, see if we can merge it in. - if (!NextStore->isSimple()) break; + if (!NextStore->isSimple()) + break; Value *StoredVal = NextStore->getValueOperand(); @@ -446,7 +452,8 @@ // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; for (const MemsetRange &Range : Ranges) { - if (Range.TheStores.size() == 1) continue; + if (Range.TheStores.size() == 1) + continue; // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(DL)) @@ -467,12 +474,10 @@ if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); - auto *NewDef = - cast(MemInsertPoint->getMemoryInst() == &*BI - ? MSSAU->createMemoryAccessBefore( - AMemSet, nullptr, MemInsertPoint) - : MSSAU->createMemoryAccessAfter( - AMemSet, nullptr, MemInsertPoint)); + auto *NewDef = cast( + MemInsertPoint->getMemoryInst() == &*BI + ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint) + : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint)); MSSAU->insertDef(NewDef, /*RenameUses=*/true); MemInsertPoint = NewDef; @@ -498,12 +503,13 @@ // Keep track of the arguments of all instruction we plan to lift // so we can make sure to lift them as well if appropriate. - DenseSet Args; + DenseSet Args; auto AddArg = [&](Value *Arg) { auto *I = dyn_cast(Arg); if (I && I->getParent() == SI->getParent()) { // Cannot hoist user of P above P - if (I == P) return false; + if (I == P) + return false; Args.insert(I); } return true; @@ -616,8 +622,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, const DataLayout &DL, BasicBlock::iterator &BBI) { - if (!LI->isSimple() || !LI->hasOneUse() || - LI->getParent() != SI->getParent()) + if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent()) return false; auto *T = LI->getType(); @@ -668,17 +673,16 @@ IRBuilder<> Builder(P); Instruction *M; if (UseMemMove) - M = Builder.CreateMemMove( - SI->getPointerOperand(), SI->getAlign(), - LI->getPointerOperand(), LI->getAlign(), Size); + M = Builder.CreateMemMove(SI->getPointerOperand(), SI->getAlign(), + LI->getPointerOperand(), LI->getAlign(), + Size); else - M = Builder.CreateMemCpy( - SI->getPointerOperand(), SI->getAlign(), - LI->getPointerOperand(), LI->getAlign(), Size); + M = Builder.CreateMemCpy(SI->getPointerOperand(), SI->getAlign(), + LI->getPointerOperand(), LI->getAlign(), Size); M->copyMetadata(*SI, LLVMContext::MD_DIAssignID); - LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " - << *M << "\n"); + LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M + << "\n"); auto *LastDef = cast(MSSAU->getMemorySSA()->getMemoryAccess(SI)); @@ -741,7 +745,8 @@ } bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { - if (!SI->isSimple()) return false; + if (!SI->isSimple()) + return false; // Avoid merging nontemporal stores since the resulting // memcpy/memset would not be able to preserve the nontemporal hint. @@ -780,8 +785,8 @@ // 0xA0A0A0A0 and 0.0. auto *V = SI->getOperand(0); if (Value *ByteVal = isBytewiseValue(V, DL)) { - if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), - ByteVal)) { + if (Instruction *I = + tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -802,8 +807,7 @@ // The newly inserted memset is immediately overwritten by the original // store, so we do not need to rename uses. auto *StoreDef = cast(MSSA->getMemoryAccess(SI)); - auto *NewAccess = MSSAU->createMemoryAccessBefore( - M, nullptr, StoreDef); + auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef); MSSAU->insertDef(cast(NewAccess), /*RenameUses=*/false); eraseInstruction(SI); @@ -822,8 +826,8 @@ // See if there is another memset or store neighboring this memset which // allows us to widen out the memset to do a single larger store. if (isa(MSI->getLength()) && !MSI->isVolatile()) - if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), - MSI->getValue())) { + if (Instruction *I = + tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -836,7 +840,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore, Value *cpyDest, Value *cpySrc, TypeSize cpySize, - Align cpyDestAlign, BatchAAResults &BAA, + Align cpyDestAlign, + BatchAAResults &BAA, std::function GetC) { // The general transformation to keep in mind is // @@ -881,15 +886,15 @@ if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) return false; - if (C->getParent() != cpyStore->getParent()) { LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n"); return false; } - MemoryLocation DestLoc = isa(cpyStore) ? - MemoryLocation::get(cpyStore) : - MemoryLocation::getForDest(cast(cpyStore)); + MemoryLocation DestLoc = + isa(cpyStore) + ? MemoryLocation::get(cpyStore) + : MemoryLocation::getForDest(cast(cpyStore)); // Check that nothing touches the dest of the copy between // the call and the store/memcpy. @@ -1066,9 +1071,11 @@ bool changedArgument = false; for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) { - Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest - : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), - cpyDest->getName(), C); + Value *Dest = + cpySrc->getType() == cpyDest->getType() + ? cpyDest + : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); changedArgument = true; if (C->getArgOperand(ArgI)->getType() == Dest->getType()) C->setArgOperand(ArgI, Dest); @@ -1167,7 +1174,8 @@ // If all checks passed, then we can transform M. LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" - << *MDep << '\n' << *M << '\n'); + << *MDep << '\n' + << *M << '\n'); // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. @@ -1303,8 +1311,8 @@ // memcpy's defining access is the memset about to be removed. auto *LastDef = cast(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); - auto *NewAccess = MSSAU->createMemoryAccessBefore( - NewMemSet, nullptr, LastDef); + auto *NewAccess = + MSSAU->createMemoryAccessBefore(NewMemSet, nullptr, LastDef); MSSAU->insertDef(cast(NewAccess), /*RenameUses=*/true); eraseInstruction(MemSet); @@ -1347,6 +1355,24 @@ return false; } +/// Find the nearest instruction I that post-dominates both I1 and I2. +static Instruction *findNearestCommonPostDominator(Instruction *I1, + Instruction *I2, + PostDominatorTree *PDT) { + BasicBlock *BB1 = I1->getParent(); + BasicBlock *BB2 = I2->getParent(); + if (BB1 == BB2) + return I1->comesBefore(I2) ? I2 : I1; + BasicBlock *PDomBB = PDT->findNearestCommonDominator(BB1, BB2); + if (!PDomBB) + return nullptr; + if (BB2 == PDomBB) + return I2; + if (BB1 == PDomBB) + return I1; + return PDomBB->getFirstNonPHI(); +} + /// Transform memcpy to memset when its source was just memset. /// In other words, turn: /// \code @@ -1380,7 +1406,7 @@ return false; // A known memcpy size is also required. - auto *CCopySize = dyn_cast(CopySize); + auto *CCopySize = dyn_cast(CopySize); if (!CCopySize) return false; if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { @@ -1457,17 +1483,16 @@ // 2-1. Check that src and dest are static allocas, which are not affected by // stacksave/stackrestore. - if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca() || - SrcAlloca->getParent() != Load->getParent() || - SrcAlloca->getParent() != Store->getParent()) + if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca()) return false; // 2-2. Check that src and dest are never captured, unescaped allocas. Also - // collect lifetime markers first/last users in order to shrink wrap the - // lifetimes, and instructions with noalias metadata to remove them. + // find the nearest common dominator and postdominator for all users in + // order to shrink wrap the lifetimes, and instructions with noalias metadata + // to remove them. SmallVector LifetimeMarkers; - Instruction *FirstUser = nullptr, *LastUser = nullptr; + Instruction *Dom = nullptr, *PDom = nullptr; SmallSet NoAliasInstrs; // Recursively track the user and check whether modified alias exist. @@ -1505,12 +1530,13 @@ continue; case UseCaptureKind::NO_CAPTURE: { auto *UI = cast(U.getUser()); - if (DestAlloca->getParent() != UI->getParent()) - return false; - if (!FirstUser || UI->comesBefore(FirstUser)) - FirstUser = UI; - if (!LastUser || LastUser->comesBefore(UI)) - LastUser = UI; + if (!Dom) + Dom = PDom = UI; + else { + Dom = DT->findNearestCommonDominator(Dom, UI); + if (PDom != nullptr) + PDom = findNearestCommonPostDominator(PDom, UI, PDT); + } if (UI->isLifetimeStartOrEnd()) { // We note the locations of these intrinsic calls so that we can // delete them later if the optimization succeeds, this is safe @@ -1534,27 +1560,55 @@ return true; }; + // TODO: update comment // 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics, - // from the alloca to the Store. + // from the alloca to the Store. And collect modref inst for the reachability + // check. ModRefInfo DestModRef = ModRefInfo::NoModRef; MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size)); + SmallVector ReachabilityWorklist; auto DestModRefCallback = [&](Instruction *UI) -> bool { // We don't care about the store itself. if (UI == Store) return true; ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc); - // FIXME: For multi-BB cases, we need to see reachability from it to - // store. - // Bailout if Dest may have any ModRef before Store. - if (UI->comesBefore(Store) && isModOrRefSet(Res)) - return false; - DestModRef |= BAA.getModRefInfo(UI, DestLoc); + DestModRef |= Res; + if (isModOrRefSet(Res)) { + // Instructions reachability checks from CFG analysis + // FIXME: if it will save many case, add isPotentiallyReachableFromMany + // for instructions on CFG analysis. + if (UI->getParent() == Store->getParent()) { + // The same block case is special because it's the only time we're + // looking within a single block to see which instruction comes first. + // Once we start looking at multiple blocks, the first instruction of + // the block is reachable, so we only need to determine reachability + // between whole blocks. + BasicBlock *BB = UI->getParent(); + + // If A comes before B, then B is definitively reachable from A. + if (UI->comesBefore(Store)) + return false; + + // If the user's parent block is entry, no predecessor exists. + if (BB->isEntryBlock()) + return true; + // Otherwise, continue doing the normal per-BB CFG walk. + ReachabilityWorklist.append(succ_begin(BB), succ_end(BB)); + } else { + ReachabilityWorklist.push_back(UI->getParent()); + } + } return true; }; if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) return false; + // Bailout if Dest may have any ModRef before Store. + if (!ReachabilityWorklist.empty() && + isPotentiallyReachableFromMany(ReachabilityWorklist, Store->getParent(), + nullptr, DT, nullptr)) + return false; // 3. Check that, from after the Load to the end of the BB, // 3-1. if the dest has any Mod, src has no Ref, and @@ -1562,9 +1616,9 @@ MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size)); auto SrcModRefCallback = [&](Instruction *UI) -> bool { - // Any ModRef before Load doesn't matter, also Load and Store can be - // ignored. - if (UI->comesBefore(Load) || UI == Load || UI == Store) + // Any ModRef post-dominated by Load doesn't matter, also Load and Store + // themselves can be ignored. + if (PDT->dominates(Load, UI) || UI == Load || UI == Store) return true; ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc); if ((isModSet(DestModRef) && isRefSet(Res)) || @@ -1596,21 +1650,37 @@ ConstantInt *AllocaSize = ConstantInt::get(Type::getInt64Ty(C), Size); // Create a new lifetime start marker before the first user of src or alloca // users. - Builder.SetInsertPoint(FirstUser->getParent(), FirstUser->getIterator()); + Builder.SetInsertPoint(Dom->getParent(), Dom->getIterator()); auto *Start = Builder.CreateLifetimeStart(SrcAlloca, AllocaSize); - auto *FirstMA = MSSA->getMemoryAccess(FirstUser); + auto *FirstMA = MSSA->getMemoryAccess(Dom); auto *StartMA = MSSAU->createMemoryAccessBefore(Start, nullptr, FirstMA); MSSAU->insertDef(cast(StartMA), /*RenameUses=*/true); // Create a new lifetime end marker after the last user of src or alloca - // users. - // FIXME: If the last user is the terminator for the bb, we can insert - // lifetime.end marker to the immidiate post-dominator, but currently do - // nothing. - if (!LastUser->isTerminator()) { - Builder.SetInsertPoint(LastUser->getParent(), ++LastUser->getIterator()); + // users. If there's no such postdominator, just don't bother; we could + // create one at each exit block, but that'd be essentially semantically + // meaningless. If the last user is the terminator for the bb, we can insert + // lifetime.end marker to the immediate post-dominator. We need to create + // MemoryAccess to consistent MemorySSA. + // FIXME: Currently, only insert lifetime.end if the post-dominator is + // the user of SrcAlloca, but even when it's not, we can probably insert + // MemoryPhi. + Instruction *PDomUser = + (PDom && isModOrRefSet(AA->getModRefInfo(PDom, SrcLoc))) ? PDom + : nullptr; + if (PDom && PDom->isTerminator() && PDomUser) { + auto *IPDomNode = (*PDT)[PDom->getParent()]->getIDom(); + auto *IPDomBB = IPDomNode ? IPDomNode->getBlock() : nullptr; + PDom = IPDomBB ? IPDomBB->getFirstNonPHI() : nullptr; + } + + if (PDom && PDomUser) { + auto InsertionPt = PDom->getIterator(); + if (isModOrRefSet(AA->getModRefInfo(PDom, SrcLoc))) + ++InsertionPt; + Builder.SetInsertPoint(PDom->getParent(), InsertionPt); auto *End = Builder.CreateLifetimeEnd(SrcAlloca, AllocaSize); - auto *LastMA = MSSA->getMemoryAccess(LastUser); + auto *LastMA = MSSA->getMemoryAccess(PDomUser); auto *EndMA = MSSAU->createMemoryAccessAfter(End, nullptr, LastMA); MSSAU->insertDef(cast(EndMA), /*RenameUses=*/true); } @@ -1639,7 +1709,8 @@ /// altogether. bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { // We can only optimize non-volatile memcpy's. - if (M->isVolatile()) return false; + if (M->isVolatile()) + return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { @@ -1769,11 +1840,10 @@ << "\n"); // If not, then we know we can transform this. - Type *ArgTys[3] = { M->getRawDest()->getType(), - M->getRawSource()->getType(), - M->getLength()->getType() }; - M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), - Intrinsic::memcpy, ArgTys)); + Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(), + M->getLength()->getType()}; + M->setCalledFunction( + Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys)); // For MemorySSA nothing really changes (except that memcpy may imply stricter // aliasing guarantees). @@ -1816,7 +1886,8 @@ // Get the alignment of the byval. If the call doesn't specify the alignment, // then it is some target specific value that we can't know. MaybeAlign ByValAlign = CB.getParamAlign(ArgNo); - if (!ByValAlign) return false; + if (!ByValAlign) + return false; // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. @@ -1960,7 +2031,7 @@ continue; for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { - // Avoid invalidating the iterator. + // Avoid invalidating the iterator. Instruction *I = &*BI++; bool RepeatInstruction = false; @@ -1999,9 +2070,10 @@ auto *AA = &AM.getResult(F); auto *AC = &AM.getResult(F); auto *DT = &AM.getResult(F); + auto *PDT = &AM.getResult(F); auto *MSSA = &AM.getResult(F); - bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA()); + bool MadeChange = runImpl(F, &TLI, AA, AC, DT, PDT, &MSSA->getMSSA()); if (!MadeChange) return PreservedAnalyses::all(); @@ -2013,12 +2085,14 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, AssumptionCache *AC_, - DominatorTree *DT_, MemorySSA *MSSA_) { + DominatorTree *DT_, PostDominatorTree *PDT_, + MemorySSA *MSSA_) { bool MadeChange = false; TLI = TLI_; AA = AA_; AC = AC_; DT = DT_; + PDT = PDT_; MSSA = MSSA_; MemorySSAUpdater MSSAU_(MSSA_); MSSAU = &MSSAU_; Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -190,6 +190,7 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis ; CHECK-O1-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis @@ -201,7 +202,7 @@ ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis ; CHECK-O1-NEXT: Running pass: CoroElidePass ; CHECK-O-NEXT: Running pass: ADCEPass -; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass ; CHECK-O23SZ-NEXT: Running pass: DSEPass ; CHECK-O23SZ-NEXT: Running pass: MoveAutoInitPass on foo Index: llvm/test/Other/new-pm-lto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-lto-defaults.ll +++ llvm/test/Other/new-pm-lto-defaults.ll @@ -103,8 +103,8 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass on foo ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass on foo -; CHECK-O23SZ-NEXT: Running pass: DSEPass on foo ; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis on foo +; CHECK-O23SZ-NEXT: Running pass: DSEPass on foo ; CHECK-O23SZ-NEXT: Running pass: MoveAutoInitPass on foo ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass on foo ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass on foo Index: llvm/test/Other/new-pm-thinlto-postlink-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -125,6 +125,7 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis ; CHECK-O1-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis @@ -135,7 +136,7 @@ ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis ; CHECK-O1-NEXT: Running pass: CoroElidePass ; CHECK-O-NEXT: Running pass: ADCEPass -; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass ; CHECK-O23SZ-NEXT: Running pass: DSEPass ; CHECK-O23SZ-NEXT: Running pass: MoveAutoInitPass on foo Index: llvm/test/Other/new-pm-thinlto-prelink-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-prelink-defaults.ll +++ llvm/test/Other/new-pm-thinlto-prelink-defaults.ll @@ -157,6 +157,7 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis ; CHECK-O1-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis @@ -167,7 +168,7 @@ ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis ; CHECK-O1-NEXT: Running pass: CoroElidePass ; CHECK-O-NEXT: Running pass: ADCEPass -; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass ; CHECK-O23SZ-NEXT: Running pass: DSEPass ; CHECK-O23SZ-NEXT: Running pass: MoveAutoInitPass Index: llvm/test/Transforms/MemCpyOpt/stack-move.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/stack-move.ll +++ llvm/test/Transforms/MemCpyOpt/stack-move.ll @@ -406,24 +406,19 @@ ret void } -; TODO: merge allocas for bb-separated, but logically straight define void @multi_bb_memcpy(i1 %b) { ; CHECK-LABEL: define void @multi_bb_memcpy ; CHECK-SAME: (i1 [[B:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]]) ; CHECK-NEXT: store i32 42, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]]) ; CHECK-NEXT: br label [[BB0:%.*]] ; CHECK: bb0: -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 4, i1 false) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[DEST]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca i32, align 4 @@ -445,23 +440,17 @@ ret void } -; TODO: Merge alloca define void @multi_bb_load_store(i1 %b) { ; CHECK-LABEL: define void @multi_bb_load_store ; CHECK-SAME: (i1 [[B:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]]) ; CHECK-NEXT: store i32 42, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]]) -; CHECK-NEXT: [[SRC_VAL:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: store i32 [[SRC_VAL]], ptr [[DEST]], align 4 ; CHECK-NEXT: br label [[BB0:%.*]] ; CHECK: bb0: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[DEST]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca i32, align 4 @@ -525,28 +514,22 @@ ret void } -; TODO: merge allocas for multi basicblocks, s.t. all copy-dominated -; uses are satisfy the condition. define void @multi_bb_simple_br(i1 %b) { ; CHECK-LABEL: define void @multi_bb_simple_br ; CHECK-SAME: (i1 [[B:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) ; CHECK-NEXT: br i1 [[B]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca %struct.Foo, align 4 @@ -572,21 +555,17 @@ ret void } -; TODO: merge allocas for multi basicblock loop case. define void @multi_bb_loop(i32 %n) { ; CHECK-LABEL: define void @multi_bb_loop ; CHECK-SAME: (i32 [[N:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NLT1:%.*]] = icmp slt i32 [[N]], 1 ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 8 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 0, i32 1, i32 42 }, ptr [[SRC]], align 4 ; CHECK-NEXT: br i1 [[NLT1]], label [[LOOP_EXIT:%.*]], label [[LOOP_BODY:%.*]] ; CHECK: loop_body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[NEW_I:%.*]], [[LOOP_BODY]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DEST]], ptr align 8 [[SRC]], i64 12, i1 false) ; CHECK-NEXT: [[NEW_I]] = add i32 [[I]], 1 ; CHECK-NEXT: store i32 [[NEW_I]], ptr [[SRC]], align 4 ; CHECK-NEXT: [[IGTN:%.*]] = icmp sgt i32 [[NEW_I]], [[N]] @@ -615,14 +594,11 @@ ret void } -; TODO: merge allocas for multi basicblocks, s.t. some modref which is unreachable from copy exists. define void @multi_bb_unreachable_modref(i1 %b0) { ; CHECK-LABEL: define void @multi_bb_unreachable_modref ; CHECK-SAME: (i1 [[B0:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br i1 [[B0]], label [[BB0:%.*]], label [[EXIT:%.*]] @@ -630,9 +606,6 @@ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: ret void ; CHECK: bb0: -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) ; CHECK-NEXT: ret void ; %src = alloca %struct.Foo, align 4 @@ -654,26 +627,21 @@ ret void } -; TODO: merge allocas for multi basicblocks, s.t. memcpy doesn't dominate the uses. define void @multi_bb_non_dominated(i1 %b0, i1 %b1) { ; CHECK-LABEL: define void @multi_bb_non_dominated ; CHECK-SAME: (i1 [[B0:%.*]], i1 [[B1:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br i1 [[B0]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca %struct.Foo, align 4 @@ -690,11 +658,9 @@ bb1: %2 = call i32 @use_nocapture(ptr noundef nocapture %src) - ; %3 = call i32 @use_writeonly(ptr noundef nocapture %dest) br label %bb2 bb2: - ; %4 = call i32 @use_nocapture(ptr noundef nocapture %src) call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) ret void