diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -96,6 +96,13 @@ /// capturing instructions that will not be passed into captured(). virtual void tooManyUses() = 0; + /// visitUse - We found a use of a value derived from the pointer. This is + /// called after shouldExplore(). Return true to stop the traversal or + /// false to continue looking for more uses. + /// + /// U->getUser() is always an Instruction. + virtual bool visitUse(const Use *U); + /// shouldExplore - This is the use of a value derived from the pointer. /// To prune the search (ie., assume that none of its users could possibly /// capture) return false. To search it, return true. diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h --- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -20,6 +20,7 @@ namespace llvm { class AAResults; +class AllocaInst; class BatchAAResults; class AssumptionCache; class CallBase; @@ -33,6 +34,7 @@ class MemorySSA; class MemorySSAUpdater; class MemSetInst; +class PostDominatorTree; class StoreInst; class TargetLibraryInfo; class Value; @@ -42,6 +44,7 @@ AAResults *AA = nullptr; AssumptionCache *AC = nullptr; DominatorTree *DT = nullptr; + PostDominatorTree *PDT = nullptr; MemorySSA *MSSA = nullptr; MemorySSAUpdater *MSSAU = nullptr; @@ -52,7 +55,8 @@ // Glue for the old PM. bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA, - AssumptionCache *AC, DominatorTree *DT, MemorySSA *MSSA); + AssumptionCache *AC, DominatorTree *DT, PostDominatorTree *PDT, + MemorySSA *MSSA); private: // Helper functions @@ -74,6 +78,9 @@ Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI); + bool performStackMoveOptzn(Instruction *Load, Instruction *Store, + AllocaInst *DestAlloca, AllocaInst *SrcAlloca, + uint64_t Size); void eraseInstruction(Instruction *I); bool iterateOnFunction(Function &F); diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -55,6 +55,8 @@ CaptureTracker::~CaptureTracker() = default; +bool CaptureTracker::visitUse(const Use *U) { return false; } + bool CaptureTracker::shouldExplore(const Use *U) { return true; } bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) { @@ -444,7 +446,7 @@ Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking()); SmallSet Visited; - auto AddUses = [&](const Value *V) { + auto VisitUses = [&](const Value *V) { for (const Use &U : V->uses()) { // If there are lots of uses, conservatively say that the value // is captured to avoid taking too much compile time. @@ -460,7 +462,7 @@ } return true; }; - if (!AddUses(V)) + if (!VisitUses(V)) return; auto IsDereferenceableOrNull = [Tracker](Value *V, const DataLayout &DL) { @@ -468,6 +470,8 @@ }; while (!Worklist.empty()) { const Use *U = Worklist.pop_back_val(); + if (Tracker->visitUse(U)) + return; switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) { case UseCaptureKind::NO_CAPTURE: continue; @@ -476,7 +480,7 @@ return; continue; case UseCaptureKind::PASSTHROUGH: - if (!AddUses(U->getUser())) + if (!VisitUses(U->getUser())) return; continue; } diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -12,7 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Bitfields.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -25,6 +28,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" @@ -66,12 +70,18 @@ static cl::opt EnableMemCpyOptWithoutLibcalls( "enable-memcpyopt-without-libcalls", cl::Hidden, cl::desc("Enable memcpyopt even when libcalls are disabled")); +static cl::opt + MemCpyOptStackMoveThreshold("memcpyopt-stack-move-threshold", cl::Hidden, + cl::desc("Maximum number of basic blocks the " + "stack-move optimization may examine"), + cl::init(250)); STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); STATISTIC(NumCallSlot, "Number of call slot optimizations performed"); +STATISTIC(NumStackMove, "Number of stack-move optimizations performed"); namespace { @@ -276,6 +286,8 @@ AU.addRequired(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); AU.addRequired(); AU.addRequired(); @@ -296,6 +308,7 @@ false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) @@ -799,6 +812,25 @@ ++NumMemCpyInstr; return true; } + + // If this is a load-store pair from a stack slot to a stack slot, we + // might be able to perform the stack-move optimization just as we do for + // memcpys from an alloca to an alloca. + if (AllocaInst *DestAlloca = + dyn_cast(SI->getPointerOperand())) { + if (AllocaInst *SrcAlloca = + dyn_cast(LI->getPointerOperand())) { + if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca, + DL.getTypeStoreSize(T))) { + // Avoid invalidating the iterator. + BBI = SI->getNextNonDebugInstruction()->getIterator(); + eraseInstruction(SI); + eraseInstruction(LI); + ++NumMemCpyInstr; + return true; + } + } + } } } @@ -1433,6 +1465,577 @@ return true; } +// These helper classes are used for the stack-move optimization. See the +// comments above performStackMoveOptzn() for more details. + +namespace { + +// Tracks liveness on the basic block level. This is conservative; see the +// comments above performStackMoveOptzn() for justification. +class BasicBlockLiveness { + // The earliest definition or use we've seen, combined with the three bits + // below. + PointerIntPair Value; + + // Whether the alloca is live-in to the block (from predecessor basic blocks). + using LiveIn = Bitfield::Element; + // Whether the alloca is live-out from the block (to successor basic blocks). + using LiveOut = Bitfield::Element; + // Whether there's at least one use of the alloca in this basic block. This + // flag is important for detecting liveness conflicts, since the other + // information stored here isn't sufficient to determine that a use is present + // if a definition precedes it. + using HasUse = Bitfield::Element; + + // Records a new def or use instruction. + void setDefUseInst(Instruction *I) { + assert((!hasDefUseInst() || I->comesBefore(getDefUseInst())) && + "Tried to overwrite an earlier def or use with a later one!"); + Value.setPointer(I); + } + + // Sets the flag which determines whether this block has a use. + void setHasUse(bool On) { + unsigned V = Value.getInt(); + Bitfield::set(V, On); + Value.setInt(V); + } + +public: + BasicBlockLiveness() : Value(nullptr) {} + + // Returns the earliest definition or use we've seen in this block. + Instruction *getDefUseInst() const { return Value.getPointer(); } + // Returns true if there's a definition or use of the memory in this block. + bool hasDefUseInst() const { return Value.getPointer() != nullptr; } + // Returns true if the memory is live-in to this block (i.e. live-out of a + // predecessor). + bool isLiveIn() const { return Bitfield::get(Value.getInt()); } + // Returns true if the memory is live-out of this block (i.e. live-in to a + // successor). + bool isLiveOut() const { return Bitfield::get(Value.getInt()); } + // Returns true if there is at least one use of the memory in this block. + bool hasUse() const { return Bitfield::get(Value.getInt()); } + // Returns true if this alloca is live anywhere in this block or has + // at least one use in it. If this returns false, the alloca is + // guaranteed to be completely dead within this basic block. + bool isLiveAnywhereOrHasUses() const { + return isLiveIn() || isLiveOut() || hasUse(); + } + + // Records a new definition or use of the alloca being tracked within this + // basic block. + void update(Instruction *I, bool IsDef) { + if (!hasDefUseInst() || I->comesBefore(getDefUseInst())) { + setDefUseInst(I); + setLiveIn(!IsDef); + } + if (!IsDef) + setHasUse(true); + } + + // Adjusts the live-in flag for this block. + void setLiveIn(bool On) { + unsigned V = Value.getInt(); + Bitfield::set(V, On); + Value.setInt(V); + } + + // Adjusts the live-out flag for this block. + void setLiveOut(bool On) { + unsigned V = Value.getInt(); + Bitfield::set(V, On); + Value.setInt(V); + } +}; + +using BasicBlockLivenessMap = DenseMap; + +// Tracks uses of an alloca for the purposes of the stack-move optimization. +// +// This class does three things: (1) it makes sure that the alloca is never +// captured; (2) it records defs and uses of the alloca in a map for the +// liveness analysis to use; (3) it finds the nearest dominator and +// postdominator of all uses of this alloca for the purpose of lifetime +// intrinsic "shrink wrapping" if the optimization goes through. +class StackMoveTracker : public CaptureTracker { + // Data layout info. + const DataLayout &DL; + // Dominator tree info. + DominatorTree &DT; + // Postdominator tree info. + PostDominatorTree &PDT; + // The memcpy instruction. + Instruction *Store; + // The size of the underlying alloca, in bits. + TypeSize AllocaSizeInBits; + +public: + // Keeps track of the lifetime intrinsics that we find. We'll need to remove + // these if the optimization goes through. + SmallVector LifetimeMarkers; + // Keeps track of instructions that have !noalias metadata. We need to drop + // that metadata if the optimization succeeds. + std::vector NoAliasInstrs; + // Liveness information for this alloca, tracked on the basic block level. + BasicBlockLivenessMap BBLiveness; + // Liveness information for this alloca, tracked on the instruction level for + // the single basic block containing the memcpy. + DenseMap StoreBBDefUseMap; + // The nearest basic block that dominates all uses of the alloca that we've + // seen so far. This is only null if we haven't seen any uses yet. + BasicBlock *Dom; + // The nearest basic block that postdominates all uses of the alloca that + // we've seen so far. This can be null if there's no such postdominator. + BasicBlock *PostDom; + // The user that caused us to bail out, if any. + User *AbortingUser; + // Whether we should bail out of the stack-move optimization. + bool Abort; + + StackMoveTracker(Instruction *Store, AllocaInst *Alloca, DominatorTree &DT, + PostDominatorTree &PDT) + : DL(Store->getModule()->getDataLayout()), DT(DT), PDT(PDT), Store(Store), + AllocaSizeInBits(*Alloca->getAllocationSizeInBits(DL)), Dom(nullptr), + PostDom(nullptr), AbortingUser(nullptr), Abort(false) {} + +private: + // Called whenever we see a use or a definition of the alloca. If IsDef is + // true, this is a def; otherwise, it's a use. + void recordUseOrDef(Instruction *I, bool IsDef) { + BasicBlock *BB = I->getParent(); + BBLiveness[BB].update(I, IsDef); + + // For the basic block containing the store, track liveness on the + // instruction level. + if (BB == Store->getParent()) + StoreBBDefUseMap[I] = IsDef; + + // If the instruction has !noalias metadata, record it so that we can delete + // the metadata if the optimization succeeds. + if (I->hasMetadata(LLVMContext::MD_noalias)) + NoAliasInstrs.push_back(I); + } + +public: + // If there are too many uses, just bail out to avoid spending excessive + // compile time. + void tooManyUses() override { Abort = true; } + + // If the pointer was captured, we can't usefully track it, so just bail out. + bool captured(const Use *U) override { + if (!Abort) { + AbortingUser = U->getUser(); + Abort = true; + return true; + } + + return false; + } + + // Classifies a use as either a true use or a definition, records that, and + // updates the nearest common dominator and postdominator accordingly. + bool visitUse(const Use *U) override { + Instruction *I = cast(U->getUser()); + BasicBlock *BB = I->getParent(); + + // GEPs don't count as uses of the alloca memory (just of the pointer to the + // alloca), so we don't care about them here. + if (isa(I) && U->getOperandNo() == 0) + return false; + + // Update the nearest common dominator and postdominator. We know that this + // is the first use if Dom is null, because multiple blocks always have a + // mutual common dominator (though not necessarily a common postdominator). + if (Dom == nullptr) { + Dom = PostDom = BB; + } else { + Dom = DT.findNearestCommonDominator(Dom, BB); + if (PostDom != nullptr) + PostDom = PDT.findNearestCommonDominator(PostDom, BB); + } + + // If an instruction overwrites all bytes of the alloca, it's a definition, + // not a use. Detect those cases here. + if (IntrinsicInst *II = dyn_cast(I)) { + if (II->isLifetimeStartOrEnd()) { + // We treat a call to a lifetime intrinsic that covers the entire alloca + // as a definition, since both llvm.lifetime.start and llvm.lifetime.end + // intrinsics conceptually fill all the bytes of the alloca with an + // undefined value. We also note these locations of these intrinsic + // calls so that we can delete them later if the optimization succeeds. + int64_t Size = cast(II->getArgOperand(0))->getSExtValue(); + if (Size < 0 || uint64_t(Size) * 8 == AllocaSizeInBits) { + recordUseOrDef(II, true); + LifetimeMarkers.push_back(II); + return false; + } + } else if (MemIntrinsic *MI = dyn_cast(II)) { + if (MI->getArgOperandNo(U) == 0) { + if (ConstantInt *CI = dyn_cast(MI->getLength())) { + if (CI->getZExtValue() * 8 == AllocaSizeInBits.getFixedSize()) { + // Memcpy, memmove, and memset instructions that fill every byte + // of the alloca are definitions. + recordUseOrDef(MI, true); + return false; + } + } + } + } + } else if (StoreInst *SI = dyn_cast(I)) { + // Stores that overwrite all bytes of the alloca are definitions. + if (U->getOperandNo() == 1 && + DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType()) == + AllocaSizeInBits.getFixedSize()) { + recordUseOrDef(SI, true); + return false; + } + } + + // Otherwise, this instruction is a use. Make a note of that fact and + // continue. + recordUseOrDef(I, false); + return false; + } +}; + +} // namespace + +// Performs liveness dataflow analysis for an alloca at the basic block level as +// part of the stack-move optimization. +// +// This implements the "backwards variable-at-a-time" variant of liveness +// analysis, propagating liveness information backwards from uses until it sees +// a basic block with a definition or one in which the variable is already +// live-out. As implemented, this is a linear-time algorithm, because it only +// visits every basic block at most once and the number of tracked variables is +// constant (two--the source and destination of the memcpy). +// +// In order to avoid spending too much compile time, this operates on the level +// of basic blocks instead of instructions, making it a conservative +// analysis. See the comments in performStackMoveOptzn() for more details. +// +// Returns true if the analysis succeeded or false if it failed due to examining +// too many basic blocks. +static bool computeLiveness(BasicBlockLivenessMap &BBLiveness) { + // Start by initializing a worklist with all basic blocks that are live-in + // (i.e. they potentially need to propagate liveness to their predecessors). + SmallVector Worklist; + for (auto &Pair : BBLiveness) { + if (Pair.second.isLiveIn()) + Worklist.push_back(Pair.first); + } + + // Iterate until we have no more blocks to process. + unsigned Count = 0; + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.back(); + Worklist.pop_back(); + + // Cap the number of basic blocks we examine in order to avoid blowing up + // compile time. The default threshold was empirically determined to be + // sufficient 90% of the time in the Rust compiler. + ++Count; + if (Count >= MemCpyOptStackMoveThreshold) { + LLVM_DEBUG( + dbgs() + << "Stack Move: Exceeded max basic block threshold, bailing\n"); + return false; + } + + // We know that the alloca must be live-in to this basic block, or else we + // wouldn't have added the block to the worklist in the first place. + assert(BBLiveness.lookup(BB).isLiveIn() && + "Shouldn't have added a BB that wasn't live-in to the worklist!"); + + // Propagate liveness back to predecessors. + for (BasicBlock *Pred : predecessors(BB)) { + BasicBlockLiveness PredLiveness = BBLiveness.lookup(Pred); + + // Skip predecessors in which the variable is already known to be + // live-out. + if (!PredLiveness.isLiveOut()) { + PredLiveness.setLiveOut(true); + + // Don't enqueue predecessors if they contain direct defs or uses of the + // variable. If a predecessor contains a use of the variable that + // dominates all the other uses or defs of the variable within that + // block, then we already added that predecessor to the worklist at the + // beginning of this procedure, so we don't need to add it again. If, on + // the other hand, the predecessor contains a definition of the variable + // that dominates all the other uses or defs of the variable within the + // block, then the predecessor won't propagate any liveness to *its* + // predecessors, so we don't need to enqueue it either. + if (!PredLiveness.hasDefUseInst()) { + // We know that this predecessor is a basic block that contains + // neither defs nor uses of the variable and in which the variable is + // live-out. So the variable must be live-in to this predecessor too. + PredLiveness.setLiveIn(true); + Worklist.push_back(Pred); + } + + BBLiveness[Pred] = PredLiveness; + } + } + } + + return true; +} + +// Returns true if the alloca is at the start of the entry block, modulo a few +// instructions like GEPs and debug info. We only perform the stack-move +// optimization for such allocas, which simplifies the logic. +static bool allocaIsAtStartOfEntryBlock(AllocaInst *AI) { + BasicBlock *BB = AI->getParent(); + if (!BB->isEntryBlock()) { + LLVM_DEBUG(dbgs() << "Stack Move: Alloca isn't in entry block\n"); + return false; + } + + for (Instruction &I : *BB) { + if (&I == AI) + return true; + if (isa(I) || isa(I) || + isa(I) || I.isLifetimeStartOrEnd()) { + continue; + } + LLVM_DEBUG( + dbgs() + << "Stack Move: Alloca isn't at start of entry block\n Instruction:" + << I << "\n"); + return false; + } + + llvm_unreachable("Alloca wasn't found in its parent basic block"); +} + +// Attempts to optimize the pattern whereby memory is copied from an alloca to +// another alloca, where the two allocas aren't live simultaneously except +// during the transfer. If successful, the two allocas can be merged into one +// and the transfer can be deleted. This pattern is generated frequently in +// Rust, due to the ubiquity of move operations in that language. +// +// We choose to limit this optimization to cases in which neither alloca was +// captured, in order to avoid interprocedural analysis. As it turns out, the +// same CaptureTracking framework that is needed to detect this condition also +// turns out to be useful for gathering definitions and uses. So our general +// approach is to run CaptureTracking to find captures and simultaneously gather +// up uses and defs, followed by the standard liveness dataflow analysis to +// ensure that the source and destination aren't simultaneously live anywhere. +// +// To avoid blowing up compile time, we perform the liveness analysis +// conservatively on the basic block level rather than on the instruction level, +// with the exception of the basic block containing the memcpy itself. This +// means that any basic block that contains a use of both the source and +// destination causes us to conservatively bail out, even if the source and +// destination aren't actually simultaneously live. Empirically, this happens +// less than 2% of the time in typical Rust code, making the +// precision/compile-time tradeoff well worth it. +// +// Once we determine that the optimization is safe to perform, we replace all +// uses of the destination alloca with the source alloca. We also "shrink wrap" +// the lifetime markers of the single merged alloca to the nearest dominating +// and postdominating basic block. Note that the "shrink wrapping" procedure is +// a safe transformation only because we restrict the scope of this optimization +// to allocas that aren't captured. +bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, + AllocaInst *DestAlloca, + AllocaInst *SrcAlloca, + uint64_t Size) { + // If the optimization is disabled, forget it. + if (MemCpyOptStackMoveThreshold == 0) + return false; + + LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n" + << *Store << "\n"); + + // Make sure the two allocas are in the same address space. + if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) { + LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n"); + return false; + } + + // Calculate the static size of the allocas to be merged, bailing out if we + // can't. + const DataLayout &DL = DestAlloca->getModule()->getDataLayout(); + std::optional SrcSize = SrcAlloca->getAllocationSizeInBits(DL); + if (!SrcSize || SrcSize->isScalable() || + Size * 8 != SrcSize->getFixedSize()) { + LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n"); + return false; + } + std::optional DestSize = DestAlloca->getAllocationSizeInBits(DL); + if (!DestSize || DestSize->isScalable() || + Size * 8 != DestSize->getFixedSize()) { + LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n"); + return false; + } + + // Make sure the allocas are at the start of the entry block. This lets us + // avoid having to do annoying checks to ensure the allocas dominate their + // uses, as well as problems related to llvm.stacksave and llvm.stackrestore + // intrinsics. + if (!allocaIsAtStartOfEntryBlock(DestAlloca) || + !allocaIsAtStartOfEntryBlock(SrcAlloca)) { + return false; + } + + // Gather up all uses of the destination. Make sure that it wasn't captured + // anywhere. + StackMoveTracker DestTracker(Store, DestAlloca, *DT, *PDT); + PointerMayBeCaptured(DestAlloca, &DestTracker); + if (DestTracker.Abort) { + LLVM_DEBUG({ + dbgs() << "Stack Move: Destination was captured:"; + if (DestTracker.AbortingUser != nullptr) + dbgs() << "\n" << *DestTracker.AbortingUser; + dbgs() << "\n"; + }); + return false; + } + + // Likewise, collect all uses of the source, again making sure that it wasn't + // captured anywhere. + StackMoveTracker SrcTracker(Store, SrcAlloca, *DT, *PDT); + PointerMayBeCaptured(SrcAlloca, &SrcTracker); + if (SrcTracker.Abort) { + LLVM_DEBUG({ + dbgs() << "Stack Move: Source was captured:"; + if (SrcTracker.AbortingUser != nullptr) + dbgs() << "\n" << *SrcTracker.AbortingUser; + dbgs() << "\n"; + }); + return false; + } + + // Compute liveness on the basic block level. + BasicBlock *StoreBB = Store->getParent(); + if (!computeLiveness(DestTracker.BBLiveness) || + !computeLiveness(SrcTracker.BBLiveness)) { + return false; + } + + // Check for liveness conflicts on the basic block level (with the exception + // of the basic block containing the memcpy). This is conservative compared to + // computing liveness on the instruction level. The precision loss is only 2% + // on the Rust compiler, however, making this compile-time tradeoff + // worthwhile. + for (auto DestPair : DestTracker.BBLiveness) { + BasicBlock *BB = DestPair.first; + if (BB != StoreBB && DestPair.second.isLiveAnywhereOrHasUses() && + SrcTracker.BBLiveness.lookup(BB).isLiveAnywhereOrHasUses()) { + LLVM_DEBUG(dbgs() << "Stack Move: Detected liveness conflict, " + "bailing:\n Basic Block: " + << BB->getNameOrAsOperand() << "\n"); + return false; + } + } + + // Check liveness inside the single basic block containing the load and + // store. + bool DestLive = DestTracker.BBLiveness.lookup(StoreBB).isLiveOut(); + bool SrcLive = SrcTracker.BBLiveness.lookup(StoreBB).isLiveOut(); + for (auto &BI : reverse(*StoreBB)) { + if (DestLive && SrcLive && &BI != Load && &BI != Store) { + LLVM_DEBUG( + dbgs() << "Stack Move: Detected liveness conflict inside the basic " + "block containing the memcpy, bailing:\n Instruction: " + << BI << "\n"); + return false; + } + + auto DestDefUseIt = DestTracker.StoreBBDefUseMap.find(&BI); + auto SrcDefUseIt = SrcTracker.StoreBBDefUseMap.find(&BI); + if (DestDefUseIt != DestTracker.StoreBBDefUseMap.end()) + DestLive = !DestDefUseIt->second; + if (SrcDefUseIt != SrcTracker.StoreBBDefUseMap.end()) + SrcLive = !SrcDefUseIt->second; + } + + // We can do the transformation. First, align the allocas appropriately. + SrcAlloca->setAlignment( + std::max(SrcAlloca->getAlign(), DestAlloca->getAlign())); + + // Merge the two allocas. + DestAlloca->replaceAllUsesWith(SrcAlloca); + + // Drop metadata on the source alloca. + SrcAlloca->dropUnknownNonDebugMetadata(); + + // Now "shrink wrap" the lifetimes. Begin by creating a new lifetime start + // marker at the start of the nearest common dominator of all defs and uses of + // the merged alloca. + // + // We could be more precise here and query AA to find the latest point in the + // basic block at which to place the call to the intrinsic, but that doesn't + // seem worth it at the moment. + assert(DestTracker.Dom != nullptr && SrcTracker.Dom != nullptr && + "There must be a common dominator for all defs and uses of the source " + "and destination"); + Type *IntPtrTy = + Type::getIntNTy(SrcAlloca->getContext(), DL.getPointerSizeInBits()); + ConstantInt *CI = cast(ConstantInt::get(IntPtrTy, Size)); + BasicBlock *Dom = + DT->findNearestCommonDominator(DestTracker.Dom, SrcTracker.Dom); + BasicBlock::iterator InsertionPt = Dom->getFirstNonPHIOrDbgOrAlloca(); + if (Dom == SrcAlloca->getParent() && InsertionPt != Dom->end() && + InsertionPt->comesBefore(SrcAlloca)) { + // Make sure that the alloca dominates the lifetime start intrinsic. + // Usually, the call to getFirstNonPHIOrDbgOrAlloca() above ensures that, + // but if the allocas aren't all at the start of the basic block we might + // have to fix things up. + InsertionPt = ++BasicBlock::iterator(SrcAlloca); + } + IRBuilder<>(Dom, InsertionPt).CreateLifetimeStart(SrcAlloca, CI); + + // Next, create a new lifetime end marker at the end of the nearest common + // postdominator of all defs and uses of the merged alloca, if there is one. + // If there's no such postdominator, just don't bother; we could create one at + // each exit block, but that'd be essentially semantically meaningless. + if (DestTracker.PostDom != nullptr && SrcTracker.PostDom != nullptr) { + if (BasicBlock *PostDom = PDT->findNearestCommonDominator( + DestTracker.PostDom, SrcTracker.PostDom)) { + // Edge case: It's possible that the terminating instruction of the + // postdominating basic block is itself an invoke instruction that uses + // the alloca. Placing the lifetime end intrinsic before that call would + // be incorrect. Detect this situation and choose the next postdominator + // instead. + MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(SrcAlloca); + if (isModOrRefSet(AA->getModRefInfo(PostDom->getTerminator(), Loc))) { + auto PostDomNode = (*PDT)[PostDom]->getIDom(); + PostDom = PostDomNode != nullptr ? PostDomNode->getBlock() : nullptr; + } + + // Add the lifetime end intrinsic. + if (PostDom != nullptr) { + IRBuilder<>(PostDom, BasicBlock::iterator(PostDom->getTerminator())) + .CreateLifetimeEnd(SrcAlloca, CI); + } + } + } + + // Remove all other lifetime markers. + for (IntrinsicInst *II : DestTracker.LifetimeMarkers) + eraseInstruction(II); + for (IntrinsicInst *II : SrcTracker.LifetimeMarkers) + eraseInstruction(II); + + // As this transformation can cause memory accesses that didn't previously + // alias to begin to alias one another, we remove !noalias metadata from any + // uses of either alloca. This is conservative, but more precision doesn't + // seem worthwhile right now. + for (Instruction *I : DestTracker.NoAliasInstrs) + I->setMetadata(LLVMContext::MD_noalias, nullptr); + for (Instruction *I : SrcTracker.NoAliasInstrs) + I->setMetadata(LLVMContext::MD_noalias, nullptr); + + // We're done! We don't need to delete the memcpy because later passes will do + // it. + LLVM_DEBUG(dbgs() << "Stack Move: Performed stack-move optimization\n"); + ++NumStackMove; + return true; +} + /// Perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on @@ -1490,13 +2093,14 @@ MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( AnyClobber, MemoryLocation::getForSource(M), BAA); - // There are four possible optimizations we can do for memcpy: + // There are five possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space or space that has just started // its lifetime copies undefined data, and we can therefore eliminate // the memcpy in favor of the data that was already at the destination. // d) memcpy from a just-memset'd source can be turned into memset. + // e) elimination of memcpy via stack-move optimization. if (auto *MD = dyn_cast(SrcClobber)) { if (Instruction *MI = MD->getMemoryInst()) { if (auto *CopySize = dyn_cast(M->getLength())) { @@ -1518,8 +2122,10 @@ } } } - if (auto *MDep = dyn_cast(MI)) - return processMemCpyMemCpyDependence(M, MDep, BAA); + if (auto *MDep = dyn_cast(MI)) { + if (processMemCpyMemCpyDependence(M, MDep, BAA)) + return true; + } if (auto *MDep = dyn_cast(MI)) { if (performMemCpyToMemSetOptzn(M, MDep, BAA)) { LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n"); @@ -1538,6 +2144,26 @@ } } + // If the transfer is from a stack slot to a stack slot, then we may be able + // to perform the stack-move optimization. See the comments in + // performStackMoveOptzn() for more details. + AllocaInst *DestAlloca = dyn_cast(M->getDest()); + if (DestAlloca == nullptr) + return false; + AllocaInst *SrcAlloca = dyn_cast(M->getSource()); + if (SrcAlloca == nullptr) + return false; + ConstantInt *Len = dyn_cast(M->getLength()); + if (Len == nullptr) + return false; + if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue())) { + // Avoid invalidating the iterator. + BBI = M->getNextNonDebugInstruction()->getIterator(); + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } + return false; } @@ -1693,9 +2319,10 @@ auto *AA = &AM.getResult(F); auto *AC = &AM.getResult(F); auto *DT = &AM.getResult(F); + auto *PDT = &AM.getResult(F); auto *MSSA = &AM.getResult(F); - bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA()); + bool MadeChange = runImpl(F, &TLI, AA, AC, DT, PDT, &MSSA->getMSSA()); if (!MadeChange) return PreservedAnalyses::all(); @@ -1707,12 +2334,14 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, AssumptionCache *AC_, - DominatorTree *DT_, MemorySSA *MSSA_) { + DominatorTree *DT_, PostDominatorTree *PDT_, + MemorySSA *MSSA_) { bool MadeChange = false; TLI = TLI_; AA = AA_; AC = AC_; DT = DT_; + PDT = PDT_; MSSA = MSSA_; MemorySSAUpdater MSSAU_(MSSA_); MSSAU = &MSSAU_; @@ -1738,7 +2367,8 @@ auto *AA = &getAnalysis().getAAResults(); auto *AC = &getAnalysis().getAssumptionCache(F); auto *DT = &getAnalysis().getDomTree(); + auto *PDT = &getAnalysis().getPostDomTree(); auto *MSSA = &getAnalysis().getMSSA(); - return Impl.runImpl(F, TLI, AA, AC, DT, MSSA); + return Impl.runImpl(F, TLI, AA, AC, DT, PDT, MSSA); } diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll --- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s +; RUN: opt < %s -S -passes=memcpyopt -memcpyopt-stack-move-threshold=0 | FileCheck --match-full-lines %s ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains define i8 @test(i8 %input) { diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -190,6 +190,7 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis ; CHECK-O1-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis @@ -201,7 +202,7 @@ ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis ; CHECK-O1-NEXT: Running pass: CoroElidePass ; CHECK-O-NEXT: Running pass: ADCEPass -; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass ; CHECK-O23SZ-NEXT: Running pass: DSEPass ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -110,8 +110,8 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass on foo ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass on foo -; CHECK-O23SZ-NEXT: Running pass: DSEPass on foo ; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis on foo +; CHECK-O23SZ-NEXT: Running pass: DSEPass on foo ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass on foo ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass on foo ; CHECK-O23SZ-NEXT: Running pass: LCSSAPass on foo diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -163,6 +163,7 @@ ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis ; CHECK-O1-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis @@ -173,7 +174,7 @@ ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis ; CHECK-O1-NEXT: Running pass: CoroElidePass ; CHECK-O-NEXT: Running pass: ADCEPass -; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass ; CHECK-O23SZ-NEXT: Running pass: DSEPass ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Transforms/MemCpyOpt/callslot.ll b/llvm/test/Transforms/MemCpyOpt/callslot.ll --- a/llvm/test/Transforms/MemCpyOpt/callslot.ll +++ b/llvm/test/Transforms/MemCpyOpt/callslot.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=memcpyopt < %s -verify-memoryssa | FileCheck %s +; RUN: opt -S -passes=memcpyopt -memcpyopt-stack-move-threshold=0 < %s -verify-memoryssa | FileCheck %s define i8 @read_dest_between_call_and_memcpy() { ; CHECK-LABEL: @read_dest_between_call_and_memcpy( diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll --- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll +++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s +; RUN: opt < %s -S -passes=memcpyopt -memcpyopt-stack-move-threshold=0 | FileCheck --match-full-lines %s ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions. ; Merging here naively generates: diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll @@ -0,0 +1,894 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Tests that the stack-move optimization functions properly. +; +; RUN: opt -passes=memcpyopt -memcpyopt-stack-move-threshold=8 -S < %s | FileCheck %s + +%struct.Foo = type { i32, i32, i32 } + +@constant = private unnamed_addr constant %struct.Foo { i32 1, i32 2, i32 3 }, align 4 + +; Optimization successes follow: + +; Tests that the optimization succeeds with a basic call to memcpy. +define void @basic_memcpy() { +; CHECK-LABEL: @basic_memcpy( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization succeeds with a basic call to memmove. +define void @basic_memmove() { +; CHECK-LABEL: @basic_memmove( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memmove.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memmove.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization succeeds with a load/store pair. +define void @load_store() { +; CHECK-LABEL: @load_store( +; CHECK-NEXT: [[SRC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]]) +; CHECK-NEXT: store i32 42, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca i32, align 4 + %dest = alloca i32, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %dest) + store i32 42, ptr %src + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + %2 = load i32, ptr %src + store i32 %2, ptr %dest + call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %src) + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %dest) + ret void +} + +; Tests that merging two allocas with different alignments results in an +; alloca with the broader alignment. +define void @align_up() { +; CHECK-LABEL: @align_up( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 8 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that we correctly remove extra lifetime intrinsics when performing the +; optimization. +define void @remove_extra_lifetime_intrinsics() { +; CHECK-LABEL: @remove_extra_lifetime_intrinsics( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that we remove scoped noalias metadata from a call. +define void @remove_scoped_noalias() { +; CHECK-LABEL: @remove_scoped_noalias( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !alias.scope !0 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src), !alias.scope !2 + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest), !noalias !2 + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that we remove metadata on the merged alloca. +define void @remove_alloca_metadata() { +; CHECK-LABEL: @remove_alloca_metadata( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !alias.scope !0 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4, !annotation !3 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src), !alias.scope !2 + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest), !noalias !2 + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that we correctly "shrinkwrap" lifetime intrinsics to the nearest +; common dominator and postdominator when performing the optimization. +define void @shrinkwrap_lifetimes() { +; CHECK-LABEL: @shrinkwrap_lifetimes( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: bb0: +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: br label [[BB2]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[TMP3]], label [[BB3:%.*]], label [[BB4:%.*]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb4: +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb5: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + %1 = call i1 @cond() + br i1 %1, label %bb0, label %bb1 + +bb0: + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + br label %bb2 + +bb1: + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + br label %bb2 + +bb2: + %2 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + %3 = call i1 @cond() + br i1 %3, label %bb3, label %bb4 + +bb3: + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %4 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + br label %bb5 + +bb4: + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %5 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + br label %bb5 + +bb5: + ret void +} + +; Tests that GEP doesn't count as a use for the purposes of liveness analysis. +define void @gep_isnt_a_use() { +; CHECK-LABEL: @gep_isnt_a_use( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[SRC]], i32 0, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + %2 = getelementptr %struct.Foo, ptr %src, i32 0, i32 0 + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that a memcpy that completely overwrites a stack value is a definition +; for the purposes of liveness analysis. +define void @memcpy_is_def() { +; CHECK-LABEL: @memcpy_is_def( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %3 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that a memset that completely overwrites a stack value is a definition +; for the purposes of liveness analysis. +define void @memset_is_def() { +; CHECK-LABEL: @memset_is_def( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[SRC]], i8 42, i64 12, i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.memset.p0.i64(ptr align 4 %src, i8 42, i64 12, i1 false) + %3 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that a store that completely overwrites a stack value is a definition +; for the purposes of liveness analysis. +define void @store_is_def() { +; CHECK-LABEL: @store_is_def( +; CHECK-NEXT: [[SRC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]]) +; CHECK-NEXT: store i32 42, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: store i32 64, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca i32, align 4 + %dest = alloca i32, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %dest) + store i32 42, ptr %src + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + %2 = load i32, ptr %src + store i32 %2, ptr %dest + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + store i32 64, ptr %src + %4 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %src) + call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %dest) + ret void +} + +; Optimization failures follow: + +; Tests that the optimization fails with a load/store pair that isn't +; block-local. +define void @global_load_store() { +; CHECK-LABEL: @global_load_store( +; CHECK-NEXT: [[SRC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: store i32 42, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: br label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: store i32 [[TMP2]], ptr [[DEST]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca i32, align 4 + %dest = alloca i32, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %dest) + store i32 42, ptr %src + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + %2 = load i32, ptr %src + br label %bb0 + +bb0: + store i32 %2, ptr %dest + call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %src) + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %dest) + ret void +} + +; Tests that dynamically-sized allocas are never merged. +define void @dynamically_sized_alloca(i64 %i) { +; CHECK-LABEL: @dynamically_sized_alloca( +; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 [[I:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i64 [[I]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca i8, i64 %i, align 4 + %dest = alloca i8, i64 %i, align 4 + call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture %dest) + ret void +} + +; Tests that a memcpy with a dynamic size is never optimized. +define void @dynamically_sized_memcpy(i64 %size) { +; CHECK-LABEL: @dynamically_sized_memcpy( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 [[SIZE:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 %size, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that allocas with different sizes aren't merged together. +define void @mismatched_alloca_size() { +; CHECK-LABEL: @mismatched_alloca_size( +; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 24, align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i64 12, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 24, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca i8, i64 24, align 4 + %dest = alloca i8, i64 12, align 4 + call void @llvm.lifetime.start.p0(i64 24, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 24, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that allocas with mismatched address spaces aren't combined. +define void @mismatched_alloca_addrspace() { +; CHECK-LABEL: @mismatched_alloca_addrspace( +; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 24, align 4, addrspace(1) +; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i64 12, align 4, addrspace(2) +; CHECK-NEXT: call void @llvm.lifetime.start.p1(i64 24, ptr addrspace(1) nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p2(i64 12, ptr addrspace(2) nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr addrspace(1) nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p2.p1.i64(ptr addrspace(2) align 4 [[DEST]], ptr addrspace(1) align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p1(i64 24, ptr addrspace(1) nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr addrspace(2) nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p2(i64 12, ptr addrspace(2) nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca i8, i64 24, align 4, addrspace(1) + %dest = alloca i8, i64 12, align 4, addrspace(2) + call void @llvm.lifetime.start.p1(i64 24, ptr addrspace(1) nocapture %src) + call void @llvm.lifetime.start.p2(i64 12, ptr addrspace(2) nocapture %dest) + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr addrspace(1) noundef nocapture %src) + call void @llvm.memcpy.p2.p1.i64(ptr addrspace(2) align 4 %dest, ptr addrspace(1) align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p1(i64 24, ptr addrspace(1) nocapture %src) + %2 = call i32 @use_nocapture(ptr addrspace(2) noundef nocapture %dest) + call void @llvm.lifetime.end.p2(i64 12, ptr addrspace(2) nocapture %dest) + ret void +} + +; Tests that volatile memcpys aren't removed. +define void @volatile_memcpy() { +; CHECK-LABEL: @volatile_memcpy( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 true) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 true) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 true) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization isn't performed when the destination is captured. +define void @dest_captured() { +; CHECK-LABEL: @dest_captured( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_capture(ptr noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_capture(ptr noundef %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization isn't performed when the source is captured. +define void @src_captured() { +; CHECK-LABEL: @src_captured( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_capture(ptr noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_capture(ptr noundef %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization isn't performed when the source and destination +; are simultaneously live within the basic block. +define void @local_liveness_conflict() { +; CHECK-LABEL: @local_liveness_conflict( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization isn't performed when the source and destination +; are simultaneously live in a way that can only be determined by examining +; multiple basic blocks. +define void @global_liveness_conflict() { +; CHECK-LABEL: @global_liveness_conflict( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: br label [[BB2]] +; CHECK: bb2: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + %2 = call i1 @cond() + br i1 %2, label %bb0, label %bb1 + +bb0: + %3 = call i32 @use_nocapture(ptr noundef nocapture %src) + br label %bb2 + +bb1: + %4 = call i32 @use_nocapture(ptr noundef nocapture %dest) + br label %bb2 + +bb2: + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that a memcpy that doesn't completely overwrite a stack value is a use +; for the purposes of liveness analysis, not a definition. +define void @incomplete_memcpy_is_use() { +; CHECK-LABEL: @incomplete_memcpy_is_use( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 11, i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 11, i1 false) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %3 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that a store that doesn't completely overwrite a stack value is a use +; for the purposes of liveness analysis, not a definition. +define void @incomplete_store_is_use() { +; CHECK-LABEL: @incomplete_store_is_use( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], ptr [[DEST]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + %2 = load i32, ptr %src + store i32 %2, ptr %dest + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %4 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that we don't incorrectly try to perform the optimization for allocas +; that succeed llvm.stacksave intrinsics, by restricting our optimization to +; allocas that were defined at the start of the entry block. +define void @stacksave() { +; CHECK-LABEL: @stacksave( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.stacksave() +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %1 = call ptr @llvm.stacksave() + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %2 = call i32 @use_nocapture(ptr noundef nocapture %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %3 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +; Tests that the optimization fails if too many basic blocks are examined. +define void @too_many_basic_blocks() { +; CHECK-LABEL: @too_many_basic_blocks( +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: br label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb3: +; CHECK-NEXT: br label [[BB4:%.*]] +; CHECK: bb4: +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb5: +; CHECK-NEXT: br label [[BB6:%.*]] +; CHECK: bb6: +; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK: bb7: +; CHECK-NEXT: br label [[BB8:%.*]] +; CHECK: bb8: +; CHECK-NEXT: br label [[BB9:%.*]] +; CHECK: bb9: +; CHECK-NEXT: br label [[BB10:%.*]] +; CHECK: bb10: +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dest = alloca %struct.Foo, align 4 + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src) + call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false) + %1 = call i32 @use_nocapture(ptr noundef nocapture %src) + br label %bb0 + +bb0: + br label %bb1 +bb1: + br label %bb2 +bb2: + br label %bb3 +bb3: + br label %bb4 +bb4: + br label %bb5 +bb5: + br label %bb6 +bb6: + br label %bb7 +bb7: + br label %bb8 +bb8: + br label %bb9 +bb9: + br label %bb10 + +bb10: + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src) + %2 = call i32 @use_nocapture(ptr noundef nocapture %dest) + call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest) + ret void +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) +declare void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) +declare void @llvm.memcpy.p2.p1.i64(ptr addrspace(2) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) +declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) +declare void @llvm.lifetime.start.p1(i64, ptr addrspace(1) nocapture) +declare void @llvm.lifetime.end.p1(i64, ptr addrspace(1) nocapture) +declare void @llvm.lifetime.start.p2(i64, ptr addrspace(2) nocapture) +declare void @llvm.lifetime.end.p2(i64, ptr addrspace(2) nocapture) +declare ptr @llvm.stacksave() + +declare i32 @use_nocapture(ptr noundef nocapture) +declare i32 @use_nocapture_p1(ptr addrspace(1) noundef nocapture) +declare i32 @use_nocapture_p2(ptr addrspace(2) noundef nocapture) +declare i32 @use_capture(ptr noundef) +declare i1 @cond() + +; Scope domain +!0 = !{!0} +; Scope in that domain +!1 = !{!1, !0} +; Scope list +!2 = !{!1} + +!3 = !{!"Whatever"} +