Index: include/llvm/Transforms/Scalar/MemCpyOptimizer.h =================================================================== --- include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -50,7 +50,7 @@ // Helper functions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); - bool processMemCpy(MemCpyInst *M); + bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI); bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, uint64_t cpyLen, unsigned cpyAlign, CallInst *C); @@ -60,6 +60,8 @@ bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); + Instruction *tryMergingIntoMemcpy(Instruction *StartInst, Value *StartDstPtr, + Value *StartSrcPtr); bool iterateOnFunction(Function &F); }; Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -31,6 +31,7 @@ #define DEBUG_TYPE "memcpyopt" STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); +STATISTIC(NumMemCpyInfer, "Number of memcpys inferred"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); @@ -119,6 +120,18 @@ return true; } +static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, + const LoadInst *LI) { + unsigned StoreAlign = SI->getAlignment(); + if (!StoreAlign) + StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); + unsigned LoadAlign = LI->getAlignment(); + if (!LoadAlign) + LoadAlign = DL.getABITypeAlignment(LI->getType()); + + return std::min(StoreAlign, LoadAlign); +} + /// Represents a range of memset'd bytes with the ByteVal value. /// This allows us to analyze stores like: @@ -131,14 +144,18 @@ /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the /// two ranges into [0, 3) which is memset'able. namespace { -struct MemsetRange { +struct MemIntrinsicRange { // Start/End - A semi range that describes the span that this range covers. // The range is closed at the start and open at the end: [Start, End). int64_t Start, End; - /// StartPtr - The getelementptr instruction that points to the start of the - /// range. - Value *StartPtr; + /// DestStartPtr - The getelementptr instruction that points to the start of + /// the range being stored to. + Value *DestStartPtr; + + /// SrcStartPtr - The getelementptr instruction that points to the start of + /// the range being read from. + Value *SrcStartPtr; /// Alignment - The known alignment of the first store. unsigned Alignment; @@ -146,21 +163,22 @@ /// TheStores - The actual stores that make up this range. SmallVector TheStores; - bool isProfitableToUseMemset(const DataLayout &DL) const; + bool isProfitableToUseMemIntrinsic(const DataLayout &DL) const; }; } // end anon namespace -bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { - // If we found more than 4 stores to merge or 16 bytes, use memset. +bool MemIntrinsicRange::isProfitableToUseMemIntrinsic( + const DataLayout &DL) const { + // If we found more than 4 stores to merge or 16 bytes, use mem intrinsic. if (TheStores.size() >= 4 || End-Start >= 16) return true; // If there is nothing to merge, don't do anything. if (TheStores.size() < 2) return false; - // If any of the stores are a memset, then it is always good to extend the - // memset. + // If any of the stores are already a mem intrinsic, then it is always good to + // extend it. for (Instruction *SI : TheStores) - if (!isa(SI)) + if (isa(SI)) return true; // Assume that the code generator is capable of merging pairs of stores @@ -194,15 +212,15 @@ namespace { -class MemsetRanges { +class MemIntrinsicRanges { /// A sorted list of the memset ranges. - SmallVector Ranges; - typedef SmallVectorImpl::iterator range_iterator; + SmallVector Ranges; + typedef SmallVectorImpl::iterator range_iterator; const DataLayout &DL; public: - MemsetRanges(const DataLayout &DL) : DL(DL) {} + MemIntrinsicRanges(const DataLayout &DL) : DL(DL) {} - typedef SmallVectorImpl::const_iterator const_iterator; + typedef SmallVectorImpl::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } @@ -216,17 +234,35 @@ void addStore(int64_t OffsetFromFirst, StoreInst *SI) { int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); + unsigned Alignment = + SI->getAlignment() + ? SI->getAlignment() + : DL.getABITypeAlignment(SI->getValueOperand()->getType()); addRange(OffsetFromFirst, StoreSize, - SI->getPointerOperand(), SI->getAlignment(), SI); + SI->getPointerOperand(), nullptr, Alignment, SI); + } + + void addLoadStore(int64_t OffsetFromFirst, LoadInst *LI, StoreInst *SI) { + int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); + + addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), + LI->getPointerOperand(), findCommonAlignment(DL, SI, LI), SI); } void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { int64_t Size = cast(MSI->getLength())->getZExtValue(); - addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); + addRange(OffsetFromFirst, Size, MSI->getDest(), nullptr, + MSI->getAlignment(), MSI); } - void addRange(int64_t Start, int64_t Size, Value *Ptr, + void addMemTransfer(int64_t OffsetFromFirst, MemTransferInst *MTI) { + int64_t Size = cast(MTI->getLength())->getZExtValue(); + addRange(OffsetFromFirst, Size, MTI->getDest(), MTI->getSource(), + MTI->getAlignment(), MTI); + } + + void addRange(int64_t Start, int64_t Size, Value *DestPtr, Value *SrcPtr, unsigned Alignment, Instruction *Inst); }; @@ -234,24 +270,26 @@ } // end anon namespace -/// Add a new store to the MemsetRanges data structure. This adds a +/// Add a new store to the MemIntrinsicRanges data structure. This adds a /// new range for the specified store at the specified offset, merging into /// existing ranges as appropriate. -void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, - unsigned Alignment, Instruction *Inst) { +void MemIntrinsicRanges::addRange(int64_t Start, int64_t Size, Value *DestPtr, + Value *SrcPtr, unsigned Alignment, + Instruction *Inst) { int64_t End = Start+Size; range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start, - [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; }); + [](const MemIntrinsicRange &LHS, int64_t RHS) { return LHS.End < RHS; }); // We now know that I == E, in which case we didn't find anything to merge // with, or that Start <= I->End. If End < I->Start or I == E, then we need // to insert a new range. Handle this now. if (I == Ranges.end() || End < I->Start) { - MemsetRange &R = *Ranges.insert(I, MemsetRange()); + MemIntrinsicRange &R = *Ranges.insert(I, MemIntrinsicRange()); R.Start = Start; R.End = End; - R.StartPtr = Ptr; + R.DestStartPtr = DestPtr; + R.SrcStartPtr = SrcPtr; R.Alignment = Alignment; R.TheStores.push_back(Inst); return; @@ -273,7 +311,8 @@ // stopped on *it*. if (Start < I->Start) { I->Start = Start; - I->StartPtr = Ptr; + I->DestStartPtr = DestPtr; + I->SrcStartPtr = SrcPtr; I->Alignment = Alignment; } @@ -369,7 +408,7 @@ // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. - MemsetRanges Ranges(DL); + MemIntrinsicRanges Ranges(DL); BasicBlock::iterator BI(StartInst); for (++BI; !isa(BI); ++BI) { @@ -431,28 +470,22 @@ // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; - for (const MemsetRange &Range : Ranges) { + for (const MemIntrinsicRange &Range : Ranges) { if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. - if (!Range.isProfitableToUseMemset(DL)) + if (!Range.isProfitableToUseMemIntrinsic(DL)) continue; // Otherwise, we do want to transform this! Create a new memset. // Get the starting pointer of the block. - StartPtr = Range.StartPtr; - - // Determine alignment + StartPtr = Range.DestStartPtr; unsigned Alignment = Range.Alignment; - if (Alignment == 0) { - Type *EltType = - cast(StartPtr->getType())->getElementType(); - Alignment = DL.getABITypeAlignment(EltType); - } + assert(!Range.SrcStartPtr && "memset containing transfer instruction?"); - AMemSet = - Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, + Alignment); DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI : Range.TheStores) @@ -473,16 +506,149 @@ return AMemSet; } -static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, - const LoadInst *LI) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) - StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); +/// When scanning forward over instructions, we look for some other patterns to +/// fold away. In particular, this looks for stores to neighboring locations of +/// memory. If it sees enough consecutive ones, it attempts to merge them +/// together into a memcpy/memset. +Instruction *MemCpyOptPass::tryMergingIntoMemcpy(Instruction *StartInst, + Value *StartDestPtr, + Value *StartSrcPtr) { + const DataLayout &DL = StartInst->getModule()->getDataLayout(); + AliasAnalysis &AA = LookupAliasAnalysis(); - return std::min(StoreAlign, LoadAlign); + // Okay, so we now have a single store that can be splatable. Scan to find + // all subsequent stores of the same value to offset from the same pointer. + // Join these together into ranges, so we can decide whether contiguous blocks + // are stored. + MemIntrinsicRanges Ranges(DL); + + BasicBlock::iterator BI(StartInst); + LoadInst *NextLoad = nullptr; + for (;!isa(BI); ++BI) { + if (!isa(BI) && !isa(BI) && + !isa(BI)) { + // If the instruction is readnone, ignore it, otherwise bail out. We + // don't even allow readonly here because we don't want something like: + // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). + if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) + break; + continue; + } + + if (LoadInst *LI = dyn_cast(BI)) { + if (NextLoad || !LI->isSimple() || !LI->hasOneUse()) + break; + NextLoad = LI; + } else if (StoreInst *NextStore = dyn_cast(BI)) { + // If this is a store, see if we can merge it in. + if (!NextLoad || NextLoad != NextStore->getValueOperand() || + !NextStore->isSimple()) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t DestOffset; + if (!IsPointerOffset(StartDestPtr, NextStore->getPointerOperand(), + DestOffset, DL)) + break; + + int64_t SrcOffset; + if (!IsPointerOffset(StartSrcPtr, NextLoad->getPointerOperand(), + SrcOffset, DL)) + break; + + if (DestOffset != SrcOffset) + break; + + Ranges.addLoadStore(DestOffset, NextLoad, NextStore); + NextLoad = nullptr; + } else { + MemTransferInst *MTI = cast(BI); + + if (NextLoad || MTI->isVolatile() || !isa(MTI->getLength())) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t DestOffset; + if (!IsPointerOffset(StartDestPtr, MTI->getDest(), DestOffset, DL)) + break; + + int64_t SrcOffset; + if (!IsPointerOffset(StartSrcPtr, MTI->getSource(), SrcOffset, DL)) + break; + + if (SrcOffset != DestOffset) + break; + + Ranges.addMemTransfer(SrcOffset, MTI); + } + } + + // If we have no ranges, then we just had a single store with nothing that + // could be merged in. This is a very common case of course. + if (Ranges.empty()) + return nullptr; + + // If we create any memsets, we put it right before the first instruction that + // isn't part of the memset block. This ensure that the memset is dominated + // by any addressing instruction needed by the start of the block. + IRBuilder<> Builder(&*BI); + + // Now that we have full information about ranges, loop over the ranges and + // emit memset's for anything big enough to be worthwhile. + Instruction *AMemCpy = nullptr; + for (const MemIntrinsicRange &Range : Ranges) { + + if (Range.TheStores.size() == 1) continue; + + // If it is profitable to lower this range to memcpy, do so now. + if (!Range.isProfitableToUseMemIntrinsic(DL)) + continue; + + // Otherwise, we do want to transform this! Create a new memcpy. + // Get the starting pointers of the block. + Value *DestStartPtr = Range.DestStartPtr; + Value *SrcStartPtr = Range.SrcStartPtr; + unsigned Alignment = Range.Alignment; + + // We don't keep track of load/store pairs well enough to determine whether + // a memmove is permitted for possibly-aliasing addresses (both order and + // duplicates matter in that case, possibly in ways only determined + // dynamically). + uint64_t Size = Range.End - Range.Start; + if (!AA.isNoAlias(MemoryLocation(DestStartPtr, Size), + MemoryLocation(SrcStartPtr, Size))) + continue; + + AMemCpy = Builder.CreateMemCpy(DestStartPtr, SrcStartPtr, Size, Alignment); + + DEBUG(dbgs() << "Replace load/stores:\n"; + for (Instruction *I : Range.TheStores) { + if (StoreInst *SI = dyn_cast(I)) + dbgs() << *SI->getValueOperand() << '\n'; + dbgs() << *I << '\n'; + } + dbgs() << "With: " << *AMemCpy << '\n'); + + if (!Range.TheStores.empty()) + AMemCpy->setDebugLoc(Range.TheStores[0]->getDebugLoc()); + + // Zap all the excess operations. + for (Instruction *I : Range.TheStores) { + if (StoreInst *SI = dyn_cast(I)) { + auto LI = cast(SI->getValueOperand()); + MD->removeInstruction(LI); + MD->removeInstruction(SI); + SI->eraseFromParent(); + LI->eraseFromParent(); + } else { + MD->removeInstruction(I); + I->eraseFromParent(); + } + } + ++NumMemCpyInfer; + } + + return AMemCpy; } // This method try to lift a store instruction before position P. @@ -653,6 +819,10 @@ BBI = M->getIterator(); return true; } + } else if (Instruction *I = tryMergingIntoMemcpy( + LI, SI->getPointerOperand(), LI->getPointerOperand())) { + BBI = I->getIterator(); + return true; } // Detect cases where we're performing call slot forwarding, but @@ -1132,7 +1302,7 @@ /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. -bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { +bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { // We can only optimize non-volatile memcpy's. if (M->isVolatile()) return false; @@ -1225,6 +1395,9 @@ return true; } + if (auto I = tryMergingIntoMemcpy(M, M->getDest(), M->getSource())) + BBI = I->getIterator(); + return false; } @@ -1346,7 +1519,7 @@ else if (MemSetInst *M = dyn_cast(I)) RepeatInstruction = processMemSet(M, BI); else if (MemCpyInst *M = dyn_cast(I)) - RepeatInstruction = processMemCpy(M); + RepeatInstruction = processMemCpy(M, BI); else if (MemMoveInst *M = dyn_cast(I)) RepeatInstruction = processMemMove(M); else if (auto CS = CallSite(I)) { Index: test/Transforms/MemCpyOpt/form-memcpy.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/form-memcpy.ll @@ -0,0 +1,326 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s + +define void @test_simple_memcpy(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_simple_memcpy +; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8* +; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8* +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false) + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +define void @test_simple_memmove(i32* %dst, i32* %src) { +; CHECK-LABEL: @test_simple_memmove +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +; Make sure we can handle calculating bases & offsets from a real memcpy. +define void @test_initial_memcpy(i32* noalias %dst, i32* noalias%src) { +; CHECK-LABEL: @test_initial_memcpy +; CHECK: {{%.*}} = bitcast i32* %dst to i8* +; CHECK: {{%.*}} = bitcast i32* %src to i8* +; CHECK: [[DST:%.*]] = bitcast i32* %dst to i8* +; CHECK: [[SRC:%.*]] = bitcast i32* %src to i8* +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false) + + %dst.0 = bitcast i32* %dst to i8* + %src.0 = bitcast i32* %src to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.0, i8* %src.0, i64 4, i32 4, i1 false) + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +define void @test_volatile_skipped(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_volatile_skipped +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load volatile i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +define void @test_atomic_skipped(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_atomic_skipped +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store atomic i32 %val.1, i32* %dst.1 unordered, align 4 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +define i32 @test_multi_use_skipped(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_multi_use_skipped +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret i32 %val.1 +} + +define void @test_side_effect_skipped(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_side_effect_skipped +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + call void asm sideeffect "", ""() + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +define void @test_holes_handled(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_holes_handled +; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8* +; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8* +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false) +; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst.7 to i8* +; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src.7 to i8* +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false) + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + + %src.7 = getelementptr i32, i32* %src, i32 7 + %dst.7 = getelementptr i32, i32* %dst, i32 7 + %val.7 = load i32, i32* %src.7 + store i32 %val.7, i32* %dst.7 + + %src.9 = getelementptr i32, i32* %src, i32 9 + %dst.9 = getelementptr i32, i32* %dst, i32 9 + %val.9 = load i32, i32* %src.9 + store i32 %val.9, i32* %dst.9 + + %src.10 = getelementptr i32, i32* %src, i32 10 + %dst.10 = getelementptr i32, i32* %dst, i32 10 + %val.10 = load i32, i32* %src.10 + store i32 %val.10, i32* %dst.10 + + %src.8 = getelementptr i32, i32* %src, i32 8 + %dst.8 = getelementptr i32, i32* %dst, i32 8 + %val.8 = load i32, i32* %src.8 + store i32 %val.8, i32* %dst.8 + + ret void +} + +define void @test_offset_mismatch(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_offset_mismatch +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %dst.2 = getelementptr i32, i32* %dst, i32 1 + %val.2 = load i32, i32* %src.2 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 2 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +define void @test_non_idempotent_ops(i8* %dst, i8* %src) { +; CHECK-LABEL: @test_non_idempotent_ops +; CHECK-NOT: call void @llvm.memcpy +; CHECK-NOT: call void @llvm.memmove + + %val.0 = load i8, i8* %src + store i8 %val.0, i8* %dst + + %src.2 = getelementptr i8, i8* %src, i8 2 + %dst.2 = getelementptr i8, i8* %dst, i8 2 + %val.2 = load i8, i8* %src.2 + store i8 %val.2, i8* %dst.2 + + %val.0.dup = load i8, i8* %src + store i8 %val.0.dup, i8* %dst + + %src.1 = getelementptr i8, i8* %src, i8 1 + %dst.1 = getelementptr i8, i8* %dst, i8 1 + %val.1 = load i8, i8* %src.1 + store i8 %val.1, i8* %dst.1 + + %src.3 = getelementptr i8, i8* %src, i8 3 + %dst.3 = getelementptr i8, i8* %dst, i8 3 + %val.3 = load i8, i8* %src.3 + store i8 %val.3, i8* %dst.3 + + ret void +} + +define void @test_intervening_op(i32* noalias %dst, i32* noalias %src) { +; CHECK-LABEL: @test_intervening_op +; CHECK-NOT: call void @llvm.memcpy + + %val.0 = load i32, i32* %src + store i32 %val.0, i32* %dst + + %src.2 = getelementptr i32, i32* %src, i32 2 + %src16.2 = bitcast i32* %src.2 to i16* + %dst.2 = getelementptr i32, i32* %dst, i32 2 + %val16.2 = load i16, i16* %src16.2 + %val.2 = sext i16 %val16.2 to i32 + store i32 %val.2, i32* %dst.2 + + %src.1 = getelementptr i32, i32* %src, i32 1 + %dst.1 = getelementptr i32, i32* %dst, i32 1 + %val.1 = load i32, i32* %src.1 + store i32 %val.1, i32* %dst.1 + + %src.3 = getelementptr i32, i32* %src, i32 3 + %dst.3 = getelementptr i32, i32* %dst, i32 3 + %val.3 = load i32, i32* %src.3 + store i32 %val.3, i32* %dst.3 + + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)