Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -348,6 +348,7 @@ uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); + bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); @@ -834,6 +835,65 @@ return true; } +/// We've found that the (upward scanning) memory dependence of \p MemCpy is +/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that +/// weren't copied over by \p MemCpy. +/// +/// In other words, transform: +/// \code +/// memset(dest, set_val, dest_size); +/// memcpy(dest, src, src_size); +/// \endcode +/// into: +/// \code +/// memcpy(dest, src, src_size); +/// memset((char *)dest + src_size, set_val, +/// dest_size < src_size ? 0 : dest_size - src_size); +/// \endcode +bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, + MemSetInst *MemSet) { + // We can only transforms memset/memcpy with the same destinations. + if (MemSet->getDest() != MemCpy->getDest()) + return false; + + Value *Dest = MemSet->getOperand(0); + Value *DestSize = MemSet->getOperand(2); + Value *SrcSize = MemCpy->getOperand(2); + Value *SetVal = MemSet->getOperand(1); + + // By default, create an unaligned memset. + unsigned Align = 1; + // If the old memset Dest was aligned, and SrcSize is constant, use the + // minimum alignment of the sum. + const unsigned DestAlign = MemSet->getAlignment(); + if (DestAlign > 1) + if (ConstantInt *SrcSizeC = dyn_cast(SrcSize)) + Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); + + IRBuilder<> Builder(MemCpy->getNextNode()); + + Value *Zero = ConstantInt::getNullValue(DestSize->getType()); + Value *MemsetStart = Builder.CreateGEP(Dest, SrcSize); + Value *SetValTrunc = Builder.CreateTrunc(SetVal, Builder.getInt8Ty()); + + // LenDiff == (dest_size - src_size) + Value *LenDiff = Builder.CreateSub(DestSize, SrcSize); + // LenDiffNeg == (dest_size - src_size) < 0 == (dest_size < src_size) + Value *LenDiffNeg = Builder.CreateICmpSLT(LenDiff, Zero); + // MemsetLen == (dest_size < src_size) ? 0 : (dest_size - src_size) + Value *MemsetLen = Builder.CreateSelect(LenDiffNeg, Zero, LenDiff); + + CallInst *NewMemSetCall = Builder.CreateMemSet(MemsetStart, SetValTrunc, + MemsetLen, Align); + + DEBUG(dbgs() << "MemCpyOpt: Optimizing redundant memset in memset;memcpy: " + << *NewMemSetCall << "\n"); + (void)NewMemSetCall; + + MD->removeInstruction(MemSet); + MemSet->eraseFromParent(); + return true; +} /// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite @@ -864,6 +924,17 @@ return true; } + AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, + M, M->getParent()); + + // Try to turn a partially redundant memset + memcpy into + // memcpy + smaller memset + if (SrcDepInfo.isClobber()) + if (MemSetInst *MDep = dyn_cast(SrcDepInfo.getInst())) + if (processMemSetMemCpyDependence(M, MDep)) + return true; + // The optimizations after this point require the memcpy size. ConstantInt *CopySize = dyn_cast(M->getLength()); if (!CopySize) return false; @@ -887,9 +958,6 @@ } } - AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, - M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); Index: test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll @@ -0,0 +1,37 @@ +; RUN: opt -memcpyopt -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: test +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 %src_size, i32 1, i1 false) +; CHECK: %1 = getelementptr i8* %dest, i64 %src_size +; CHECK: %2 = sub i64 %dest_size, %src_size +; CHECK: %3 = icmp slt i64 %2, 0 +; CHECK: %4 = select i1 %3, i64 0, i64 %2 +; CHECK: call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 %4, i32 1, i1 false) +; CHECK: ret void +define void @test(i8* %src, i64 %src_size, i8* %dest, i64 %dest_size) { + call void @llvm.memset.p0i8.i64(i8* %dest, i8 0, i64 %dest_size, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 %src_size, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: test_align_same +; CHECK: call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 %4, i32 8, i1 false) +define void @test_align_same(i8* %src, i8* %dest, i64 %dest_size) { + call void @llvm.memset.p0i8.i64(i8* %dest, i8 0, i64 %dest_size, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 80, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: test_align_min +; CHECK: call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 %4, i32 4, i1 false) +define void @test_align_min(i8* %src, i8* %dest, i64 %dest_size) { + call void @llvm.memset.p0i8.i64(i8* %dest, i8 0, i64 %dest_size, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 36, i32 1, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)