Index: llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -346,6 +346,7 @@ uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); + bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); @@ -839,6 +840,53 @@ return true; } +/// We've found that the (upward scanning) memory dependence of \p MemCpy is +/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that +/// weren't copied over by \p MemCpy. +/// +/// In other words, transform: +/// \code +/// memset(dst, c, dst_size); +/// memcpy(dst, src, src_size); +/// \endcode +/// into: +/// \code +/// memcpy(dst, src, src_size); +/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); +/// \endcode +bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, + MemSetInst *MemSet) { + // We can only transform memset/memcpy with the same destination. + if (MemSet->getDest() != MemCpy->getDest()) + return false; + + Value *Dest = MemSet->getDest(); + Value *DestSize = MemSet->getLength(); + Value *SrcSize = MemCpy->getLength(); + + // By default, create an unaligned memset. + unsigned Align = 1; + // If Dest is aligned, and SrcSize is constant, use the minimum alignment + // of the sum. + const unsigned DestAlign = + std::max(MemSet->getAlignment(), MemCpy->getAlignment()); + if (DestAlign > 1) + if (ConstantInt *SrcSizeC = dyn_cast(SrcSize)) + Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); + + IRBuilder<> Builder(MemCpy->getNextNode()); + + Value *MemsetLen = + Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize), + ConstantInt::getNullValue(DestSize->getType()), + Builder.CreateSub(DestSize, SrcSize)); + Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1), + MemsetLen, Align); + + MD->removeInstruction(MemSet); + MemSet->eraseFromParent(); + return true; +} /// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite @@ -869,6 +917,17 @@ return true; } + AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, + M, M->getParent()); + + // Try to turn a partially redundant memset + memcpy into + // memcpy + smaller memset. We don't need the memcpy size for this. + if (SrcDepInfo.isClobber()) + if (MemSetInst *MDep = dyn_cast(SrcDepInfo.getInst())) + if (processMemSetMemCpyDependence(M, MDep)) + return true; + // The optimizations after this point require the memcpy size. ConstantInt *CopySize = dyn_cast(M->getLength()); if (!CopySize) return false; @@ -892,9 +951,6 @@ } } - AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, - M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); Index: llvm/trunk/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll =================================================================== --- llvm/trunk/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll +++ llvm/trunk/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll @@ -0,0 +1,54 @@ +; RUN: opt -memcpyopt -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define void @test +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i32 1, i1 false) +; CHECK-DAG: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size +; CHECK-DAG: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, %src_size +; CHECK-DAG: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, %src_size +; CHECK-DAG: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST]], i8 0, i64 [[SIZE]], i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test(i8* %src, i64 %src_size, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_same +; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 {{.*}}, i32 8, i1 false) +define void @test_align_same(i8* %src, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 80, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_min +; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 {{.*}}, i32 4, i1 false) +define void @test_align_min(i8* %src, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 36, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_memcpy +; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 {{.*}}, i32 8, i1 false) +define void @test_align_memcpy(i8* %src, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 80, i32 8, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_dst +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i32 1, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src, i64 %src_size, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_different_dst(i8* %dst2, i8* %src, i64 %src_size, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src, i64 %src_size, i32 1, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)