Index: llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -346,6 +346,7 @@ uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); + bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep); bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); @@ -896,6 +897,39 @@ return true; } +/// Transform memcpy to memset when its source was just memset. +/// In other words, turn: +/// \code +/// memset(dst1, c, dst1_size); +/// memcpy(dst2, dst1, dst2_size); +/// \endcode +/// into: +/// \code +/// memset(dst1, c, dst1_size); +/// memset(dst2, c, dst2_size); +/// \endcode +/// When dst2_size <= dst1_size. +/// +/// The \p MemCpy must have a Constant length. +bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, + MemSetInst *MemSet) { + // This only makes sense on memcpy(..., memset(...), ...). + if (MemSet->getRawDest() != MemCpy->getRawSource()) + return false; + + ConstantInt *CopySize = cast(MemCpy->getLength()); + ConstantInt *MemSetSize = dyn_cast(MemSet->getLength()); + // Make sure the memcpy doesn't read any more than what the memset wrote. + // Don't worry about sizes larger than i64. + if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue()) + return false; + + IRBuilder<> Builder(MemCpy->getNextNode()); + Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), + CopySize, MemCpy->getAlignment()); + return true; +} + /// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on @@ -938,12 +972,13 @@ ConstantInt *CopySize = dyn_cast(M->getLength()); if (!CopySize) return false; - // The are three possible optimizations we can do for memcpy: + // There are four possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space or space that has just started its // lifetime copies undefined data, and we can therefore eliminate the // memcpy in favor of the data that was already at the destination. + // d) memcpy from a just-memset'd source can be turned into memset. if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), @@ -984,6 +1019,15 @@ } } + if (SrcDepInfo.isClobber()) + if (MemSetInst *MDep = dyn_cast(SrcDepInfo.getInst())) + if (performMemCpyToMemSetOptzn(M, MDep)) { + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumCpyToSet; + return true; + } + return false; } Index: llvm/trunk/test/Transforms/MemCpyOpt/callslot_deref.ll =================================================================== --- llvm/trunk/test/Transforms/MemCpyOpt/callslot_deref.ll +++ llvm/trunk/test/Transforms/MemCpyOpt/callslot_deref.ll @@ -17,10 +17,11 @@ } ; memset touch more bytes than those guaranteed to be dereferenceable +; We can't remove the memcpy, but we can turn it into an independent memset. define void @must_not_remove_memcpy(i8* noalias nocapture dereferenceable(1024) %dst) { ; CHECK-LABEL: @must_not_remove_memcpy( ; CHECK: call void @llvm.memset.p0i8.i64 -; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK: call void @llvm.memset.p0i8.i64 %src = alloca [4096 x i8], align 1 %p = getelementptr inbounds [4096 x i8], [4096 x i8]* %src, i64 0, i64 0 call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i32 1, i1 false) Index: llvm/trunk/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll =================================================================== --- llvm/trunk/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll +++ llvm/trunk/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll @@ -40,7 +40,7 @@ ; CHECK: %[[a_cast:[^=]+]] = bitcast [8 x i64]* %[[a]] to i8* ; CHECK: call void @llvm.memset.p0i8.i64(i8* %[[a_cast]], i8 0, i64 64 ; CHECK: %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8* -; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[sret_cast]], i8* %[[a_cast]], i64 64 +; CHECK: call void @llvm.memset.p0i8.i64(i8* %[[sret_cast]], i8 0, i64 64 ; CHECK: call void @llvm.memset.p0i8.i64(i8* %[[a_cast]], i8 42, i64 32 ; CHECK: %[[out_cast:[^=]+]] = bitcast [8 x i64]* %out to i8* ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[out_cast]], i8* %[[a_cast]], i64 64 Index: llvm/trunk/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll =================================================================== --- llvm/trunk/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll +++ llvm/trunk/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll @@ -0,0 +1,101 @@ +; RUN: opt -memcpyopt -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define void @test( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 128, i32 8, i1 false) +; CHECK-NEXT: ret void +define void @test(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 8, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_smaller_memcpy( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_smaller_memcpy(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_smaller_memset( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i32 1, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_smaller_memset(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_memset( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 128, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_align_memset(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dst2, i8 %c, i32 100, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst2, i8* %dst1, i32 100, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types_2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dst1, i8 %c, i32 128, i32 8, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types_2(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i32(i8* %dst1, i8 %c, i32 128, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_source_gep( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) +; CHECK-NEXT: %p = getelementptr i8, i8* %dst1, i64 64 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_different_source_gep(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) + ; FIXME: We could optimize this as well. + %p = getelementptr i8, i8* %dst1, i64 64 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_variable_size_1( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i32 1, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_variable_size_1(i8* %dst1, i64 %dst1_size, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_variable_size_2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i32 1, i1 false) +; CHECK-NEXT: ret void +define void @test_variable_size_2(i8* %dst1, i8* %dst2, i64 %dst2_size, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i32 1, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)