Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -154,6 +154,12 @@ return ModRefInfo::Mod; } + if (const MemSetInst *MI = dyn_cast(Inst)) { + Loc = MemoryLocation::getForDest(MI); + // Conversatively assume ModRef for volatile memset. + return MI->isVolatile() ? ModRefInfo::ModRef : ModRefInfo::Mod; + } + if (const IntrinsicInst *II = dyn_cast(Inst)) { switch (II->getIntrinsicID()) { case Intrinsic::lifetime_start: Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1144,6 +1144,21 @@ return true; } +/// Determine whether the instruction has undefined content for the given Size, +/// either because it was freshly alloca'd or started its lifetime. +static bool hasUndefContents(Instruction *I, ConstantInt *Size) { + if (isa(I)) + return true; + + if (IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) + if (LTSize->getZExtValue() >= Size->getZExtValue()) + return true; + + return false; +} + /// Transform memcpy to memset when its source was just memset. /// In other words, turn: /// \code @@ -1167,12 +1182,23 @@ if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) return false; - ConstantInt *CopySize = cast(MemCpy->getLength()); + // A known memset size is required. ConstantInt *MemSetSize = dyn_cast(MemSet->getLength()); + if (!MemSetSize) + return false; + // Make sure the memcpy doesn't read any more than what the memset wrote. // Don't worry about sizes larger than i64. - if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue()) - return false; + ConstantInt *CopySize = cast(MemCpy->getLength()); + if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) { + // If the memcpy is larger than the memset, but the memory was undef prior + // to the memset, we can just ignore the tail. + MemDepResult DepInfo = MD->getDependency(MemSet); + if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) + CopySize = MemSetSize; + else + return false; + } IRBuilder<> Builder(MemCpy); Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), @@ -1252,19 +1278,7 @@ if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { - Instruction *I = SrcDepInfo.getInst(); - bool hasUndefContents = false; - - if (isa(I)) { - hasUndefContents = true; - } else if (IntrinsicInst *II = dyn_cast(I)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) - if (LTSize->getZExtValue() >= CopySize->getZExtValue()) - hasUndefContents = true; - } - - if (hasUndefContents) { + if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) { MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; Index: test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -0,0 +1,152 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -memcpyopt -S %s | FileCheck %s + +; memset -> memcpy forwarding, if memcpy is larger than memset, but trailing +; bytes are known to be undef. + + +%T = type { i64, i32, i32 } + +define void @test_alloca(i8* %result) { +; CHECK-LABEL: @test_alloca( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +define void @test_alloca_with_lifetimes(i8* %result) { +; CHECK-LABEL: @test_alloca_with_lifetimes( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[B]]) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[B]]) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* %b) + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 16, i8* %b) + ret void +} + +define void @test_malloc_with_lifetimes(i8* %result) { +; CHECK-LABEL: @test_malloc_with_lifetimes( +; CHECK-NEXT: [[A:%.*]] = call i8* @malloc(i64 16) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[A]]) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[A]]) +; CHECK-NEXT: call void @free(i8* [[A]]) +; CHECK-NEXT: ret void +; + %a = call i8* @malloc(i64 16) + call void @llvm.lifetime.start.p0i8(i64 16, i8* %a) + call void @llvm.memset.p0i8.i64(i8* align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %a, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 16, i8* %a) + call void @free(i8* %a) + ret void +} + +; memcpy size is larger than lifetime, don't optimize. +define void @test_copy_larger_than_lifetime_size(i8* %result) { +; CHECK-LABEL: @test_copy_larger_than_lifetime_size( +; CHECK-NEXT: [[A:%.*]] = call i8* @malloc(i64 16) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 12, i8* [[A]]) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[A]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 12, i8* [[A]]) +; CHECK-NEXT: call void @free(i8* [[A]]) +; CHECK-NEXT: ret void +; + %a = call i8* @malloc(i64 16) + call void @llvm.lifetime.start.p0i8(i64 12, i8* %a) + call void @llvm.memset.p0i8.i64(i8* align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %a, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 12, i8* %a) + call void @free(i8* %a) + ret void +} + +; The trailing bytes are not known to be undef, we can't ignore them. +define void @test_not_undef_memory(i8* %result, i8* %input) { +; CHECK-LABEL: @test_not_undef_memory( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[INPUT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[INPUT]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* align 8 %input, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %input, i64 16, i1 false) + ret void +} + +; Memset is volatile, memcpy is not. Can be optimized. +define void @test_volatile_memset(i8* %result) { +; CHECK-LABEL: @test_volatile_memset( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 true) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 true) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; Memcpy is volatile, memset is not. Cannot be optimized. +define void @test_volatile_memcpy(i8* %result) { +; CHECK-LABEL: @test_volatile_memcpy( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 true) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 true) + ret void +} + +; Write between memset and memcpy, can't optimize. +define void @test_write_between(i8* %result) { +; CHECK-LABEL: @test_write_between( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: store i8 -1, i8* [[B]] +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + store i8 -1, i8* %b + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +declare i8* @malloc(i64) +declare void @free(i8*) + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)