Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1144,6 +1144,21 @@ return true; } +/// Determine whether the instruction has undefined content for the given Size, +/// either because it was freshly alloca'd or started its lifetime. +static bool hasUndefContents(Instruction *I, ConstantInt *Size) { + if (isa(I)) + return true; + + if (IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) + if (LTSize->getZExtValue() >= Size->getZExtValue()) + return true; + + return false; +} + /// Transform memcpy to memset when its source was just memset. /// In other words, turn: /// \code @@ -1167,12 +1182,27 @@ if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) return false; - ConstantInt *CopySize = cast(MemCpy->getLength()); + // A known memset size is required. ConstantInt *MemSetSize = dyn_cast(MemSet->getLength()); + if (!MemSetSize) + return false; + // Make sure the memcpy doesn't read any more than what the memset wrote. // Don't worry about sizes larger than i64. - if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue()) - return false; + ConstantInt *CopySize = cast(MemCpy->getLength()); + if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) { + // If the memcpy is larger than the memset, but the memory was undef prior + // to the memset, we can just ignore the tail. Technically we're only + // interested in the bytes from MemSetSize..CopySize here, but as we can't + // easily represent this location, we use the full 0..CopySize range. + MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); + MemDepResult DepInfo = MD->getPointerDependencyFrom( + MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent()); + if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) + CopySize = MemSetSize; + else + return false; + } IRBuilder<> Builder(MemCpy); Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), @@ -1252,19 +1282,7 @@ if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { - Instruction *I = SrcDepInfo.getInst(); - bool hasUndefContents = false; - - if (isa(I)) { - hasUndefContents = true; - } else if (IntrinsicInst *II = dyn_cast(I)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) - if (LTSize->getZExtValue() >= CopySize->getZExtValue()) - hasUndefContents = true; - } - - if (hasUndefContents) { + if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) { MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; Index: test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll =================================================================== --- test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll +++ test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 ; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* ; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) ; CHECK-NEXT: ret void ; %a = alloca %T, align 8 @@ -28,7 +28,7 @@ ; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[B]]) ; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[B]]) ; CHECK-NEXT: ret void ; @@ -46,7 +46,7 @@ ; CHECK-NEXT: [[A:%.*]] = call i8* @malloc(i64 16) ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[A]]) ; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[A]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[A]]) ; CHECK-NEXT: call void @free(i8* [[A]]) ; CHECK-NEXT: ret void @@ -98,7 +98,7 @@ ; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 ; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* ; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 true) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) ; CHECK-NEXT: ret void ; %a = alloca %T, align 8 @@ -142,6 +142,67 @@ ret void } +; A write prior to the memset, which is part of the memset region. +; We could optimize this, but currently don't, because the used memory location is imprecise. +define void @test_write_before_memset_in_memset_region(i8* %result) { +; CHECK-LABEL: @test_write_before_memset_in_memset_region( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: store i8 -1, i8* [[B]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + store i8 -1, i8* %b + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; A write prior to the memset, which is part of the memcpy (but not memset) region. +; This cannot be optimized. +define void @test_write_before_memset_in_memcpy_region(i8* %result) { +; CHECK-LABEL: @test_write_before_memset_in_memcpy_region( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 2 +; CHECK-NEXT: store i32 -1, i32* [[C]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + %c = getelementptr inbounds %T, %T* %a, i64 0, i32 2 + store i32 -1, i32* %c + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; A write prior to the memset, which is part of both the memset and memcpy regions. +; This cannot be optimized. +define void @test_write_before_memset_in_both_regions(i8* %result) { +; CHECK-LABEL: @test_write_before_memset_in_both_regions( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 1 +; CHECK-NEXT: store i32 -1, i32* [[C]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 10, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + %c = getelementptr inbounds %T, %T* %a, i64 0, i32 1 + store i32 -1, i32* %c + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 10, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + declare i8* @malloc(i64) declare void @free(i8*)