diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -708,9 +708,11 @@ if (P) { // If we load from memory that may alias the memory we store to, // memmove must be used to preserve semantic. If not, memcpy can - // be used. + // be used. Also, if we load from constant memory, memcpy can be used + // as the constant memory won't be modified. bool UseMemMove = false; - if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc)) + if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc) && + !AA->pointsToConstantMemory(LoadLoc)) UseMemMove = true; uint64_t Size = DL.getTypeStoreSize(T); @@ -1102,11 +1104,14 @@ } // If the dest of the second might alias the source of the first, then the - // source and dest might overlap. We still want to eliminate the intermediate - // value, but we have to generate a memmove instead of memcpy. + // source and dest might overlap. In addition, if the source of the first + // points to constant memory, they won't overlap by definition. Otherwise, we + // still want to eliminate the intermediate value, but we have to generate a + // memmove instead of memcpy. bool UseMemMove = false; if (!AA->isNoAlias(MemoryLocation::getForDest(M), - MemoryLocation::getForSource(MDep))) + MemoryLocation::getForSource(MDep)) && + !AA->pointsToConstantMemory(MemoryLocation::getForSource(MDep))) UseMemMove = true; // If all checks passed, then we can transform M. @@ -1168,10 +1173,10 @@ // Check that src and dst of the memcpy aren't the same. While memcpy // operands cannot partially overlap, exact equality is allowed. - if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(), - LocationSize::precise(1)), - MemoryLocation(MemCpy->getDest(), - LocationSize::precise(1)))) + if (!AA->isNoAlias( + MemoryLocation(MemCpy->getSource(), LocationSize::precise(1)), + MemoryLocation(MemCpy->getDest(), LocationSize::precise(1))) && + !AA->pointsToConstantMemory(MemoryLocation::getForSource(MemCpy))) return false; if (EnableMemorySSA) { @@ -1560,9 +1565,10 @@ /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed /// not to alias. bool MemCpyOptPass::processMemMove(MemMoveInst *M) { - // See if the pointers alias. + // See if the pointers alias or the source points to the constant memory. if (!AA->isNoAlias(MemoryLocation::getForDest(M), - MemoryLocation::getForSource(M))) + MemoryLocation::getForSource(M)) && + !AA->pointsToConstantMemory(MemoryLocation::getForSource(M))) return false; LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M diff --git a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll --- a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll @@ -4,6 +4,9 @@ %T = type { i8, i32 } +; A global constant of %T +@C = external constant %T + ; Ensure load-store forwarding of an aggregate is interpreted as ; a memmove when the source and dest may alias define void @test_memmove(%T* align 8 %a, %T* align 16 %b) { @@ -32,6 +35,17 @@ ret void } +define void @test_memcpy_constant(%T* %d) { +; CHECK-LABEL: @test_memcpy_constant( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %T* [[D:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP1]], i8* align 8 getelementptr inbounds ([[T:%.*]], %T* @C, i32 0, i32 0), i64 8, i1 false) +; CHECK-NEXT: ret void +; + %val = load %T, %T* @C, align 8 + store %T %val, %T* %d, align 16 + ret void +} + ; memcpy(%d, %a) should not be generated since store2 may-aliases load %a. define void @f(%T* %a, %T* %b, %T* %c, %T* %d) { ; CHECK-LABEL: @f( diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -8,6 +8,8 @@ %0 = type { x86_fp80, x86_fp80 } %1 = type { i32, i32 } +@C = external constant [0 x i8] + declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind @@ -22,7 +24,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = alloca [[TMP0:%.*]], align 16 ; CHECK-NEXT: [[MEMTMP:%.*]] = alloca [[TMP0]], align 16 ; CHECK-NEXT: [[TMP5:%.*]] = fsub x86_fp80 0xK80000000000000000000, [[Z_1:%.*]] -; CHECK-NEXT: call void @ccoshl(%0* sret([[TMP0]]) [[TMP2]], x86_fp80 [[TMP5]], x86_fp80 [[Z_0:%.*]]) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: call void @ccoshl(%0* sret([[TMP0]]) [[TMP2]], x86_fp80 [[TMP5]], x86_fp80 [[Z_0:%.*]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[TMP219:%.*]] = bitcast %0* [[TMP2]] to i8* ; CHECK-NEXT: [[MEMTMP20:%.*]] = bitcast %0* [[MEMTMP]] to i8* ; CHECK-NEXT: [[AGG_RESULT21:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8* @@ -64,6 +66,23 @@ } +; The intermediate alloca and one of the memcpy's should be eliminated, the +; other should be related with a memcpy. +define void @test2_constant(i8* %Q) nounwind { +; CHECK-LABEL: @test2_constant( +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [0 x i8], [0 x i8]* @C, i64 0, i64 undef +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P]], i32 32, i1 false) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + %P = getelementptr inbounds [0 x i8], [0 x i8]* @C, i64 0, i64 undef + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void + +} + ; The intermediate alloca and one of the memcpy's should be eliminated, the ; other should be related with a memcpy. define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind { @@ -299,7 +318,7 @@ define i32 @test7(%struct.p* nocapture align 8 byval(%struct.p) %q) nounwind ssp { ; CHECK-LABEL: @test7( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call i32 @g(%struct.p* byval([[STRUCT_P:%.*]]) align 8 [[Q:%.*]]) #[[ATTR0]] +; CHECK-NEXT: [[CALL:%.*]] = call i32 @g(%struct.p* byval([[STRUCT_P:%.*]]) align 8 [[Q:%.*]]) #[[ATTR2]] ; CHECK-NEXT: ret i32 [[CALL]] ; entry: diff --git a/llvm/test/Transforms/MemCpyOpt/memmove.ll b/llvm/test/Transforms/MemCpyOpt/memmove.ll --- a/llvm/test/Transforms/MemCpyOpt/memmove.ll +++ b/llvm/test/Transforms/MemCpyOpt/memmove.ll @@ -6,6 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-apple-darwin9.0" +@C = external constant [0 x i8] + declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind define i8* @test1(i8* nocapture %src) nounwind { @@ -54,3 +56,14 @@ tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %add.ptr, i64 17, i1 false) ret void } + +define void @test4(i8* %P) nounwind { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds [0 x i8], [0 x i8]* @C, i64 0, i64 undef +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[ADD_PTR]], i64 17, i1 false) +; CHECK-NEXT: ret void +; + %add.ptr = getelementptr inbounds [0 x i8], [0 x i8]* @C, i64 0, i64 undef + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %add.ptr, i64 17, i1 false) + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll @@ -4,6 +4,25 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +@C = external constant [0 x i8] + +define void @test_constant(i64 %src_size, i8* %dst, i64 %dst_size, i8 %c) { +; CHECK-LABEL: @test_constant( +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds [0 x i8], [0 x i8]* @C, i64 0, i64 undef +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 [[SRC_SIZE]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP4]], i8 [[C:%.*]], i64 [[TMP3]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 [[SRC_SIZE]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false) + %src = getelementptr inbounds [0 x i8], [0 x i8]* @C, i64 0, i64 undef + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) + ret void +} + define void @test(i8* %src, i64 %src_size, i8* noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]]