Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -133,6 +133,16 @@ uint64_t Size = MemOpLength->getLimitedValue(); assert(Size && "0-sized memory transferring should be removed already."); + // Do not expand a 8 byte copy to load/stores when we optimise for minimum + // code size. This could for example expand into 2 word loads and + // 2 stores. But when unaligned data access is not supported, this is a + // lot worse and we will have 8 byte loads and 8 byte stores. Keeping the + // memcpy call will result in just 2 instruction: the call and a mov imm to + // an arg register for the number of bytes to copy. + auto F = MI->getParent()->getParent(); + if (Size > 4 && F->hasFnAttribute(Attribute::MinSize)) + return nullptr; + if (Size > 8 || (Size&(Size-1))) return nullptr; // If not 1/2/4/8 bytes, exit. Index: test/Transforms/InstCombine/memcpy-to-load.ll =================================================================== --- test/Transforms/InstCombine/memcpy-to-load.ll +++ test/Transforms/InstCombine/memcpy-to-load.ll @@ -76,6 +76,16 @@ ret void } +; Do not expand the call if we optimise for minsize +define void @copy_8_bytes_minsize(i8* %A, i8* %B) #0 { +; CHECK-LABEL: @test4( +; CHECK: tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %A, i8* align 1 %B, i32 8, i1 false) +; CHECK-NEXT: ret void +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %A, i8* align 1 %B, i32 8, i1 false) + ret void +} + define void @copy_16_bytes(i8* %d, i8* %s) { ; ALL-LABEL: @copy_16_bytes( ; ALL-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[D:%.*]], i8* align 1 [[S:%.*]], i32 16, i1 false) @@ -85,3 +95,5 @@ ret void } +attributes #0 = { minsize nounwind optsize } +