Index: libc/src/string/CMakeLists.txt =================================================================== --- libc/src/string/CMakeLists.txt +++ libc/src/string/CMakeLists.txt @@ -215,6 +215,11 @@ if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64") set(LIBC_STRING_TARGET_ARCH "x86") set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86/memcpy.cpp) +elseif(${LIBC_TARGET_MACHINE} STREQUAL "aarch64") + set(LIBC_STRING_TARGET_ARCH "aarch64") + set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp) +#Disable tail merging as it leads to lower performance + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mllvm --tail-merge-threshold=0") else() set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE}) set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp) Index: libc/src/string/aarch64/CMakeLists.txt =================================================================== --- /dev/null +++ libc/src/string/aarch64/CMakeLists.txt @@ -0,0 +1 @@ +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}") Index: libc/src/string/aarch64/memcpy.cpp =================================================================== --- /dev/null +++ libc/src/string/aarch64/memcpy.cpp @@ -0,0 +1,67 @@ +//===-- Implementation of memcpy ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "src/__support/common.h" +#include "src/string/memory_utils/memcpy_utils.h" + +namespace __llvm_libc { + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// We have used __builtin_expect to tell the compiler to favour lower sizes as +// that will reduce the branching overhead where that would hurt most +// proportional to total cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. +static void memcpy_aarch64(char *__restrict dst, const char *__restrict src, + size_t count) { + if (count == 0) + return; + if (count == 1) + return CopyBlock<1>(dst, src); + if (count == 2) + return CopyBlock<2>(dst, src); + if (count == 3) + return CopyBlock<3>(dst, src); + if (count == 4) + return CopyBlock<4>(dst, src); + if (count < 8) + return CopyBlockOverlap<4>(dst, src, count); + if (count < 16) + return CopyBlockOverlap<8>(dst, src, count); + if (count < 32) + return CopyBlockOverlap<16>(dst, src, count); + if (count < 64) + return CopyBlockOverlap<32>(dst, src, count); + if (count < 128) + return CopyBlockOverlap<64>(dst, src, count); + return CopyAlignedBlocks<64,16>(dst, src, count); +} + +LLVM_LIBC_FUNCTION(void *, memcpy, + (void *__restrict dst, const void *__restrict src, + size_t size)) { + memcpy_aarch64(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} + +} // namespace __llvm_libc