diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -81,6 +81,9 @@ if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64") set(LIBC_STRING_TARGET_ARCH "x86") set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86/memcpy.cpp) +elseif(${LIBC_TARGET_MACHINE} STREQUAL "aarch64") + set(LIBC_STRING_TARGET_ARCH "aarch64") + set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp) else() set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE}) set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp) diff --git a/libc/src/string/aarch64/CMakeLists.txt b/libc/src/string/aarch64/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/src/string/aarch64/CMakeLists.txt @@ -0,0 +1 @@ +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}") diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp new file mode 100644 --- /dev/null +++ b/libc/src/string/aarch64/memcpy.cpp @@ -0,0 +1,113 @@ +//===-- Implementation of memcpy ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "src/__support/common.h" +#include "src/string/memory_utils/memcpy_utils.h" + +namespace __llvm_libc { + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// We have used __builtin_expect to tell the compiler to favour lower sizes as +// that will reduce the branching overhead where that would hurt most +// proportional to total cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. +static void memcpy_aarch64(char *__restrict dst, const char *__restrict src, + size_t count) { + char *dst_m = dst + count; + const char *src_m = src + count; + if (__builtin_expect(count < 128, 1)) { + if (__builtin_expect(count > 32, 0)) { + CopyBlock<32>(dst, src); + CopyBlock<32>(dst_m - 32, src_m - 32); + if (__builtin_expect(count > 64, 0)) { + CopyBlock<32>(dst + 32, src + 32); + if (__builtin_expect(count > 96, 0)) { + CopyBlock<32>(dst + 64, src + 64); + } + } + return; + } else if (__builtin_expect(count < 16, 1)) { + if (__builtin_expect((count & 0x8) != 0, 0)) { + CopyBlock<8>(dst, src); + return CopyBlock<8>(dst_m - 8, src_m - 8); + } else if (__builtin_expect((count & 0x4) != 0, 0)) { + CopyBlock<4>(dst, src); + return CopyBlock<4>(dst_m - 4, src_m - 4); + } else { + if (count == 0) + return; + if (count == 1) + return CopyBlock<1>(dst, src); + if (count == 2) + return CopyBlock<2>(dst, src); + if (count == 3) + return CopyBlock<3>(dst, src); + } + } else { + CopyBlock<16>(dst, src); + return CopyBlock<16>(dst_m - 16, src_m - 16); + } + } + // Large copy + // Copy 16 bytes and then align src to 16-byte alignment. + CopyBlock<16>(dst, src); + + // Align to either source or destination depending on target. + // Default aligns to source, define 'ALIGN_DST' to align to destination. +#if ALIGN_DST +#define ALIGN_SRCDST dst +#else +#define ALIGN_SRCDST src +#endif + size_t misalign = ((intptr_t)ALIGN_SRCDST) % 16; + dst -= misalign; + src -= misalign; + + // Copy 64 bytes from aligned src/dst + CopyBlock<32>(dst + 16, src + 16); + CopyBlock<32>(dst + 48, src + 48); + + // Since we are copying the last 64-bytes unconditionally and we have + // already copied 64 + 16 - misalign bytes, we only need to copy the + // remaining bytes. Since the difference may be negative we must use a + // signed comparison. + int64_t count2 = count - (144 - misalign); + while (count2 > 0) { + CopyBlock<32>(dst + 80, src + 80); + CopyBlock<32>(dst + 112, src + 112); + count2 -= 64; + dst += 64; + src += 64; + } + // Copy last 64-bytes. + return CopyBlock<64>(dst_m - 64, src_m - 64); +} + +void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst, + const void *__restrict src, size_t size) { + memcpy_aarch64(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} + +} // namespace __llvm_libc