diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -21,6 +21,7 @@ include(CMakeParseArguments) include(LLVMLibCRules) +include(LLVMLibCCheckCpuFeatures) add_subdirectory(src) add_subdirectory(config) diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -1,99 +1,129 @@ -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Cpu features definition and flags -# -# Declare a list of all supported cpu features in ALL_CPU_FEATURES. -# -# Declares associated flags to enable/disable individual feature of the form: -# - CPU_FEATURE__ENABLE_FLAG -# - CPU_FEATURE__DISABLE_FLAG -# -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ if(${LIBC_TARGET_MACHINE} MATCHES "x86|x86_64") - set(ALL_CPU_FEATURES SSE SSE2 AVX AVX512F) + set(ALL_CPU_FEATURES SSE SSE2 AVX AVX2 AVX512F) endif() -function(_define_cpu_feature_flags feature) - if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - string(TOLOWER ${feature} lowercase_feature) - set(CPU_FEATURE_${feature}_ENABLE_FLAG "-m${lowercase_feature}" PARENT_SCOPE) - set(CPU_FEATURE_${feature}_DISABLE_FLAG "-mno-${lowercase_feature}" PARENT_SCOPE) +list(SORT ALL_CPU_FEATURES) + +# Function to check whether the host supports the provided set of features. +# Usage: +# host_supports( +# +# +# ) +function(host_supports output_var features) + _intersection(a "${HOST_CPU_FEATURES}" "${features}") + if("${a}" STREQUAL "${features}") + set(${output_var} TRUE PARENT_SCOPE) + else() + unset(${output_var} PARENT_SCOPE) + endif() +endfunction() + +# Function to compute the flags to pass down to the compiler. +# Usage: +# compute_flags( +# +# MARCH +# REQUIRE +# REJECT +# ) +function(compute_flags output_var) + cmake_parse_arguments( + "COMPUTE_FLAGS" + "" # Optional arguments + "MARCH" # Single value arguments + "REQUIRE;REJECT" # Multi value arguments + ${ARGN}) + # Check that features are not required and rejected at the same time. + if(COMPUTE_FLAGS_REQUIRE AND COMPUTE_FLAGS_REJECT) + _intersection(var ${COMPUTE_FLAGS_REQUIRE} ${COMPUTE_FLAGS_REJECT}) + if(var) + message(FATAL_ERROR "Cpu Features REQUIRE and REJECT ${var}") + endif() + endif() + # Generate the compiler flags in `current`. + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang|GNU") + if(COMPUTE_FLAGS_MARCH) + list(APPEND current "-march=${COMPUTE_FLAGS_MARCH}") + endif() + foreach(feature IN LISTS COMPUTE_FLAGS_REQUIRE) + string(TOLOWER ${feature} lowercase_feature) + list(APPEND current "-m${lowercase_feature}") + endforeach() + foreach(feature IN LISTS COMPUTE_FLAGS_REJECT) + string(TOLOWER ${feature} lowercase_feature) + list(APPEND current "-mno-${lowercase_feature}") + endforeach() else() # In future, we can extend for other compilers. message(FATAL_ERROR "Unkown compiler ${CMAKE_CXX_COMPILER_ID}.") endif() + # Export the list of flags. + set(${output_var} "${current}" PARENT_SCOPE) endfunction() -# Defines cpu features flags -foreach(feature IN LISTS ALL_CPU_FEATURES) - _define_cpu_feature_flags(${feature}) -endforeach() - -#------------------------------------------------------------------------------ -# Optimization level flags -# -# Generates the set of flags needed to compile for a up to a particular -# optimization level. -# -# Creates variables of the form `CPU_FEATURE_OPT__FLAGS`. -# CPU_FEATURE_OPT_NONE_FLAGS is a special flag for which no feature is needed. -# -# e.g. -# CPU_FEATURE_OPT_NONE_FLAGS : -mno-sse;-mno-sse2;-mno-avx;-mno-avx512f -# CPU_FEATURE_OPT_SSE_FLAGS : -msse;-mno-sse2;-mno-avx;-mno-avx512f -# CPU_FEATURE_OPT_SSE2_FLAGS : -msse;-msse2;-mno-avx;-mno-avx512f -# CPU_FEATURE_OPT_AVX_FLAGS : -msse;-msse2;-mavx;-mno-avx512f -# CPU_FEATURE_OPT_AVX512F_FLAGS : -msse;-msse2;-mavx;-mavx512f -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Internal helpers and utilities. +# ------------------------------------------------------------------------------ -# Helper function to concatenate flags needed to support optimization up to -# a particular feature. -function(_generate_flags_for_up_to feature flag_variable) - list(FIND ALL_CPU_FEATURES ${feature} feature_index) - foreach(current_feature IN LISTS ALL_CPU_FEATURES) - list(FIND ALL_CPU_FEATURES ${current_feature} current_feature_index) - if(${current_feature_index} GREATER ${feature_index}) - list(APPEND flags ${CPU_FEATURE_${current_feature}_DISABLE_FLAG}) - else() - list(APPEND flags ${CPU_FEATURE_${current_feature}_ENABLE_FLAG}) +# Computes the intersection between two lists. +function(_intersection output_var list1 list2) + foreach(element IN LISTS list1) + if("${list2}" MATCHES "(^|;)${element}(;|$)") + list(APPEND tmp "${element}") endif() endforeach() - set(${flag_variable} ${flags} PARENT_SCOPE) + set(${output_var} ${tmp} PARENT_SCOPE) endfunction() -function(_generate_opt_levels) - set(opt_levels NONE) - list(APPEND opt_levels ${ALL_CPU_FEATURES}) - foreach(feature IN LISTS opt_levels) - set(flag_name "CPU_FEATURE_OPT_${feature}_FLAGS") - _generate_flags_for_up_to(${feature} ${flag_name}) - set(${flag_name} ${${flag_name}} PARENT_SCOPE) +# Generates a cpp file to introspect the compiler defined flags. +function(_generate_check_code) + foreach(feature IN LISTS ALL_CPU_FEATURES) + set(DEFINITIONS + "${DEFINITIONS} +#ifdef __${feature}__ + \"${feature}\", +#endif") endforeach() + configure_file( + "${LIBC_SOURCE_DIR}/cmake/modules/cpu_features/check_cpu_features.cpp.in" + "cpu_features/check_cpu_features.cpp" @ONLY) endfunction() +_generate_check_code() -_generate_opt_levels() - -#------------------------------------------------------------------------------ -# Host cpu feature introspection -# -# Populates a HOST_CPU_FEATURES list containing the available CPU_FEATURE. -#------------------------------------------------------------------------------ -function(_check_host_cpu_feature feature) - string(TOLOWER ${feature} lowercase_feature) +# Compiles and runs the code generated above with the specified requirements. +# This is helpful to infer which features a particular target supports or if +# a specific features implies other features (e.g. BMI2 implies SSE2 and SSE). +function(_check_defined_cpu_feature output_var) + cmake_parse_arguments( + "CHECK_DEFINED" + "" # Optional arguments + "MARCH" # Single value arguments + "REQUIRE;REJECT" # Multi value arguments + ${ARGN}) + compute_flags( + flags + MARCH ${CHECK_DEFINED_MARCH} + REQUIRE ${CHECK_DEFINED_REQUIRE} + REJECT ${CHECK_DEFINED_REJECT}) try_run( - run_result - compile_result - "${CMAKE_CURRENT_BINARY_DIR}/check_${lowercase_feature}" - "${CMAKE_MODULE_PATH}/cpu_features/check_${lowercase_feature}.cpp" - COMPILE_DEFINITIONS ${CPU_FEATURE_${feature}_ENABLE_FLAG} - OUTPUT_VARIABLE compile_output - ) + run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}" + "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp" + COMPILE_DEFINITIONS ${flags} + COMPILE_OUTPUT_VARIABLE compile_output + RUN_OUTPUT_VARIABLE run_output) if(${compile_result} AND ("${run_result}" EQUAL 0)) - list(APPEND HOST_CPU_FEATURES ${feature}) - set(HOST_CPU_FEATURES ${HOST_CPU_FEATURES} PARENT_SCOPE) + set(${output_var} + "${run_output}" + PARENT_SCOPE) + else() + message(FATAL_ERROR "${compile_output}") endif() endfunction() -foreach(feature IN LISTS ALL_CPU_FEATURES) - _check_host_cpu_feature(${feature}) -endforeach() +# Populates the HOST_CPU_FEATURES list. +_check_defined_cpu_feature(HOST_CPU_FEATURES MARCH native) diff --git a/libc/cmake/modules/LLVMLibCRules.cmake b/libc/cmake/modules/LLVMLibCRules.cmake --- a/libc/cmake/modules/LLVMLibCRules.cmake +++ b/libc/cmake/modules/LLVMLibCRules.cmake @@ -308,6 +308,7 @@ # SRCS # HDRS # DEPENDS +# COMPILE_OPTIONS # ) function(add_libc_unittest target_name) if(NOT LLVM_INCLUDE_TESTS) @@ -318,7 +319,7 @@ "LIBC_UNITTEST" "" # No optional arguments "SUITE" # Single value arguments - "SRCS;HDRS;DEPENDS" # Multi-value arguments + "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi-value arguments ${ARGN} ) if(NOT LIBC_UNITTEST_SRCS) @@ -356,6 +357,12 @@ ${LIBC_BUILD_DIR} ${LIBC_BUILD_DIR}/include ) + if(LIBC_UNITTEST_COMPILE_OPTIONS) + target_compile_options( + ${target_name} + PRIVATE ${LIBC_UNITTEST_COMPILE_OPTIONS} + ) + endif() if(library_deps) target_link_libraries(${target_name} PRIVATE ${library_deps}) diff --git a/libc/cmake/modules/cpu_features/check_avx.cpp b/libc/cmake/modules/cpu_features/check_avx.cpp deleted file mode 100644 --- a/libc/cmake/modules/cpu_features/check_avx.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __AVX__ -#error "missing __AVX__" -#endif -#include -int main() { - (void)_mm256_set1_epi8('0'); - return 0; -} diff --git a/libc/cmake/modules/cpu_features/check_avx512f.cpp b/libc/cmake/modules/cpu_features/check_avx512f.cpp deleted file mode 100644 --- a/libc/cmake/modules/cpu_features/check_avx512f.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __AVX512F__ -#error "missing __AVX512F__" -#endif -#include -int main() { - (void)_mm512_undefined(); - return 0; -} diff --git a/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in new file mode 100644 --- /dev/null +++ b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in @@ -0,0 +1,29 @@ +#include +#include + +// This file is instantiated by CMake. +// DEFINITIONS below is replaced with a set of lines like so: +// #ifdef __SSE2__ +// "SSE2", +// #endif +// +// This allows for introspection of compiler definitions. +// The output of the program is a single line of semi colon separated feature +// names. + +// MSVC is using a different set of preprocessor definitions for +// SSE and SSE2, see _M_IX86_FP in +// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros + +int main(int, char **) { + const char *strings[] = { + @DEFINITIONS@ + }; + const size_t size = sizeof(strings) / sizeof(strings[0]); + for (size_t i = 0; i < size; ++i) { + if (i) + putchar(';'); + fputs(strings[i], stdout); + } + return EXIT_SUCCESS; +} diff --git a/libc/cmake/modules/cpu_features/check_sse.cpp b/libc/cmake/modules/cpu_features/check_sse.cpp deleted file mode 100644 --- a/libc/cmake/modules/cpu_features/check_sse.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __SSE__ -#error "missing __SSE__" -#endif -#include -int main() { - (void)_mm_set_ss(1.0f); - return 0; -} diff --git a/libc/cmake/modules/cpu_features/check_sse2.cpp b/libc/cmake/modules/cpu_features/check_sse2.cpp deleted file mode 100644 --- a/libc/cmake/modules/cpu_features/check_sse2.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __SSE2__ -#error "missing __SSE2__" -#endif -#include -int main() { - (void)_mm_set1_epi8('0'); - return 0; -} diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt --- a/libc/lib/CMakeLists.txt +++ b/libc/lib/CMakeLists.txt @@ -11,6 +11,7 @@ # string.h entrypoints strcpy strcat + memcpy # sys/mman.h entrypoints mmap diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(memory_utils) + add_entrypoint_object( strcat SRCS @@ -19,4 +21,63 @@ string_h ) -add_subdirectory(memory_utils) +# ------------------------------------------------------------------------------ +# memcpy +# ------------------------------------------------------------------------------ + +# include the relevant architecture specific implementations +if(${LIBC_TARGET_MACHINE} STREQUAL "x86") + set(LIBC_MEMCPY_IMPL_FOLDER "x86_64") +else() + set(LIBC_MEMCPY_IMPL_FOLDER ${LIBC_TARGET_MACHINE}) +endif() + +add_gen_header( + memcpy_arch_specific + DEF_FILE + memcpy_arch_specific.h.def + GEN_HDR + memcpy_arch_specific.h + PARAMS + memcpy_arch_specific=${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc + DATA_FILES + ${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc +) + +# Helper to define an implementation of memcpy. +# - Computes flags to satisfy required/rejected features and arch, +# - Declares an entry point, +# - Attach the REQUIRE_CPU_FEATURES property to the target, +# - Add the target to `memcpy_implementations` global property for tests. +function(add_memcpy memcpy_name) + cmake_parse_arguments( + "ADD_MEMCPY" + "" # Optional arguments + "MARCH" # Single value arguments + "REQUIRE;REJECT" # Multi value arguments + ${ARGN}) + compute_flags(flags + MARCH ${ADD_MEMCPY_MARCH} + REQUIRE ${ADD_MEMCPY_REQUIRE} + REJECT ${ADD_MEMCPY_REJECT} + ) + add_entrypoint_object( + ${memcpy_name} + SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp + HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h + DEPENDS + string_h + memory_utils + memcpy_arch_specific + COMPILE_OPTIONS + -fno-builtin-memcpy + ${flags} + ) + set_target_properties(${memcpy_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_MEMCPY_REQUIRE}") + get_property(all GLOBAL PROPERTY memcpy_implementations) + list(APPEND all ${memcpy_name}) + set_property(GLOBAL PROPERTY memcpy_implementations "${all}") +endfunction() + +add_subdirectory(${LIBC_MEMCPY_IMPL_FOLDER}) +add_memcpy(memcpy MARCH native) diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy.h @@ -0,0 +1,21 @@ +//===----------------- Implementation header for memcpy -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H +#define LLVM_LIBC_SRC_STRING_MEMCPY_H + +#include "include/string.h" +#include // size_t + +namespace __llvm_libc { + +void *memcpy(void *__restrict, const void *__restrict, size_t); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy.cpp @@ -0,0 +1,22 @@ +//===--------------------- Implementation of memcpy -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "src/__support/common.h" +#include "src/string/memcpy_arch_specific.h" + +namespace __llvm_libc { + +void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst, + const void *__restrict src, size_t size) { + memcpy_no_return(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy_arch_specific.h.def @@ -0,0 +1,65 @@ +//===-------------- Implementation of arch specific memcpy ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H +#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H + +%%include_file(${memcpy_arch_specific}) + +namespace __llvm_libc { + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. +static void memcpy_no_return(char *__restrict dst, const char *__restrict src, + size_t count) { + if (count == 0) + return; + if (count == 1) + return Copy<1>(dst, src); + if (count == 2) + return Copy<2>(dst, src); + if (count == 3) + return Copy<3>(dst, src); + if (count == 4) + return Copy<4>(dst, src); + if (count < 8) + return CopyOverlap<4>(dst, src, count); + if (count == 8) + return Copy<8>(dst, src); + if (count < 16) + return CopyOverlap<8>(dst, src, count); + if (count == 16) + return Copy<16>(dst, src); + if (count < 32) + return CopyOverlap<16>(dst, src, count); + if (count < 64) + return CopyOverlap<32>(dst, src, count); + if (count < 128) + return CopyOverlap<64>(dst, src, count); + CopyGE128(dst, src, count); +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -12,6 +12,9 @@ add_header_library( memory_utils - HDRS utils.h - DEPENDS cacheline_size + HDRS + utils.h + memcpy_utils.h + DEPENDS + cacheline_size ) diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/memcpy_utils.h @@ -0,0 +1,100 @@ +//===---------------------------- Memcpy utils ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H +#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H + +#include "src/string/memory_utils/utils.h" +#include // size_t + +// __builtin_memcpy_inline guarantees to never call external functions. +// Unfortunately it is not widely available. +#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline) +#define USE_BUILTIN_MEMCPY_INLINE +#elif defined(__GNUC__) +#define USE_BUILTIN_MEMCPY +#endif + +// This is useful for testing. +#if defined(LLVM_LIBC_MEMCPY_MONITOR) +extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict, + const char *__restrict, size_t); +#endif + +namespace __llvm_libc { + +// Copies `kBlockSize` bytes from `src` to `dst`. +template +static void Copy(char *__restrict dst, const char *__restrict src) { +#if defined(LLVM_LIBC_MEMCPY_MONITOR) + LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize); +#elif defined(USE_BUILTIN_MEMCPY_INLINE) + __builtin_memcpy_inline(dst, src, kBlockSize); +#elif defined(USE_BUILTIN_MEMCPY) + __builtin_memcpy(dst, src, kBlockSize); +#else + for (size_t i = 0; i < kBlockSize; ++i) + dst[i] = src[i]; +#endif +} + +// Copies `kBlockSize` bytes from `src + count - kBlockSize` to +// `dst + count - kBlockSize`. +// Precondition: `count >= kBlockSize`. +template +static void CopyLastBlock(char *__restrict dst, const char *__restrict src, + size_t count) { + const size_t offset = count - kBlockSize; + Copy(dst + offset, src + offset); +} + +// Copies `kBlockSize` bytes twice with an overlap between the two. +// +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [__XXXXXXXX_________] +// [________XXXXXXXX___] +// +// Precondition: `count >= kBlockSize && count <= kBlockSize`. +template +static void CopyOverlap(char *__restrict dst, const char *__restrict src, + size_t count) { + Copy(dst, src); + CopyLastBlock(dst, src, count); +} + +// Copies `count` bytes by blocks of `kBlockSize` bytes. +// Copies at the start and end of the buffer are unaligned. +// Copies in the middle of the buffer are aligned to `kBlockSize`. +// +// e.g. with +// [12345678123456781234567812345678] +// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___] +// [__XXXXXXXX______________________] +// [________XXXXXXXX________________] +// [________________XXXXXXXX________] +// [_____________________XXXXXXXX___] +// +// Precondition: `count > 2 * kBlockSize` for efficiency. +// `count >= kBlockSize` for correctness. +template +static void CopyAligned(char *__restrict dst, const char *__restrict src, + size_t count) { + Copy(dst, src); // Copy first block + + // Copy aligned blocks + size_t offset = kBlockSize - offset_from_last_aligned(dst); + for (; offset + kBlockSize < count; offset += kBlockSize) + Copy(dst + offset, src + offset); + + CopyLastBlock(dst, src, count); // Copy last block +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -43,6 +43,11 @@ return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1); } +template intptr_t offset_from_last_aligned(const void *ptr) { + static_assert(is_power2(alignment), "alignment must be a power of 2"); + return reinterpret_cast(ptr) & (alignment - 1U); +} + template intptr_t offset_to_next_aligned(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); // The logic is not straightforward and involves unsigned modulo arithmetic @@ -51,7 +56,7 @@ } // Returns the offset from `ptr` to the next cache line. -static intptr_t offset_to_next_cache_line(const void *ptr) { +static inline intptr_t offset_to_next_cache_line(const void *ptr) { return offset_to_next_aligned(ptr); } diff --git a/libc/src/string/x86_64/CMakeLists.txt b/libc/src/string/x86_64/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/src/string/x86_64/CMakeLists.txt @@ -0,0 +1,4 @@ +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_none" REJECT "${ALL_CPU_FEATURES}") +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_sse" REQUIRE "SSE" REJECT "SSE2") +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx" REQUIRE "AVX" REJECT "AVX2") +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx512f" REQUIRE "AVX512F") diff --git a/libc/src/string/x86_64/memcpy_arch_specific.h.inc b/libc/src/string/x86_64/memcpy_arch_specific.h.inc new file mode 100644 --- /dev/null +++ b/libc/src/string/x86_64/memcpy_arch_specific.h.inc @@ -0,0 +1,35 @@ +#include "src/string/memory_utils/memcpy_utils.h" + +namespace __llvm_libc { + +static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, + size_t count) { + // FIXME: Add MSVC suppport with + // #include + // __movsb(reinterpret_cast(dst), + // reinterpret_cast(src), count); + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); +} + +#if defined(__AVX__) +#define BEST_SIZE 64 +#else +#define BEST_SIZE 32 +#endif + +static void CopyGE128(char *__restrict dst, const char *__restrict src, + size_t count) { +#if defined(__AVX__) + if (count < 256) + return CopyOverlap<128>(dst, src, count); +#endif + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used to to kRepMovsBSize and then RepMovsb. + constexpr size_t kRepMovsBSize = -1; + if (count <= kRepMovsBSize) + return CopyAligned(dst, src, count); + CopyRepMovsb(dst, src, count); +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -22,3 +22,25 @@ DEPENDS strcpy ) + +# Tests all implementations of memcpy that can run on the host. +get_property(memcpy_implementations GLOBAL PROPERTY memcpy_implementations) +foreach(memcpy_config_name IN LISTS memcpy_implementations) + get_target_property(require_cpu_features ${memcpy_config_name} REQUIRE_CPU_FEATURES) + host_supports(can_run "${require_cpu_features}") + if(can_run) + compute_flags(flags MARCH native) + add_libc_unittest( + ${memcpy_config_name}_test + SUITE + libc_string_unittests + SRCS + memcpy_test.cpp + DEPENDS + ${memcpy_config_name} + ) + else() + message(STATUS "Skipping test for '${memcpy_config_name}' insufficient host cpu features") + endif() +endforeach() + diff --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memcpy_test.cpp @@ -0,0 +1,53 @@ +//===----------------------- Unittests for memcpy -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "utils/CPP/ArrayRef.h" +#include "utils/UnitTest/Test.h" +#include "src/string/memcpy.h" + +using __llvm_libc::cpp::Array; +using __llvm_libc::cpp::ArrayRef; +using __llvm_libc::cpp::MutableArrayRef; +using Data = Array; + +static const ArrayRef kNumbers("0123456789", 10); +static const ArrayRef kDeadcode("DEADC0DE", 8); + +// Returns a Data object filled with a repetition of `filler`. +Data getData(ArrayRef filler) { + Data out; + for (size_t i = 0; i < out.size(); ++i) + out[i] = filler[i % filler.size()]; + return out; +} + +TEST(MemcpyTest, Thorough) { + const Data groundtruth = getData(kNumbers); + const Data dirty = getData(kDeadcode); + for (size_t count = 0; count < 1024; ++count) { + for (size_t align = 0; align < 64; ++align) { + auto buffer = dirty; + const char *const src = groundtruth.data(); + char *const dst = &buffer[align]; + __llvm_libc::memcpy(dst, src, count); + // Everything before copy is untouched. + for (size_t i = 0; i < align; ++i) + ASSERT_EQ(buffer[i], dirty[i]); + // Everything in between is copied. + for (size_t i = 0; i < count; ++i) + ASSERT_EQ(buffer[align + i], groundtruth[i]); + // Everything after copy is untouched. + for (size_t i = align + count; i < dirty.size(); ++i) + ASSERT_EQ(buffer[i], dirty[i]); + } + } +} + +// FIXME: Add tests with reads and writes on the boundary of a read/write +// protected page to check we're not reading nor writing prior/past the allowed +// regions. diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -4,7 +4,14 @@ libc_string_unittests SRCS utils_test.cpp + memcpy_utils_test.cpp DEPENDS memory_utils standalone_cpp ) + +target_compile_definitions( + utils_test + PRIVATE + LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor +) diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp @@ -0,0 +1,208 @@ +//===-------------------- Unittests for memory_utils ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memory_utils/memcpy_utils.h" +#include "utils/CPP/Array.h" +#include "utils/UnitTest/Test.h" + +#include +#include // uintptr_t + +#ifndef LLVM_LIBC_MEMCPY_MONITOR +#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test. +#endif + +namespace __llvm_libc { + +struct Buffer { + static constexpr size_t kMaxBuffer = 1024; + char buffer[kMaxBuffer + 1]; + size_t last = 0; + + void Clear() { + last = 0; + for (size_t i = 0; i < kMaxBuffer; ++i) + buffer[i] = '0'; + buffer[kMaxBuffer] = '\0'; + } + + void Increment(const void *ptr) { + const auto offset = reinterpret_cast(ptr); + assert(offset < kMaxBuffer); + ++buffer[offset]; + if (offset > last) + last = offset; + } + + char *Finish() { + assert(last < kMaxBuffer); + buffer[last + 1] = '\0'; + return buffer; + } +}; + +struct Trace { + Buffer read; + Buffer write; + + void Add(char *__restrict dst, const char *__restrict src, size_t count) { + for (size_t i = 0; i < count; ++i) + read.Increment(src + i); + for (size_t i = 0; i < count; ++i) + write.Increment(dst + i); + } + + void Clear() { + read.Clear(); + write.Clear(); + } + + char *Read() { return read.Finish(); } + char *Write() { return write.Finish(); } +}; + +static Trace &GetTrace() { + static thread_local Trace events; + return events; +} + +extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst, + const char *__restrict src, + size_t count) { + GetTrace().Add(dst, src, count); +} + +char *I(uintptr_t offset) { return reinterpret_cast(offset); } + +TEST(MemcpyUtilsTest, CopyTrivial) { + auto &trace = GetTrace(); + + trace.Clear(); + Copy<1>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1"); + EXPECT_STREQ(trace.Read(), "1"); + + trace.Clear(); + Copy<2>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11"); + EXPECT_STREQ(trace.Read(), "11"); + + trace.Clear(); + Copy<4>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1111"); + EXPECT_STREQ(trace.Read(), "1111"); + + trace.Clear(); + Copy<8>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11111111"); + EXPECT_STREQ(trace.Read(), "11111111"); + + trace.Clear(); + Copy<16>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1111111111111111"); + EXPECT_STREQ(trace.Read(), "1111111111111111"); + + trace.Clear(); + Copy<32>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111"); + EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111"); + + trace.Clear(); + Copy<64>(I(0), I(0)); + EXPECT_STREQ( + trace.Write(), + "1111111111111111111111111111111111111111111111111111111111111111"); + EXPECT_STREQ( + trace.Read(), + "1111111111111111111111111111111111111111111111111111111111111111"); +} + +TEST(MemcpyUtilsTest, CopyOffset) { + auto &trace = GetTrace(); + + trace.Clear(); + Copy<1>(I(3), I(1)); + EXPECT_STREQ(trace.Write(), "0001"); + EXPECT_STREQ(trace.Read(), "01"); + + trace.Clear(); + Copy<1>(I(2), I(1)); + EXPECT_STREQ(trace.Write(), "001"); + EXPECT_STREQ(trace.Read(), "01"); +} + +TEST(MemcpyUtilsTest, CopyOverlap) { + auto &trace = GetTrace(); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 2); + EXPECT_STREQ(trace.Write(), "22"); + EXPECT_STREQ(trace.Read(), "22"); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 3); + EXPECT_STREQ(trace.Write(), "121"); + EXPECT_STREQ(trace.Read(), "121"); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 4); + EXPECT_STREQ(trace.Write(), "1111"); + EXPECT_STREQ(trace.Read(), "1111"); + + trace.Clear(); + CopyOverlap<4>(I(2), I(1), 7); + EXPECT_STREQ(trace.Write(), "001112111"); + EXPECT_STREQ(trace.Read(), "01112111"); +} + +TEST(MemcpyUtilsTest, CopyAligned) { + auto &trace = GetTrace(); + // Destination is aligned already. + // "1111000000000" + // + "0000111100000" + // + "0000000011110" + // + "0000000001111" + // = "1111111112221" + trace.Clear(); + CopyAligned<4>(I(0), I(0), 13); + EXPECT_STREQ(trace.Write(), "1111111112221"); + EXPECT_STREQ(trace.Read(), "1111111112221"); + + // Misaligned destination + // "01111000000000" + // + "00001111000000" + // + "00000000111100" + // + "00000000001111" + // = "01112111112211" + trace.Clear(); + CopyAligned<4>(I(1), I(0), 13); + EXPECT_STREQ(trace.Write(), "01112111112211"); + EXPECT_STREQ(trace.Read(), "1112111112211"); +} + +TEST(MemcpyUtilsTest, MaxReloads) { + auto &trace = GetTrace(); + for (size_t alignment = 0; alignment < 32; ++alignment) { + for (size_t count = 64; count < 768; ++count) { + trace.Clear(); + // We should never reload more than twice when copying from count = 2x32. + CopyAligned<32>(I(alignment), I(0), count); + const char *const written = trace.Write(); + // First bytes are untouched. + for (size_t i = 0; i < alignment; ++i) + EXPECT_EQ(written[i], '0'); + // Next bytes are loaded once or twice but no more. + for (size_t i = alignment; i < count; ++i) { + EXPECT_GE(written[i], '1'); + EXPECT_LE(written[i], '2'); + } + } + } +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp --- a/libc/test/src/string/memory_utils/utils_test.cpp +++ b/libc/test/src/string/memory_utils/utils_test.cpp @@ -87,6 +87,14 @@ EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16)); } +TEST(UtilsTest, OffsetFromLastAligned) { + EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15)); + EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16)); +} + TEST(UtilsTest, OffsetToNextCacheLine) { EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0); EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));