diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -112,9 +112,14 @@ EXCLUDE_FROM_ALL LibcMemoryBenchmark.cpp LibcMemoryBenchmark.h + LibcFunctionPrototypes.h MemorySizeDistributions.cpp MemorySizeDistributions.h ) +target_include_directories(libc-memory-benchmark + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} +) target_link_libraries(libc-memory-benchmark PUBLIC libc-benchmark @@ -196,3 +201,5 @@ libc.src.string.bzero_opt_host benchmark_main ) + +add_subdirectory(automemcpy) diff --git a/libc/benchmarks/automemcpy/CMakeLists.txt b/libc/benchmarks/automemcpy/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/CMakeLists.txt @@ -0,0 +1,12 @@ +if(NOT LIBC_BUILD_AUTOMEMCPY) + return () +endif() + +if(NOT LLVM_WITH_Z3) + MESSAGE(FATAL_ERROR "Building llvm-libc automemcpy requires Z3") +endif() + +set(LIBC_AUTOMEMCPY_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) + +add_subdirectory(lib) +add_subdirectory(unittests) diff --git a/libc/benchmarks/automemcpy/README.md b/libc/benchmarks/automemcpy/README.md new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/README.md @@ -0,0 +1,111 @@ +This folder contains an implementation of [automemcpy: A framework for automatic generation of fundamental memory operations](https://research.google/pubs/pub50338/). + +It uses the [Z3 theorem prover](https://github.com/Z3Prover/z3) to enumerate a subset of valid memory function implementations. These implementations are then materialized as C++ code and can be [benchmarked](../) against various [size distributions](../distributions). This process helps the design of efficient implementations for a particular environnement (size distribution, processor or custom compilation options). + +This is not enabled by default, as it is mostly useful when working on tuning the library implementation. To build it, use `LIBC_BUILD_AUTOMEMCPY=ON` (see below). + +## Prerequisites + +You may need to install `Z3` from source if it's not available on your system. +Here we show instructions to install it into ``. +You may need to `sudo` to `make install`. + +```shell +mkdir -p ~/git +cd ~/git +git clone https://github.com/Z3Prover/z3.git +python scripts/mk_make.py --prefix= +cd build +make -j +make install +``` + +## Configuration + +```shell +mkdir -p +cd /llvm +cmake -DCMAKE_C_COMPILER=/usr/bin/clang \ + -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \ + -DLLVM_ENABLE_PROJECTS="libc" \ + -DLLVM_ENABLE_Z3_SOLVER=ON \ + -DLLVM_Z3_INSTALL_DIR= \ + -DLIBC_BUILD_AUTOMEMCPY=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -B +``` + +## Targets and compilation + +There are three main CMake targets + 1. `automemcpy_implementations` + - runs `Z3` and materializes valid memory functions as C++ code, a message will display its ondisk location. + - the source code is then compiled using the native host optimizations (i.e. `-march=native` or `-mcpu=native` depending on the architecture). + 2. `automemcpy` + - the binary that benchmarks the autogenerated implementations. + 3. `automemcpy_result_analyzer` + - the binary that analyses the benchmark results. + +You may only compile the binaries as they both pull the autogenerated code as a dependency. + +```shell +make -C -j automemcpy automemcpy_result_analyzer +``` + +## Running the benchmarks + +Make sure to save the results of the benchmark as a json file. + +```shell +/bin/automemcpy --benchmark_out_format=json --benchmark_out=/results.json +``` + +### Additional useful options + + + - `--benchmark_min_time=.2` + + By default, each function is benchmarked for at least one second, here we lower it to 200ms. + + - `--benchmark_filter="BM_Memset|BM_Bzero"` + + By default, all functions are benchmarked, here we restrict them to `memset` and `bzero`. + +Other options might be useful, use `--help` for more information. + +## Analyzing the benchmarks + +Analysis is performed by running `automemcpy_result_analyzer` on one or more json result files. + +```shell +/bin/automemcpy_result_analyzer /results.json +``` + +What it does: + 1. Gathers all throughput values for each function / distribution pair and picks the median one.\ + This allows picking a representative value over many runs of the benchmark. Please make sure all the runs happen under similar circumstances. + + 2. For each distribution, look at the span of throughputs for functions of the same type (e.g. For distribution `A`, memcpy throughput spans from 2GiB/s to 5GiB/s). + + 3. For each distribution, give a normalized score to each function (e.g. For distribution `A`, function `M` scores 0.65).\ + This score is then turned into a grade `EXCELLENT`, `VERY_GOOD`, `GOOD`, `PASSABLE`, `INADEQUATE`, `MEDIOCRE`, `BAD` - so that each distribution categorizes how function perform according to them. + + 4. A [Majority Judgement](https://en.wikipedia.org/wiki/Majority_judgment) process is then used to categorize each function. This enables finer analysis of how distributions agree on which function is better. In the following example, `Function_1` and `Function_2` are rated `EXCELLENT` but looking at the grade's distribution might help decide which is best. + +| | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | INADEQUATE | MEDIOCRE | BAD | +|------------|:---------:|:---------:|:----:|:--------:|:----------:|:--------:|:---:| +| Function_1 | 7 | 1 | 2 | | | | | +| Function_2 | 6 | 4 | | | | | | + +The tool outputs the histogram of grades for each function. In case of tie, other dimensions might help decide (e.g. code size, performance on other microarchitectures). + +``` +EXCELLENT |█▁▂ | Function_0 +EXCELLENT |█▅ | Function_1 +VERY_GOOD |▂█▁ ▁ | Function_2 +GOOD | ▁█▄ | Function_3 +PASSABLE | ▂▆▄█ | Function_4 +INADEQUATE | ▃▃█▁ | Function_5 +MEDIOCRE | █▆▁| Function_6 +BAD | ▁▁█| Function_7 +``` diff --git a/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h b/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h @@ -0,0 +1,26 @@ +//===-- C++ code generation from NamedFunctionDescriptors -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H +#define LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H + +#include "automemcpy/FunctionDescriptor.h" +#include +#include +#include + +namespace llvm { +namespace automemcpy { + +// This function serializes the array of FunctionDescriptors as a C++ file. +void Serialize(raw_ostream &Stream, ArrayRef FD); + +} // namespace automemcpy +} // namespace llvm + +#endif // LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H diff --git a/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h b/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h @@ -0,0 +1,159 @@ +//===-- Pod structs to describe a memory function----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H +#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace automemcpy { + +// Boilerplate code to be able to sort and hash types. +#define COMPARABLE_AND_HASHABLE(T, ...) \ + inline auto asTuple() const { return std::tie(__VA_ARGS__); } \ + bool operator==(const T &O) const { return asTuple() == O.asTuple(); } \ + bool operator<(const T &O) const { return asTuple() < O.asTuple(); } \ + struct Hasher { \ + std::size_t operator()(const T &K) const { \ + return llvm::hash_value(K.asTuple()); \ + } \ + }; + +// Represents the maximum value for the size parameter of a memory function. +// This is an `int` so we can use it as an expression in Z3. +// It also allows for a more readable and compact representation when storing +// the SizeSpan in the autogenerated C++ file. +static constexpr int kMaxSize = INT_MAX; + +// This mimics the `Arg` type in libc/src/string/memory_utils/elements.h without +// having to depend on it. +enum class AlignArg { _1, _2, ARRAY_SIZE }; + +// Describes a range of sizes. +// We use the begin/end representation instead of first/last to allow for empty +// range (i.e. Begin == End) +struct SizeSpan { + size_t Begin = 0; + size_t End = 0; + + COMPARABLE_AND_HASHABLE(SizeSpan, Begin, End) +}; + +// Describes a contiguous region. +// In such a region all sizes are handled individually. +// e.g. with Span = {0, 2}; +// if(size == 0) return Handle<0>(); +// if(size == 1) return Handle<1>(); +struct Contiguous { + SizeSpan Span; + + COMPARABLE_AND_HASHABLE(Contiguous, Span) +}; + +// This struct represents a range of sizes over which to use an overlapping +// strategy. An overlapping strategy of size N handles all sizes from N to 2xN. +// The span may represent several contiguous overlaps. +// e.g. with Span = {16, 128}; +// if(size >= 16 and size < 32) return Handle>(); +// if(size >= 32 and size < 64) return Handle>(); +// if(size >= 64 and size < 128) return Handle>(); +struct Overlap { + SizeSpan Span; + + COMPARABLE_AND_HASHABLE(Overlap, Span) +}; + +// Describes a region using a loop handling BlockSize bytes at a time. The +// remaining bytes of the loop are handled with an overlapping operation. +struct Loop { + SizeSpan Span; + size_t BlockSize = 0; + + COMPARABLE_AND_HASHABLE(Loop, Span, BlockSize) +}; + +// Same as `Loop` but starts by aligning a buffer on `Alignment` bytes. +// A first operation handling 'Alignment` bytes is performed followed by a +// sequence of Loop.BlockSize bytes operation. The Loop starts processing from +// the next aligned byte in the chosen buffer. The remaining bytes of the loop +// are handled with an overlapping operation. +struct AlignedLoop { + Loop Loop; + size_t Alignment = 0; // Size of the alignment. + AlignArg AlignTo = AlignArg::_1; // Which buffer to align. + + COMPARABLE_AND_HASHABLE(AlignedLoop, Loop, Alignment, AlignTo) +}; + +// Some processors offer special instruction to handle the memory function +// completely, we refer to such instructions as accelerators. +struct Accelerator { + SizeSpan Span; + + COMPARABLE_AND_HASHABLE(Accelerator, Span) +}; + +// The memory functions are assembled out of primitives that can be implemented +// with regular scalar operations (SCALAR), with the help of vector or bitcount +// instructions (NATIVE) or by deferring it to the compiler (BUILTIN). +enum class ElementTypeClass { + SCALAR, + NATIVE, + BUILTIN, +}; + +// A simple enum to categorize which function is being implemented. +enum class FunctionType { + MEMCPY, + MEMCMP, + BCMP, + MEMSET, + BZERO, +}; + +// This struct describes the skeleton of the implementation, it does not go into +// every detail but is enough to uniquely identify the implementation. +struct FunctionDescriptor { + FunctionType Type; + Optional Contiguous; + Optional Overlap; + Optional Loop; + Optional AlignedLoop; + Optional Accelerator; + ElementTypeClass ElementClass; + + COMPARABLE_AND_HASHABLE(FunctionDescriptor, Type, Contiguous, Overlap, Loop, + AlignedLoop, Accelerator, ElementClass) + + inline size_t id() const { return llvm::hash_value(asTuple()); } +}; + +// Same as above but with the function name. +struct NamedFunctionDescriptor { + StringRef Name; + FunctionDescriptor Desc; +}; + +template llvm::hash_code hash_value(const ArrayRef &V) { + return llvm::hash_combine_range(V.begin(), V.end()); +} +template llvm::hash_code hash_value(const T &O) { + return llvm::hash_value(O.asTuple()); +} + +} // namespace automemcpy +} // namespace llvm + +#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H */ diff --git a/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h b/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h @@ -0,0 +1,62 @@ +//===-- Generate random but valid function descriptors ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H +#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H + +#include "automemcpy/FunctionDescriptor.h" +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace automemcpy { + +// Holds the state for the constraint solver. +// It implements a single method that returns the next valid description. +struct RandomFunctionGenerator { + RandomFunctionGenerator(); + + // Get the next valid FunctionDescriptor or llvm::None. + Optional next(); + +private: + // Returns an expression where `Variable` is forced to be one of the `Values`. + z3::expr inSetConstraint(z3::expr &Variable, ArrayRef Values) const; + // Add constaints to `Begin` and `End` so that they are: + // - between 0 and kMaxSize (inclusive) + // - ordered (begin<=End) + // - amongst a set of predefined values. + void addBoundsAndAnchors(z3::expr &Begin, z3::expr &End); + // Add constraints to make sure that the loop block size is amongst a set of + // predefined values. Also makes sure that the loop that the loop is iterated + // at least `LoopMinIter` times. + void addLoopConstraints(const z3::expr &LoopBegin, const z3::expr &LoopEnd, + z3::expr &LoopBlockSize, int LoopMinIter); + + z3::context Context; + z3::solver Solver; + + z3::expr Type; + z3::expr ContiguousBegin, ContiguousEnd; + z3::expr OverlapBegin, OverlapEnd; + z3::expr LoopBegin, LoopEnd, LoopBlockSize; + z3::expr AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize, + AlignedAlignment, AlignedArg; + z3::expr AcceleratorBegin, AcceleratorEnd; + z3::expr ElementClass; +}; + +} // namespace automemcpy +} // namespace llvm + +#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H */ diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt @@ -0,0 +1,29 @@ +add_library(automemcpy_codegen CodeGen.cpp) +target_link_libraries(automemcpy_codegen PUBLIC LLVMSupport) +target_compile_options(automemcpy_codegen PUBLIC -fno-rtti) +target_include_directories(automemcpy_codegen PUBLIC ${LIBC_AUTOMEMCPY_INCLUDE_DIR}) + +add_executable(automemcpy_codegen_main CodeGenMain.cpp RandomFunctionGenerator.cpp) +target_link_libraries(automemcpy_codegen_main PUBLIC automemcpy_codegen ${Z3_LIBRARIES}) +target_compile_options(automemcpy_codegen_main PUBLIC -fno-rtti) + +set(Implementations "${CMAKE_CURRENT_BINARY_DIR}/Implementations.cpp") +add_custom_command( + OUTPUT ${Implementations} + COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/automemcpy_codegen_main" > "${Implementations}" + COMMAND echo "automemcpy implementations generated in ${Implementations}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + DEPENDS automemcpy_codegen_main +) + +add_library(automemcpy_implementations "${Implementations}") +target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark) +target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR}) +target_compile_options(automemcpy_implementations PUBLIC -fno-rtti PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin) + +add_executable(automemcpy EXCLUDE_FROM_ALL ${LIBC_SOURCE_DIR}/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp) +target_link_libraries(automemcpy PRIVATE libc-memory-benchmark benchmark_main automemcpy_implementations) + +add_executable(automemcpy_result_analyzer EXCLUDE_FROM_ALL ResultAnalyzer.cpp) +target_link_libraries(automemcpy_result_analyzer PRIVATE LLVMSupport automemcpy_implementations) +target_include_directories(automemcpy_result_analyzer PRIVATE ${LIBC_AUTOMEMCPY_INCLUDE_DIR}) diff --git a/libc/benchmarks/automemcpy/lib/CodeGen.cpp b/libc/benchmarks/automemcpy/lib/CodeGen.cpp new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/lib/CodeGen.cpp @@ -0,0 +1,646 @@ +//===-- C++ code generation from NamedFunctionDescriptors -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This code is responsible for generating the "Implementation.cpp" file. +// The file is composed like this: +// +// 1. Includes +// 2. Using statements to help readability. +// 3. Source code for all the mem function implementations. +// 4. The function to retrieve all the function descriptors with their name. +// llvm::ArrayRef getFunctionDescriptors(); +// 5. The functions for the benchmarking infrastructure: +// llvm::ArrayRef getMemcpyConfigurations(); +// llvm::ArrayRef getMemcmpConfigurations(); +// llvm::ArrayRef getBcmpConfigurations(); +// llvm::ArrayRef getMemsetConfigurations(); +// llvm::ArrayRef getBzeroConfigurations(); +// +// +// Sections 3, 4 and 5 are handled by the following namespaces: +// - codegen::functions +// - codegen::descriptors +// - codegen::configurations +// +// The programming style is functionnal. In each of these namespace, the +// original `NamedFunctionDescriptor` object is turned into a different type. We +// make use of overloaded stream operators to format the resulting type into +// either a function, a descriptor or a configuration. The entry point of each +// namespace is the Serialize function. +// +// Note the code here is better understood by starting from the `Serialize` +// function at the end of the file. + +#include "automemcpy/CodeGen.h" +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace automemcpy { +namespace codegen { + +// The indentation string. +static constexpr StringRef kIndent = " "; + +// The codegen namespace handles the serialization of a NamedFunctionDescriptor +// into source code for the function, the descriptor and the configuration. + +namespace functions { + +// This namespace turns a NamedFunctionDescriptor into an actual implementation. +// ----------------------------------------------------------------------------- +// e.g. +// static void memcpy_0xB20D4702493C397E(char *__restrict dst, +// const char *__restrict src, +// size_t size) { +// using namespace __llvm_libc::x86; +// if(size == 0) return; +// if(size == 1) return Copy<_1>(dst, src); +// if(size < 4) return Copy>(dst, src, size); +// if(size < 8) return Copy>(dst, src, size); +// if(size < 16) return Copy>(dst, src, size); +// if(size < 32) return Copy>(dst, src, size); +// return Copy(dst, src, size); +// } + +// The `Serialize` method turns a `NamedFunctionDescriptor` into a +// `FunctionImplementation` which holds all the information needed to produce +// the C++ source code. + +// An Element with its size (e.g. `_16` in the example above). +struct ElementType { + size_t Size; +}; +// The case `if(size == 0)` is encoded as a the Zero type. +struct Zero { + StringRef DefaultReturnValue; +}; +// An individual size `if(size == X)` is encoded as an Individual type. +struct Individual { + size_t IfEq; + ElementType Element; +}; +// An overlap strategy is encoded as an Overlap type. +struct Overlap { + size_t IfLt; + ElementType Element; +}; +// A loop strategy is encoded as a Loop type. +struct Loop { + size_t IfLt; + ElementType Element; +}; +// An aligned loop strategy is encoded as an AlignedLoop type. +struct AlignedLoop { + size_t IfLt; + ElementType Element; + ElementType Alignment; + StringRef AlignTo; +}; +// The accelerator strategy. +struct Accelerator { + size_t IfLt; +}; +// The Context stores data about the function type. +struct Context { + StringRef FunctionReturnType; // e.g. void* or int + StringRef FunctionArgs; + StringRef ElementOp; // Copy, ThreeWayCompare, SplatSet, ... + StringRef FixedSizeArgs; + StringRef RuntimeSizeArgs; + StringRef AlignArg1; + StringRef AlignArg2; + StringRef DefaultReturnValue; +}; +// A detailed representation of the function implementation mapped from the +// NamedFunctionDescriptor. +struct FunctionImplementation { + Context Ctx; + StringRef Name; + std::vector Individuals; + std::vector Overlaps; + Optional Loop; + Optional AlignedLoop; + Optional Accelerator; + ElementTypeClass ElementClass; +}; + +// Returns the Context for each FunctionType. +static Context getCtx(FunctionType FT) { + switch (FT) { + case FunctionType::MEMCPY: + return {"void", + "(char *__restrict dst, const char *__restrict src, size_t size)", + "Copy", + "(dst, src)", + "(dst, src, size)", + "Arg::Dst", + "Arg::Src", + ""}; + case FunctionType::MEMCMP: + return {"int", + "(const char * lhs, const char * rhs, size_t size)", + "ThreeWayCompare", + "(lhs, rhs)", + "(lhs, rhs, size)", + "Arg::Lhs", + "Arg::Rhs", + "0"}; + case FunctionType::MEMSET: + return {"void", + "(char * dst, int value, size_t size)", + "SplatSet", + "(dst, value)", + "(dst, value, size)", + "Arg::Dst", + "Arg::Src", + ""}; + case FunctionType::BZERO: + return {"void", "(char * dst, size_t size)", + "SplatSet", "(dst, 0)", + "(dst, 0, size)", "Arg::Dst", + "Arg::Src", ""}; + default: + report_fatal_error("Not yet implemented"); + } +} + +static StringRef getAligntoString(const Context &Ctx, const AlignArg &AlignTo) { + switch (AlignTo) { + case AlignArg::_1: + return Ctx.AlignArg1; + case AlignArg::_2: + return Ctx.AlignArg2; + case AlignArg::ARRAY_SIZE: + report_fatal_error("logic error"); + } +} + +static raw_ostream &operator<<(raw_ostream &Stream, const ElementType &E) { + return Stream << '_' << E.Size; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Individual &O) { + return Stream << O.Element; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Overlap &O) { + return Stream << "HeadTail<" << O.Element << '>'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Loop &O) { + return Stream << "Loop<" << O.Element << '>'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const AlignedLoop &O) { + return Stream << "Align<" << O.Alignment << ',' << O.AlignTo << ">::Then<" + << Loop{O.IfLt, O.Element} << ">"; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Accelerator &O) { + return Stream << "Accelerator"; +} + +template struct IfEq { + StringRef Op; + StringRef Args; + const T ∈ +}; + +template struct IfLt { + StringRef Op; + StringRef Args; + const T ∈ +}; + +static raw_ostream &operator<<(raw_ostream &Stream, const Zero &O) { + Stream << kIndent << "if(size == 0) return"; + if (!O.DefaultReturnValue.empty()) + Stream << ' ' << O.DefaultReturnValue; + return Stream << ";\n"; +} + +template +static raw_ostream &operator<<(raw_ostream &Stream, const IfEq &O) { + return Stream << kIndent << "if(size == " << O.Element.IfEq << ") return " + << O.Op << '<' << O.Element << '>' << O.Args << ";\n"; +} + +template +static raw_ostream &operator<<(raw_ostream &Stream, const IfLt &O) { + Stream << kIndent; + if (O.Element.IfLt != kMaxSize) + Stream << "if(size < " << O.Element.IfLt << ") "; + return Stream << "return " << O.Op << '<' << O.Element << '>' << O.Args + << ";\n"; +} + +static raw_ostream &operator<<(raw_ostream &Stream, + const ElementTypeClass &Class) { + switch (Class) { + case ElementTypeClass::SCALAR: + return Stream << "scalar"; + case ElementTypeClass::BUILTIN: + return Stream << "builtin"; + case ElementTypeClass::NATIVE: + // FIXME: the framework should provide a `native` namespace that redirect to + // x86, arm or other architectures. + return Stream << "x86"; + } +} + +static raw_ostream &operator<<(raw_ostream &Stream, + const FunctionImplementation &FI) { + const auto &Ctx = FI.Ctx; + Stream << "static " << Ctx.FunctionReturnType << ' ' << FI.Name + << Ctx.FunctionArgs << " {\n"; + Stream << kIndent << "using namespace __llvm_libc::" << FI.ElementClass + << ";\n"; + for (const auto &I : FI.Individuals) + if (I.Element.Size == 0) + Stream << Zero{Ctx.DefaultReturnValue}; + else + Stream << IfEq{Ctx.ElementOp, Ctx.FixedSizeArgs, I}; + for (const auto &O : FI.Overlaps) + Stream << IfLt{Ctx.ElementOp, Ctx.RuntimeSizeArgs, O}; + if (const auto &C = FI.Loop) + Stream << IfLt{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C}; + if (const auto &C = FI.AlignedLoop) + Stream << IfLt{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C}; + if (const auto &C = FI.Accelerator) + Stream << IfLt{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C}; + return Stream << "}\n"; +} + +// Turns a `NamedFunctionDescriptor` into a `FunctionImplementation` unfolding +// the contiguous and overlap region into several statements. The zero case is +// also mapped to its own type. +static FunctionImplementation +getImplementation(const NamedFunctionDescriptor &NamedFD) { + const FunctionDescriptor &FD = NamedFD.Desc; + FunctionImplementation Impl; + Impl.Ctx = getCtx(FD.Type); + Impl.Name = NamedFD.Name; + Impl.ElementClass = FD.ElementClass; + if (auto C = FD.Contiguous) + for (size_t I = C->Span.Begin; I < C->Span.End; ++I) + Impl.Individuals.push_back(Individual{I, ElementType{I}}); + if (auto C = FD.Overlap) + for (size_t I = C->Span.Begin; I < C->Span.End; I *= 2) + Impl.Overlaps.push_back(Overlap{2 * I, ElementType{I}}); + if (const auto &L = FD.Loop) + Impl.Loop = Loop{L->Span.End, ElementType{L->BlockSize}}; + if (const auto &AL = FD.AlignedLoop) + Impl.AlignedLoop = AlignedLoop{ + AL->Loop.Span.End, ElementType{AL->Loop.BlockSize}, + ElementType{AL->Alignment}, getAligntoString(Impl.Ctx, AL->AlignTo)}; + if (const auto &A = FD.Accelerator) + Impl.Accelerator = Accelerator{A->Span.End}; + return Impl; +} + +static void Serialize(raw_ostream &Stream, + ArrayRef Descriptors) { + + for (const auto &FD : Descriptors) + Stream << getImplementation(FD); +} + +} // namespace functions + +namespace descriptors { + +// This namespace generates the getFunctionDescriptors function: +// ------------------------------------------------------------- +// e.g. +// ArrayRef getFunctionDescriptors() { +// static constexpr NamedFunctionDescriptor kDescriptors[] = { +// {"memcpy_0xE00E29EE73994E2B",{FunctionType::MEMCPY,llvm::None,llvm::None,llvm::None,llvm::None,Accelerator{{0,kMaxSize}},ElementTypeClass::NATIVE}}, +// {"memcpy_0x8661D80472487AB5",{FunctionType::MEMCPY,Contiguous{{0,1}},llvm::None,llvm::None,llvm::None,Accelerator{{1,kMaxSize}},ElementTypeClass::NATIVE}}, +// ... +// }; +// return makeArrayRef(kDescriptors); +// } + +static raw_ostream &operator<<(raw_ostream &Stream, const SizeSpan &SS) { + Stream << "{" << SS.Begin << ','; + if (SS.End == kMaxSize) + Stream << "kMaxSize"; + else + Stream << SS.End; + return Stream << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Contiguous &O) { + return Stream << "Contiguous{" << O.Span << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Overlap &O) { + return Stream << "Overlap{" << O.Span << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Loop &O) { + return Stream << "Loop{" << O.Span << ',' << O.BlockSize << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const AlignArg &O) { + switch (O) { + case AlignArg::_1: + return Stream << "AlignArg::_1"; + case AlignArg::_2: + return Stream << "AlignArg::_2"; + case AlignArg::ARRAY_SIZE: + report_fatal_error("logic error"); + } +} +static raw_ostream &operator<<(raw_ostream &Stream, const AlignedLoop &O) { + return Stream << "AlignedLoop{" << O.Loop << ',' << O.Alignment << ',' + << O.AlignTo << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const Accelerator &O) { + return Stream << "Accelerator{" << O.Span << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, const ElementTypeClass &O) { + switch (O) { + case ElementTypeClass::SCALAR: + return Stream << "ElementTypeClass::SCALAR"; + case ElementTypeClass::BUILTIN: + return Stream << "ElementTypeClass::BUILTIN"; + case ElementTypeClass::NATIVE: + return Stream << "ElementTypeClass::NATIVE"; + } +} +static raw_ostream &operator<<(raw_ostream &Stream, const FunctionType &T) { + switch (T) { + case FunctionType::MEMCPY: + return Stream << "FunctionType::MEMCPY"; + case FunctionType::MEMCMP: + return Stream << "FunctionType::MEMCMP"; + case FunctionType::BCMP: + return Stream << "FunctionType::BCMP"; + case FunctionType::MEMSET: + return Stream << "FunctionType::MEMSET"; + case FunctionType::BZERO: + return Stream << "FunctionType::BZERO"; + } +} +template +static raw_ostream &operator<<(raw_ostream &Stream, + const llvm::Optional &MaybeT) { + if (MaybeT) + return Stream << *MaybeT; + return Stream << "llvm::None"; +} +static raw_ostream &operator<<(raw_ostream &Stream, + const FunctionDescriptor &FD) { + return Stream << '{' << FD.Type << ',' << FD.Contiguous << ',' << FD.Overlap + << ',' << FD.Loop << ',' << FD.AlignedLoop << ',' + << FD.Accelerator << ',' << FD.ElementClass << '}'; +} +static raw_ostream &operator<<(raw_ostream &Stream, + const NamedFunctionDescriptor &NFD) { + return Stream << '{' << '"' << NFD.Name << '"' << ',' << NFD.Desc << '}'; +} +template +static raw_ostream &operator<<(raw_ostream &Stream, + const std::vector &VectorT) { + Stream << '{'; + bool First = true; + for (const auto &Obj : VectorT) { + if (!First) + Stream << ','; + Stream << Obj; + First = false; + } + return Stream << '}'; +} + +static void Serialize(raw_ostream &Stream, + ArrayRef Descriptors) { + Stream << R"(ArrayRef getFunctionDescriptors() { + static constexpr NamedFunctionDescriptor kDescriptors[] = { +)"; + for (size_t I = 0, E = Descriptors.size(); I < E; ++I) { + Stream << kIndent << kIndent << Descriptors[I] << ",\n"; + } + Stream << R"( }; + return makeArrayRef(kDescriptors); +} +)"; +} + +} // namespace descriptors + +namespace configurations { + +// This namespace generates the getXXXConfigurations functions: +// ------------------------------------------------------------ +// e.g. +// llvm::ArrayRef getMemcpyConfigurations() { +// using namespace __llvm_libc; +// static constexpr MemcpyConfiguration kConfigurations[] = { +// {Wrap, "memcpy_0xE00E29EE73994E2B"}, +// {Wrap, "memcpy_0x8661D80472487AB5"}, +// ... +// }; +// return llvm::makeArrayRef(kConfigurations); +// } + +// The `Wrap` template function is provided in the `Main` function below. +// It is used to adapt the gnerated code to the prototype of the C function. +// For instance, the generated code for a `memcpy` takes `char*` pointers and +// returns nothing but the original C `memcpy` function take and returns `void*` +// pointers. + +struct FunctionName { + FunctionType ForType; +}; + +struct ReturnType { + FunctionType ForType; +}; + +struct Configuration { + FunctionName Name; + ReturnType Type; + std::vector Descriptors; +}; + +static raw_ostream &operator<<(raw_ostream &Stream, const FunctionName &FN) { + switch (FN.ForType) { + case FunctionType::MEMCPY: + return Stream << "getMemcpyConfigurations"; + case FunctionType::MEMCMP: + return Stream << "getMemcmpConfigurations"; + case FunctionType::BCMP: + return Stream << "getBcmpConfigurations"; + case FunctionType::MEMSET: + return Stream << "getMemsetConfigurations"; + case FunctionType::BZERO: + return Stream << "getBzeroConfigurations"; + } +} + +static raw_ostream &operator<<(raw_ostream &Stream, const ReturnType &RT) { + switch (RT.ForType) { + case FunctionType::MEMCPY: + return Stream << "MemcpyConfiguration"; + case FunctionType::MEMCMP: + case FunctionType::BCMP: + return Stream << "MemcmpOrBcmpConfiguration"; + case FunctionType::MEMSET: + return Stream << "MemsetConfiguration"; + case FunctionType::BZERO: + return Stream << "BzeroConfiguration"; + } +} + +static raw_ostream &operator<<(raw_ostream &Stream, + const NamedFunctionDescriptor *FD) { + return Stream << formatv("{Wrap<{0}>, \"{0}\"}", FD->Name); +} + +static raw_ostream & +operator<<(raw_ostream &Stream, + const std::vector &Descriptors) { + for (size_t I = 0, E = Descriptors.size(); I < E; ++I) + Stream << kIndent << kIndent << Descriptors[I] << ",\n"; + return Stream; +} + +static raw_ostream &operator<<(raw_ostream &Stream, const Configuration &C) { + Stream << "llvm::ArrayRef<" << C.Type << "> " << C.Name << "() {\n"; + if (C.Descriptors.empty()) + Stream << kIndent << "return {};\n"; + else { + Stream << kIndent << "using namespace __llvm_libc;\n"; + Stream << kIndent << "static constexpr " << C.Type + << " kConfigurations[] = {\n"; + Stream << C.Descriptors; + Stream << kIndent << "};\n"; + Stream << kIndent << "return llvm::makeArrayRef(kConfigurations);\n"; + } + Stream << "}\n"; + return Stream; +} + +static void Serialize(raw_ostream &Stream, FunctionType FT, + ArrayRef Descriptors) { + Configuration Conf; + Conf.Name = {FT}; + Conf.Type = {FT}; + for (const auto &FD : Descriptors) + if (FD.Desc.Type == FT) + Conf.Descriptors.push_back(&FD); + Stream << Conf; +} + +} // namespace configurations +static void Serialize(raw_ostream &Stream, + ArrayRef Descriptors) { + Stream << "// This file is auto-generated by libc/benchmarks/automemcpy.\n"; + Stream << "// Functions : " << Descriptors.size() << "\n"; + Stream << "\n"; + Stream << "#include \"LibcFunctionPrototypes.h\"\n"; + Stream << "#include \"automemcpy/FunctionDescriptor.h\"\n"; + Stream << "#include \"src/string/memory_utils/elements.h\"\n"; + Stream << "\n"; + Stream << "using llvm::libc_benchmarks::BzeroConfiguration;\n"; + Stream << "using llvm::libc_benchmarks::MemcmpOrBcmpConfiguration;\n"; + Stream << "using llvm::libc_benchmarks::MemcpyConfiguration;\n"; + Stream << "using llvm::libc_benchmarks::MemsetConfiguration;\n"; + Stream << "\n"; + Stream << "namespace __llvm_libc {\n"; + Stream << "\n"; + codegen::functions::Serialize(Stream, Descriptors); + Stream << "\n"; + Stream << "} // namespace __llvm_libc\n"; + Stream << "\n"; + Stream << "namespace llvm {\n"; + Stream << "namespace automemcpy {\n"; + Stream << "\n"; + codegen::descriptors::Serialize(Stream, Descriptors); + Stream << "\n"; + Stream << "} // namespace automemcpy\n"; + Stream << "} // namespace llvm\n"; + Stream << "\n"; + Stream << R"( +using MemcpyStub = void (*)(char *__restrict, const char *__restrict, size_t); +template +void *Wrap(void *__restrict dst, const void *__restrict src, size_t size) { + Foo(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} +)"; + codegen::configurations::Serialize(Stream, FunctionType::MEMCPY, Descriptors); + Stream << R"( +using MemcmpStub = int (*)(const char *, const char *, size_t); +template +int Wrap(const void *lhs, const void *rhs, size_t size) { + return Foo(reinterpret_cast(lhs), + reinterpret_cast(rhs), size); +} +)"; + codegen::configurations::Serialize(Stream, FunctionType::MEMCMP, Descriptors); + codegen::configurations::Serialize(Stream, FunctionType::BCMP, Descriptors); + Stream << R"( +using MemsetStub = void (*)(char *, int, size_t); +template void *Wrap(void *dst, int value, size_t size) { + Foo(reinterpret_cast(dst), value, size); + return dst; +} +)"; + codegen::configurations::Serialize(Stream, FunctionType::MEMSET, Descriptors); + Stream << R"( +using BzeroStub = void (*)(char *, size_t); +template void Wrap(void *dst, size_t size) { + Foo(reinterpret_cast(dst), size); +} +)"; + codegen::configurations::Serialize(Stream, FunctionType::BZERO, Descriptors); + Stream << "// Functions : " << Descriptors.size() << "\n"; +} + +} // namespace codegen + +// Stores `VolatileStr` into a cache and returns a StringRef of the cached +// version. +StringRef getInternalizedString(std::string VolatileStr) { + static llvm::StringSet<> StringCache; + return StringCache.insert(std::move(VolatileStr)).first->getKey(); +} + +static StringRef getString(FunctionType FT) { + switch (FT) { + case FunctionType::MEMCPY: + return "memcpy"; + case FunctionType::MEMCMP: + return "memcmp"; + case FunctionType::BCMP: + return "bcmp"; + case FunctionType::MEMSET: + return "memset"; + case FunctionType::BZERO: + return "bzero"; + } +} + +void Serialize(raw_ostream &Stream, ArrayRef Descriptors) { + std::vector FunctionDescriptors; + FunctionDescriptors.reserve(Descriptors.size()); + for (auto &FD : Descriptors) { + FunctionDescriptors.emplace_back(); + FunctionDescriptors.back().Name = getInternalizedString( + formatv("{0}_{1:X16}", getString(FD.Type), FD.id())); + FunctionDescriptors.back().Desc = std::move(FD); + } + // Sort functions so they are easier to spot in the generated C++ file. + std::sort(FunctionDescriptors.begin(), FunctionDescriptors.end(), + [](const NamedFunctionDescriptor &A, + const NamedFunctionDescriptor &B) { return A.Desc < B.Desc; }); + codegen::Serialize(Stream, FunctionDescriptors); +} + +} // namespace automemcpy +} // namespace llvm diff --git a/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp b/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp @@ -0,0 +1,28 @@ +#include "automemcpy/CodeGen.h" +#include "automemcpy/RandomFunctionGenerator.h" +#include + +namespace llvm { +namespace automemcpy { + +std::vector generateFunctionDescriptors() { + std::unordered_set Seen; + std::vector FunctionDescriptors; + RandomFunctionGenerator P; + while (Optional MaybeFD = P.next()) { + FunctionDescriptor FD = *MaybeFD; + if (Seen.count(FD)) // FIXME: Z3 sometimes returns twice the same object. + continue; + Seen.insert(FD); + FunctionDescriptors.push_back(std::move(FD)); + } + return FunctionDescriptors; +} + +} // namespace automemcpy +} // namespace llvm + +int main(int, char **) { + llvm::automemcpy::Serialize(llvm::outs(), + llvm::automemcpy::generateFunctionDescriptors()); +} diff --git a/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp b/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp @@ -0,0 +1,279 @@ +//===-- Generate random but valid function descriptors -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "automemcpy/RandomFunctionGenerator.h" + +#include +#include +#include + +#include + +namespace llvm { +namespace automemcpy { + +// Exploration parameters +// ---------------------- +// Here we define a set of values that will contraint the exploration and +// prevent combinatorial explosion. + +// We limit the number of cases for individual sizes to sizes up to 4. +// More individual sizes don't bring much over the overlapping strategy. +static constexpr int kMaxIndividualSize = 4; + +// We limit Overlapping Strategy to sizes up to 256. +// An overlap of 256B means accessing 128B at once which is usually not +// feasible by current CPUs. We rely on the compiler to generate multiple +// loads/stores if needed but higher sizes are unlikely to benefit from hardware +// acceleration. +static constexpr int kMaxOverlapSize = 256; + +// For the loop strategies, we make sure that they iterate at least a certain +// number of times to amortize the cost of looping. +static constexpr int kLoopMinIter = 3; +static constexpr int kAlignedLoopMinIter = 2; + +// We restrict the size of the block of data to handle in a loop. +// Generally speaking block size <= 16 perform poorly. +static constexpr int kLoopBlockSize[] = {16, 32, 64}; + +// We restrict alignment to the following values. +static constexpr int kLoopAlignments[] = {16, 32, 64}; + +// We make sure that the region bounds are one of the following values. +static constexpr int kAnchors[] = {0, 1, 2, 4, 8, 16, 32, 48, + 64, 96, 128, 256, 512, 1024, kMaxSize}; + +// We also allow disabling loops, aligned loops and accelerators. +static constexpr bool kDisableLoop = false; +static constexpr bool kDisableAlignedLoop = false; +static constexpr bool kDisableAccelerator = false; + +// For memcpy, we can also explore the whether aligning on source or destination +// has an effect. +static constexpr bool kExploreAlignmentArg = true; + +// The function we generate code for. +// BCMP is specifically disabled for now. +static constexpr int kFunctionTypes[] = { + (int)FunctionType::MEMCPY, + (int)FunctionType::MEMCMP, + // (int)FunctionType::BCMP, + (int)FunctionType::MEMSET, + (int)FunctionType::BZERO, +}; + +// The actual implementation of each function can be handled via primitive types +// (SCALAR), vector types where available (NATIVE) or by the compiler (BUILTIN). +// We want to move toward delegating the code generation entirely to the +// compiler but for now we have to make use of -per microarchitecture- custom +// implementations. Scalar being more portable but also less performant, we +// remove it as well. +static constexpr int kElementClasses[] = { + // (int)ElementTypeClass::SCALAR, + (int)ElementTypeClass::NATIVE, + // (int)ElementTypeClass::BUILTIN +}; + +RandomFunctionGenerator::RandomFunctionGenerator() + : Solver(Context), Type(Context.int_const("Type")), + ContiguousBegin(Context.int_const("ContiguousBegin")), + ContiguousEnd(Context.int_const("ContiguousEnd")), + OverlapBegin(Context.int_const("OverlapBegin")), + OverlapEnd(Context.int_const("OverlapEnd")), + LoopBegin(Context.int_const("LoopBegin")), + LoopEnd(Context.int_const("LoopEnd")), + LoopBlockSize(Context.int_const("LoopBlockSize")), + AlignedLoopBegin(Context.int_const("AlignedLoopBegin")), + AlignedLoopEnd(Context.int_const("AlignedLoopEnd")), + AlignedLoopBlockSize(Context.int_const("AlignedLoopBlockSize")), + AlignedAlignment(Context.int_const("AlignedAlignment")), + AlignedArg(Context.int_const("AlignedArg")), + AcceleratorBegin(Context.int_const("AcceleratorBegin")), + AcceleratorEnd(Context.int_const("AcceleratorEnd")), + ElementClass(Context.int_const("ElementClass")) { + // All possible functions. + Solver.add(inSetConstraint(Type, kFunctionTypes)); + + // Add constraints for region bounds. + addBoundsAndAnchors(ContiguousBegin, ContiguousEnd); + addBoundsAndAnchors(OverlapBegin, OverlapEnd); + addBoundsAndAnchors(LoopBegin, LoopEnd); + addBoundsAndAnchors(AlignedLoopBegin, AlignedLoopEnd); + addBoundsAndAnchors(AcceleratorBegin, AcceleratorEnd); + // We always consider strategies in this order, and we + // always end with the `Accelerator` strategy, as it's typically more + // efficient for large sizes. + // Contiguous <= Overlap <= Loop <= AlignedLoop <= Accelerator + Solver.add(ContiguousEnd == OverlapBegin); + Solver.add(OverlapEnd == LoopBegin); + Solver.add(LoopEnd == AlignedLoopBegin); + Solver.add(AlignedLoopEnd == AcceleratorBegin); + // Fix endpoints: The minimum size that we want to copy is 0, and we always + // start with the `Contiguous` strategy. The max size is `kMaxSize`. + Solver.add(ContiguousBegin == 0); + Solver.add(AcceleratorEnd == kMaxSize); + // Contiguous + Solver.add(ContiguousEnd <= kMaxIndividualSize + 1); + // Overlap + Solver.add(OverlapEnd <= kMaxOverlapSize + 1); + // Overlap only ever makes sense when accessing multiple bytes at a time. + // i.e. Overlap<1> is useless. + Solver.add(OverlapBegin == OverlapEnd || OverlapBegin >= 2); + // Loop + addLoopConstraints(LoopBegin, LoopEnd, LoopBlockSize, kLoopMinIter); + // Aligned Loop + addLoopConstraints(AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize, + kAlignedLoopMinIter); + Solver.add(inSetConstraint(AlignedAlignment, kLoopAlignments)); + Solver.add(AlignedLoopBegin == AlignedLoopEnd || AlignedLoopBegin >= 64); + Solver.add(AlignedLoopBlockSize >= AlignedAlignment); + Solver.add(AlignedLoopBlockSize >= LoopBlockSize); + z3::expr IsMemcpy = Type == (int)FunctionType::MEMCPY; + z3::expr ExploreAlignment = IsMemcpy && kExploreAlignmentArg; + Solver.add( + (ExploreAlignment && + inSetConstraint(AlignedArg, {(int)AlignArg::_1, (int)AlignArg::_2})) || + (!ExploreAlignment && AlignedArg == (int)AlignArg::_1)); + // Accelerator + Solver.add(IsMemcpy || + (AcceleratorBegin == + AcceleratorEnd)); // Only Memcpy has accelerator for now. + // Element classes + Solver.add(inSetConstraint(ElementClass, kElementClasses)); + + if (kDisableLoop) + Solver.add(LoopBegin == LoopEnd); + if (kDisableAlignedLoop) + Solver.add(AlignedLoopBegin == AlignedLoopEnd); + if (kDisableAccelerator) + Solver.add(AcceleratorBegin == AcceleratorEnd); +} + +// Creates SizeSpan from Begin/End values. +// Returns llvm::None if Begin==End. +static Optional AsSizeSpan(size_t Begin, size_t End) { + if (Begin == End) + return None; + SizeSpan SS; + SS.Begin = Begin; + SS.End = End; + return SS; +} + +// Generic method to create a `Region` struct with a Span or None if span is +// empty. +template +static Optional As(size_t Begin, size_t End) { + if (auto Span = AsSizeSpan(Begin, End)) { + Region Output; + Output.Span = *Span; + return Output; + } + return None; +} + +// Returns a Loop struct or None if span is empty. +static Optional AsLoop(size_t Begin, size_t End, size_t BlockSize) { + if (auto Span = AsSizeSpan(Begin, End)) { + Loop Output; + Output.Span = *Span; + Output.BlockSize = BlockSize; + return Output; + } + return None; +} + +// Returns an AlignedLoop struct or None if span is empty. +static Optional AsAlignedLoop(size_t Begin, size_t End, + size_t BlockSize, size_t Alignment, + AlignArg AlignTo) { + if (auto Loop = AsLoop(Begin, End, BlockSize)) { + AlignedLoop Output; + Output.Loop = *Loop; + Output.Alignment = Alignment; + Output.AlignTo = AlignTo; + return Output; + } + return None; +} + +Optional RandomFunctionGenerator::next() { + if (Solver.check() != z3::sat) + return {}; + + z3::model m = Solver.get_model(); + + // Helper method to get the current numerical value of a z3::expr. + const auto E = [&m](z3::expr &V) -> int { + return m.eval(V).get_numeral_int(); + }; + + // Fill is the function descriptor to return. + FunctionDescriptor R; + R.Type = FunctionType(E(Type)); + R.Contiguous = As(E(ContiguousBegin), E(ContiguousEnd)); + R.Overlap = As(E(OverlapBegin), E(OverlapEnd)); + R.Loop = AsLoop(E(LoopBegin), E(LoopEnd), E(LoopBlockSize)); + R.AlignedLoop = AsAlignedLoop(E(AlignedLoopBegin), E(AlignedLoopEnd), + E(AlignedLoopBlockSize), E(AlignedAlignment), + AlignArg(E(AlignedArg))); + R.Accelerator = As(E(AcceleratorBegin), E(AcceleratorEnd)); + R.ElementClass = ElementTypeClass(E(ElementClass)); + + // Express current state as a set of constraints. + z3::expr CurrentLayout = + (Type == E(Type)) && (ContiguousBegin == E(ContiguousBegin)) && + (ContiguousEnd == E(ContiguousEnd)) && + (OverlapBegin == E(OverlapBegin)) && (OverlapEnd == E(OverlapEnd)) && + (LoopBegin == E(LoopBegin)) && (LoopEnd == E(LoopEnd)) && + (LoopBlockSize == E(LoopBlockSize)) && + (AlignedLoopBegin == E(AlignedLoopBegin)) && + (AlignedLoopEnd == E(AlignedLoopEnd)) && + (AlignedLoopBlockSize == E(AlignedLoopBlockSize)) && + (AlignedAlignment == E(AlignedAlignment)) && + (AlignedArg == E(AlignedArg)) && + (AcceleratorBegin == E(AcceleratorBegin)) && + (AcceleratorEnd == E(AcceleratorEnd)) && + (ElementClass == E(ElementClass)); + + // Ask solver to never show this configuration ever again. + Solver.add(!CurrentLayout); + return R; +} + +// Make sure `Variable` is one of the provided values. +z3::expr RandomFunctionGenerator::inSetConstraint(z3::expr &Variable, + ArrayRef Values) const { + z3::expr_vector Args(Variable.ctx()); + for (int Value : Values) + Args.push_back(Variable == Value); + return z3::mk_or(Args); +} + +void RandomFunctionGenerator::addBoundsAndAnchors(z3::expr &Begin, + z3::expr &End) { + // Begin and End are picked amongst a set of predefined values. + Solver.add(inSetConstraint(Begin, kAnchors)); + Solver.add(inSetConstraint(End, kAnchors)); + Solver.add(Begin >= 0); + Solver.add(Begin <= End); + Solver.add(End <= kMaxSize); +} + +void RandomFunctionGenerator::addLoopConstraints(const z3::expr &LoopBegin, + const z3::expr &LoopEnd, + z3::expr &LoopBlockSize, + int LoopMinIter) { + Solver.add(inSetConstraint(LoopBlockSize, kLoopBlockSize)); + Solver.add(LoopBegin == LoopEnd || + (LoopBegin > (LoopMinIter * LoopBlockSize))); +} + +} // namespace automemcpy +} // namespace llvm diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp @@ -0,0 +1,345 @@ +//===-- JSON serialization routines ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This code analyzes the json file produced by the `automemcpy` binary. +// It works as follows: +// - Reads one or more json files. +// - If there are several runs for the same function and distribution, picks the +// median throughput (aka `BytesPerSecond`). +// - Aggregates the throughput per distributions and scores them from worst (0) +// to best (1). +// - Each distribution categorizes each function into one of the following +// categories: EXCELLENT, VERY_GOOD, GOOD, PASSABLE, INADEQUATE, MEDIOCRE, +// BAD. +// - A process similar to the Majority Judgment voting system is used to `elect` +// the best function. The histogram of grades is returned so we can +// distinguish between functions with the same final grade. In the following +// example both functions grade EXCELLENT but we may prefer the second one. +// +// | | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | ... +// |------------|-----------|-----------|------|----------| ... +// | Function_1 | 7 | 1 | 2 | | ... +// | Function_2 | 6 | 4 | | | ... + +#include "LibcFunctionPrototypes.h" +#include "automemcpy/FunctionDescriptor.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBuffer.h" +#include +#include +#include +#include +#include + +namespace llvm { + +static cl::list InputFilenames(cl::Positional, cl::OneOrMore, + cl::desc("")); + +namespace automemcpy { + +extern ArrayRef getFunctionDescriptors(); + +static StringMap createFunctionDescriptorMap() { + StringMap Descriptors; + for (const NamedFunctionDescriptor &FD : getFunctionDescriptors()) + Descriptors.insert_or_assign(FD.Name, &FD.Desc); + return Descriptors; +} + +static const FunctionDescriptor &getFunctionDescriptor(StringRef FunctionName) { + static StringMap Descriptors = + createFunctionDescriptorMap(); + const auto *FD = Descriptors.lookup(FunctionName); + if (!FD) + report_fatal_error( + Twine("No FunctionDescriptor for ").concat(FunctionName)); + return *FD; +} + +struct FunctionId { + StringRef Name; + FunctionType Type; + COMPARABLE_AND_HASHABLE(FunctionId, Type, Name) +}; + +struct DistributionId { + StringRef Name; + COMPARABLE_AND_HASHABLE(DistributionId, Name) +}; + +struct SampleId { + FunctionId Function; + DistributionId Distribution; + COMPARABLE_AND_HASHABLE(SampleId, Function.Type, Function.Name, + Distribution.Name) +}; + +struct Sample { + SampleId Id; + double BytesPerSecond = 0; +}; + +struct Grade { + enum GradeEnum { + EXCELLENT, + VERY_GOOD, + GOOD, + PASSABLE, + INADEQUATE, + MEDIOCRE, + BAD, + ARRAY_SIZE, + }; + + static StringRef getString(const GradeEnum &GE) { + switch (GE) { + case EXCELLENT: + return "EXCELLENT"; + case VERY_GOOD: + return "VERY_GOOD"; + case GOOD: + return "GOOD"; + case PASSABLE: + return "PASSABLE"; + case INADEQUATE: + return "INADEQUATE"; + case MEDIOCRE: + return "MEDIOCRE"; + case BAD: + return "BAD"; + case ARRAY_SIZE: + report_fatal_error("logic error"); + } + } + + static GradeEnum judge(double Score) { + if (Score >= 6. / 7) + return EXCELLENT; + if (Score >= 5. / 7) + return VERY_GOOD; + if (Score >= 4. / 7) + return GOOD; + if (Score >= 3. / 7) + return PASSABLE; + if (Score >= 2. / 7) + return INADEQUATE; + if (Score >= 1. / 7) + return MEDIOCRE; + return BAD; + } +}; + +// A GradeEnum indexed array with counts for each grade. +using GradeHistogram = std::array; + +void PrintUTF8(raw_ostream &Stream, const GradeHistogram &GH) { + static constexpr std::array kCharacters = { + " ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"}; + + const size_t Max = *std::max_element(GH.begin(), GH.end()); + for (size_t I = 0; I < GH.size(); ++I) { + size_t Index = (float(GH[I]) / Max) * (kCharacters.size() - 1); + Stream << kCharacters.at(Index); + } +} + +struct FunctionData { + FunctionId Id; + StringMap Throughputs; // Median of samples per distribution + StringMap Scores; // Normalized score per distribution + StringMap GradeAttribution; // Grade per distribution + GradeHistogram GradeHisto = {}; // GradeEnum indexed array + Grade::GradeEnum FinalGrade = Grade::BAD; // Overall grade for this function +}; + +static std::vector getThroughputs(ArrayRef Samples) { + std::unordered_map, SampleId::Hasher> + BucketedSamples; + for (const auto &S : Samples) + BucketedSamples[S.Id].push_back(S.BytesPerSecond); + std::unordered_map, FunctionId::Hasher> + Throughputs; + for (auto &Pair : BucketedSamples) { + const auto &Id = Pair.first; + auto &Values = Pair.second; + std::sort(Values.begin(), Values.end()); + const double MedianValue = Values[Values.size() / 2]; + Throughputs[Id.Function][Id.Distribution.Name] = MedianValue; + } + std::vector Output; + for (auto &Pair : Throughputs) { + FunctionData Data; + Data.Id = Pair.first; + Data.Throughputs = std::move(Pair.second); + Output.push_back(std::move(Data)); + } + return Output; +} + +static void fillScores(MutableArrayRef Functions) { + // A key to bucket throughput per function type and distribution. + struct Key { + FunctionType Type; + StringRef Distribution; + + COMPARABLE_AND_HASHABLE(Key, Type, Distribution) + }; + // Tracks minimum and maximum values. + struct MinMax { + double Min = std::numeric_limits::max(); + double Max = std::numeric_limits::min(); + void update(double Value) { + if (Value < Min) + Min = Value; + if (Value > Max) + Max = Value; + } + double normalize(double Value) const { return (Value - Min) / (Max - Min); } + }; + + std::unordered_map ThroughputMinMax; + for (const auto &Function : Functions) { + const FunctionType Type = Function.Id.Type; + for (const auto &Pair : Function.Throughputs) { + const auto &Distribution = Pair.getKey(); + const double Throughput = Pair.getValue(); + const Key K{Type, Distribution}; + ThroughputMinMax[K].update(Throughput); + } + } + for (auto &Function : Functions) { + const FunctionType Type = Function.Id.Type; + for (const auto &Pair : Function.Throughputs) { + const auto &Distribution = Pair.getKey(); + const double Throughput = Pair.getValue(); + const Key K{Type, Distribution}; + Function.Scores[Distribution] = ThroughputMinMax[K].normalize(Throughput); + } + } +} + +static void castVotes(MutableArrayRef Functions) { + for (FunctionData &Function : Functions) + for (const auto &Pair : Function.Scores) { + const StringRef Distribution = Pair.getKey(); + const double Score = Pair.getValue(); + const auto G = Grade::judge(Score); + ++(Function.GradeHisto[G]); + Function.GradeAttribution[Distribution] = G; + } + + for (FunctionData &Function : Functions) { + const auto &GradeHisto = Function.GradeHisto; + const size_t Votes = + std::accumulate(GradeHisto.begin(), GradeHisto.end(), 0U); + const size_t MedianVote = Votes / 2; + size_t CountedVotes = 0; + Grade::GradeEnum MedianGrade = Grade::BAD; + for (size_t I = 0; I < GradeHisto.size(); ++I) { + CountedVotes += GradeHisto[I]; + if (CountedVotes > MedianVote) { + MedianGrade = Grade::GradeEnum(I); + break; + } + } + Function.FinalGrade = MedianGrade; + } +} + +struct JsonFile { + std::vector Samples; +}; + +static StringRef getInternalizedString(StringRef VolatileStr) { + static llvm::StringSet<> StringCache; + return StringCache.insert(VolatileStr).first->getKey(); +} + +bool fromJSON(const json::Value &V, Sample &Out, json::Path P) { + std::string Label; + json::ObjectMapper O(V, P); + if (O && O.map("bytes_per_second", Out.BytesPerSecond) && + O.map("label", Label)) { + const auto LabelPair = StringRef(Label).split(','); + Out.Id.Function.Name = getInternalizedString(LabelPair.first); + Out.Id.Function.Type = getFunctionDescriptor(LabelPair.first).Type; + Out.Id.Distribution.Name = getInternalizedString(LabelPair.second); + return true; + } + return false; +} + +bool fromJSON(const json::Value &V, JsonFile &JF, json::Path P) { + json::ObjectMapper O(V, P); + return O && O.map("benchmarks", JF.Samples); +} + +static ExitOnError ExitOnErr; + +JsonFile parseJsonResults(StringRef Filename) { + auto Buf = ExitOnErr(errorOrToExpected( + MemoryBuffer::getFile(Filename, /*bool IsText=*/true, + /*RequiresNullTerminator=*/false))); + auto JsonValue = ExitOnErr(json::parse(Buf->getBuffer())); + json::Path::Root Root; + JsonFile JF; + if (!fromJSON(JsonValue, JF, Root)) + ExitOnErr(Root.getError()); + return JF; +} + +int Main(int argc, char **argv) { + ExitOnErr.setBanner(std::string(argv[0]) + ": "); + cl::ParseCommandLineOptions(argc, argv, "Automemcpy Json Results Analyzer\n"); + std::vector Samples; + + for (const auto &Filename : InputFilenames) { + auto Result = parseJsonResults(Filename); + llvm::append_range(Samples, Result.Samples); + } + + std::vector Functions = getThroughputs(Samples); + fillScores(Functions); + castVotes(Functions); + + // TODO: Implement tie breaking algorithm. + std::sort(Functions.begin(), Functions.end(), + [](const FunctionData &A, const FunctionData &B) { + return A.FinalGrade < B.FinalGrade; + }); + + // Present data by function type. + std::stable_sort(Functions.begin(), Functions.end(), + [](const FunctionData &A, const FunctionData &B) { + return A.Id.Type < B.Id.Type; + }); + + for (const FunctionData &Function : Functions) { + outs() << formatv("{0,-10}", Grade::getString(Function.FinalGrade)); + outs() << " |"; + PrintUTF8(outs(), Function.GradeHisto); + outs() << "| "; + outs().resetColor(); + outs() << formatv("{0,+25}", Function.Id.Name); + outs() << "\n"; + } + + return EXIT_SUCCESS; +} + +} // namespace automemcpy +} // namespace llvm + +int main(int argc, char **argv) { return llvm::automemcpy::Main(argc, argv); } diff --git a/libc/benchmarks/automemcpy/unittests/CMakeLists.txt b/libc/benchmarks/automemcpy/unittests/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/unittests/CMakeLists.txt @@ -0,0 +1,4 @@ +add_libc_benchmark_unittest(libc-automemcpy-codegen-test + SRCS CodeGenTest.cpp + DEPENDS automemcpy_codegen +) diff --git a/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp b/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp @@ -0,0 +1,219 @@ +//===-- Automemcpy CodeGen Test -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "automemcpy/CodeGen.h" +#include "automemcpy/RandomFunctionGenerator.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using testing::AllOf; +using testing::AnyOf; +using testing::ElementsAre; +using testing::Ge; +using testing::Gt; +using testing::Le; +using testing::Lt; + +namespace llvm { +namespace automemcpy { +namespace { + +TEST(Automemcpy, Codegen) { + static constexpr FunctionDescriptor kDescriptors[] = { + {FunctionType::MEMCPY, llvm::None, llvm::None, llvm::None, llvm::None, + Accelerator{{0, kMaxSize}}, ElementTypeClass::NATIVE}, + {FunctionType::MEMCPY, Contiguous{{0, 4}}, Overlap{{4, 256}}, + Loop{{256, kMaxSize}, 64}, llvm::None, llvm::None, + ElementTypeClass::NATIVE}, + {FunctionType::MEMCMP, Contiguous{{0, 2}}, Overlap{{2, 64}}, llvm::None, + AlignedLoop{Loop{{64, kMaxSize}, 16}, 16, AlignArg::_1}, llvm::None, + ElementTypeClass::NATIVE}, + {FunctionType::MEMSET, Contiguous{{0, 2}}, Overlap{{2, 256}}, llvm::None, + AlignedLoop{Loop{{256, kMaxSize}, 32}, 16, AlignArg::_1}, llvm::None, + ElementTypeClass::NATIVE}, + {FunctionType::MEMSET, Contiguous{{0, 2}}, Overlap{{2, 256}}, llvm::None, + AlignedLoop{Loop{{256, kMaxSize}, 32}, 32, AlignArg::_1}, llvm::None, + ElementTypeClass::NATIVE}, + {FunctionType::BZERO, Contiguous{{0, 4}}, Overlap{{4, 128}}, llvm::None, + AlignedLoop{Loop{{128, kMaxSize}, 32}, 32, AlignArg::_1}, llvm::None, + ElementTypeClass::NATIVE}, + }; + + std::string Output; + raw_string_ostream OutputStream(Output); + Serialize(OutputStream, kDescriptors); + + EXPECT_STREQ(OutputStream.str().c_str(), + R"(// This file is auto-generated by libc/benchmarks/automemcpy. +// Functions : 6 + +#include "LibcFunctionPrototypes.h" +#include "automemcpy/FunctionDescriptor.h" +#include "src/string/memory_utils/elements.h" + +using llvm::libc_benchmarks::BzeroConfiguration; +using llvm::libc_benchmarks::MemcmpOrBcmpConfiguration; +using llvm::libc_benchmarks::MemcpyConfiguration; +using llvm::libc_benchmarks::MemsetConfiguration; + +namespace __llvm_libc { + +static void memcpy_0xE00E29EE73994E2B(char *__restrict dst, const char *__restrict src, size_t size) { + using namespace __llvm_libc::x86; + return Copy(dst, src, size); +} +static void memcpy_0x7381B60C7BE75EF9(char *__restrict dst, const char *__restrict src, size_t size) { + using namespace __llvm_libc::x86; + if(size == 0) return; + if(size == 1) return Copy<_1>(dst, src); + if(size == 2) return Copy<_2>(dst, src); + if(size == 3) return Copy<_3>(dst, src); + if(size < 8) return Copy>(dst, src, size); + if(size < 16) return Copy>(dst, src, size); + if(size < 32) return Copy>(dst, src, size); + if(size < 64) return Copy>(dst, src, size); + if(size < 128) return Copy>(dst, src, size); + if(size < 256) return Copy>(dst, src, size); + return Copy>(dst, src, size); +} +static int memcmp_0x348D7BA6DB0EE033(const char * lhs, const char * rhs, size_t size) { + using namespace __llvm_libc::x86; + if(size == 0) return 0; + if(size == 1) return ThreeWayCompare<_1>(lhs, rhs); + if(size < 4) return ThreeWayCompare>(lhs, rhs, size); + if(size < 8) return ThreeWayCompare>(lhs, rhs, size); + if(size < 16) return ThreeWayCompare>(lhs, rhs, size); + if(size < 32) return ThreeWayCompare>(lhs, rhs, size); + if(size < 64) return ThreeWayCompare>(lhs, rhs, size); + return ThreeWayCompare::Then>>(lhs, rhs, size); +} +static void memset_0x71E761699B999863(char * dst, int value, size_t size) { + using namespace __llvm_libc::x86; + if(size == 0) return; + if(size == 1) return SplatSet<_1>(dst, value); + if(size < 4) return SplatSet>(dst, value, size); + if(size < 8) return SplatSet>(dst, value, size); + if(size < 16) return SplatSet>(dst, value, size); + if(size < 32) return SplatSet>(dst, value, size); + if(size < 64) return SplatSet>(dst, value, size); + if(size < 128) return SplatSet>(dst, value, size); + if(size < 256) return SplatSet>(dst, value, size); + return SplatSet::Then>>(dst, value, size); +} +static void memset_0x3DF0F44E2ED6A50F(char * dst, int value, size_t size) { + using namespace __llvm_libc::x86; + if(size == 0) return; + if(size == 1) return SplatSet<_1>(dst, value); + if(size < 4) return SplatSet>(dst, value, size); + if(size < 8) return SplatSet>(dst, value, size); + if(size < 16) return SplatSet>(dst, value, size); + if(size < 32) return SplatSet>(dst, value, size); + if(size < 64) return SplatSet>(dst, value, size); + if(size < 128) return SplatSet>(dst, value, size); + if(size < 256) return SplatSet>(dst, value, size); + return SplatSet::Then>>(dst, value, size); +} +static void bzero_0x475977492C218AD4(char * dst, size_t size) { + using namespace __llvm_libc::x86; + if(size == 0) return; + if(size == 1) return SplatSet<_1>(dst, 0); + if(size == 2) return SplatSet<_2>(dst, 0); + if(size == 3) return SplatSet<_3>(dst, 0); + if(size < 8) return SplatSet>(dst, 0, size); + if(size < 16) return SplatSet>(dst, 0, size); + if(size < 32) return SplatSet>(dst, 0, size); + if(size < 64) return SplatSet>(dst, 0, size); + if(size < 128) return SplatSet>(dst, 0, size); + return SplatSet::Then>>(dst, 0, size); +} + +} // namespace __llvm_libc + +namespace llvm { +namespace automemcpy { + +ArrayRef getFunctionDescriptors() { + static constexpr NamedFunctionDescriptor kDescriptors[] = { + {"memcpy_0xE00E29EE73994E2B",{FunctionType::MEMCPY,llvm::None,llvm::None,llvm::None,llvm::None,Accelerator{{0,kMaxSize}},ElementTypeClass::NATIVE}}, + {"memcpy_0x7381B60C7BE75EF9",{FunctionType::MEMCPY,Contiguous{{0,4}},Overlap{{4,256}},Loop{{256,kMaxSize},64},llvm::None,llvm::None,ElementTypeClass::NATIVE}}, + {"memcmp_0x348D7BA6DB0EE033",{FunctionType::MEMCMP,Contiguous{{0,2}},Overlap{{2,64}},llvm::None,AlignedLoop{Loop{{64,kMaxSize},16},16,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}}, + {"memset_0x71E761699B999863",{FunctionType::MEMSET,Contiguous{{0,2}},Overlap{{2,256}},llvm::None,AlignedLoop{Loop{{256,kMaxSize},32},16,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}}, + {"memset_0x3DF0F44E2ED6A50F",{FunctionType::MEMSET,Contiguous{{0,2}},Overlap{{2,256}},llvm::None,AlignedLoop{Loop{{256,kMaxSize},32},32,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}}, + {"bzero_0x475977492C218AD4",{FunctionType::BZERO,Contiguous{{0,4}},Overlap{{4,128}},llvm::None,AlignedLoop{Loop{{128,kMaxSize},32},32,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}}, + }; + return makeArrayRef(kDescriptors); +} + +} // namespace automemcpy +} // namespace llvm + + +using MemcpyStub = void (*)(char *__restrict, const char *__restrict, size_t); +template +void *Wrap(void *__restrict dst, const void *__restrict src, size_t size) { + Foo(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} +llvm::ArrayRef getMemcpyConfigurations() { + using namespace __llvm_libc; + static constexpr MemcpyConfiguration kConfigurations[] = { + {Wrap, "memcpy_0xE00E29EE73994E2B"}, + {Wrap, "memcpy_0x7381B60C7BE75EF9"}, + }; + return llvm::makeArrayRef(kConfigurations); +} + +using MemcmpStub = int (*)(const char *, const char *, size_t); +template +int Wrap(const void *lhs, const void *rhs, size_t size) { + return Foo(reinterpret_cast(lhs), + reinterpret_cast(rhs), size); +} +llvm::ArrayRef getMemcmpConfigurations() { + using namespace __llvm_libc; + static constexpr MemcmpOrBcmpConfiguration kConfigurations[] = { + {Wrap, "memcmp_0x348D7BA6DB0EE033"}, + }; + return llvm::makeArrayRef(kConfigurations); +} +llvm::ArrayRef getBcmpConfigurations() { + return {}; +} + +using MemsetStub = void (*)(char *, int, size_t); +template void *Wrap(void *dst, int value, size_t size) { + Foo(reinterpret_cast(dst), value, size); + return dst; +} +llvm::ArrayRef getMemsetConfigurations() { + using namespace __llvm_libc; + static constexpr MemsetConfiguration kConfigurations[] = { + {Wrap, "memset_0x71E761699B999863"}, + {Wrap, "memset_0x3DF0F44E2ED6A50F"}, + }; + return llvm::makeArrayRef(kConfigurations); +} + +using BzeroStub = void (*)(char *, size_t); +template void Wrap(void *dst, size_t size) { + Foo(reinterpret_cast(dst), size); +} +llvm::ArrayRef getBzeroConfigurations() { + using namespace __llvm_libc; + static constexpr BzeroConfiguration kConfigurations[] = { + {Wrap, "bzero_0x475977492C218AD4"}, + }; + return llvm::makeArrayRef(kConfigurations); +} +// Functions : 6 +)"); +} +} // namespace +} // namespace automemcpy +} // namespace llvm diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h --- a/libc/src/string/memory_utils/elements.h +++ b/libc/src/string/memory_utils/elements.h @@ -151,6 +151,43 @@ static void SplatSet(char *dst, const unsigned char value) {} }; +// Overlap ElementA and ElementB so they span Size bytes. +template +struct Overlap { + static constexpr size_t kSize = Size; + static_assert(ElementB::kSize <= ElementA::kSize, "ElementB too big"); + static_assert(ElementA::kSize <= Size, "ElementA too big"); + static_assert((ElementA::kSize + ElementB::kSize) >= Size, + "Elements too small to overlap"); + static constexpr size_t kOffset = kSize - ElementB::kSize; + + static void Copy(char *__restrict dst, const char *__restrict src) { + ElementA::Copy(dst, src); + ElementB::Copy(dst + kOffset, src + kOffset); + } + + static bool Equals(const char *lhs, const char *rhs) { + if (!ElementA::Equals(lhs, rhs)) + return false; + if (!ElementB::Equals(lhs + kOffset, rhs + kOffset)) + return false; + return true; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + if (!ElementA::Equals(lhs, rhs)) + return ElementA::ThreeWayCompare(lhs, rhs); + if (!ElementB::Equals(lhs + kOffset, rhs + kOffset)) + return ElementB::ThreeWayCompare(lhs + kOffset, rhs + kOffset); + return 0; + } + + static void SplatSet(char *dst, const unsigned char value) { + ElementA::SplatSet(dst, value); + ElementB::SplatSet(dst + kOffset, value); + } +}; + // Runtime-size Higher-Order Operations // ------------------------------------ // - Tail: Perform the operation on the last 'T::kSize' bytes of the buffer.