diff --git a/libc/benchmarks/automemcpy/FunctionDescriptor.h b/libc/benchmarks/automemcpy/FunctionDescriptor.h new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/FunctionDescriptor.h @@ -0,0 +1,159 @@ +//===-- Pod structs to describe a memory function----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H +#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace automemcpy { + +// Boilerplate code to be able to sort and hash types. +#define COMPARABLE_AND_HASHABLE(T, ...) \ + inline auto asTuple() const { return std::tie(__VA_ARGS__); } \ + bool operator==(const T &O) const { return asTuple() == O.asTuple(); } \ + bool operator<(const T &O) const { return asTuple() < O.asTuple(); } \ + struct Hasher { \ + std::size_t operator()(const T &K) const { \ + return llvm::hash_value(K.asTuple()); \ + } \ + }; + +// Represents the maximum value for the size parameter of a memory function. +// This is an `int` so we can use it as an expression in Z3. +// It also allows for a more readable and compact representation when storing +// the SizeSpan in the autogenerated C++ file. +static constexpr int kMaxSize = INT_MAX; + +// This mimics the `Arg` type in libc/src/string/memory_utils/elements.h without +// having to depend on it. +enum class AlignArg { _1, _2, ARRAY_SIZE }; + +// Describes a range of sizes. +// We use the begin/end representation instead of first/last to allow for empty +// range (i.e. Begin == End) +struct SizeSpan { + size_t Begin = 0; + size_t End = 0; + + COMPARABLE_AND_HASHABLE(SizeSpan, Begin, End) +}; + +// Describes a contiguous region. +// In such a region all sizes are handled individually. +// e.g. with Span = {0, 2}; +// if(size == 0) return Handle<0>(); +// if(size == 1) return Handle<1>(); +struct Contiguous { + SizeSpan Span; + + COMPARABLE_AND_HASHABLE(Contiguous, Span) +}; + +// This struct represents a range of sizes over which to use an overlapping +// strategy. An overlapping strategy of size N handles all sizes from N to 2xN. +// The span may represent several contiguous overlaps. +// e.g. with Span = {16, 128}; +// if(size >= 16 and size < 32) return Handle>(); +// if(size >= 32 and size < 64) return Handle>(); +// if(size >= 64 and size < 128) return Handle>(); +struct Overlap { + SizeSpan Span; + + COMPARABLE_AND_HASHABLE(Overlap, Span) +}; + +// Describes a region using a loop handling BlockSize bytes at a time. The +// remaining bytes of the loop are handled with an overlapping operation. +struct Loop { + SizeSpan Span; + size_t BlockSize = 0; + + COMPARABLE_AND_HASHABLE(Loop, Span, BlockSize) +}; + +// Same as `Loop` but starts by aligning a buffer on `Alignment` bytes. +// A first operation handling 'Alignment` bytes is performed followed by a +// sequence of Loop.BlockSize bytes operation. The Loop starts processing from +// the next aligned byte in the chosen buffer. The remaining bytes of the loop +// are handled with an overlapping operation. +struct AlignedLoop { + Loop Loop; + size_t Alignment = 0; // Size of the alignment. + AlignArg AlignTo = AlignArg::_1; // Which buffer to align. + + COMPARABLE_AND_HASHABLE(AlignedLoop, Loop, Alignment, AlignTo) +}; + +// Some processors offer special instruction to handle the memory function +// completely, we refer to such instructions as accelerators. +struct Accelerator { + SizeSpan Span; + + COMPARABLE_AND_HASHABLE(Accelerator, Span) +}; + +// The memory functions are assembled out of primitives that can be implemented +// with regular scalar operations (SCALAR), with the help of vector or bitcount +// instructions (NATIVE) or by deferring it to the compiler (BUILTIN). +enum class ElementTypeClass { + SCALAR, + NATIVE, + BUILTIN, +}; + +// A simple enum to categorize which function is being implemented. +enum class FunctionType { + MEMCPY, + MEMCMP, + BCMP, + MEMSET, + BZERO, +}; + +// This struct describes the skeleton of the implementation, it does not go into +// every detail but is enough to uniquely identify the implementation. +struct FunctionDescriptor { + FunctionType Type; + Optional Contiguous; + Optional Overlap; + Optional Loop; + Optional AlignedLoop; + Optional Accelerator; + ElementTypeClass ElementClass; + + COMPARABLE_AND_HASHABLE(FunctionDescriptor, Type, Contiguous, Overlap, Loop, + AlignedLoop, Accelerator, ElementClass) + + inline size_t id() const { return llvm::hash_value(asTuple()); } +}; + +// Same as above but with the function name. +struct NamedFunctionDescriptor { + StringRef Name; + FunctionDescriptor Desc; +}; + +template llvm::hash_code hash_value(const ArrayRef &V) { + return llvm::hash_combine_range(V.begin(), V.end()); +} +template llvm::hash_code hash_value(const T &O) { + return llvm::hash_value(O.asTuple()); +} + +} // namespace automemcpy +} // namespace llvm + +#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H */ diff --git a/libc/benchmarks/automemcpy/README.md b/libc/benchmarks/automemcpy/README.md new file mode 100644 --- /dev/null +++ b/libc/benchmarks/automemcpy/README.md @@ -0,0 +1,111 @@ +This folder contains an implementation of [automemcpy: A framework for automatic generation of fundamental memory operations](https://research.google/pubs/pub50338/). + +It uses the [Z3 theorem prover](https://github.com/Z3Prover/z3) to enumerate a subset of valid memory function implementations. These implementations are then materialized as C++ code and can be [benchmarked](../) against various [size distributions](../distributions). This process helps the design of efficient implementations for a particular environnement (size distribution, processor or custom compilation options). + +This is not enabled by default, as it is mostly useful when working on tuning the library implementation. To build it, use `LIBC_BUILD_AUTOMEMCPY=ON` (see below). + +## Prerequisites + +You may need to install `Z3` from source if it's not available on your system. +Here we show instructions to install it into ``. +You may need to `sudo` to `make install`. + +```shell +mkdir -p ~/git +cd ~/git +git clone https://github.com/Z3Prover/z3.git +python scripts/mk_make.py --prefix= +cd build +make -j +make install +``` + +## Configuration + +```shell +mkdir -p +cd /llvm +cmake -DCMAKE_C_COMPILER=/usr/bin/clang \ + -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \ + -DLLVM_ENABLE_PROJECTS="libc" \ + -DLLVM_ENABLE_Z3_SOLVER=ON \ + -DLLVM_Z3_INSTALL_DIR= \ + -DLIBC_BUILD_AUTOMEMCPY=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -B +``` + +## Targets and compilation + +There are three main CMake targets + 1. `automemcpy_implementations` + - runs `Z3` and materializes valid memory functions as C++ code, a message will display its ondisk location. + - the source code is then compiled using the native host optimizations (i.e. `-march=native` or `-mcpu=native` depending on the architecture). + 2. `automemcpy` + - the binary that benchmarks the autogenerated implementations. + 3. `automemcpy_result_analyser` + - the binary that analyses the benchmark results. + +You may only compile the binaries as they both pull the autogenerated code as a dependency. + +```shell +make -C -j automemcpy automemcpy_result_analyser +``` + +## Running the benchmarks + +Make sure to save the results of the benchmark as a json file. + +```shell +/bin/automemcpy --benchmark_out_format=json --benchmark_out=/results.json +``` + +### Additional useful options + + + - `--benchmark_min_time=.2` + + By default, each function is benchmarked for at least one second, here we lower it to 200ms. + + - `--benchmark_filter="BM_Memset|BM_Bzero"` + + By default, all functions are benchmarked, here we restrict them to `memset` and `bzero`. + +Other options might be useful, use `--help` for more information. + +## Analyzing the benchmarks + +Analysis is performed by running `automemcpy_result_analyser` on one or more json result files. + +```shell +/bin/automemcpy_result_analyser /results.json +``` + +What it does: + 1. Gathers all throughput values for each function / distribution pair and picks the median one.\ + This allows picking a representative value over many runs of the benchmark. Please make sure all the runs happen under similar circumstances. + + 2. For each distribution, look at the span of throughputs for functions of the same type (e.g. For distribution `A`, memcpy throughput spans from 2GiB/s to 5GiB/s). + + 3. For each distribution, give a normalized score to each function (e.g. For distribution `A`, function `M` scores 0.65).\ + This score is then turned into a grade `EXCELLENT`, `VERY_GOOD`, `GOOD`, `PASSABLE`, `INADEQUATE`, `MEDIOCRE`, `BAD` - so that each distribution categorizes how function perform according to them. + + 4. A [Majority Judgement](https://en.wikipedia.org/wiki/Majority_judgment) process is then used to categorize each function. This enables finer analysis of how distributions agree on which function is better. In the following example, `Function_1` and `Function_2` are rated `EXCELLENT` but looking at the grade's distribution might help decide which is best. + +| | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | INADEQUATE | MEDIOCRE | BAD | +|------------|:---------:|:---------:|:----:|:--------:|:----------:|:--------:|:---:| +| Function_1 | 7 | 1 | 2 | | | | | +| Function_2 | 6 | 4 | | | | | | + +The tool outputs the histogram of grades for each function. In case of tie, other dimensions might help decide (e.g. code size, performance on other microarchitectures). + +``` +EXCELLENT |█▁▂ | Function_0 +EXCELLENT |█▅ | Function_1 +VERY_GOOD |▂█▁ ▁ | Function_2 +GOOD | ▁█▄ | Function_3 +PASSABLE | ▂▆▄█ | Function_4 +INADEQUATE | ▃▃█▁ | Function_5 +MEDIOCRE | █▆▁| Function_6 +BAD | ▁▁█| Function_7 +```