diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -145,8 +145,8 @@ RUNTIME_OUTPUT_DIRECTORY "${BENCHMARK_OUTPUT_DIR}" COMPILE_FLAGS "${BENCHMARK_TEST_LIBCXX_COMPILE_FLAGS}" LINK_FLAGS "${BENCHMARK_TEST_LIBCXX_LINK_FLAGS}" - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED YES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED NO CXX_EXTENSIONS NO) cxx_link_system_libraries(${libcxx_target}) if (LIBCXX_BENCHMARK_NATIVE_STDLIB) @@ -175,8 +175,8 @@ INCLUDE_DIRECTORIES "" COMPILE_FLAGS "${BENCHMARK_TEST_NATIVE_COMPILE_FLAGS}" LINK_FLAGS "${BENCHMARK_TEST_NATIVE_LINK_FLAGS}" - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED YES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED NO CXX_EXTENSIONS NO) endif() endfunction() diff --git a/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp b/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp new file mode 100644 --- /dev/null +++ b/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp @@ -0,0 +1,223 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// TODO FMT Remove once libc++ requires C++20 support. +#if __cplusplus > 201703L && !defined(_LIBCPP_HAS_NO_UNICODE) + +#include +#include + +#include "benchmark/benchmark.h" + +#include "test_macros.h" + +#ifdef _LIBCPP_HAS_NO_UNICODE +#error The benchmark requires Unicode support enabled. +#endif + +// Always enable asserts since they are used compile-time not run-time. +#ifdef NDEBUG +#undef NDEBUG +#endif +#include + +using namespace std::__format_spec; + +template +class tester { + static constexpr size_t size_ = N - 1; + std::array data_; + + constexpr void validate(auto it) const noexcept { + assert(it == end()); + auto result = __get_string_alignment(begin(), end(), 1'000'000, 1'000'000); + assert(result.__last == end() && + static_cast(result.__size) == + __detail::__estimate_column_width(begin(), end(), -1).__width && + result.__align); + } + +public: + explicit constexpr tester(const CharT (&input)[N]) { + auto it = data_.begin(); + for (int i = 0; i < 100; ++i) + it = std::copy_n(input, size_, it); + + validate(it); + } + + constexpr size_t size() const noexcept { return data_.size(); } + constexpr const CharT* begin() const noexcept { return data_.begin(); } + constexpr const CharT* end() const noexcept { return data_.end(); } + + void test(benchmark::State& state) const { + for (auto _ : state) + benchmark::DoNotOptimize( + __get_string_alignment(begin(), end(), 1'000'000, 1'000'000)); + state.SetItemsProcessed(state.iterations() * size()); + } +}; + +#define TEST(u8) \ + if constexpr (std::same_as) { \ + constexpr auto p = tester{u8}; \ + p.test(state); \ + } else if constexpr (std::same_as) { \ + constexpr auto p = tester{TEST_CONCAT(u, u8)}; \ + p.test(state); \ + } else { \ + constexpr auto p = tester{TEST_CONCAT(U, u8)}; \ + p.test(state); \ + } + +template +static void BM_EstimateLengthNoMultiByte(benchmark::State& state) { + TEST("The quick brown fox jumps over the lazy dog"); +} + +template +static void BM_EstimateLengthTwoByteDE(benchmark::State& state) { + static_assert( + sizeof( + "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich") == + 67); + + // https://en.wikipedia.org/wiki/Pangram + TEST("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich"); +} + +template +static void BM_EstimateLengthTwoBytePL(benchmark::State& state) { + static_assert(sizeof("Stróż pchnął kość w quiz gędźb vel fax myjń") == 53); + + // https://en.wikipedia.org/wiki/Pangram + TEST("Stróż pchnął kość w quiz gędźb vel fax myjń"); +} + +// All values below are 1100, which is is the first multi column sequence. +template +static void BM_EstimateLengthThreeByteSingleColumnLow(benchmark::State& state) { + static_assert(sizeof("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807" + "\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f") == + 49); + + TEST("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807" + "\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f"); +} + +template +static void +BM_EstimateLengthThreeByteSingleColumnHigh(benchmark::State& state) { + static_assert(sizeof("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807" + "\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f") == + 49); + + TEST("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807" + "\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f"); +} + +template +static void BM_EstimateLengthThreeByteDoubleColumn(benchmark::State& state) { + static_assert(sizeof("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807" + "\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f") == + 49); + + TEST("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807" + "\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f"); +} + +template +static void BM_EstimateLengthThreeByte(benchmark::State& state) { + static_assert(sizeof("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000" + "\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e") == + 49); + + TEST("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000" + "\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e"); +} + +template +static void BM_EstimateLengthFourByteSingleColumn(benchmark::State& state) { + static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003" + "\U00010004\U00010005\U00010006\U00010007" + "\U00010008\U00010009\U0001000a\U0001000b" + "\U0001000c\U0001000d\U0001000e\U0001000f") == 65); + + TEST("\U00010000\U00010001\U00010002\U00010003" + "\U00010004\U00010005\U00010006\U00010007" + "\U00010008\U00010009\U0001000a\U0001000b" + "\U0001000c\U0001000d\U0001000e\U0001000f"); +} + +template +static void BM_EstimateLengthFourByteDoubleColumn(benchmark::State& state) { + static_assert(sizeof("\U00020000\U00020002\U00020002\U00020003" + "\U00020004\U00020005\U00020006\U00020007" + "\U00020008\U00020009\U0002000a\U0002000b" + "\U0002000c\U0002000d\U0002000e\U0002000f") == 65); + + TEST("\U00020000\U00020002\U00020002\U00020003" + "\U00020004\U00020005\U00020006\U00020007" + "\U00020008\U00020009\U0002000a\U0002000b" + "\U0002000c\U0002000d\U0002000e\U0002000f"); +} + +template +static void BM_EstimateLengthFourByte(benchmark::State& state) { + static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003" + "\U00020004\U00020005\U00020006\U00020007" + "\U00010008\U00010009\U0001000a\U0001000b" + "\U0002000c\U0002000d\U0002000e\U0002000f") == 65); + + TEST("\U00010000\U00010001\U00010002\U00010003" + "\U00020004\U00020005\U00020006\U00020007" + "\U00010008\U00010009\U0001000a\U0001000b" + "\U0002000c\U0002000d\U0002000e\U0002000f"); +} + +BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char); + +BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char16_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char16_t); + +BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char32_t); +BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char32_t); + +int main(int argc, char** argv) { + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) + return 1; + + benchmark::RunSpecifiedBenchmarks(); +} +#else +int main(int, char**) { return 0; } +#endif diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -171,7 +171,7 @@ "`P1460 `__","LWG","Mandating the Standard Library: Clause 20 - Utilities library","Prague","* *","" "`P1739 `__","LWG","Avoid template bloat for safe_ranges in combination with ""subrange-y"" view adaptors","Prague","* *","" "`P1831 `__","LWG","Deprecating volatile: library","Prague","* *","" -"`P1868 `__","LWG","width: clarifying units of width and precision in std::format","Prague","* *","" +"`P1868 `__","LWG","width: clarifying units of width and precision in std::format","Prague","|In Progress|","" "`P1908 `__","CWG","Reserving Attribute Namespaces for Future Use","Prague","* *","" "`P1937 `__","CWG","Fixing inconsistencies between constexpr and consteval functions","Prague","* *","" "`P1956 `__","LWG","On the names of low-level bit manipulation functions","Prague","|Complete|","12.0" diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h --- a/libcxx/include/__format/parser_std_format_spec.h +++ b/libcxx/include/__format/parser_std_format_spec.h @@ -10,12 +10,15 @@ #ifndef _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H #define _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H +#include <__algorithm/find_if.h> +#include <__algorithm/min.h> #include <__config> #include <__debug> #include <__format/format_arg.h> #include <__format/format_error.h> #include <__format/format_string.h> #include <__variant/monostate.h> +#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -708,6 +711,463 @@ // TODO FMT Add a parser for floating-point values. // TODO FMT Add a parser for pointer values. +/** Helper struct returned from @ref __get_string_alignment. */ +template +struct _LIBCPP_TEMPLATE_VIS __string_alignment { + /** Points beyond the last character to write to the output. */ + const _CharT* __last; + /** + * The estimated number of columns in the output or 0. + * + * Only when the output needs to be aligned it's required to know the exact + * number of columns in the output. So if the formatted output has only a + * minimum width the exact size isn't important. It's only important to know + * the minimum has been reached. The minimum width is the width specified in + * the format-spec. + * + * For example in this code @code std::format("{:10}", MyString); @endcode + * the width estimation can stop once the algorithm has determined the output + * width is 10 columns. + * + * So if: + * * @ref __align == @c true the @ref __size is the estimated number of + * columns required. + * * @ref __align == @c false the @ref __size is the estimated number of + * columns required or 0 when the estimation algorithm stopped prematurely. + */ + ptrdiff_t __size; + /** + * Does the output need to be aligned. + * + * When alignment is needed the output algorithm needs to add the proper + * padding. Else the output algorithm just needs to copy the input up to + * @ref __last. + */ + bool __align; +}; + +#ifndef _LIBCPP_HAS_NO_UNICODE +namespace __detail { + +/** + * Unicode column width estimates. + * + * Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32. + * Depending on format the relation between the number of code units stored and + * the number of output columns differs. The first relation is the number of + * code units forming a code point. (The text assumes the code units are + * unsigned.) + * - UTF-8 The number of code units is between one and four. The first 127 + * Unicode code points match the ASCII character set. When the highest bit is + * set it means the code point has more than one code unit. + * - UTF-16: The number of code units is between 1 and 2. When the first + * code unit is in the range [0xd800,0xdfff) it means the code point uses two + * code units. + * - UTF-32: The number of code units is always one. + * + * The code point to the number of columns isn't well defined. The code uses the + * estimations defined in [format.string.std]/11. This list might change in the + * future. + * + * The algorithm of @ref __get_string_alignment uses two different scanners: + * - The simple scanner @ref __estimate_column_width_fast. This scanner assumes + * 1 code unit is 1 column. This scanner stops when it can't be sure the + * assumption is valid: + * - UTF-8 when the code point is encoded in more than 1 code unit. + * - UTF-16 and UTF-32 when the first multi-column code point is encountered. + * (The code unit's value is lower than 0xd800 so the 2 code unit encoding + * is irrelevant for this scanner.) + * Due to these assumptions the scanner is faster than the full scanner. It + * can process all text only containing ASCII. For UTF-16/32 it can process + * most (all?) European languages. (Note the set it can process might be + * reduced in the future, due to updates in the scanning rules.) + * - The full scanner @ref __estimate_column_width. This scanner, if needed, + * converts multiple code units into one code point then converts the code + * point to a column width. + * + * See also: + * - [format.string.general]/11 + * - https://en.wikipedia.org/wiki/UTF-8#Encoding + * - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF + */ + +/** + * The first 2 column code point. + * + * This is the point where the fast UTF-16/32 scanner needs to stop processing. + */ +inline constexpr uint32_t __two_column_code_point = 0x1100; + +/** Helper concept for an UTF-8 character type. */ +template +concept __utf8 = same_as<_CharT, char> || same_as<_CharT, char8_t>; + +/** Helper concept for an UTF-16 character type. */ +template +concept __utf16 = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || + same_as<_CharT, char16_t>; + +/** Helper concept for an UTF-32 character type. */ +template +concept __utf32 = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || + same_as<_CharT, char32_t>; + +/** Helper concept for an UTF-16 or UTF-32 character type. */ +template +concept __utf16_32 = __utf16<_CharT> || __utf32<_CharT>; + +/** + * Converts a code point to the column width. + * + * The estimations are conforming to [format.string.general]/11 + * + * This version expects a value less than 0x1'0000, which is a 3-byte UTF-8 + * character. + */ +_LIBCPP_HIDE_FROM_ABI constexpr int __column_width_3(uint32_t __c) noexcept { + _LIBCPP_ASSERT(__c < 0x1'0000, + "Use __column_width_4 or __column_width for larger values"); + + // clang-format off + return 1 + (__c >= 0x1100 && (__c <= 0x115f || + (__c >= 0x2329 && (__c <= 0x232a || + (__c >= 0x2e80 && (__c <= 0x303e || + (__c >= 0x3040 && (__c <= 0xa4cf || + (__c >= 0xac00 && (__c <= 0xd7a3 || + (__c >= 0xf900 && (__c <= 0xfaff || + (__c >= 0xfe10 && (__c <= 0xfe19 || + (__c >= 0xfe30 && (__c <= 0xfe6f || + (__c >= 0xff00 && (__c <= 0xff60 || + (__c >= 0xffe0 && (__c <= 0xffe6 + )))))))))))))))))))); + // clang-format on +} + +/** + * @overload + * + * This version expects a value greater than or equal to 0x1'0000, which is a + * 4-byte UTF-8 character. + */ +_LIBCPP_HIDE_FROM_ABI constexpr int __column_width_4(uint32_t __c) noexcept { + _LIBCPP_ASSERT(__c >= 0x1'0000, + "Use __column_width_3 or __column_width for smaller values"); + + // clang-format off + return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f || + (__c >= 0x1'f900 && (__c <= 0x1'f9ff || + (__c >= 0x2'0000 && (__c <= 0x2'fffd || + (__c >= 0x3'0000 && (__c <= 0x3'fffd + )))))))); + // clang-format on +} + +/** + * @overload + * + * The general case, accepting all values. + */ +_LIBCPP_HIDE_FROM_ABI constexpr int __column_width(uint32_t __c) noexcept { + if (__c < 0x1'0000) + return __column_width_3(__c); + + return __column_width_4(__c); +} + +/** + * Estimate the column width for the UTF-8 sequence using the fast algorithm. + */ +template <__utf8 _CharT> +_LIBCPP_HIDE_FROM_ABI constexpr const _CharT* +__estimate_column_width_fast(const _CharT* __first, + const _CharT* __last) noexcept { + return _VSTD::find_if(__first, __last, + [](unsigned char __c) { return __c & 0x80; }); +} + +/** + * @overload + * + * The implementation for UTF-16/32. + */ +template <__utf16_32 _CharT> +_LIBCPP_HIDE_FROM_ABI constexpr const _CharT* +__estimate_column_width_fast(const _CharT* __first, + const _CharT* __last) noexcept { + return _VSTD::find_if(__first, __last, + [](uint32_t __c) { return __c >= 0x1100; }); +} + +template +struct _LIBCPP_TEMPLATE_VIS __column_width_result { + /** The number of output columns. */ + size_t __width; + /** + * The last parsed element. + * + * This limits the original output to fit in the wanted number of columns. + */ + const _CharT* __ptr; +}; + +/** + * Small helper to determine the width of malformed Unicode. + * + * @note This function's only needed for UTF-8. During scanning UTF-8 there + * are multiple place where it can be detected that the Unicode is malformed. + * UTF-16 only requires 1 test and UTF-32 requires no testing. + */ +template <__utf8 _CharT> +_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> +__estimate_column_width_malformed(const _CharT* __first, const _CharT* __last, + size_t __maximum, size_t __result) noexcept { + size_t __size = __last - __first; + size_t __n = _VSTD::min(__size, __maximum); + return {__result + __n, __first + __n}; +} + +/** + * Determines the number of output columns needed to render the input. + * + * @note When the scanner encounters malformed Unicode it acts as-if every code + * unit at the end of the input is one output column. It's expected the output + * terminal will replace these malformed code units with a one column + * replacement characters. + * + * @param __first Points to the first element of the input range. + * @param __last Points beyond the last element of the input range. + * @param __maximum The maximum number of output columns. The returned number + * of estimated output columns will not exceed this value. + */ +template <__utf8 _CharT> +_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> +__estimate_column_width(const _CharT* __first, const _CharT* __last, + size_t __maximum) noexcept { + size_t __result = 0; + + while (__first != __last) { + // Based on the number of leading 1 bits the number of code units in the + // code point can be determined. See + // https://en.wikipedia.org/wiki/UTF-8#Encoding + switch (_VSTD::countl_one(static_cast(*__first))) { + case 0: // 1-code unit encoding: all 1 column + ++__result; + ++__first; + break; + + case 2: // 2-code unit encoding: all 1 column + // Malformed Unicode. + if (__last - __first < 2) [[unlikely]] + return __estimate_column_width_malformed(__first, __last, __maximum, + __result); + __first += 2; + ++__result; + break; + + case 3: // 3-code unit encoding: either 1 or 2 columns + // Malformed Unicode. + if (__last - __first < 3) [[unlikely]] + return __estimate_column_width_malformed(__first, __last, __maximum, + __result); + { + uint32_t __c = static_cast(*__first++) & 0x0f; + __c <<= 6; + __c |= static_cast(*__first++) & 0x3f; + __c <<= 6; + __c |= static_cast(*__first++) & 0x3f; + __result += __column_width_3(__c); + if (__result > __maximum) + return {__result - 2, __first - 3}; + } + break; + case 4: // 4-code unit encoding: either 1 or 2 columns + // Malformed Unicode. + if (__last - __first < 4) [[unlikely]] + return __estimate_column_width_malformed(__first, __last, __maximum, + __result); + { + uint32_t __c = static_cast(*__first++) & 0x07; + __c <<= 6; + __c |= static_cast(*__first++) & 0x3f; + __c <<= 6; + __c |= static_cast(*__first++) & 0x3f; + __c <<= 6; + __c |= static_cast(*__first++) & 0x3f; + __result += __column_width_4(__c); + if (__result > __maximum) + return {__result - 2, __first - 4}; + } + break; + default: + // Malformed Unicode. + return __estimate_column_width_malformed(__first, __last, __maximum, + __result); + } + + if (__result >= __maximum) + return {__result, __first}; + } + return {__result, __first}; +} + +template <__utf16 _CharT> +_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> +__estimate_column_width(const _CharT* __first, const _CharT* __last, + size_t __maximum) noexcept { + size_t __result = 0; + + while (__first != __last) { + uint32_t __c = *__first; + // Is the code unit part of a surrogate pair? See + // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF + if (__c >= 0xd800 && __c <= 0xDfff) { + // Malformed Unicode. + if (__last - __first < 2) [[unlikely]] + return {__result + 1, __first + 1}; + + __c -= 0xd800; + __c <<= 10; + __c += (*(__first + 1) - 0xdc00); + __c += 0x10'000; + + __result += __column_width_4(__c); + if (__result > __maximum) + return {__result - 2, __first}; + __first += 2; + } else { + __result += __column_width_3(__c); + if (__result > __maximum) + return {__result - 2, __first}; + ++__first; + } + + if (__result >= __maximum) + return {__result, __first}; + } + + return {__result, __first}; +} + +template <__utf32 _CharT> +_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> +__estimate_column_width(const _CharT* __first, const _CharT* __last, + size_t __maximum) noexcept { + size_t __result = 0; + + while (__first != __last) { + wchar_t __c = *__first; + __result += __column_width(__c); + + if (__result > __maximum) + return {__result - 2, __first}; + + ++__first; + if (__result >= __maximum) + return {__result, __first}; + } + + return {__result, __first}; +} + +} // namespace __detail + +template +_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT> +__get_string_alignment(const _CharT* __first, const _CharT* __last, + ptrdiff_t __width, ptrdiff_t __precision) noexcept { + _LIBCPP_ASSERT(__width != 0 || __precision != -1, + "The function has no effect and shouldn't be used"); + + // TODO FMT There might be more optimizations possible: + // If __precision == __format::__number_max and the encoding is: + // * UTF-8 : 4 * (__last - __first) >= __width + // * UTF-16 : 2 * (__last - __first) >= __width + // * UTF-32 : (__last - __first) >= __width + // In these cases it's certain the output is at least the requested width. + // It's unknown how often this happens in practice. For now the improvement + // isn't implemented. + + /* + * First assume there are no special Unicode code units in the input. + * - Apply the precision (this may reduce the size of the input). When + * __precison == -1 this step is omitted. + * - Scan for special code units in the input. + * If our assumption was correct the __pos will be at the end of the input. + */ + const ptrdiff_t __length = __last - __first; + const _CharT* __limit = + __first + + (__precision == -1 ? __length : _VSTD::min(__length, __precision)); + ptrdiff_t __size = __limit - __first; + const _CharT* __pos = + __detail::__estimate_column_width_fast(__first, __limit); + + if (__pos == __limit) + return {__limit, __size, __size < __width}; + + /* + * Our assumption was wrong, there are special Unicode code units. + * The range [__first, __pos) contains a set of code units with the + * following property: + * Every _CharT in the range will be rendered in 1 column. + * + * If there's no maximum width and the parsed size already exceeds the + * minimum required width. The real size isn't important. So bail out. + */ + if (__precision == -1 && (__pos - __first) >= __width) + return {__last, 0, false}; + + /* If there's a __precision, truncate the output to that width. */ + ptrdiff_t __prefix = __pos - __first; + if (__precision != -1) { + _LIBCPP_ASSERT(__precision > __prefix, "Logic error."); + auto __lengh_info = __detail::__estimate_column_width( + __pos, __last, __precision - __prefix); + __size = __lengh_info.__width + __prefix; + return {__lengh_info.__ptr, __size, __size < __width}; + } + + /* Else use __width to determine the number of required padding characters. */ + _LIBCPP_ASSERT(__width > __prefix, "Logic error."); + /* + * The column width is always one or two columns. For the precision the wanted + * column width is the maximum, for the width it's the minimum. Using the + * width estimation with its truncating behavior will result in the wrong + * result in the following case: + * - The last code unit processed requires two columns and exceeds the + * maximum column width. + * By increasing the __maximum by one avoids this issue. (It means it may + * pass one code point more than required to determine the proper result; + * that however isn't a problem for the algorithm.) + */ + size_t __maximum = 1 + __width - __prefix; + auto __lengh_info = + __detail::__estimate_column_width(__pos, __last, __maximum); + if (__lengh_info.__ptr != __last) { + // Consumed the width number of code units. The exact size of the string + // is unknown. We only know we don't need to align the output. + _LIBCPP_ASSERT(static_cast(__lengh_info.__width + __prefix) >= + __width, + "Logic error"); + return {__last, 0, false}; + } + + __size = __lengh_info.__width + __prefix; + return {__last, __size, __size < __width}; +} +#else // _LIBCPP_HAS_NO_UNICODE +template +_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT> +__get_string_alignment(const _CharT* __first, const _CharT* __last, + ptrdiff_t __width, ptrdiff_t __precision) noexcept { + const ptrdiff_t __length = __last - __first; + const _CharT* __size = + __first + + (__precision == -1 ? __length : _VSTD::min(__length, __precision)); + return {__first + __size, __size, __size < __width}; +} +#endif // _LIBCPP_HAS_NO_UNICODE + } // namespace __format_spec #endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_unicode.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_unicode.pass.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_unicode.pass.cpp @@ -0,0 +1,307 @@ +//===----------------------------------------------------------------------===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts + +// UTF-32 doesn't work properly +// XFAIL: windows + +// + +// Tests the Unicode width support of the standard format specifiers. +// It tests [format.string.std]/8 - 11: +// - Properly determining the estimated with of a unicode string. +// - Properly truncating to the wanted maximum width. + +#include +#include + +#include "test_macros.h" +#include "make_string.h" + +#define CSTR(S) MAKE_CSTRING(CharT, S) + +using namespace std::__format_spec; + +template +constexpr bool operator==(const __string_alignment& lhs, + const __string_alignment& rhs) noexcept { + return lhs.__last == rhs.__last && lhs.__size == rhs.__size && + lhs.__align == rhs.__align; +} + +template +constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align, + const CharT* str, size_t width, + size_t precision) { + std::basic_string_view sv{str}; + __string_alignment expected{sv.begin() + offset, size, align}; + __string_alignment traits = + __get_string_alignment(sv.begin(), sv.end(), width, precision); + assert(traits == expected); +} + +#ifndef _LIBCPP_HAS_NO_UNICODE +template +constexpr void estimate_column_width_fast(size_t expected, const CharT* str) { + std::basic_string_view sv{str}; + const CharT* out = + __detail::__estimate_column_width_fast(sv.begin(), sv.end()); + assert(out == sv.begin() + expected); +} + +template +constexpr void estimate_column_width_fast() { + + // No unicode + estimate_column_width_fast(3, CSTR("abc")); + estimate_column_width_fast(3, CSTR("a\u007fc")); + + if constexpr (sizeof(CharT) == 1) { + // UTF-8 stop at the first multi-byte character. + estimate_column_width_fast(0, CSTR("\u0080bc")); + estimate_column_width_fast(1, CSTR("a\u0080c")); + estimate_column_width_fast(2, CSTR("ab\u0080")); + estimate_column_width_fast(1, CSTR("aßc")); + + estimate_column_width_fast(1, CSTR("a\u07ffc")); + estimate_column_width_fast(1, CSTR("a\u0800c")); + + estimate_column_width_fast(1, CSTR("a\u10ffc")); + } else { + // UTF-16/32 stop at the first multi-column character. + estimate_column_width_fast(3, CSTR("\u0080bc")); + estimate_column_width_fast(3, CSTR("a\u0080c")); + estimate_column_width_fast(3, CSTR("ab\u0080")); + estimate_column_width_fast(3, CSTR("aßc")); + + estimate_column_width_fast(3, CSTR("a\u07ffc")); + estimate_column_width_fast(3, CSTR("a\u0800c")); + + estimate_column_width_fast(3, CSTR("a\u10ffc")); + } + // First 2-column character + estimate_column_width_fast(1, CSTR("a\u1100c")); + + estimate_column_width_fast(1, CSTR("a\U0000ffffc")); + estimate_column_width_fast(1, CSTR("a\U00010000c")); + estimate_column_width_fast(1, CSTR("a\U0010FFFFc")); +} + +template +constexpr void estimate_column_width(size_t expected, const CharT* str) { + std::basic_string_view sv{str}; + std::__format_spec::__detail::__column_width_result column_info = + __detail::__estimate_column_width(sv.begin(), sv.end(), -1); + assert(column_info.__width == expected); +} + +template +constexpr void estimate_column_width() { + //*** 1-byte code points *** + estimate_column_width(1, CSTR(" ")); + estimate_column_width(1, CSTR("~")); + + //*** 2-byte code points *** + estimate_column_width(1, CSTR("\u00a1")); // INVERTED EXCLAMATION MARK + estimate_column_width(1, CSTR("\u07ff")); // NKO TAMAN SIGN + + //*** 3-byte code points *** + estimate_column_width(1, CSTR("\u0800")); // SAMARITAN LETTER ALAF + estimate_column_width(1, CSTR("\ufffd")); // REPLACEMENT CHARACTER + + // 2 column ranges + estimate_column_width(2, CSTR("\u1100")); // HANGUL CHOSEONG KIYEOK + estimate_column_width(2, CSTR("\u115f")); // HANGUL CHOSEONG FILLER + + estimate_column_width(2, CSTR("\u2329")); // LEFT-POINTING ANGLE BRACKET + estimate_column_width(2, CSTR("\u232a")); // RIGHT-POINTING ANGLE BRACKET + + estimate_column_width(2, CSTR("\u2e80")); // CJK RADICAL REPEAT + estimate_column_width(2, CSTR("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR + + estimate_column_width(2, CSTR("\u3040")); // U+3041 HIRAGANA LETTER SMALL A + estimate_column_width(2, CSTR("\ua4cf")); // U+A4D0 LISU LETTER BA + + estimate_column_width(2, CSTR("\uac00")); // + estimate_column_width(2, CSTR("\ud7a3")); // Hangul Syllable Hih + + estimate_column_width(2, CSTR("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900 + estimate_column_width(2, CSTR("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF + + estimate_column_width(2, + CSTR("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA + estimate_column_width( + 2, CSTR("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS + + estimate_column_width( + 2, CSTR("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER + estimate_column_width(2, + CSTR("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM + + estimate_column_width(2, CSTR("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK + estimate_column_width(2, CSTR("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS + + estimate_column_width(2, CSTR("\uffe0")); // FULLWIDTH CENT SIGN + estimate_column_width(2, CSTR("\uffe6")); // FULLWIDTH WON SIGN + + //*** 4-byte code points *** + estimate_column_width(1, CSTR("\U00010000")); // LINEAR B SYLLABLE B008 A + estimate_column_width(1, CSTR("\U0010FFFF")); // Undefined Character + + // 2 column ranges + estimate_column_width(2, CSTR("\U0001f300")); // CYCLONE + estimate_column_width(2, CSTR("\U0001f64f")); // PERSON WITH FOLDED HANDS + estimate_column_width( + 2, CSTR("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS + estimate_column_width(2, CSTR("\U0001f9ff")); // NAZAR AMULET + estimate_column_width( + 2, CSTR("\U00020000")); // + estimate_column_width(2, CSTR("\U0002fffd")); // Undefined Character + estimate_column_width( + 2, CSTR("\U00030000")); // + estimate_column_width(2, CSTR("\U0003fffd")); // Undefined Character +} + +template +constexpr void get_string_alignment() { + // Truncate the input. + get_string_alignment(2, 2, false, CSTR("abc"), 0, 2); + + // The 2-column character gets entirely rejected. + get_string_alignment(1, 1, false, CSTR("a\u115f"), 0, 2); + + // Due to the requested width extra alignment is required. + get_string_alignment(1, 1, true, CSTR("a\u115f"), 2, 2); + + // Same but for a 2-column 4-byte UTF-8 sequence + get_string_alignment(1, 1, false, CSTR("a\U0001f300"), 0, 2); + get_string_alignment(1, 1, true, CSTR("a\U0001f300"), 2, 2); + + // No alignment required. + get_string_alignment(3, 3, false, CSTR("abc"), 2, -1); + get_string_alignment(3, 3, false, CSTR("abc"), 3, -1); + + // Special case, we have a special character already parsed and have enough + // withd to satisfy the minumum required width. + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("ab\u1111"), + 2, -1); + + // Evaluates all so size ->4 + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false, + CSTR("a\u115fc") /* 2-column character */, 3, -1); + // Evaluates all so size ->4 + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false, + CSTR("a\u115fc") /* 2-column character */, 4, -1); + + // Evaluates all so size ->5 + get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false, + CSTR("a\u115fcd") /* 2-column character */, 4, -1); + + // Evaluates all so size ->5 + get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false, + CSTR("a\u115fcd") /* 2-column character */, 5, -1); + + // Extend width + get_string_alignment(3, 3, true, CSTR("abc"), 4, -1); + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 3, true, + CSTR("a\u1160c") /* 1-column character */, 4, -1); + + // In this case the threshold where the width is still determined. + get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 3, false, CSTR("i\u1110"), + 2, -1); + + // The width is no longer exactly determined. + get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("i\u1110"), + 1, -1); + + // Extend width and truncate input. + get_string_alignment(1, 1, true, CSTR("abc"), 3, 1); + + if constexpr (sizeof(CharT) == 1) { + // Corrupt UTF-8 sequence. + get_string_alignment(2, 2, false, CSTR("a\xc0"), 0, 3); + get_string_alignment(2, 2, false, CSTR("a\xe0"), 0, 3); + get_string_alignment(2, 2, false, CSTR("a\xf0"), 0, 3); + } else if constexpr (sizeof(CharT) == 2) { + // Corrupt UTF-16 sequence. + if constexpr (std::same_as) + get_string_alignment(2, 2, false, u"a\xdddd", 0, 3); + else + // Corrupt UTF-16 wchar_t seqence. + get_string_alignment(2, 2, false, L"a\xdddd", 0, 3); + } + // UTF-32 doesn't combine characters, thus no corruption tests. +} + +template +constexpr void test() { + estimate_column_width_fast(); + estimate_column_width(); + get_string_alignment(); +} +#else // _LIBCPP_HAS_NO_UNICODE +template +constexpr void get_string_alignment() { + // Truncate the input. + get_string_alignment(2, 2, false, CSTR("abc"), 0, 2); + + // The 2-column character gets half accepted. + get_string_alignment(2, 2, false, CSTR("a\u115f"), 0, 2); + + // No alignment since the number of characters fits. + get_string_alignment(2, 2, false, CSTR("a\u115f"), 2, 2); + + // Same but for a 2-column 4-byte UTF-8 sequence + get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 0, 2); + get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 2, 2); + + // No alignment required. + get_string_alignment(3, 3, false, CSTR("abc"), 2, -1); + get_string_alignment(3, 3, false, CSTR("abc"), 3, -1); + + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), + 3 + 2 * (sizeof(CharT) == 1), false, CSTR("ab\u1111"), 2, + -1); + + // Doesn't evaluate 'c' so size -> 0 + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), + 3 + 2 * (sizeof(CharT) == 1), false, + CSTR("a\u115fc") /* 2-column character */, 3, -1); + // Extend width + get_string_alignment(3, 3, true, CSTR("abc"), 4, -1); + get_string_alignment(3 + 2 * (sizeof(CharT) == 1), + 3 + 2 * (sizeof(CharT) == 1), true, + CSTR("a\u1160c") /* 1-column character */, 6, -1); +} + +template +constexpr void test() { + get_string_alignment(); +} +#endif // _LIBCPP_HAS_NO_UNICODE + +constexpr bool test() { + test(); + test(); +#ifndef _LIBCPP_HAS_NO_CHAR8_T + test(); +#endif +#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS + test(); + test(); +#endif + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +}