Index: libcxx/src/locale.cpp =================================================================== --- libcxx/src/locale.cpp +++ libcxx/src/locale.cpp @@ -2022,10 +2022,9 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; switch (c1) { case 0xE0: @@ -2041,6 +2040,9 @@ return codecvt_base::error; break; } + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint16_t t = static_cast(((c1 & 0x0F) << 12) @@ -2053,11 +2055,9 @@ } else if (c1 < 0xF5) { - if (frm_end-frm_nxt < 4) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - uint8_t c4 = frm_nxt[3]; switch (c1) { case 0xF0: @@ -2073,8 +2073,16 @@ return codecvt_base::error; break; } - if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) - return codecvt_base::error; + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; + if ((c3 & 0xC0) != 0x80) + return codecvt_base::error; + if (frm_end-frm_nxt < 4) + return codecvt_base::partial; + uint8_t c4 = frm_nxt[3]; + if ((c4 & 0xC0) != 0x80) + return codecvt_base::error; if (to_end-to_nxt < 2) return codecvt_base::partial; if ((((c1 & 7UL) << 18) + @@ -2143,10 +2151,9 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; switch (c1) { case 0xE0: @@ -2162,6 +2169,9 @@ return codecvt_base::error; break; } + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint16_t t = static_cast(((c1 & 0x0F) << 12) @@ -2174,11 +2184,9 @@ } else if (c1 < 0xF5) { - if (frm_end-frm_nxt < 4) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - uint8_t c4 = frm_nxt[3]; switch (c1) { case 0xF0: @@ -2194,8 +2202,16 @@ return codecvt_base::error; break; } - if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) - return codecvt_base::error; + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; + if ((c3 & 0xC0) != 0x80) + return codecvt_base::error; + if (frm_end-frm_nxt < 4) + return codecvt_base::partial; + uint8_t c4 = frm_nxt[3]; + if ((c4 & 0xC0) != 0x80) + return codecvt_base::error; if (to_end-to_nxt < 2) return codecvt_base::partial; if ((((c1 & 7UL) << 18) + @@ -2421,10 +2437,9 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; switch (c1) { case 0xE0: @@ -2440,6 +2455,9 @@ return codecvt_base::error; break; } + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint32_t t = static_cast(((c1 & 0x0F) << 12) @@ -2452,11 +2470,9 @@ } else if (c1 < 0xF5) { - if (frm_end-frm_nxt < 4) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - uint8_t c4 = frm_nxt[3]; switch (c1) { case 0xF0: @@ -2472,8 +2488,16 @@ return codecvt_base::error; break; } - if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) - return codecvt_base::error; + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; + if ((c3 & 0xC0) != 0x80) + return codecvt_base::error; + if (frm_end-frm_nxt < 4) + return codecvt_base::partial; + uint8_t c4 = frm_nxt[3]; + if ((c4 & 0xC0) != 0x80) + return codecvt_base::error; uint32_t t = static_cast(((c1 & 0x07) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) @@ -2679,10 +2703,9 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) + if (frm_end-frm_nxt < 2) return codecvt_base::partial; uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; switch (c1) { case 0xE0: @@ -2698,6 +2721,9 @@ return codecvt_base::error; break; } + if (frm_end-frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint16_t t = static_cast(((c1 & 0x0F) << 12) Index: libcxx/test/std/localization/codecvt_unicode.pass.cpp =================================================================== --- /dev/null +++ libcxx/test/std/localization/codecvt_unicode.pass.cpp @@ -0,0 +1,1381 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS +// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}} + +#include +#include +#include +#include +#include + +#include "test_macros.h" + +struct test_offsets_ok { + size_t in_size; + size_t out_size; +}; +struct test_offsets_partial { + size_t in_size; + size_t out_size; + size_t expected_in_next; + size_t expected_out_next; +}; + +template +struct test_offsets_error { + size_t in_size; + size_t out_size; + size_t expected_in_next; + size_t expected_out_next; + CharT replace_char; + size_t replace_pos; +}; + +#define array_size(x) (sizeof(x) / sizeof(x)[0]) + +template +void utf8_to_utf32_in_ok(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; + const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 5, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 4); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}}; + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } + + // Similar tests to above, but we always pass the full output buffer + for (auto t : offsets) { + InternT out[array_size(exp)] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, std::end(out), out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf8_to_utf32_in_partial(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; + const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 5, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 4); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {7, 4, 6, 3}, // incomplete fourth CP + {8, 4, 6, 3}, // incomplete fourth CP + {9, 4, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + }; + + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_utf32_in_error(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; + const char32_t expected[] = {'b', 0x0448, 0xD700, 0x10AAAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 5, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 4); + + // There are 5 classes of errors in UTF-8 decoding + // 1. Missing leading byte + // 2. Missing trailing byte + // 3. Surrogate CP + // 4. Ovelong sequence + // 5. CP out of Unicode range + test_offsets_error offsets[] = { + + // 1. Missing leading byte. We will replace the leading byte with + // non-leading byte, such as a byte that is always invalid or a trailing + // byte. + + // replace leading byte with invalid byte + {1, 4, 0, 0, 0xFF, 0}, + {3, 4, 1, 1, 0xFF, 1}, + {6, 4, 3, 2, 0xFF, 3}, + {10, 4, 6, 3, 0xFF, 6}, + + // replace leading byte with trailing byte + {1, 4, 0, 0, 0b10101010, 0}, + {3, 4, 1, 1, 0b10101010, 1}, + {6, 4, 3, 2, 0b10101010, 3}, + {10, 4, 6, 3, 0b10101010, 6}, + + // 2. Missing trailing byte. We will replace the trailing byte with + // non-trailing byte, such as a byte that is always invalid or a leading + // byte (simple ASCII byte in our case). + + // replace first trailing byte with ASCII byte + {3, 4, 1, 1, 'z', 2}, + {6, 4, 3, 2, 'z', 4}, + {10, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 4, 1, 1, 0xFF, 2}, + {6, 4, 3, 2, 0xFF, 4}, + {10, 4, 6, 3, 0xFF, 7}, + + // replace second trailing byte with ASCII byte + {6, 4, 3, 2, 'z', 5}, + {10, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 4, 3, 2, 0xFF, 5}, + {10, 4, 6, 3, 0xFF, 8}, + + // replace third trailing byte + {10, 4, 6, 3, 'z', 9}, + {10, 4, 6, 3, 0xFF, 9}, + + // 2.1 The following test-cases raise doubt whether error or partial should + // be returned. For example, we have 4-byte sequence with valid leading + // byte. If we hide the last byte we need to return partial. But, if the + // second or third byte, which are visible to the call to codecvt, are + // malformed then error should be returned. + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 4, 3, 2, 'z', 4}, + {8, 4, 6, 3, 'z', 7}, + {9, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 4, 3, 2, 0xFF, 4}, + {8, 4, 6, 3, 0xFF, 7}, + {9, 4, 6, 3, 0xFF, 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 4, 6, 3, 0xFF, 8}, + + // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte + // CP U+D700 + {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800 + {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00 + {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00 + {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00 + + // 4. Overlong sequence. The CPs in the input are chosen such as modifying + // just the leading byte is enough to make them overlong, i.e. for the + // 3-byte and 4-byte CP the second byte (first trailing) has enough leading + // zeroes. + {3, 4, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong + {3, 4, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong + {6, 4, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong + {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong + + // 5. CP above range + // turn U+10AAAA into U+14AAAA by changing its leading byte + {10, 4, 6, 3, 0b11110101, 6}, + // turn U+10AAAA into U+11AAAA by changing its 2nd byte + {10, 4, 6, 3, 0b10011010, 7}, + }; + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto old_char = in[t.replace_pos]; + in[t.replace_pos] = t.replace_char; + + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + + in[t.replace_pos] = old_char; + } +} + +template +void utf8_to_utf32_in(const std::codecvt& cvt) { + utf8_to_utf32_in_ok(cvt); + utf8_to_utf32_in_partial(cvt); + utf8_to_utf32_in_error(cvt); +} + +template +void utf32_to_utf8_out_ok(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 5, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 4); + assert(std::char_traits::length(exp) == 10); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}}; + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf32_to_utf8_out_partial(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 5, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 4); + assert(std::char_traits::length(exp) == 10); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {4, 6, 3, 6}, // no space for fourth CP + {4, 7, 3, 6}, // no space for fourth CP + {4, 8, 3, 6}, // no space for fourth CP + {4, 9, 3, 6}, // no space for fourth CP + }; + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf32_to_utf8_out_error(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 5, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 4); + assert(std::char_traits::length(exp) == 10); + + test_offsets_error offsets[] = { + + // Surrogate CP + {4, 10, 0, 0, 0xD800, 0}, + {4, 10, 1, 1, 0xDBFF, 1}, + {4, 10, 2, 3, 0xDC00, 2}, + {4, 10, 3, 6, 0xDFFF, 3}, + + // CP out of range + {4, 10, 0, 0, 0x00110000, 0}, + {4, 10, 1, 1, 0x00110000, 1}, + {4, 10, 2, 3, 0x00110000, 2}, + {4, 10, 3, 6, 0x00110000, 3}}; + + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto old_char = in[t.replace_pos]; + in[t.replace_pos] = t.replace_char; + + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + + in[t.replace_pos] = old_char; + } +} + +template +void utf32_to_utf8_out(const std::codecvt& cvt) { + utf32_to_utf8_out_ok(cvt); + utf32_to_utf8_out_partial(cvt); + utf32_to_utf8_out_error(cvt); +} + +template +void test_utf8_utf32_cvt(const std::codecvt& cvt) { + utf8_to_utf32_in(cvt); + utf32_to_utf8_out(cvt); +} + +template +void utf8_to_utf16_in_ok(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; + const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 6, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 5); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}}; + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } + + for (auto t : offsets) { + InternT out[array_size(exp)] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, std::end(out), out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf8_to_utf16_in_partial(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; + const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 6, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 5); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {10, 4, 6, 3}, // no space for fourth CP + {7, 5, 6, 3}, // incomplete fourth CP + {8, 5, 6, 3}, // incomplete fourth CP + {9, 5, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + {7, 4, 6, 3}, // incomplete fourth CP, and no space for it + {8, 4, 6, 3}, // incomplete fourth CP, and no space for it + {9, 4, 6, 3}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_utf16_in_error(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; + const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 6, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 5); + + // There are 5 classes of errors in UTF-8 decoding + // 1. Missing leading byte + // 2. Missing trailing byte + // 3. Surrogate CP + // 4. Ovelong sequence + // 5. CP out of Unicode range + test_offsets_error offsets[] = { + + // 1. Missing leading byte. We will replace the leading byte with + // non-leading byte, such as a byte that is always invalid or a trailing + // byte. + + // replace leading byte with invalid byte + {1, 5, 0, 0, 0xFF, 0}, + {3, 5, 1, 1, 0xFF, 1}, + {6, 5, 3, 2, 0xFF, 3}, + {10, 5, 6, 3, 0xFF, 6}, + + // replace leading byte with trailing byte + {1, 5, 0, 0, 0b10101010, 0}, + {3, 5, 1, 1, 0b10101010, 1}, + {6, 5, 3, 2, 0b10101010, 3}, + {10, 5, 6, 3, 0b10101010, 6}, + + // 2. Missing trailing byte. We will replace the trailing byte with + // non-trailing byte, such as a byte that is always invalid or a leading + // byte (simple ASCII byte in our case). + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, 0xFF, 2}, + {6, 5, 3, 2, 0xFF, 4}, + {10, 5, 6, 3, 0xFF, 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, 0xFF, 5}, + {10, 5, 6, 3, 0xFF, 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, 0xFF, 9}, + + // 2.1 The following test-cases raise doubt whether error or partial should + // be returned. For example, we have 4-byte sequence with valid leading + // byte. If we hide the last byte we need to return partial. But, if the + // second or third byte, which are visible to the call to codecvt, are + // malformed then error should be returned. + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, 0xFF, 4}, + {8, 5, 6, 3, 0xFF, 7}, + {9, 5, 6, 3, 0xFF, 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, 0xFF, 8}, + + // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte + // CP U+D700 + {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800 + {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00 + {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00 + {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00 + + // 4. Overlong sequence. The CPs in the input are chosen such as modifying + // just the leading byte is enough to make them overlong, i.e. for the + // 3-byte and 4-byte CP the second byte (first trailing) has enough leading + // zeroes. + {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong + {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong + {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong + {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong + + // 5. CP above range + // turn U+10AAAA into U+14AAAA by changing its leading byte + {10, 5, 6, 3, 0b11110101, 6}, + // turn U+10AAAA into U+11AAAA by changing its 2nd byte + {10, 5, 6, 3, 0b10011010, 7}, + }; + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto old_char = in[t.replace_pos]; + in[t.replace_pos] = t.replace_char; + + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + + in[t.replace_pos] = old_char; + } +} + +template +void utf8_to_utf16_in(const std::codecvt& cvt) { + utf8_to_utf16_in_ok(cvt); + utf8_to_utf16_in_partial(cvt); + utf8_to_utf16_in_error(cvt); +} + +template +void utf16_to_utf8_out_ok(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 6, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 5); + assert(std::char_traits::length(exp) == 10); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}}; + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf16_to_utf8_out_partial(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 6, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 5); + assert(std::char_traits::length(exp) == 10); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {5, 6, 3, 6}, // no space for fourth CP + {5, 7, 3, 6}, // no space for fourth CP + {5, 8, 3, 6}, // no space for fourth CP + {5, 9, 3, 6}, // no space for fourth CP + + {4, 10, 3, 6}, // incomplete fourth CP + + {4, 6, 3, 6}, // incomplete fourth CP, and no space for it + {4, 7, 3, 6}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6}, // incomplete fourth CP, and no space for it + }; + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf16_to_utf8_out_error(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP + const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 6, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 5); + assert(std::char_traits::length(exp) == 10); + + // The only possible error in UTF-16 is unpaired surrogate code units. + // So we replace valid code points (scalar values) with lone surrogate CU. + test_offsets_error offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, 'z', 4}, + }; + + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto old_char = in[t.replace_pos]; + in[t.replace_pos] = t.replace_char; + + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + + in[t.replace_pos] = old_char; + } +} + +template +void utf16_to_utf8_out(const std::codecvt& cvt) { + utf16_to_utf8_out_ok(cvt); + utf16_to_utf8_out_partial(cvt); + utf16_to_utf8_out_error(cvt); +} + +template +void test_utf8_utf16_cvt(const std::codecvt& cvt) { + utf8_to_utf16_in(cvt); + utf16_to_utf8_out(cvt); +} + +template +void utf8_to_ucs2_in_ok(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP and 3-byte CP + const unsigned char input[] = "b\u0448\uAAAA"; + const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0}; + static_assert(array_size(input) == 7, ""); + static_assert(array_size(expected) == 4, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 6); + assert(std::char_traits::length(exp) == 3); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}}; + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } + + for (auto t : offsets) { + InternT out[array_size(exp)] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, std::end(out), out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf8_to_ucs2_in_partial(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP and 3-byte CP + const unsigned char input[] = "b\u0448\uAAAA"; + const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0}; + static_assert(array_size(input) == 7, ""); + static_assert(array_size(expected) == 4, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 6); + assert(std::char_traits::length(exp) == 3); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + }; + + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_ucs2_in_error(const std::codecvt& cvt) { + const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; + const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; + static_assert(array_size(input) == 11, ""); + static_assert(array_size(expected) == 6, ""); + + ExternT in[array_size(input)]; + InternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 10); + assert(std::char_traits::length(exp) == 5); + + // There are 5 classes of errors in UTF-8 decoding + // 1. Missing leading byte + // 2. Missing trailing byte + // 3. Surrogate CP + // 4. Ovelong sequence + // 5. CP out of Unicode range + test_offsets_error offsets[] = { + + // 1. Missing leading byte. We will replace the leading byte with + // non-leading byte, such as a byte that is always invalid or a trailing + // byte. + + // replace leading byte with invalid byte + {1, 5, 0, 0, 0xFF, 0}, + {3, 5, 1, 1, 0xFF, 1}, + {6, 5, 3, 2, 0xFF, 3}, + {10, 5, 6, 3, 0xFF, 6}, + + // replace leading byte with trailing byte + {1, 5, 0, 0, 0b10101010, 0}, + {3, 5, 1, 1, 0b10101010, 1}, + {6, 5, 3, 2, 0b10101010, 3}, + {10, 5, 6, 3, 0b10101010, 6}, + + // 2. Missing trailing byte. We will replace the trailing byte with + // non-trailing byte, such as a byte that is always invalid or a leading + // byte (simple ASCII byte in our case). + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, 0xFF, 2}, + {6, 5, 3, 2, 0xFF, 4}, + {10, 5, 6, 3, 0xFF, 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, 0xFF, 5}, + {10, 5, 6, 3, 0xFF, 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, 0xFF, 9}, + + // 2.1 The following test-cases raise doubt whether error or partial should + // be returned. For example, we have 4-byte sequence with valid leading + // byte. If we hide the last byte we need to return partial. But, if the + // second or third byte, which are visible to the call to codecvt, are + // malformed then error should be returned. + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, 0xFF, 4}, + {8, 5, 6, 3, 0xFF, 7}, + {9, 5, 6, 3, 0xFF, 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, 0xFF, 8}, + + // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte + // CP U+D700 + {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800 + {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00 + {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00 + {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00 + + // 4. Overlong sequence. The CPs in the input are chosen such as modifying + // just the leading byte is enough to make them overlong, i.e. for the + // 3-byte and 4-byte CP the second byte (first trailing) has enough leading + // zeroes. + {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong + {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong + {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong + {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong + + // 5. CP above range + // turn U+10AAAA into U+14AAAA by changing its leading byte + {10, 5, 6, 3, 0b11110101, 6}, + // turn U+10AAAA into U+11AAAA by changing its 2nd byte + {10, 5, 6, 3, 0b10011010, 7}, + // Don't replace anything, show full 4-byte CP U+10AAAA + {10, 4, 6, 3, 'b', 0}, + {10, 5, 6, 3, 'b', 0}, + // Don't replace anything, show incomplete 4-byte CP at the end. It's still + // out of UCS2 range just by seeing the first byte. + {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP + }; + for (auto t : offsets) { + InternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto old_char = in[t.replace_pos]; + in[t.replace_pos] = t.replace_char; + + mbstate_t state = {}; + const ExternT* in_next = nullptr; + InternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + + in[t.replace_pos] = old_char; + } +} + +template +void utf8_to_ucs2_in(const std::codecvt& cvt) { + utf8_to_ucs2_in_ok(cvt); + utf8_to_ucs2_in_partial(cvt); + utf8_to_ucs2_in_error(cvt); +} + +template +void ucs2_to_utf8_out_ok(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP and 3-byte CP + const char16_t input[] = {'b', 0x0448, 0xAAAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA"; + static_assert(array_size(input) == 4, ""); + static_assert(array_size(expected) == 7, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 3); + assert(std::char_traits::length(exp) == 6); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}}; + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(std::char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void ucs2_to_utf8_out_partial(const std::codecvt& cvt) { + // UTF-8 string of 1-byte code point (CP), 2-byte CP and 3-byte CP + const char16_t input[] = {'b', 0x0448, 0xAAAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA"; + static_assert(array_size(input) == 4, ""); + static_assert(array_size(expected) == 7, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 3); + assert(std::char_traits::length(exp) == 6); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + }; + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void ucs2_to_utf8_out_error(const std::codecvt& cvt) { + const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; + static_assert(array_size(input) == 6, ""); + static_assert(array_size(expected) == 11, ""); + + InternT in[array_size(input)]; + ExternT exp[array_size(expected)]; + std::copy(std::begin(input), std::end(input), std::begin(in)); + std::copy(std::begin(expected), std::end(expected), std::begin(exp)); + assert(std::char_traits::length(in) == 5); + assert(std::char_traits::length(exp) == 10); + + test_offsets_error offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // dont replace anything, just show the surrogate pair + {5, 10, 3, 6, 'b', 0}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, 'z', 4}, + + {5, 7, 3, 6, 'b', 0}, // no space for fourth CP + {5, 8, 3, 6, 'b', 0}, // no space for fourth CP + {5, 9, 3, 6, 'b', 0}, // no space for fourth CP + + {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP + {4, 7, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) { + ExternT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto old_char = in[t.replace_pos]; + in[t.replace_pos] = t.replace_char; + + mbstate_t state = {}; + const InternT* in_next = nullptr; + ExternT* out_next = nullptr; + std::codecvt_base::result res = std::codecvt_base::ok; + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(std::char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + + in[t.replace_pos] = old_char; + } +} + +template +void ucs2_to_utf8_out(const std::codecvt& cvt) { + ucs2_to_utf8_out_ok(cvt); + ucs2_to_utf8_out_partial(cvt); + ucs2_to_utf8_out_error(cvt); +} + +template +void test_utf8_ucs2_cvt(const std::codecvt& cvt) { + utf8_to_ucs2_in(cvt); + ucs2_to_utf8_out(cvt); +} + +void test_utf8_utf32_codecvts() { + using codecvt_c32 = std::codecvt; + const std::locale& loc = std::locale::classic(); + assert(std::has_facet(loc)); + + const codecvt_c32& cvt = std::use_facet(loc); + test_utf8_utf32_cvt(cvt); + + std::codecvt_utf8 cvt2; + test_utf8_utf32_cvt(cvt2); + +#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR) + std::codecvt_utf8 cvt3; + test_utf8_utf32_cvt(cvt3); +#endif + +#ifndef TEST_HAS_NO_CHAR8_T + using codecvt_c32_c8 = std::codecvt; + assert(std::has_facet(loc)); + const codecvt_c32_c8& cvt4 = std::use_facet(loc); + test_utf8_utf32_cvt(cvt4); +#endif +} + +void test_utf8_utf16_codecvts() { + using codecvt_c16 = std::codecvt; + const std::locale& loc = std::locale::classic(); + assert(std::has_facet(loc)); + + const codecvt_c16& cvt = std::use_facet(loc); + test_utf8_utf16_cvt(cvt); + + std::codecvt_utf8_utf16 cvt2; + test_utf8_utf16_cvt(cvt2); + + std::codecvt_utf8_utf16 cvt3; + test_utf8_utf16_cvt(cvt3); + +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + std::codecvt_utf8_utf16 cvt4; + test_utf8_utf16_cvt(cvt4); +#endif + +#ifndef TEST_HAS_NO_CHAR8_T + using codecvt_c16_c8 = std::codecvt; + assert(std::has_facet(loc)); + const codecvt_c16_c8& cvt5 = std::use_facet(loc); + test_utf8_utf16_cvt(cvt5); +#endif +} + +void test_utf8_ucs2_codecvts() { + std::codecvt_utf8 cvt; + test_utf8_ucs2_cvt(cvt); + +#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR) + std::codecvt_utf8 cvt2; + test_utf8_ucs2_cvt(cvt2); +#endif +} + +int main() { + test_utf8_utf32_codecvts(); + test_utf8_utf16_codecvts(); + test_utf8_ucs2_codecvts(); +}