Index: libcxx/src/locale.cpp =================================================================== --- libcxx/src/locale.cpp +++ libcxx/src/locale.cpp @@ -2022,25 +2022,26 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - switch (c1) - { - case 0xE0: - if ((c2 & 0xE0) != 0xA0) - return codecvt_base::error; - break; - case 0xED: - if ((c2 & 0xE0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xE0: + if ((c2 & 0xE0) != 0xA0) + return codecvt_base::error; + break; + case 0xED: + if ((c2 & 0xE0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint16_t t = static_cast(((c1 & 0x0F) << 12) @@ -2053,28 +2054,33 @@ } else if (c1 < 0xF5) { - if (frm_end-frm_nxt < 4) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xF0: + if (!(0x90 <= c2 && c2 <= 0xBF)) + return codecvt_base::error; + break; + case 0xF4: + if ((c2 & 0xF0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; + } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; uint8_t c3 = frm_nxt[2]; + if ((c3 & 0xC0) != 0x80) + return codecvt_base::error; + if (frm_end - frm_nxt < 4) + return codecvt_base::partial; uint8_t c4 = frm_nxt[3]; - switch (c1) - { - case 0xF0: - if (!(0x90 <= c2 && c2 <= 0xBF)) - return codecvt_base::error; - break; - case 0xF4: - if ((c2 & 0xF0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; - } - if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) - return codecvt_base::error; + if ((c4 & 0xC0) != 0x80) + return codecvt_base::error; if (to_end-to_nxt < 2) return codecvt_base::partial; if ((((c1 & 7UL) << 18) + @@ -2143,25 +2149,26 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - switch (c1) - { - case 0xE0: - if ((c2 & 0xE0) != 0xA0) - return codecvt_base::error; - break; - case 0xED: - if ((c2 & 0xE0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xE0: + if ((c2 & 0xE0) != 0xA0) + return codecvt_base::error; + break; + case 0xED: + if ((c2 & 0xE0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint16_t t = static_cast(((c1 & 0x0F) << 12) @@ -2174,28 +2181,33 @@ } else if (c1 < 0xF5) { - if (frm_end-frm_nxt < 4) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xF0: + if (!(0x90 <= c2 && c2 <= 0xBF)) + return codecvt_base::error; + break; + case 0xF4: + if ((c2 & 0xF0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; + } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; uint8_t c3 = frm_nxt[2]; + if ((c3 & 0xC0) != 0x80) + return codecvt_base::error; + if (frm_end - frm_nxt < 4) + return codecvt_base::partial; uint8_t c4 = frm_nxt[3]; - switch (c1) - { - case 0xF0: - if (!(0x90 <= c2 && c2 <= 0xBF)) - return codecvt_base::error; - break; - case 0xF4: - if ((c2 & 0xF0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; - } - if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) - return codecvt_base::error; + if ((c4 & 0xC0) != 0x80) + return codecvt_base::error; if (to_end-to_nxt < 2) return codecvt_base::partial; if ((((c1 & 7UL) << 18) + @@ -2421,25 +2433,26 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - switch (c1) - { - case 0xE0: - if ((c2 & 0xE0) != 0xA0) - return codecvt_base::error; - break; - case 0xED: - if ((c2 & 0xE0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xE0: + if ((c2 & 0xE0) != 0xA0) + return codecvt_base::error; + break; + case 0xED: + if ((c2 & 0xE0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint32_t t = static_cast(((c1 & 0x0F) << 12) @@ -2452,28 +2465,33 @@ } else if (c1 < 0xF5) { - if (frm_end-frm_nxt < 4) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xF0: + if (!(0x90 <= c2 && c2 <= 0xBF)) + return codecvt_base::error; + break; + case 0xF4: + if ((c2 & 0xF0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; + } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; uint8_t c3 = frm_nxt[2]; + if ((c3 & 0xC0) != 0x80) + return codecvt_base::error; + if (frm_end - frm_nxt < 4) + return codecvt_base::partial; uint8_t c4 = frm_nxt[3]; - switch (c1) - { - case 0xF0: - if (!(0x90 <= c2 && c2 <= 0xBF)) - return codecvt_base::error; - break; - case 0xF4: - if ((c2 & 0xF0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; - } - if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) - return codecvt_base::error; + if ((c4 & 0xC0) != 0x80) + return codecvt_base::error; uint32_t t = static_cast(((c1 & 0x07) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) @@ -2679,25 +2697,26 @@ } else if (c1 < 0xF0) { - if (frm_end-frm_nxt < 3) - return codecvt_base::partial; - uint8_t c2 = frm_nxt[1]; - uint8_t c3 = frm_nxt[2]; - switch (c1) - { - case 0xE0: - if ((c2 & 0xE0) != 0xA0) - return codecvt_base::error; - break; - case 0xED: - if ((c2 & 0xE0) != 0x80) - return codecvt_base::error; - break; - default: - if ((c2 & 0xC0) != 0x80) - return codecvt_base::error; - break; + if (frm_end - frm_nxt < 2) + return codecvt_base::partial; + uint8_t c2 = frm_nxt[1]; + switch (c1) { + case 0xE0: + if ((c2 & 0xE0) != 0xA0) + return codecvt_base::error; + break; + case 0xED: + if ((c2 & 0xE0) != 0x80) + return codecvt_base::error; + break; + default: + if ((c2 & 0xC0) != 0x80) + return codecvt_base::error; + break; } + if (frm_end - frm_nxt < 3) + return codecvt_base::partial; + uint8_t c3 = frm_nxt[2]; if ((c3 & 0xC0) != 0x80) return codecvt_base::error; uint16_t t = static_cast(((c1 & 0x0F) << 12) Index: libcxx/test/std/localization/codecvt_unicode.h =================================================================== --- /dev/null +++ libcxx/test/std/localization/codecvt_unicode.h @@ -0,0 +1,1155 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +struct test_offsets_ok { + size_t in_size, out_size; +}; +struct test_offsets_partial { + size_t in_size, out_size, expected_in_next, expected_out_next; +}; + +template +struct test_offsets_error { + size_t in_size, out_size, expected_in_next, expected_out_next; + CharT replace_char; + size_t replace_pos; +}; + +template +auto constexpr array_size(const T (&)[N]) -> size_t { + return N; +} + +template +void utf8_to_utf32_in_ok(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + std::copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(in) == 11, ""); + static_assert(array_size(exp_literal) == 5, ""); + static_assert(array_size(exp) == 5, ""); + assert(char_traits::length(in) == 10); + assert(char_traits::length(exp_literal) == 4); + assert(char_traits::length(exp) == 4); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}}; + for (auto t : offsets) { + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } + + for (auto t : offsets) { + CharT out[array_size(exp)] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf8_to_utf32_in_partial(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + std::copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(in) == 11, ""); + static_assert(array_size(exp_literal) == 5, ""); + static_assert(array_size(exp) == 5, ""); + assert(char_traits::length(in) == 10); + assert(char_traits::length(exp_literal) == 4); + assert(char_traits::length(exp) == 4); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {7, 4, 6, 3}, // incomplete fourth CP + {8, 4, 6, 3}, // incomplete fourth CP + {9, 4, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + }; + + for (auto t : offsets) { + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_utf32_in_error(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + std::copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(valid_in) == 11, ""); + static_assert(array_size(exp_literal) == 5, ""); + static_assert(array_size(exp) == 5, ""); + assert(char_traits::length(valid_in) == 10); + assert(char_traits::length(exp_literal) == 4); + assert(char_traits::length(exp) == 4); + + test_offsets_error offsets[] = { + + // replace leading byte with invalid byte + {1, 4, 0, 0, '\xFF', 0}, + {3, 4, 1, 1, '\xFF', 1}, + {6, 4, 3, 2, '\xFF', 3}, + {10, 4, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 4, 1, 1, 'z', 2}, + {6, 4, 3, 2, 'z', 4}, + {10, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 4, 1, 1, '\xFF', 2}, + {6, 4, 3, 2, '\xFF', 4}, + {10, 4, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 4, 3, 2, 'z', 5}, + {10, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 4, 3, 2, '\xFF', 5}, + {10, 4, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 4, 6, 3, 'z', 9}, + {10, 4, 6, 3, '\xFF', 9}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 4, 3, 2, 'z', 4}, + {8, 4, 6, 3, 'z', 7}, + {9, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 4, 3, 2, '\xFF', 4}, + {8, 4, 6, 3, '\xFF', 7}, + {9, 4, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 4, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) { + char in[array_size(valid_in)] = {}; + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + char_traits::copy(in, valid_in, array_size(valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_utf32_in(const std::codecvt& cvt) { + utf8_to_utf32_in_ok(cvt); + utf8_to_utf32_in_partial(cvt); + utf8_to_utf32_in_error(cvt); +} + +template +void utf32_to_utf8_out_ok(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char32_t in_literal[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size(in_literal)] = {}; + copy(begin(in_literal), end(in_literal), begin(in)); + + static_assert(array_size(in_literal) == 5, ""); + static_assert(array_size(in) == 5, ""); + static_assert(array_size(exp) == 11, ""); + assert(char_traits::length(in_literal) == 4); + assert(char_traits::length(in) == 4); + assert(char_traits::length(exp) == 10); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}}; + for (auto t : offsets) { + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf32_to_utf8_out_partial(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char32_t in_literal[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size(in_literal)] = {}; + copy(begin(in_literal), end(in_literal), begin(in)); + + static_assert(array_size(in_literal) == 5, ""); + static_assert(array_size(in) == 5, ""); + static_assert(array_size(exp) == 11, ""); + assert(char_traits::length(in_literal) == 4); + assert(char_traits::length(in) == 4); + assert(char_traits::length(exp) == 10); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {4, 6, 3, 6}, // no space for fourth CP + {4, 7, 3, 6}, // no space for fourth CP + {4, 8, 3, 6}, // no space for fourth CP + {4, 9, 3, 6}, // no space for fourth CP + }; + for (auto t : offsets) { + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf32_to_utf8_out_error(const std::codecvt& cvt) { + using namespace std; + const char32_t valid_in[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert(array_size(valid_in) == 5, ""); + static_assert(array_size(exp) == 11, ""); + assert(char_traits::length(valid_in) == 4); + assert(char_traits::length(exp) == 10); + + test_offsets_error offsets[] = { + {4, 10, 0, 0, 0x00110000, 0}, + {4, 10, 1, 1, 0x00110000, 1}, + {4, 10, 2, 3, 0x00110000, 2}, + {4, 10, 3, 6, 0x00110000, 3}}; + + for (auto t : offsets) { + CharT in[array_size(valid_in)] = {}; + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + copy(begin(valid_in), end(valid_in), begin(in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf32_to_utf8_out(const std::codecvt& cvt) { + utf32_to_utf8_out_ok(cvt); + utf32_to_utf8_out_partial(cvt); + utf32_to_utf8_out_error(cvt); +} + +template +void test_utf8_utf32_codecvts(const std::codecvt& cvt) { + utf8_to_utf32_in(cvt); + utf32_to_utf8_out(cvt); +} + +template +void utf8_to_utf16_in_ok(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(in) == 11, ""); + static_assert(array_size(exp_literal) == 6, ""); + static_assert(array_size(exp) == 6, ""); + assert(char_traits::length(in) == 10); + assert(char_traits::length(exp_literal) == 5); + assert(char_traits::length(exp) == 5); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}}; + for (auto t : offsets) { + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } + + for (auto t : offsets) { + CharT out[array_size(exp)] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf8_to_utf16_in_partial(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(in) == 11, ""); + static_assert(array_size(exp_literal) == 6, ""); + static_assert(array_size(exp) == 6, ""); + assert(char_traits::length(in) == 10); + assert(char_traits::length(exp_literal) == 5); + assert(char_traits::length(exp) == 5); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {10, 4, 6, 3}, // no space for fourth CP + {7, 5, 6, 3}, // incomplete fourth CP + {8, 5, 6, 3}, // incomplete fourth CP + {9, 5, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + {7, 4, 6, 3}, // incomplete fourth CP, and no space for it + {8, 4, 6, 3}, // incomplete fourth CP, and no space for it + {9, 4, 6, 3}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) { + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_utf16_in_error(const std::codecvt& cvt) { + using namespace std; + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(valid_in) == 11, ""); + static_assert(array_size(exp_literal) == 6, ""); + static_assert(array_size(exp) == 6, ""); + assert(char_traits::length(valid_in) == 10); + assert(char_traits::length(exp_literal) == 5); + assert(char_traits::length(exp) == 5); + + test_offsets_error offsets[] = { + + // replace leading byte with invalid byte + {1, 5, 0, 0, '\xFF', 0}, + {3, 5, 1, 1, '\xFF', 1}, + {6, 5, 3, 2, '\xFF', 3}, + {10, 5, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, '\xFF', 2}, + {6, 5, 3, 2, '\xFF', 4}, + {10, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, '\xFF', 5}, + {10, 5, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, '\xFF', 9}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, '\xFF', 4}, + {8, 5, 6, 3, '\xFF', 7}, + {9, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) { + char in[array_size(valid_in)] = {}; + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + char_traits::copy(in, valid_in, array_size(valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_utf16_in(const std::codecvt& cvt) { + utf8_to_utf16_in_ok(cvt); + utf8_to_utf16_in_partial(cvt); + utf8_to_utf16_in_error(cvt); +} + +template +void utf16_to_utf8_out_ok(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char16_t in_literal[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size(in_literal)]; + copy(begin(in_literal), end(in_literal), begin(in)); + + static_assert(array_size(in_literal) == 6, ""); + static_assert(array_size(exp) == 11, ""); + static_assert(array_size(in) == 6, ""); + assert(char_traits::length(in_literal) == 5); + assert(char_traits::length(exp) == 10); + assert(char_traits::length(in) == 5); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}}; + for (auto t : offsets) { + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf16_to_utf8_out_partial(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char16_t in_literal[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size(in_literal)]; + copy(begin(in_literal), end(in_literal), begin(in)); + + static_assert(array_size(in_literal) == 6, ""); + static_assert(array_size(exp) == 11, ""); + static_assert(array_size(in) == 6, ""); + assert(char_traits::length(in_literal) == 5); + assert(char_traits::length(exp) == 10); + assert(char_traits::length(in) == 5); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {5, 6, 3, 6}, // no space for fourth CP + {5, 7, 3, 6}, // no space for fourth CP + {5, 8, 3, 6}, // no space for fourth CP + {5, 9, 3, 6}, // no space for fourth CP + + {4, 10, 3, 6}, // incomplete fourth CP + + {4, 6, 3, 6}, // incomplete fourth CP, and no space for it + {4, 7, 3, 6}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6}, // incomplete fourth CP, and no space for it + }; + for (auto t : offsets) { + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf16_to_utf8_out_error(const std::codecvt& cvt) { + using namespace std; + const char16_t valid_in[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert(array_size(valid_in) == 6, ""); + static_assert(array_size(exp) == 11, ""); + assert(char_traits::length(valid_in) == 5); + assert(char_traits::length(exp) == 10); + + test_offsets_error offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, u'z', 4}, + }; + + for (auto t : offsets) { + CharT in[array_size(valid_in)] = {}; + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + copy(begin(valid_in), end(valid_in), begin(in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf16_to_utf8_out(const std::codecvt& cvt) { + utf16_to_utf8_out_ok(cvt); + utf16_to_utf8_out_partial(cvt); + utf16_to_utf8_out_error(cvt); +} + +template +void test_utf8_utf16_cvts(const std::codecvt& cvt) { + utf8_to_utf16_in(cvt); + utf16_to_utf8_out(cvt); +} + +template +void utf8_to_ucs2_in_ok(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char in[] = "bш\uAAAA"; + const char16_t exp_literal[] = u"bш\uAAAA"; + CharT exp[array_size(exp_literal)] = {}; + copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(in) == 7, ""); + static_assert(array_size(exp_literal) == 4, ""); + static_assert(array_size(exp) == 4, ""); + assert(char_traits::length(in) == 6); + assert(char_traits::length(exp_literal) == 3); + assert(char_traits::length(exp) == 3); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}}; + for (auto t : offsets) { + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } + + for (auto t : offsets) { + CharT out[array_size(exp)] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void utf8_to_ucs2_in_partial(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char in[] = "bш\uAAAA"; + const char16_t exp_literal[] = u"bш\uAAAA"; + CharT exp[array_size(exp_literal)] = {}; + copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(in) == 7, ""); + static_assert(array_size(exp_literal) == 4, ""); + static_assert(array_size(exp) == 4, ""); + assert(char_traits::length(in) == 6); + assert(char_traits::length(exp_literal) == 3); + assert(char_traits::length(exp) == 3); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + }; + + for (auto t : offsets) { + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_ucs2_in_error(const std::codecvt& cvt) { + using namespace std; + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size(exp_literal)] = {}; + copy(begin(exp_literal), end(exp_literal), begin(exp)); + + static_assert(array_size(valid_in) == 11, ""); + static_assert(array_size(exp_literal) == 6, ""); + static_assert(array_size(exp) == 6, ""); + assert(char_traits::length(valid_in) == 10); + assert(char_traits::length(exp_literal) == 5); + assert(char_traits::length(exp) == 5); + + test_offsets_error offsets[] = { + + // replace leading byte with invalid byte + {1, 5, 0, 0, '\xFF', 0}, + {3, 5, 1, 1, '\xFF', 1}, + {6, 5, 3, 2, '\xFF', 3}, + {10, 5, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, '\xFF', 2}, + {6, 5, 3, 2, '\xFF', 4}, + {10, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, '\xFF', 5}, + {10, 5, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, '\xFF', 9}, + + // When we see a leading byte of 4-byte CP, we should return error, no + // matter if it is incomplete at the end or has errors in the trailing + // bytes. + + // Don't replace anything, show full 4-byte CP + {10, 4, 6, 3, 'b', 0}, + {10, 5, 6, 3, 'b', 0}, + + // Don't replace anything, show incomplete 4-byte CP at the end + {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, '\xFF', 4}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {8, 5, 6, 3, '\xFF', 7}, + {9, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) { + char in[array_size(valid_in)] = {}; + CharT out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + char_traits::copy(in, valid_in, array_size(valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char*)nullptr; + auto out_next = (CharT*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void utf8_to_ucs2_in(const std::codecvt& cvt) { + utf8_to_ucs2_in_ok(cvt); + utf8_to_ucs2_in_partial(cvt); + utf8_to_ucs2_in_error(cvt); +} + +template +void ucs2_to_utf8_out_ok(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char16_t in_literal[] = u"bш\uAAAA"; + const char exp[] = "bш\uAAAA"; + CharT in[array_size(in_literal)] = {}; + copy(begin(in_literal), end(in_literal), begin(in)); + + static_assert(array_size(in_literal) == 4, ""); + static_assert(array_size(exp) == 7, ""); + static_assert(array_size(in) == 4, ""); + assert(char_traits::length(in_literal) == 3); + assert(char_traits::length(exp) == 6); + assert(char_traits::length(in) == 3); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}}; + for (auto t : offsets) { + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.ok); + assert(in_next == in + t.in_size); + assert(out_next == out + t.out_size); + assert(char_traits::compare(out, exp, t.out_size) == 0); + if (t.out_size < array_size(out)) + assert(out[t.out_size] == 0); + } +} + +template +void ucs2_to_utf8_out_partial(const std::codecvt& cvt) { + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char16_t in_literal[] = u"bш\uAAAA"; + const char exp[] = "bш\uAAAA"; + CharT in[array_size(in_literal)] = {}; + copy(begin(in_literal), end(in_literal), begin(in)); + + static_assert(array_size(in_literal) == 4, ""); + static_assert(array_size(exp) == 7, ""); + static_assert(array_size(in) == 4, ""); + assert(char_traits::length(in_literal) == 3); + assert(char_traits::length(exp) == 6); + assert(char_traits::length(in) == 3); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + }; + for (auto t : offsets) { + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.partial); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void ucs2_to_utf8_out_error(const std::codecvt& cvt) { + using namespace std; + const char16_t valid_in[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert(array_size(valid_in) == 6, ""); + static_assert(array_size(exp) == 11, ""); + assert(char_traits::length(valid_in) == 5); + assert(char_traits::length(exp) == 10); + + test_offsets_error offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // dont replace anything, just show the surrogate pair + {5, 10, 3, 6, u'b', 0}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, u'z', 4}, + + {5, 7, 3, 6, u'b', 0}, // no space for fourth CP + {5, 8, 3, 6, u'b', 0}, // no space for fourth CP + {5, 9, 3, 6, u'b', 0}, // no space for fourth CP + + {4, 10, 3, 6, u'b', 0}, // incomplete fourth CP + {4, 7, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) { + CharT in[array_size(valid_in)] = {}; + char out[array_size(exp) - 1] = {}; + assert(t.in_size <= array_size(in)); + assert(t.out_size <= array_size(out)); + assert(t.expected_in_next <= t.in_size); + assert(t.expected_out_next <= t.out_size); + copy(begin(valid_in), end(valid_in), begin(in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT*)nullptr; + auto out_next = (char*)nullptr; + auto res = codecvt_base::result(); + + res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next); + assert(res == cvt.error); + assert(in_next == in + t.expected_in_next); + assert(out_next == out + t.expected_out_next); + assert(char_traits::compare(out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size(out)) + assert(out[t.expected_out_next] == 0); + } +} + +template +void ucs2_to_utf8_out(const std::codecvt& cvt) { + ucs2_to_utf8_out_ok(cvt); + ucs2_to_utf8_out_partial(cvt); + ucs2_to_utf8_out_error(cvt); +} + +template +void test_utf8_ucs2_cvts(const std::codecvt& cvt) { + utf8_to_ucs2_in(cvt); + ucs2_to_utf8_out(cvt); +} Index: libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp +++ libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp @@ -27,6 +27,7 @@ #include #include +#include "../../../../codecvt_unicode.h" #include "test_macros.h" typedef std::codecvt F; @@ -47,5 +48,7 @@ for (unsigned i = 0; i < 9; ++i) assert(to[i] == from[i]); - return 0; + utf8_to_utf16_in(f); + + return 0; } Index: libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp +++ libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp @@ -29,6 +29,7 @@ #include +#include "../../../../codecvt_unicode.h" #include "test_macros.h" typedef std::codecvt F; @@ -51,6 +52,7 @@ for (unsigned i = 0; i < 9; ++i) assert(to[i] == from[i]); } + utf16_to_utf8_out(f); - return 0; + return 0; } Index: libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp +++ libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp @@ -27,6 +27,7 @@ #include #include +#include "../../../../codecvt_unicode.h" #include "test_macros.h" typedef std::codecvt F; @@ -47,5 +48,7 @@ for (unsigned i = 0; i < 9; ++i) assert(to[i] == static_cast(from[i])); - return 0; + utf8_to_utf32_in(f); + + return 0; } Index: libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp +++ libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp @@ -29,6 +29,7 @@ #include +#include "../../../../codecvt_unicode.h" #include "test_macros.h" typedef std::codecvt F; @@ -51,6 +52,7 @@ for (unsigned i = 0; i < 9; ++i) assert(static_cast(to[i]) == from[i]); } + utf32_to_utf8_out(f); - return 0; + return 0; } Index: libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_in.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_in.pass.cpp +++ libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_in.pass.cpp @@ -26,6 +26,7 @@ #include #include +#include "../codecvt_unicode.h" #include "test_macros.h" int main(int, char**) @@ -270,6 +271,11 @@ assert(np == n+1); assert(w == 0x56); } + { + typedef std::codecvt_utf8 C; + C c; + utf8_to_utf32_in(c); + } { typedef std::codecvt_utf8 C; C c; @@ -360,6 +366,22 @@ assert(np == n+1); assert(w == 0x56); } + { + typedef std::codecvt_utf8 C; + C c; + utf8_to_ucs2_in(c); + } +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + { + typedef std::codecvt_utf8 C; + C c; +# if __SIZEOF_WCHAR_T__ == 2 + utf8_to_ucs2_in(c); +# elif __SIZEOF_WCHAR_T__ == 4 + utf8_to_utf32_in(c); +# endif + } +#endif - return 0; + return 0; } Index: libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_out.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_out.pass.cpp +++ libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_out.pass.cpp @@ -26,6 +26,7 @@ #include #include +#include "../codecvt_unicode.h" #include "test_macros.h" template @@ -163,6 +164,11 @@ assert(n[5] == char(0x85)); assert(n[6] == char(0)); } + { + typedef std::codecvt_utf8 C; + C c; + ucs2_to_utf8_out(c); + } } template @@ -320,6 +326,11 @@ assert(n[5] == char(0x85)); assert(n[6] == char(0x83)); } + { + typedef std::codecvt_utf8 C; + C c; + utf32_to_utf8_out(c); + } } int main(int, char**) { Index: libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_utf16_in.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_utf16_in.pass.cpp +++ libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_utf16_in.pass.cpp @@ -26,6 +26,7 @@ #include #include +#include "../codecvt_unicode.h" #include "test_macros.h" template @@ -117,6 +118,11 @@ assert(np == n + 1); assert(w[0] == 0x0056); } + { + typedef std::codecvt_utf8_utf16 C; + C c; + utf8_to_utf16_in(c); + } } template @@ -236,6 +242,11 @@ assert(np == n + 1); assert(w[0] == 0x0056); } + { + typedef std::codecvt_utf8_utf16 C; + C c; + utf8_to_utf16_in(c); + } } int main(int, char**) { Index: libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_utf16_out.pass.cpp =================================================================== --- libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_utf16_out.pass.cpp +++ libcxx/test/std/localization/locale.stdcvt/codecvt_utf8_utf16_out.pass.cpp @@ -26,6 +26,7 @@ #include #include +#include "../codecvt_unicode.h" #include "test_macros.h" template @@ -169,6 +170,11 @@ assert(n[2] == char(0xBF)); assert(n[3] == char(0x56)); } + { + typedef std::codecvt_utf8_utf16 C; + C c; + utf16_to_utf8_out(c); + } } template @@ -301,6 +307,11 @@ assert(n[2] == char(0xBF)); assert(n[3] == char(0x56)); } + { + typedef std::codecvt_utf8_utf16 C; + C c; + utf16_to_utf8_out(c); + } } int main(int, char**) {