diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst --- a/libcxx/docs/UsingLibcxx.rst +++ b/libcxx/docs/UsingLibcxx.rst @@ -552,3 +552,26 @@ * You are using allocator, which does not call destructor during deallocation. * You are aware that memory allocated with an allocator may be accessed, even when unused by container. + +Platform specific behavior +========================== + +Windows +------- + +The ``stdout``, ``stderr``, and ``stdin`` file streams can be placed in +Unicode mode by a suitable call to ``_setmode()``. When in this mode, +the sequence of bytes read from, or written to, these streams is interpreted +as a sequence of little-endian ``wchar_t`` elements. Thus, use of +``std::cout``, ``std::cerr``, or ``std::cin`` with streams in Unicode mode +will not behave as they usually do since bytes read or written won't be +interpreted as individual ``char`` elements. However, ``std::wcout``, +``std::wcerr``, and ``std::wcin`` will behave as expected. + +Wide character stream such as ``std::wcin`` or ``std::wcout`` imbued with a +locale behave differently than they otherwise do. By default, wide character +streams don't convert wide characters but input/output them as is. If a +specific locale is imbued, the IO with the underlying stream happens with +regular ``char`` elements, which are converted to/from wide characters +according to the locale. Note that this doesn't behave as expected if the +stream has been set in Unicode mode. diff --git a/libcxx/src/std_stream.h b/libcxx/src/std_stream.h --- a/libcxx/src/std_stream.h +++ b/libcxx/src/std_stream.h @@ -60,6 +60,12 @@ bool __last_consumed_is_next_; bool __always_noconv_; +#if defined(_LIBCPP_WIN32API) + static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>; +#else + static constexpr bool __is_win32api_wide_char = false; +#endif + __stdinbuf(const __stdinbuf&); __stdinbuf& operator=(const __stdinbuf&); @@ -74,6 +80,12 @@ __last_consumed_is_next_(false) { imbue(this->getloc()); + // On Windows, in wchar_t mode, ignore the codecvt from the locale by + // default and assume noconv; this passes wchar_t through unmodified from + // getwc. If the user sets a custom locale with imbue(), that gets honored, + // the IO is done with getc() and converted with the provided codecvt. + if constexpr (__is_win32api_wide_char) + __always_noconv_ = true; } template @@ -101,6 +113,36 @@ return __getchar(true); } +static bool __do_getc(FILE *__fp, char *__pbuf) { + int __c = getc(__fp); + if (__c == EOF) + return false; + *__pbuf = static_cast(__c); + return true; +} +#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +static bool __do_getc(FILE *__fp, wchar_t *__pbuf) { + wint_t __c = getwc(__fp); + if (__c == WEOF) + return false; + *__pbuf = static_cast(__c); + return true; +} +#endif + +static bool __do_ungetc(int __c, FILE *__fp, char __dummy) { + if (ungetc(__c, __fp) == EOF) + return false; + return true; +} +#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +static bool __do_ungetc(std::wint_t __c, FILE *__fp, wchar_t __dummy) { + if (ungetwc(__c, __fp) == WEOF) + return false; + return true; +} +#endif + template typename __stdinbuf<_CharT>::int_type __stdinbuf<_CharT>::__getchar(bool __consume) @@ -115,6 +157,20 @@ } return __result; } + if (__always_noconv_) { + char_type __1buf; + if (!__do_getc(__file_, &__1buf)) + return traits_type::eof(); + if (!__consume) + { + if (!__do_ungetc(traits_type::to_int_type(__1buf), __file_, __1buf)) + return traits_type::eof(); + } + else + __last_consumed_ = traits_type::to_int_type(__1buf); + return traits_type::to_int_type(__1buf); + } + char __extbuf[__limit]; int __nread = _VSTD::max(1, __encoding_); for (int __i = 0; __i < __nread; ++__i) @@ -125,42 +181,37 @@ __extbuf[__i] = static_cast(__c); } char_type __1buf; - if (__always_noconv_) - __1buf = static_cast(__extbuf[0]); - else + const char* __enxt; + char_type* __inxt; + codecvt_base::result __r; + do { - const char* __enxt; - char_type* __inxt; - codecvt_base::result __r; - do + state_type __sv_st = *__st_; + __r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt, + &__1buf, &__1buf + 1, __inxt); + switch (__r) { - state_type __sv_st = *__st_; - __r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt, - &__1buf, &__1buf + 1, __inxt); - switch (__r) + case _VSTD::codecvt_base::ok: + break; + case codecvt_base::partial: + *__st_ = __sv_st; + if (__nread == sizeof(__extbuf)) + return traits_type::eof(); { - case _VSTD::codecvt_base::ok: - break; - case codecvt_base::partial: - *__st_ = __sv_st; - if (__nread == sizeof(__extbuf)) + int __c = getc(__file_); + if (__c == EOF) return traits_type::eof(); - { - int __c = getc(__file_); - if (__c == EOF) - return traits_type::eof(); - __extbuf[__nread] = static_cast(__c); - } - ++__nread; - break; - case codecvt_base::error: - return traits_type::eof(); - case _VSTD::codecvt_base::noconv: - __1buf = static_cast(__extbuf[0]); - break; + __extbuf[__nread] = static_cast(__c); } - } while (__r == _VSTD::codecvt_base::partial); - } + ++__nread; + break; + case codecvt_base::error: + return traits_type::eof(); + case _VSTD::codecvt_base::noconv: + __1buf = static_cast(__extbuf[0]); + break; + } + } while (__r == _VSTD::codecvt_base::partial); if (!__consume) { for (int __i = __nread; __i > 0;) @@ -188,8 +239,11 @@ } return __c; } - if (__last_consumed_is_next_) - { + if (__always_noconv_ && __last_consumed_is_next_) { + if (!__do_ungetc(__last_consumed_, __file_, + traits_type::to_char_type(__last_consumed_))) + return traits_type::eof(); + } else if (__last_consumed_is_next_) { char __extbuf[__limit]; char* __enxt; const char_type __ci = traits_type::to_char_type(__last_consumed_); @@ -244,6 +298,12 @@ state_type* __st_; bool __always_noconv_; +#if defined(_LIBCPP_WIN32API) + static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>; +#else + static constexpr bool __is_win32api_wide_char = false; +#endif + __stdoutbuf(const __stdoutbuf&); __stdoutbuf& operator=(const __stdoutbuf&); }; @@ -255,7 +315,30 @@ __st_(__st), __always_noconv_(__cv_->always_noconv()) { + // On Windows, in wchar_t mode, ignore the codecvt from the locale by + // default and assume noconv; this passes wchar_t through unmodified to + // fputwc, which handles it correctly depending on the actual mode of the + // output stream. If the user sets a custom locale with imbue(), that + // gets honored. + if constexpr (__is_win32api_wide_char) + __always_noconv_ = true; +} + +static bool __do_fputc(char __c, FILE* __fp) { + if (fwrite(&__c, sizeof(__c), 1, __fp) != 1) + return false; + return true; } +#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +static bool __do_fputc(wchar_t __c, FILE* __fp) { + // fputwc works regardless of wide/narrow mode of stdout, while + // fwrite of wchar_t only works if the stream actually has been set + // into wide mode. + if (fputwc(__c, __fp) == WEOF) + return false; + return true; +} +#endif template typename __stdoutbuf<_CharT>::int_type @@ -268,7 +351,7 @@ __1buf = traits_type::to_char_type(__c); if (__always_noconv_) { - if (fwrite(&__1buf, sizeof(char_type), 1, __file_) != 1) + if (!__do_fputc(__1buf, __file_)) return traits_type::eof(); } else @@ -313,7 +396,10 @@ streamsize __stdoutbuf<_CharT>::xsputn(const char_type* __s, streamsize __n) { - if (__always_noconv_) + // For wchar_t on Windows, don't call fwrite(), but write characters one + // at a time with fputwc(); that works both when stdout is in the default + // mode and when it is set to Unicode mode. + if (__always_noconv_ && !__is_win32api_wide_char) return fwrite(__s, sizeof(char_type), __n, __file_); streamsize __i = 0; for (; __i < __n; ++__i, ++__s) diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh @@ -0,0 +1,5 @@ +# Check that the stderr of the executed program matches a reference file. +program=${1} +expected_file=${2} +${program} 2>stderr.log >stdout.log +cmp stderr.log "${expected_file}" diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh @@ -0,0 +1,5 @@ +# Check that the stdout of the executed program matches a reference file. +program=${1} +expected_file=${2} +${program} 2>stderr.log >stdout.log +cmp stdout.log "${expected_file}" diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh @@ -0,0 +1,4 @@ +# Pass a reference file as stdin to a test executable. +program=${1} +input=${2} +cat ${input} | ${program} diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/test.dat b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/test.dat new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ + +// istream wcerr; + +// UNSUPPORTED: no-wide-characters + +// UNSUPPORTED: executor-has-no-bash +// FILE_DEPENDENCIES: ../check-stderr.sh +// RUN: %{build} +// RUN: %{exec} bash check-stderr.sh "%t.exe" "zzzz" + +#include + +struct custom_codecvt : std::codecvt { + using base = std::codecvt; +protected: + result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end, + const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const { + while (from != from_end && to != to_end) { + ++from; + *to++ = 'z'; + } + from_next = from; + to_next = to; + return ok; + } +}; + +int main(int, char**) { + std::locale loc(std::locale::classic(), new custom_codecvt); + std::wcerr.imbue(loc); + std::wcerr << L"1234"; + return 0; +} diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp @@ -0,0 +1,32 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// istream wcerr; + +// UNSUPPORTED: no-wide-characters +// REQUIRES: target={{.+}}-windows-{{.+}} + +// UNSUPPORTED: executor-has-no-bash +// FILE_DEPENDENCIES: check-stderr.sh, test.dat +// RUN: %{build} +// RUN: %{exec} bash check-stderr.sh "%t.exe" "test.dat" + +// Check that wcerr works, preserving the unicode characters, after switching +// stderr to wide mode. + +#include +#include +#include + +int main(int, char**) { + _setmode(_fileno(stderr), _O_WTEXT); + std::wcerr << L"1234\u20ac\u00e5\u00e4\u00f6"; + return 0; +} diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// istream wcin; + +// UNSUPPORTED: no-wide-characters + +// UNSUPPORTED: executor-has-no-bash +// FILE_DEPENDENCIES: ../send-stdin.sh +// RUN: %{build} +// RUN: %{exec} bash send-stdin.sh "%t.exe" "1234" + +#include +#include + +struct custom_codecvt : std::codecvt { + using base = std::codecvt; +protected: + result do_in(std::mbstate_t&, const char *from, const char *from_end, + const char *&from_next, wchar_t *to, wchar_t *to_end, wchar_t *&to_next) const { + while (from != from_end && to != to_end) { + ++from; + *to++ = L'z'; + } + from_next = from; + to_next = to; + return ok; + } +}; + +int main(int, char**) { + std::locale loc(std::locale::classic(), new custom_codecvt); + std::wcin.imbue(loc); + std::wstring str; + std::wcin >> str; + assert(str == L"zzzz"); + return 0; +} diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// istream wcin; + +// UNSUPPORTED: no-wide-characters +// REQUIRES: target={{.+}}-windows-{{.+}} + +// UNSUPPORTED: executor-has-no-bash +// FILE_DEPENDENCIES: send-stdin.sh, test.dat +// RUN: %{build} +// RUN: %{exec} bash send-stdin.sh "%t.exe" "test.dat" + +// Check that wcin works, preserving the unicode characters, after switching +// stdin to wide mode. + +#include +#include +#include +#include + +int main(int, char**) { + _setmode(_fileno(stdin), _O_WTEXT); + std::wstring str; + std::wcin >> str; + assert(str == L"1234\u20ac\u00e5\u00e4\u00f6"); + return 0; +} diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// istream wcout; + +// UNSUPPORTED: no-wide-characters + +// UNSUPPORTED: executor-has-no-bash +// FILE_DEPENDENCIES: ../check-stdout.sh +// RUN: %{build} +// RUN: %{exec} bash check-stdout.sh "%t.exe" "zzzz" + +#include + +struct custom_codecvt : std::codecvt { + using base = std::codecvt; +protected: + result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end, + const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const { + while (from != from_end && to != to_end) { + ++from; + *to++ = 'z'; + } + from_next = from; + to_next = to; + return ok; + } +}; + +int main(int, char**) { + std::locale loc(std::locale::classic(), new custom_codecvt); + std::wcout.imbue(loc); + std::wcout << L"1234"; + return 0; +} diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp @@ -0,0 +1,32 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// istream wcout; + +// UNSUPPORTED: no-wide-characters +// REQUIRES: target={{.+}}-windows-{{.+}} + +// UNSUPPORTED: executor-has-no-bash +// FILE_DEPENDENCIES: check-stdout.sh, test.dat +// RUN: %{build} +// RUN: %{exec} bash check-stdout.sh "%t.exe" "test.dat" + +// Check that wcout works, preserving the unicode characters, after switching +// stdout to wide mode. + +#include +#include +#include + +int main(int, char**) { + _setmode(_fileno(stdout), _O_WTEXT); + std::wcout << L"1234\u20ac\u00e5\u00e4\u00f6"; + return 0; +}