Index: include/llvm/Support/Program.h =================================================================== --- include/llvm/Support/Program.h +++ include/llvm/Support/Program.h @@ -126,6 +126,40 @@ /// argument length limits. bool argumentsFitWithinSystemLimits(ArrayRef Args); + /// File encoding options when writing contents that a non-UTF8 tool will + /// read (on Windows systems). For UNIX, we always use UTF-8. + enum WindowsEncodingMethod { + /// UTF-8 is the LLVM native encoding, being the same as "do not perform + /// encoding conversion". + WEM_UTF8, + WEM_CurrentCodePage, + WEM_UTF16 + }; + + /// Saves the UTF8-encoded \p contents string into the file \p FileName + /// using a specific encoding. + /// + /// This write file function adds the possibility to choose which encoding + /// to use when writing a text file. On Windows, this is important when + /// writing files with internationalization support with an encoding that is + /// different from the one used in LLVM (UTF-8). We use this when writing + /// response files, since GCC tools on MinGW only understand legacy code + /// pages, and VisualStudio tools only understand UTF-16. + /// For UNIX, using different encodings is silently ignored, since all tools + /// work well with UTF-8. + /// This function assumes that you only use UTF-8 *text* data and will convert + /// it to your desired encoding before writing to the file. + /// + /// FIXME: We use EM_CurrentCodePage to write response files for GNU tools in + /// a MinGW/MinGW-w64 environment, which has serious flaws but currently is + /// our best shot to make gcc/ld understand international characters. This + /// should be changed as soon as binutils fix this to support UTF16 on mingw. + /// + /// \returns non-zero error_code if failed + std::error_code + writeFileWithEncoding(StringRef FileName, StringRef Contents, + WindowsEncodingMethod Encoding = WEM_UTF8); + /// This function waits for the process specified by \p PI to finish. /// \returns A \see ProcessInfo struct with Pid set to: /// \li The process id of the child process if the child process has changed Index: lib/Support/Unix/Program.inc =================================================================== --- lib/Support/Unix/Program.inc +++ lib/Support/Unix/Program.inc @@ -19,6 +19,7 @@ #include "Unix.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" #include #if HAVE_SYS_STAT_H #include @@ -440,6 +441,23 @@ return std::error_code(); } +std::error_code +llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents, + WindowsEncodingMethod Encoding /*unused*/) { + std::error_code EC; + llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text); + + if (EC) + return EC; + + OS << Contents; + + if (OS.has_error()) + return std::make_error_code(std::errc::io_error); + + return EC; +} + bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef Args) { static long ArgMax = sysconf(_SC_ARG_MAX); Index: lib/Support/Windows/Path.inc =================================================================== --- lib/Support/Windows/Path.inc +++ lib/Support/Windows/Path.inc @@ -919,11 +919,13 @@ return std::error_code(); } -std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, - llvm::SmallVectorImpl &utf8) { +static +std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16, + size_t utf16_len, + llvm::SmallVectorImpl &utf8) { if (utf16_len) { // Get length. - int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(), + int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(), 0, NULL, NULL); if (len == 0) @@ -933,7 +935,7 @@ utf8.set_size(len); // Now do the actual conversion. - len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.data(), + len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(), utf8.size(), NULL, NULL); if (len == 0) @@ -946,6 +948,16 @@ return std::error_code(); } + +std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, + llvm::SmallVectorImpl &utf8) { + return UTF16ToCodePage(CP_UTF8, utf16, utf16_len, utf8); +} + +std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len, + llvm::SmallVectorImpl &utf8) { + return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8); +} } // end namespace windows } // end namespace sys } // end namespace llvm Index: lib/Support/Windows/Program.inc =================================================================== --- lib/Support/Windows/Program.inc +++ lib/Support/Windows/Program.inc @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "WindowsSupport.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -440,6 +442,50 @@ return std::error_code(); } +std::error_code +llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents, + WindowsEncodingMethod Encoding) { + std::error_code EC; + llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text); + if (EC) + return EC; + + if (Encoding == WEM_UTF8) { + OS << Contents; + } else if (Encoding == WEM_CurrentCodePage) { + SmallVector ArgsUTF16; + SmallVector ArgsCurCP; + + if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16))) + return EC; + + if ((EC = windows::UTF16ToCurCP( + ArgsUTF16.data(), ArgsUTF16.size(), ArgsCurCP))) + return EC; + + OS.write(ArgsCurCP.data(), ArgsCurCP.size()); + } else if (Encoding == WEM_UTF16) { + SmallVector ArgsUTF16; + + if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16))) + return EC; + + // Endianness guessing + char BOM[2]; + uint16_t src = UNI_UTF16_BYTE_ORDER_MARK_NATIVE; + memcpy(BOM, &src, 2); + OS.write(BOM, 2); + OS.write((char *)ArgsUTF16.data(), ArgsUTF16.size() << 1); + } else { + llvm_unreachable("Unknown encoding"); + } + + if (OS.has_error()) + return std::make_error_code(std::errc::io_error); + + return EC; +} + bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef Args) { // The documented max length of the command line passed to CreateProcess. static const size_t MaxCommandStringLength = 32768; Index: lib/Support/Windows/WindowsSupport.h =================================================================== --- lib/Support/Windows/WindowsSupport.h +++ lib/Support/Windows/WindowsSupport.h @@ -166,6 +166,9 @@ std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl &utf16); std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, SmallVectorImpl &utf8); +/// Convert from UTF16 to the current code page used in the system +std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len, + SmallVectorImpl &utf8); } // end namespace windows } // end namespace sys } // end namespace llvm. Index: unittests/Support/ProgramTest.cpp =================================================================== --- unittests/Support/ProgramTest.cpp +++ unittests/Support/ProgramTest.cpp @@ -34,6 +34,16 @@ #error sleep_for is not implemented on your platform. #endif +#define ASSERT_NO_ERROR(x) \ + if (std::error_code ASSERT_NO_ERROR_ec = x) { \ + SmallString<128> MessageStorage; \ + raw_svector_ostream Message(MessageStorage); \ + Message << #x ": did not return errc::success.\n" \ + << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \ + << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \ + GTEST_FATAL_FAILURE_(MessageStorage.c_str()); \ + } else { \ + } // From TestMain.cpp. extern const char *TestMainArgv0; @@ -220,4 +230,44 @@ } +#ifdef LLVM_ON_WIN32 +const char utf16le_text[] = + "\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61\x00"; +const char utf16be_text[] = + "\x00\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61"; +#endif +const char utf8_text[] = "\x6c\x69\x6e\x67\xc3\xbc\x69\xc3\xa7\x61"; + +TEST(ProgramTest, TestWriteWithSystemEncoding) { + SmallString<128> TestDirectory; + ASSERT_NO_ERROR(fs::createUniqueDirectory("program-test", TestDirectory)); + errs() << "Test Directory: " << TestDirectory << '\n'; + errs().flush(); + SmallString<128> file_pathname(TestDirectory); + path::append(file_pathname, "international-file.txt"); + // Only on Windows we should encode in UTF16. For other systems, use UTF8 + ASSERT_NO_ERROR(sys::writeFileWithEncoding(file_pathname.c_str(), utf8_text, + sys::WEM_UTF16)); + int fd = 0; + ASSERT_NO_ERROR(fs::openFileForRead(file_pathname.c_str(), fd)); +#if defined(LLVM_ON_WIN32) + char buf[18]; + ASSERT_EQ(::read(fd, buf, 18), 18); + if (strncmp(buf, "\xfe\xff", 2) == 0) { // UTF16-BE + ASSERT_EQ(strncmp(&buf[2], utf16be_text, 16), 0); + } else if (strncmp(buf, "\xff\xfe", 2) == 0) { // UTF16-LE + ASSERT_EQ(strncmp(&buf[2], utf16le_text, 16), 0); + } else { + FAIL() << "Invalid BOM in UTF-16 file"; + } +#else + char buf[10]; + ASSERT_EQ(::read(fd, buf, 10), 10); + ASSERT_EQ(strncmp(buf, utf8_text, 10), 0); +#endif + ::close(fd); + ASSERT_NO_ERROR(fs::remove(file_pathname.str())); + ASSERT_NO_ERROR(fs::remove(TestDirectory.str())); +} + } // end anonymous namespace