Index: include/llvm/Support/FileSystem.h =================================================================== --- include/llvm/Support/FileSystem.h +++ include/llvm/Support/FileSystem.h @@ -602,6 +602,34 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD); +/// File encoding options when writing contents that a non-UTF8 tool will +/// read (on Windows systems). For UNIX, we always use UTF-8. +enum WindowsEncodingMethod : unsigned { + /// UTF-8 is the LLVM native encoding, being the same as "do not perform + /// encoding conversion". + WEM_UTF8 = 0, + WEM_CurrentCodePage = 1, + WEM_UTF16 = 2 +}; + +/// Saves the UTF8-encoded \p contents string into the file \p FileName +/// using a specific encoding. This is necessary when writing files to +/// some Windows tools that do not understand UTF-8, i.e., when generating +/// response files that link.exe or gcc.exe will read. +/// FIXME: We use WEM_CurrentCodePage to write response files for GNU tools in +/// a MinGW/MinGW-w64 environment, which has serious flaws but currently is +/// our best shot to make gcc/ld understand international characters. This +/// should be changed as soon as binutils fix this to support UTF16 on mingw. +/// \returns non-zero error_code if failed +std::error_code writeFileWithEncoding(int FD, StringRef Contents, + WindowsEncodingMethod Encoding, + bool BeginOfFile, bool UseAtomicWrites); + +/// Loops calls to ::write() until the entire buffer is written, ignoring +/// trivial errors. +std::error_code writeBufferToFile(int FD, const char *Buf, ssize_t Size, + bool UseAtomicWrites); + /// @brief Identify the type of a binary file based on how magical it is. file_magic identify_magic(StringRef magic); Index: include/llvm/Support/raw_ostream.h =================================================================== --- include/llvm/Support/raw_ostream.h +++ include/llvm/Support/raw_ostream.h @@ -27,6 +27,7 @@ namespace sys { namespace fs { enum OpenFlags : unsigned; + enum WindowsEncodingMethod : unsigned; } } @@ -327,6 +328,14 @@ uint64_t pos; + /// Controls the target encoding to convert this stream to, if necessary + sys::fs::WindowsEncodingMethod Encoding; + + /// A buffer to avoid flushing incomplete UTF chars when working with a stream + /// that converts encoding. + char UTFBuf[4]; + char *UTFBufEnd; + /// write_impl - See raw_ostream::write_impl. void write_impl(const char *Ptr, size_t Size) override; @@ -355,6 +364,23 @@ raw_fd_ostream(StringRef Filename, std::error_code &EC, sys::fs::OpenFlags Flags); + /// This constructor variant adds the possibility to choose which encoding + /// to use when writing a text file. On Windows, this is important when + /// writing files with internationalization support with an encoding that is + /// different from the one used in LLVM (UTF-8). We use this when writing + /// response files, since GCC tools on MinGW only understand legacy code + /// pages, and VisualStudio tools only understand UTF-16. + /// For UNIX, using different encodings is silently ignored, since all tools + /// work well with UTF-8. + /// This mode assumes that you only use UTF-8 *text* data and will convert + /// it to your desired encoding before writing to the file. + /// + /// This variant does not accept the "-" special case for Filename. + raw_fd_ostream(StringRef Filename, std::error_code &EC, + sys::fs::OpenFlags Flags, + sys::fs::WindowsEncodingMethod Encoding); + + /// raw_fd_ostream ctor - FD is the file descriptor that this writes to. If /// ShouldClose is true, this closes the file when the stream is destroyed. raw_fd_ostream(int fd, bool shouldClose, bool unbuffered=false); Index: lib/Support/Path.cpp =================================================================== --- lib/Support/Path.cpp +++ lib/Support/Path.cpp @@ -1045,6 +1045,55 @@ return fs::status(Path, result); } +std::error_code writeBufferToFile(int FD, const char *Buf, ssize_t Size, + bool UseAtomicWrites) { + do { + ssize_t Ret; + + // Check whether we should attempt to use atomic writes. + if (LLVM_LIKELY(!UseAtomicWrites)) { + Ret = ::write(FD, Buf, Size); + } else { + // Use ::writev() where available. +#if defined(HAVE_WRITEV) + const void *Addr = static_cast(Buf); + struct iovec IOV = {const_cast(Addr), Size }; + Ret = ::writev(FD, &IOV, 1); +#else + Ret = ::write(FD, Buf, Size); +#endif + } + + if (Ret < 0) { + // If it's a recoverable error, swallow it and retry the write. + // + // Ideally we wouldn't ever see EAGAIN or EWOULDBLOCK here, since + // raw_ostream isn't designed to do non-blocking I/O. However, some + // programs, such as old versions of bjam, have mistakenly used + // O_NONBLOCK. For compatibility, emulate blocking semantics by + // spinning until the write succeeds. If you don't want spinning, + // don't use O_NONBLOCK file descriptors with raw_ostream. + if (errno == EINTR || errno == EAGAIN +#ifdef EWOULDBLOCK + || errno == EWOULDBLOCK +#endif + ) + continue; + + // Otherwise it's a non-recoverable error. + return std::make_error_code(std::errc::io_error); + } + + // The write may have written some or all of the data. Update the + // size and buffer pointer to reflect the remainder that needs + // to be written. If there are no bytes left, we're done. + Buf += Ret; + Size -= Ret; + } while (Size > 0); + + return std::error_code(); +} + } // end namespace fs } // end namespace sys } // end namespace llvm Index: lib/Support/Unix/Path.inc =================================================================== --- lib/Support/Unix/Path.inc +++ lib/Support/Unix/Path.inc @@ -95,7 +95,7 @@ defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) static int test_dir(char ret[PATH_MAX], const char *dir, const char *bin) -{ +{ struct stat sb; char fullpath[PATH_MAX]; @@ -635,6 +635,13 @@ return std::error_code(); } +std::error_code writeFileWithEncoding( + int FD, StringRef Contents, WindowsEncodingMethod Encoding /*ignored*/, + bool BeginOfFile /*ignored*/, bool UseAtomicWrites) { + return writeBufferToFile(FD, Contents.data(), Contents.size(), + UseAtomicWrites); +} + } // end namespace fs namespace path { Index: lib/Support/Windows/Path.inc =================================================================== --- lib/Support/Windows/Path.inc +++ lib/Support/Windows/Path.inc @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/WindowsError.h" #include #include @@ -829,6 +830,48 @@ ResultFD = FD; return std::error_code(); } + +std::error_code writeFileWithEncoding(int FD, StringRef Contents, + WindowsEncodingMethod Encoding, + bool BeginOfFile, bool UseAtomicWrites) { + if (Encoding == WEM_UTF8) { + return writeBufferToFile(FD, Contents.data(), Contents.size(), + UseAtomicWrites); + } else if (Encoding == WEM_CurrentCodePage) { + SmallVector ArgsUTF16; + SmallVector ArgsCurCP; + + if (std::error_code EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)) + return EC; + + if (std::error_code EC = + windows::UTF16ToCurCP(ArgsUTF16.data(), ArgsUTF16.size(), ArgsCurCP)) + return EC; + + return writeBufferToFile(FD, ArgsCurCP.data(), ArgsCurCP.size(), + UseAtomicWrites); + } else if (Encoding == WEM_UTF16) { + SmallVector ArgsUTF16; + + if (std::error_code EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)) + return EC; + + if (BeginOfFile) { + // Endianness guessing - Write BOM in the first write to this file. + char BOM[2]; + uint16_t src = UNI_UTF16_BYTE_ORDER_MARK_NATIVE; + memcpy(BOM, &src, 2); + if (std::error_code EC = writeBufferToFile(FD, BOM, 2, UseAtomicWrites)) + return EC; + } + + return writeBufferToFile(FD, (char *)ArgsUTF16.data(), ArgsUTF16.size() << 1, + UseAtomicWrites); + } + + llvm_unreachable("Unknown encoding"); + return std::make_error_code(std::errc::io_error); +} } // end namespace fs namespace path { @@ -919,11 +962,13 @@ return std::error_code(); } -std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, - llvm::SmallVectorImpl &utf8) { +static +std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16, + size_t utf16_len, + llvm::SmallVectorImpl &utf8) { if (utf16_len) { // Get length. - int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(), + int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(), 0, NULL, NULL); if (len == 0) @@ -933,7 +978,7 @@ utf8.set_size(len); // Now do the actual conversion. - len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.data(), + len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(), utf8.size(), NULL, NULL); if (len == 0) @@ -946,6 +991,16 @@ return std::error_code(); } + +std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, + llvm::SmallVectorImpl &utf8) { + return UTF16ToCodePage(CP_UTF8, utf16, utf16_len, utf8); +} + +std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len, + llvm::SmallVectorImpl &utf8) { + return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8); +} } // end namespace windows } // end namespace sys } // end namespace llvm Index: lib/Support/Windows/WindowsSupport.h =================================================================== --- lib/Support/Windows/WindowsSupport.h +++ lib/Support/Windows/WindowsSupport.h @@ -166,6 +166,9 @@ std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl &utf16); std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, SmallVectorImpl &utf8); +/// Convert from UTF16 to the current code page used in the system +std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len, + SmallVectorImpl &utf8); } // end namespace windows } // end namespace sys } // end namespace llvm. Index: lib/Support/raw_ostream.cpp =================================================================== --- lib/Support/raw_ostream.cpp +++ lib/Support/raw_ostream.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Config/config.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" @@ -428,7 +429,8 @@ raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC, sys::fs::OpenFlags Flags) - : Error(false), UseAtomicWrites(false), pos(0) { + : Error(false), UseAtomicWrites(false), pos(0), Encoding(sys::fs::WEM_UTF8), + UTFBufEnd(UTFBuf) { EC = std::error_code(); // Handle "-" as stdout. Note that when we do this, we consider ourself // the owner of stdout. This means that we can do things like close the @@ -455,11 +457,28 @@ ShouldClose = true; } +raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC, + sys::fs::OpenFlags Flags, + sys::fs::WindowsEncodingMethod _Encoding) + : Error(false), UseAtomicWrites(false), pos(0), Encoding(_Encoding), + UTFBufEnd(UTFBuf) { + EC = sys::fs::openFileForWrite(Filename, FD, Flags); + + if (EC) { + ShouldClose = false; + return; + } + + // Ok, we successfully opened the file, so it'll need to be closed. + ShouldClose = true; +} + /// raw_fd_ostream ctor - FD is the file descriptor that this writes to. If /// ShouldClose is true, this closes the file when the stream is destroyed. raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered) : raw_ostream(unbuffered), FD(fd), - ShouldClose(shouldClose), Error(false), UseAtomicWrites(false) { + ShouldClose(shouldClose), Error(false), UseAtomicWrites(false), + Encoding(sys::fs::WEM_UTF8), UTFBufEnd(UTFBuf) { #ifdef O_BINARY // Setting STDOUT to binary mode is necessary in Win32 // to avoid undesirable linefeed conversion. @@ -477,6 +496,7 @@ } raw_fd_ostream::~raw_fd_ostream() { + assert(UTFBuf == UTFBufEnd && "Should not hold incomplete UTF chars"); if (FD >= 0) { flush(); if (ShouldClose) @@ -503,55 +523,80 @@ report_fatal_error("IO failure on output stream.", /*GenCrashDiag=*/false); } +/// Analyzes the last 3 bytes of the buffer . +/// If it detects an incomplete UTF char at the end of the buffer, it copies +/// them to buffer Out and updates OutEnd (which points to the end of the out +/// buffer). +/// Returns the number of bytes copied. +static unsigned getIncompleteUTFBytes(const char *Ptr, size_t Size, + char *Out, char *&OutEnd) { + char Last3[3] = {'\0', '\0', '\0'}; + + // Copy last 3 bytes of the buffer, the largest possible incomplete UTF char + int Idx = 2; + const char *Cur = Ptr + Size - 1; + while ((Idx >= 0) && (Cur >= Ptr)) + Last3[Idx--] = *Cur--; + + // Are the last bytes of the buffer the begin of a UTF multibyte char? + if (getNumBytesForUTF8(Last3[2]) > 1) { + Out[0] = Last3[2]; + OutEnd = &Out[1]; + return 1; + } + if (getNumBytesForUTF8(Last3[1]) > 2) { + Out[0] = Last3[1]; + Out[1] = Last3[2]; + OutEnd = &Out[2]; + return 2; + } + if (getNumBytesForUTF8(Last3[0]) > 3) { + Out[0] = Last3[0]; + Out[1] = Last3[1]; + Out[2] = Last3[2]; + OutEnd = &Out[3]; + return 3; + } + OutEnd = Out; + return 0; +} void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) { assert(FD >= 0 && "File already closed."); - pos += Size; - do { - ssize_t ret; - - // Check whether we should attempt to use atomic writes. - if (LLVM_LIKELY(!UseAtomicWrites)) { - ret = ::write(FD, Ptr, Size); - } else { - // Use ::writev() where available. -#if defined(HAVE_WRITEV) - const void *Addr = static_cast(Ptr); - struct iovec IOV = {const_cast(Addr), Size }; - ret = ::writev(FD, &IOV, 1); -#else - ret = ::write(FD, Ptr, Size); -#endif + // If user requested a different encoding, we must manage incomplete UTF8 + // chars at the end of the buffer. + if (Encoding != sys::fs::WEM_UTF8) { + std::vector NewBuf; + const size_t SizeIncomplete = (size_t) (UTFBufEnd - UTFBuf); + size_t NewSize = Size + SizeIncomplete; + + // Add incomplete UTF char from previous flush + if (SizeIncomplete > 0) { + NewBuf.resize(NewSize); + NewBuf.insert(NewBuf.begin(), UTFBuf, UTFBufEnd); + NewBuf.insert(NewBuf.begin() + SizeIncomplete, Ptr, Ptr + Size); + Ptr = NewBuf.data(); } - if (ret < 0) { - // If it's a recoverable error, swallow it and retry the write. - // - // Ideally we wouldn't ever see EAGAIN or EWOULDBLOCK here, since - // raw_ostream isn't designed to do non-blocking I/O. However, some - // programs, such as old versions of bjam, have mistakenly used - // O_NONBLOCK. For compatibility, emulate blocking semantics by - // spinning until the write succeeds. If you don't want spinning, - // don't use O_NONBLOCK file descriptors with raw_ostream. - if (errno == EINTR || errno == EAGAIN -#ifdef EWOULDBLOCK - || errno == EWOULDBLOCK -#endif - ) - continue; + // Detect incomplete UTF char at the end of the buffer and defer it to + // the next flush + if (unsigned Num = getIncompleteUTFBytes(Ptr, NewSize, UTFBuf, UTFBufEnd)) + NewSize -= Num; - // Otherwise it's a non-recoverable error. Note it and quit. + if ((sys::fs::writeFileWithEncoding( + FD, StringRef(Ptr, NewSize), Encoding, /*BeginOfFile=*/pos == 0, + UseAtomicWrites))) error_detected(); - break; - } - // The write may have written some or all of the data. Update the - // size and buffer pointer to reflect the remainder that needs - // to be written. If there are no bytes left, we're done. - Ptr += ret; - Size -= ret; - } while (Size > 0); + pos += Size; + return; + } + + pos += Size; + + if ((sys::fs::writeBufferToFile(FD, Ptr, Size, UseAtomicWrites))) + error_detected(); } void raw_fd_ostream::close() { Index: unittests/Support/Path.cpp =================================================================== --- unittests/Support/Path.cpp +++ unittests/Support/Path.cpp @@ -624,6 +624,57 @@ EXPECT_EQ(mfrrv.const_data(), Data); } +#ifdef LLVM_ON_WIN32 +const char UTF16LE_Text[] = + "\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61\x00"; +const char UTF16BE_Text[] = + "\x00\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61"; +#endif +const char UTF8Text[] = "\x6c\x69\x6e\x67\xc3\xbc\x69\xc3\xa7\x61"; + +TEST_F(FileSystemTest, TestWriteFileWithEncoding) { + // Create a temp file for writing + int FileDescriptor1 = 0; + SmallString<128> FilePathname(TestDirectory); + + path::append(FilePathname, "international-file.txt"); + ASSERT_NO_ERROR(fs::openFileForWrite(FilePathname.c_str(), FileDescriptor1, + fs::OpenFlags::F_Text)); + + // Only on Windows we should encode in UTF16. For other systems, use UTF8 + ASSERT_NO_ERROR(sys::fs::writeFileWithEncoding(FileDescriptor1, UTF8Text, + sys::fs::WEM_UTF16, true, + false)); + + ::close(FileDescriptor1); + + // Now open the file for reading and confirm the encoding + int FileDescriptor2 = 0; + ASSERT_NO_ERROR(fs::openFileForRead(FilePathname.c_str(), FileDescriptor2)); + + // On Windows, test for UTF16 variants +#if defined(LLVM_ON_WIN32) + char Buf[18]; + ASSERT_EQ(::read(FileDescriptor2, Buf, 18), 18); + if (strncmp(Buf, "\xfe\xff", 2) == 0) { // UTF16-BE + ASSERT_EQ(strncmp(&Buf[2], UTF16BE_Text, 16), 0); + } else if (strncmp(Buf, "\xff\xfe", 2) == 0) { // UTF16-LE + ASSERT_EQ(strncmp(&Buf[2], UTF16LE_Text, 16), 0); + } else { + FAIL() << "Invalid BOM in UTF-16 file"; + } +#else + // On UNIX, test for UTF8 + char Buf[10]; + ASSERT_EQ(::read(FileDescriptor2, Buf, 10), 10); + ASSERT_EQ(strncmp(Buf, UTF8Text, 10), 0); +#endif + + // Now close the file and delete it + ::close(FileDescriptor2); + ASSERT_NO_ERROR(fs::remove(FilePathname.str())); +} + TEST(Support, NormalizePath) { #if defined(LLVM_ON_WIN32) #define EXPECT_PATH_IS(path__, windows__, not_windows__) \ Index: unittests/Support/raw_ostream_test.cpp =================================================================== --- unittests/Support/raw_ostream_test.cpp +++ unittests/Support/raw_ostream_test.cpp @@ -9,11 +9,25 @@ #include "gtest/gtest.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define ASSERT_NO_ERROR(x) \ + if (std::error_code ASSERT_NO_ERROR_ec = x) { \ + SmallString<128> MessageStorage; \ + raw_svector_ostream Message(MessageStorage); \ + Message << #x ": did not return errc::success.\n" \ + << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \ + << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \ + GTEST_FATAL_FAILURE_(MessageStorage.c_str()); \ + } else { \ + } + + namespace { template std::string printToString(const T &Value) { @@ -117,6 +131,56 @@ EXPECT_EQ("1.20", printToString(format("%.2f", 1.2), 10)); } +#ifdef LLVM_ON_WIN32 +const char UTF16LE_Text[] = + "\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61\x00"; +const char UTF16BE_Text[] = + "\x00\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61"; +#endif +const char UTF8Text[] = "\x6c\x69\x6e\x67\xc3\xbc\x69\xc3\xa7\x61"; + +TEST(raw_ostreamTest, Encodedraw_fd_ostream) { + SmallString<128> TestDirectory; + ASSERT_NO_ERROR(llvm::sys::fs::createUniqueDirectory("raw_fd_ostream-test", + TestDirectory)); + errs() << "Test Directory: " << TestDirectory << '\n'; + errs().flush(); + SmallString<128> FilePathname(TestDirectory); + llvm::sys::path::append(FilePathname, "international-file.txt"); + // Only on Windows this should encode in UTF16. For other systems, it should + // ignore our request and encode in UTF8 + std::error_code EC; + raw_fd_ostream OS(FilePathname, EC, sys::fs::OpenFlags::F_Text, + sys::fs::WEM_UTF16); + ASSERT_FALSE(EC); + OS << "\x6c\x69\x6e\x67\xc3"; // interrupt the UTF8 char + OS.flush(); + OS << "\xbc\x69\xc3\xa7\x61"; // resume + OS.close(); + ASSERT_FALSE(OS.has_error()); + int FileDescriptor = 0; + ASSERT_NO_ERROR(llvm::sys::fs::openFileForRead(FilePathname.c_str(), + FileDescriptor)); +#if defined(LLVM_ON_WIN32) + char Buf[18]; + ASSERT_EQ(::read(FileDescriptor, Buf, 18), 18); + if (strncmp(Buf, "\xfe\xff", 2) == 0) { // UTF16-BE + ASSERT_EQ(strncmp(&Buf[2], UTF16BE_Text, 16), 0); + } else if (strncmp(Buf, "\xff\xfe", 2) == 0) { // UTF16-LE + ASSERT_EQ(strncmp(&Buf[2], UTF16LE_Text, 16), 0); + } else { + FAIL() << "Invalid BOM in UTF-16 file"; + } +#else + char Buf[10]; + ASSERT_EQ(::read(FileDescriptor, Buf, 10), 10); + ASSERT_EQ(strncmp(Buf, UTF8Text, 10), 0); +#endif + ::close(FileDescriptor); + ASSERT_NO_ERROR(llvm::sys::fs::remove(FilePathname.str())); + ASSERT_NO_ERROR(llvm::sys::fs::remove(TestDirectory.str())); +} + TEST(raw_ostreamTest, TinyBuffer) { std::string Str; raw_string_ostream OS(Str);