Index: llvm/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/include/llvm/Support/ConvertUTF.h +++ llvm/include/llvm/Support/ConvertUTF.h @@ -243,6 +243,25 @@ bool hasUTF16ByteOrderMark(ArrayRef SrcBytes); /** + * Returns true if a blob of text starts with a UTF-8 byte order mark. + * UTF-8 BOM is a sequence of bytes on Windows and is not affected by the host + * system's endianness. + */ +bool hasUTF8ByteOrderMark(ArrayRef SrcBytes); + +#ifdef LLVM_ON_WIN32 +/** + * Converts a stream of raw bytes assumed to be encoded in ANSI code page (aka + * Windows system locale) into a UTF8 std::string. + * + * \param [in] SrcBytes A buffer of what is assumed to be ANSI-encoded text. + * \param [out] Out Converted UTF-8 is stored here on success. + * \returns true on success + */ +bool convertANSIToUTF8String(ArrayRef SrcBytes, std::string &Out); +#endif + +/** * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string. * * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text. Index: llvm/lib/Support/CommandLine.cpp =================================================================== --- llvm/lib/Support/CommandLine.cpp +++ llvm/lib/Support/CommandLine.cpp @@ -674,6 +674,20 @@ return false; Str = StringRef(UTF8Buf); } + // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove + // these bytes before parsing. + // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark + else if (hasUTF8ByteOrderMark(BufRef)) + Str = StringRef(BufRef.data() + 3, BufRef.size() - 3); +#ifdef LLVM_ON_WIN32 + // Otherwise, this might be a hand-written text file encoded in the system's + // default code page. + else { + if (!convertANSIToUTF8String(BufRef, UTF8Buf)) + return false; + Str = StringRef(UTF8Buf); + } +#endif // Tokenize the contents into NewArgv. Tokenizer(Str, Saver, NewArgv, MarkEOLs); Index: llvm/lib/Support/ConvertUTFWrapper.cpp =================================================================== --- llvm/lib/Support/ConvertUTFWrapper.cpp +++ llvm/lib/Support/ConvertUTFWrapper.cpp @@ -12,6 +12,10 @@ #include #include +#ifdef LLVM_ON_WIN32 +#include "Windows/WindowsSupport.h" +#endif + namespace llvm { bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, @@ -81,6 +85,56 @@ (S[0] == '\xfe' && S[1] == '\xff'))); } +// It is called byte order marker but the UTF-8 BOM is actually not affected +// by the host system's endianness. +bool hasUTF8ByteOrderMark(ArrayRef S) { + return (S.size() >= 3 && + S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf'); +} + +#ifdef LLVM_ON_WIN32 +// Convert system-locale encoded string to UTF8 +bool convertANSIToUTF8String(ArrayRef SrcBytes, std::string &Out) { + assert(Out.empty()); + + if (SrcBytes.empty()) + return true; + + SmallVector utf16; + SmallVector utf8; + + int len = ::MultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, SrcBytes.data(), + SrcBytes.size(), utf16.begin(), 0); + if (len == 0) + return false; + + utf16.reserve(len + 1); + utf16.set_size(len); + + len = ::MultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, SrcBytes.data(), + SrcBytes.size(), utf16.begin(), utf16.size()); + if (len == 0) + return false; + + len = ::WideCharToMultiByte(CP_UTF8, 0, utf16.begin(), utf16.size(), + utf8.begin(), 0, nullptr, nullptr); + if (len == 0) + return false; + + utf8.reserve(len + 1); + utf8.set_size(len); + + len = ::WideCharToMultiByte(CP_UTF8, 0, utf16.begin(), utf16.size(), + utf8.data(), utf8.size(), nullptr, nullptr); + if (len == 0) + return false; + + Out.resize(utf8.size()); + std::copy(utf8.begin(), utf8.end(), Out.begin()); + return true; +} +#endif // LLVM_ON_WIN32 + bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out) { assert(Out.empty());