Index: llvm/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/include/llvm/Support/ConvertUTF.h +++ llvm/include/llvm/Support/ConvertUTF.h @@ -243,6 +243,13 @@ bool hasUTF16ByteOrderMark(ArrayRef SrcBytes); /** + * Returns true if a blob of text starts with a UTF-8 byte order mark. + * UTF-8 BOM is a sequence of bytes on Windows and is not affected by the host + * system's endianness. + */ +bool hasUTF8ByteOrderMark(ArrayRef SrcBytes); + +/** * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string. * * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text. Index: llvm/lib/Support/CommandLine.cpp =================================================================== --- llvm/lib/Support/CommandLine.cpp +++ llvm/lib/Support/CommandLine.cpp @@ -674,6 +674,11 @@ return false; Str = StringRef(UTF8Buf); } + // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove + // these bytes before parsing. + // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark + else if (hasUTF8ByteOrderMark(BufRef)) + Str = StringRef(BufRef.data() + 3, BufRef.size() - 3); // Tokenize the contents into NewArgv. Tokenizer(Str, Saver, NewArgv, MarkEOLs); Index: llvm/lib/Support/ConvertUTFWrapper.cpp =================================================================== --- llvm/lib/Support/ConvertUTFWrapper.cpp +++ llvm/lib/Support/ConvertUTFWrapper.cpp @@ -81,6 +81,13 @@ (S[0] == '\xfe' && S[1] == '\xff'))); } +// It is called byte order marker but the UTF-8 BOM is actually not affected +// by the host system's endianness. +bool hasUTF8ByteOrderMark(ArrayRef S) { + return (S.size() >= 3 && + S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf'); +} + bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out) { assert(Out.empty()); Index: llvm/test/Other/ResponseFile.ll =================================================================== --- llvm/test/Other/ResponseFile.ll +++ llvm/test/Other/ResponseFile.ll @@ -6,6 +6,13 @@ ; RUN: llvm-as @%t.list2 -o %t.bc ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s +; When the response file begins with UTF8 BOM sequence, we shall remove them. +; RUN: echo -e "\xef\xbb\xbf" > %t.list3 +; RUN: echo %s >> %t.list3 +; RUN: echo -e "\xef\xbb\xbf-time-passes @%t.list3" > %t.list4 +; RUN: llvm-as @%t.list4 -o %t.bc +; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s + ; CHECK: T foobar define void @foobar() { Index: llvm/unittests/Support/ConvertUTFTest.cpp =================================================================== --- llvm/unittests/Support/ConvertUTFTest.cpp +++ llvm/unittests/Support/ConvertUTFTest.cpp @@ -66,6 +66,20 @@ EXPECT_FALSE(HasBOM); } +TEST(ConvertUTFTest, HasUTF8BOM) { + bool HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf", 3)); + EXPECT_TRUE(HasBOM); + HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf ", 4)); + EXPECT_TRUE(HasBOM); + HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf\x00asdf", 7)); + EXPECT_TRUE(HasBOM); + + HasBOM = hasUTF8ByteOrderMark(None); + EXPECT_FALSE(HasBOM); + HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef", 1)); + EXPECT_FALSE(HasBOM); +} + struct ConvertUTFResultContainer { ConversionResult ErrorCode; std::vector UnicodeScalars;