Index: clang/include/clang/Basic/SourceManager.h =================================================================== --- clang/include/clang/Basic/SourceManager.h +++ clang/include/clang/Basic/SourceManager.h @@ -226,6 +226,10 @@ bool shouldFreeBuffer() const { return (Buffer.getInt() & DoNotFreeFlag) == 0; } + + // If BufStr has an invalid BOM, returns the BOM name; otherwise, returns + // nullptr + static const char *getInvalidBOM(StringRef BufStr); }; // Assert that the \c ContentCache objects will always be 8-byte aligned so Index: clang/lib/Basic/SourceManager.cpp =================================================================== --- clang/lib/Basic/SourceManager.cpp +++ clang/lib/Basic/SourceManager.cpp @@ -95,6 +95,29 @@ Buffer.setInt((B && DoNotFree) ? DoNotFreeFlag : 0); } +const char *ContentCache::getInvalidBOM(StringRef BufStr) { + // If the buffer is valid, check to see if it has a UTF Byte Order Mark + // (BOM). We only support UTF-8 with and without a BOM right now. See + // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + const char *InvalidBOM = + llvm::StringSwitch(BufStr) + .StartsWith(llvm::StringLiteral::withInnerNUL("\x00\x00\xFE\xFF"), + "UTF-32 (BE)") + .StartsWith(llvm::StringLiteral::withInnerNUL("\xFF\xFE\x00\x00"), + "UTF-32 (LE)") + .StartsWith("\xFE\xFF", "UTF-16 (BE)") + .StartsWith("\xFF\xFE", "UTF-16 (LE)") + .StartsWith("\x2B\x2F\x76", "UTF-7") + .StartsWith("\xF7\x64\x4C", "UTF-1") + .StartsWith("\xDD\x73\x66\x73", "UTF-EBCDIC") + .StartsWith("\x0E\xFE\xFF", "SCSU") + .StartsWith("\xFB\xEE\x28", "BOCU-1") + .StartsWith("\x84\x31\x95\x33", "GB-18030") + .Default(nullptr); + + return InvalidBOM; +} + const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag, FileManager &FM, SourceLocation Loc, @@ -190,20 +213,7 @@ // (BOM). We only support UTF-8 with and without a BOM right now. See // http://en.wikipedia.org/wiki/Byte_order_mark for more information. StringRef BufStr = Buffer.getPointer()->getBuffer(); - const char *InvalidBOM = llvm::StringSwitch(BufStr) - .StartsWith(llvm::StringLiteral::withInnerNUL("\x00\x00\xFE\xFF"), - "UTF-32 (BE)") - .StartsWith(llvm::StringLiteral::withInnerNUL("\xFF\xFE\x00\x00"), - "UTF-32 (LE)") - .StartsWith("\xFE\xFF", "UTF-16 (BE)") - .StartsWith("\xFF\xFE", "UTF-16 (LE)") - .StartsWith("\x2B\x2F\x76", "UTF-7") - .StartsWith("\xF7\x64\x4C", "UTF-1") - .StartsWith("\xDD\x73\x66\x73", "UTF-EBCDIC") - .StartsWith("\x0E\xFE\xFF", "SCSU") - .StartsWith("\xFB\xEE\x28", "BOCU-1") - .StartsWith("\x84\x31\x95\x33", "GB-18030") - .Default(nullptr); + const char *InvalidBOM = getInvalidBOM(BufStr); if (InvalidBOM) { Diag.Report(Loc, diag::err_unsupported_bom) Index: clang/tools/clang-format/ClangFormat.cpp =================================================================== --- clang/tools/clang-format/ClangFormat.cpp +++ clang/tools/clang-format/ClangFormat.cpp @@ -290,31 +290,6 @@ } } -// If BufStr has an invalid BOM, returns the BOM name; otherwise, returns -// nullptr. -static const char *getInValidBOM(StringRef BufStr) { - // Check to see if the buffer has a UTF Byte Order Mark (BOM). - // We only support UTF-8 with and without a BOM right now. See - // https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding - // for more information. - const char *InvalidBOM = - llvm::StringSwitch(BufStr) - .StartsWith(llvm::StringLiteral::withInnerNUL("\x00\x00\xFE\xFF"), - "UTF-32 (BE)") - .StartsWith(llvm::StringLiteral::withInnerNUL("\xFF\xFE\x00\x00"), - "UTF-32 (LE)") - .StartsWith("\xFE\xFF", "UTF-16 (BE)") - .StartsWith("\xFF\xFE", "UTF-16 (LE)") - .StartsWith("\x2B\x2F\x76", "UTF-7") - .StartsWith("\xF7\x64\x4C", "UTF-1") - .StartsWith("\xDD\x73\x66\x73", "UTF-EBCDIC") - .StartsWith("\x0E\xFE\xFF", "SCSU") - .StartsWith("\xFB\xEE\x28", "BOCU-1") - .StartsWith("\x84\x31\x95\x33", "GB-18030") - .Default(nullptr); - return InvalidBOM; -} - static bool emitReplacementWarnings(const Replacements &Replaces, StringRef AssumedFileName, const std::unique_ptr &Code) { @@ -400,7 +375,7 @@ StringRef BufStr = Code->getBuffer(); - const char *InvalidBOM = getInValidBOM(BufStr); + const char *InvalidBOM = SrcMgr::ContentCache::getInvalidBOM(BufStr); if (InvalidBOM) { errs() << "error: encoding with unsupported byte order mark \"" Index: clang/unittests/Basic/SourceManagerTest.cpp =================================================================== --- clang/unittests/Basic/SourceManagerTest.cpp +++ clang/unittests/Basic/SourceManagerTest.cpp @@ -200,6 +200,47 @@ ""); } +TEST_F(SourceManagerTest, getInvalidBOM) { + ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM(""), nullptr); + ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM("\x00\x00\x00"), nullptr); + ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM("\xFF\xFF\xFF"), nullptr); + ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM("#include "), + nullptr); + + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\xFE\xFF#include ")), + "UTF-16 (BE)"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\xFF\xFE#include ")), + "UTF-16 (LE)"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\x2B\x2F\x76#include ")), + "UTF-7"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\xF7\x64\x4C#include ")), + "UTF-1"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\xDD\x73\x66\x73#include ")), + "UTF-EBCDIC"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\x0E\xFE\xFF#include ")), + "SCSU"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\xFB\xEE\x28#include ")), + "BOCU-1"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + "\x84\x31\x95\x33#include ")), + "GB-18030"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + llvm::StringLiteral::withInnerNUL( + "\x00\x00\xFE\xFF#include "))), + "UTF-32 (BE)"); + ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM( + llvm::StringLiteral::withInnerNUL( + "\xFF\xFE\x00\x00#include "))), + "UTF-32 (LE)"); +} + #if defined(LLVM_ON_UNIX) TEST_F(SourceManagerTest, getMacroArgExpandedLocation) {