Index: clang-tools-extra/clangd/SourceCode.cpp =================================================================== --- clang-tools-extra/clangd/SourceCode.cpp +++ clang-tools-extra/clangd/SourceCode.cpp @@ -67,15 +67,20 @@ } // This convenient property of UTF-8 holds for all non-ASCII characters. size_t UTF8Length = llvm::countLeadingOnes(C); - // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. - // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. - assert((UTF8Length >= 2 && UTF8Length <= 4) && - "Invalid UTF-8, or transcoding bug?"); I += UTF8Length; // Skip over all trailing bytes. - // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). - // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) - if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) - return true; + // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. + // 11111xxx is not valid UTF-8 at all. Log it because it's probably our bug. + if(UTF8Length == 1 || UTF8Length > 4) { + vlog("File contains invalid UTF-8, or transcoding bug? UTF8Length = {0}, string = {1}", UTF8Length, U8); + // If UTF8 length is 1, treat as one UTF8, if above 4, treat as one UTF16 + if (CB(UTF8Length, UTF8Length == 1 ? 1 : 2)) + return true; + } else { + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) + if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) + return true; + } } return false; } Index: clang-tools-extra/clangd/unittests/SourceCodeTests.cpp =================================================================== --- clang-tools-extra/clangd/unittests/SourceCodeTests.cpp +++ clang-tools-extra/clangd/unittests/SourceCodeTests.cpp @@ -57,6 +57,14 @@ EXPECT_EQ(lspLength("¥"), 1UL); // astral EXPECT_EQ(lspLength("😂"), 2UL); + // Extended ASCII + const char extendedASCIIStr[2] = {static_cast(160U), '\0'}; + EXPECT_EQ(lspLength(extendedASCIIStr), 1UL); + // Invalid UTF16 + const char invalidUTF16[6] = { + static_cast(0xFF), static_cast(0xFF), static_cast(0xFF), + static_cast(0xFF), static_cast(0xFF), '\0'}; + EXPECT_EQ(lspLength(invalidUTF16), 2UL); WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8); EXPECT_EQ(lspLength(""), 0UL); @@ -66,6 +74,10 @@ EXPECT_EQ(lspLength("¥"), 2UL); // astral EXPECT_EQ(lspLength("😂"), 4UL); + // Extended ASCII + EXPECT_EQ(lspLength(extendedASCIIStr), 1UL); + // Invalid UTF16 + EXPECT_EQ(lspLength(invalidUTF16), 5UL); WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32); EXPECT_EQ(lspLength(""), 0UL); @@ -75,6 +87,10 @@ EXPECT_EQ(lspLength("¥"), 1UL); // astral EXPECT_EQ(lspLength("😂"), 1UL); + // Extended ASCII + EXPECT_EQ(lspLength(extendedASCIIStr), 1UL); + // Invalid UTF16 + EXPECT_EQ(lspLength(invalidUTF16), 1UL); } // The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).