Index: clang-tools-extra/trunk/clangd/Protocol.h =================================================================== --- clang-tools-extra/trunk/clangd/Protocol.h +++ clang-tools-extra/trunk/clangd/Protocol.h @@ -28,6 +28,7 @@ #include "clang/Index/IndexSymbol.h" #include "llvm/ADT/Optional.h" #include "llvm/Support/JSON.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -346,9 +347,12 @@ UTF16, // Length counts bytes of UTF-8 encoded text. (Clangd extension). UTF8, + // Length counts codepoints in unicode text. (Clangd extension). + UTF32, }; llvm::json::Value toJSON(const OffsetEncoding &); bool fromJSON(const llvm::json::Value &, OffsetEncoding &); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, OffsetEncoding OS); // This struct doesn't mirror LSP! // The protocol defines deeply nested structures for client capabilities. Index: clang-tools-extra/trunk/clangd/Protocol.cpp =================================================================== --- clang-tools-extra/trunk/clangd/Protocol.cpp +++ clang-tools-extra/trunk/clangd/Protocol.cpp @@ -938,16 +938,19 @@ return fromJSON(Params, Base); } -llvm::json::Value toJSON(const OffsetEncoding &OE) { +static const char *toString(OffsetEncoding OE) { switch (OE) { - case OffsetEncoding::UTF8: - return "utf-8"; - case OffsetEncoding::UTF16: - return "utf-16"; - case OffsetEncoding::UnsupportedEncoding: - return "unknown"; + case OffsetEncoding::UTF8: + return "utf-8"; + case OffsetEncoding::UTF16: + return "utf-16"; + case OffsetEncoding::UTF32: + return "utf-32"; + case OffsetEncoding::UnsupportedEncoding: + return "unknown"; } } +llvm::json::Value toJSON(const OffsetEncoding &OE) { return toString(OE); } bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) { auto Str = V.getAsString(); if (!Str) @@ -955,9 +958,13 @@ OE = llvm::StringSwitch(*Str) .Case("utf-8", OffsetEncoding::UTF8) .Case("utf-16", OffsetEncoding::UTF16) + .Case("utf-32", OffsetEncoding::UTF32) .Default(OffsetEncoding::UnsupportedEncoding); return true; } +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) { + return OS << toString(Enc); +} } // namespace clangd } // namespace clang Index: clang-tools-extra/trunk/clangd/SourceCode.cpp =================================================================== --- clang-tools-extra/trunk/clangd/SourceCode.cpp +++ clang-tools-extra/trunk/clangd/SourceCode.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Path.h" namespace clang { @@ -30,6 +31,8 @@ // Returns true if CB returned true, false if we hit the end of string. template static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. for (size_t I = 0; I < U8.size();) { unsigned char C = static_cast(U8[I]); if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. @@ -53,46 +56,75 @@ return false; } -// Returns the offset into the string that matches \p Units UTF-16 code units. -// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back -// to UTF-8, and returns the length in bytes. -static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) { +// Returns the byte offset into the string that is an offset of \p Units in +// the specified encoding. +// Conceptually, this converts to the encoding, truncates to CodeUnits, +// converts back to UTF-8, and returns the length in bytes. +static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, + bool &Valid) { + Valid = Units >= 0; + if (Units <= 0) + return 0; size_t Result = 0; - Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) { - Result += U8Len; - U16Units -= U16Len; - return U16Units <= 0; - }); - if (U16Units < 0) // Offset was into the middle of a surrogate pair. - Valid = false; + switch (Enc) { + case OffsetEncoding::UTF8: + Result = Units; + break; + case OffsetEncoding::UTF16: + Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Result += U8Len; + Units -= U16Len; + return Units <= 0; + }); + if (Units < 0) // Offset in the middle of a surrogate pair. + Valid = false; + break; + case OffsetEncoding::UTF32: + Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Result += U8Len; + Units--; + return Units <= 0; + }); + break; + case OffsetEncoding::UnsupportedEncoding: + llvm_unreachable("unsupported encoding"); + } // Don't return an out-of-range index if we overran. - return std::min(Result, U8.size()); + if (Result > U8.size()) { + Valid = false; + return U8.size(); + } + return Result; } Key kCurrentOffsetEncoding; -static bool useUTF16ForLSP() { +static OffsetEncoding lspEncoding() { auto *Enc = Context::current().get(kCurrentOffsetEncoding); - switch (Enc ? *Enc : OffsetEncoding::UTF16) { - case OffsetEncoding::UTF16: - return true; - case OffsetEncoding::UTF8: - return false; - case OffsetEncoding::UnsupportedEncoding: - llvm_unreachable("cannot use an unsupported encoding"); - } + return Enc ? *Enc : OffsetEncoding::UTF16; } // Like most strings in clangd, the input is UTF-8 encoded. size_t lspLength(llvm::StringRef Code) { - if (!useUTF16ForLSP()) - return Code.size(); - // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). - // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; - iterateCodepoints(Code, [&](int U8Len, int U16Len) { - Count += U16Len; - return false; - }); + switch (lspEncoding()) { + case OffsetEncoding::UTF8: + Count = Code.size(); + break; + case OffsetEncoding::UTF16: + iterateCodepoints(Code, [&](int U8Len, int U16Len) { + Count += U16Len; + return false; + }); + break; + case OffsetEncoding::UTF32: + iterateCodepoints(Code, [&](int U8Len, int U16Len) { + ++Count; + return false; + }); + break; + case OffsetEncoding::UnsupportedEncoding: + llvm_unreachable("unsupported encoding"); + } return Count; } @@ -118,28 +150,15 @@ StringRef Line = Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); - if (!useUTF16ForLSP()) { - // Bounds-checking only. - if (P.character > int(Line.size())) { - if (AllowColumnsBeyondLineLength) - return StartOfLine + Line.size(); - else - return llvm::make_error( - llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character, - P.line), - llvm::errc::invalid_argument); - } - return StartOfLine + P.character; - } - // P.character is in UTF-16 code units, so we have to transcode. + // P.character may be in UTF-16, transcode if necessary. bool Valid; - size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid); + size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error( - llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, - P.line), + llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), + P.character, P.line), llvm::errc::invalid_argument); - return StartOfLine + ByteOffsetInLine; + return StartOfLine + ByteInLine; } Position offsetToPosition(llvm::StringRef Code, size_t Offset) { Index: clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp =================================================================== --- clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp +++ clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp @@ -58,6 +58,15 @@ EXPECT_EQ(lspLength("¥"), 2UL); // astral EXPECT_EQ(lspLength("😂"), 4UL); + + WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32); + EXPECT_EQ(lspLength(""), 0UL); + EXPECT_EQ(lspLength("ascii"), 5UL); + // BMP + EXPECT_EQ(lspLength("↓"), 1UL); + EXPECT_EQ(lspLength("¥"), 1UL); + // astral + EXPECT_EQ(lspLength("😂"), 1UL); } // The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes). @@ -131,6 +140,63 @@ EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed()); EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed()); + // Codepoints are similar, except near astral characters. + WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32); + // line out of bounds + EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed()); + // first line + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, -1)), + llvm::Failed()); // out of range + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 0)), + llvm::HasValue(0)); // first character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 3)), + llvm::HasValue(3)); // middle character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 6)), + llvm::HasValue(6)); // last character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7)), + llvm::HasValue(7)); // the newline itself + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7), false), + llvm::HasValue(7)); + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8)), + llvm::HasValue(7)); // out of range + EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8), false), + llvm::Failed()); // out of range + // middle line + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, -1)), + llvm::Failed()); // out of range + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 0)), + llvm::HasValue(8)); // first character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3)), + llvm::HasValue(11)); // middle character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3), false), + llvm::HasValue(11)); + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 6)), + llvm::HasValue(16)); // last character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 7)), + llvm::HasValue(17)); // the newline itself + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8)), + llvm::HasValue(17)); // out of range + EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8), false), + llvm::Failed()); // out of range + // last line + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, -1)), + llvm::Failed()); // out of range + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 0)), + llvm::HasValue(18)); // first character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 4)), + llvm::HasValue(22)); // Before astral character. + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5), false), + llvm::HasValue(26)); // after astral character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 7)), + llvm::HasValue(28)); // last character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 8)), + llvm::HasValue(29)); // EOF + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9), false), + llvm::Failed()); // out of range + // line out of bounds + EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed()); + EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed()); + // Test UTF-8, where transformations are trivial. WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8); EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed()); @@ -169,6 +235,27 @@ EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF"; EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds"; + // Codepoints are similar, except near astral characters. + WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32); + EXPECT_THAT(offsetToPosition(File, 0), Pos(0, 0)) << "start of file"; + EXPECT_THAT(offsetToPosition(File, 3), Pos(0, 3)) << "in first line"; + EXPECT_THAT(offsetToPosition(File, 6), Pos(0, 6)) << "end of first line"; + EXPECT_THAT(offsetToPosition(File, 7), Pos(0, 7)) << "first newline"; + EXPECT_THAT(offsetToPosition(File, 8), Pos(1, 0)) << "start of second line"; + EXPECT_THAT(offsetToPosition(File, 12), Pos(1, 4)) << "before BMP char"; + EXPECT_THAT(offsetToPosition(File, 13), Pos(1, 5)) << "in BMP char"; + EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 5)) << "after BMP char"; + EXPECT_THAT(offsetToPosition(File, 16), Pos(1, 6)) << "end of second line"; + EXPECT_THAT(offsetToPosition(File, 17), Pos(1, 7)) << "second newline"; + EXPECT_THAT(offsetToPosition(File, 18), Pos(2, 0)) << "start of last line"; + EXPECT_THAT(offsetToPosition(File, 21), Pos(2, 3)) << "in last line"; + EXPECT_THAT(offsetToPosition(File, 22), Pos(2, 4)) << "before astral char"; + EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 5)) << "in astral char"; + EXPECT_THAT(offsetToPosition(File, 26), Pos(2, 5)) << "after astral char"; + EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 7)) << "end of last line"; + EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 8)) << "EOF"; + EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 8)) << "out of bounds"; + WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8); for (Line L : FileLines) { for (unsigned I = 0; I <= L.Length; ++I)