Index: clangd/ClangdServer.cpp =================================================================== --- clangd/ClangdServer.cpp +++ clangd/ClangdServer.cpp @@ -232,14 +232,8 @@ RefactoringResultCollector ResultCollector; const SourceManager &SourceMgr = AST.getASTContext().getSourceManager(); - const FileEntry *FE = - SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()); - if (!FE) - return CB(llvm::make_error( - "rename called for non-added document", - llvm::errc::invalid_argument)); SourceLocation SourceLocationBeg = - clangd::getBeginningOfIdentifier(AST, Pos, FE); + clangd::getBeginningOfIdentifier(AST, Pos, SourceMgr.getMainFileID()); tooling::RefactoringRuleContext Context( AST.getASTContext().getSourceManager()); Context.setASTContext(AST.getASTContext()); Index: clangd/ClangdUnit.h =================================================================== --- clangd/ClangdUnit.h +++ clangd/ClangdUnit.h @@ -173,8 +173,9 @@ }; /// Get the beginning SourceLocation at a specified \p Pos. +/// May be invalid if Pos is, or if there's no identifier. SourceLocation getBeginningOfIdentifier(ParsedAST &Unit, const Position &Pos, - const FileEntry *FE); + const FileID FID); /// For testing/debugging purposes. Note that this method deserializes all /// unserialized Decls, so use with care. Index: clangd/ClangdUnit.cpp =================================================================== --- clangd/ClangdUnit.cpp +++ clangd/ClangdUnit.cpp @@ -215,19 +215,6 @@ std::move(IncLocations)); } -namespace { - -SourceLocation getMacroArgExpandedLocation(const SourceManager &Mgr, - const FileEntry *FE, Position Pos) { - // The language server protocol uses zero-based line and column numbers. - // Clang uses one-based numbers. - SourceLocation InputLoc = - Mgr.translateFileLineCol(FE, Pos.line + 1, Pos.character + 1); - return Mgr.getMacroArgExpandedLocation(InputLoc); -} - -} // namespace - void ParsedAST::ensurePreambleDeclsDeserialized() { if (PreambleDeclsDeserialized || !Preamble) return; @@ -470,40 +457,34 @@ SourceLocation clangd::getBeginningOfIdentifier(ParsedAST &Unit, const Position &Pos, - const FileEntry *FE) { + const FileID FID) { const ASTContext &AST = Unit.getASTContext(); const SourceManager &SourceMgr = AST.getSourceManager(); - - SourceLocation InputLocation = - getMacroArgExpandedLocation(SourceMgr, FE, Pos); - if (Pos.character == 0) { - return InputLocation; - } - - // This handle cases where the position is in the middle of a token or right - // after the end of a token. In theory we could just use GetBeginningOfToken - // to find the start of the token at the input position, but this doesn't - // work when right after the end, i.e. foo|. - // So try to go back by one and see if we're still inside an identifier - // token. If so, Take the beginning of this token. - // (It should be the same identifier because you can't have two adjacent - // identifiers without another token in between.) - Position PosCharBehind = Pos; - --PosCharBehind.character; - - SourceLocation PeekBeforeLocation = - getMacroArgExpandedLocation(SourceMgr, FE, PosCharBehind); - Token Result; - if (Lexer::getRawToken(PeekBeforeLocation, Result, SourceMgr, - AST.getLangOpts(), false)) { - // getRawToken failed, just use InputLocation. - return InputLocation; + auto Offset = positionToOffset(SourceMgr.getBufferData(FID), Pos); + if (!Offset) { + log("getBeginningOfIdentifier: " + toString(Offset.takeError())); + return SourceLocation(); } - - if (Result.is(tok::raw_identifier)) { - return Lexer::GetBeginningOfToken(PeekBeforeLocation, SourceMgr, - AST.getLangOpts()); - } - - return InputLocation; + SourceLocation InputLoc = SourceMgr.getComposedLoc(FID, *Offset); + + // GetBeginningOfToken(pos) is almost what we want, but does the wrong thing + // if the cursor is at the end of the identifier. + // Instead, we lex at GetBeginningOfToken(pos - 1). The cases are: + // 1) at the beginning of an identifier, we'll be looking at something + // that isn't an identifier. + // 2) at the middle or end of an identifier, we get the identifier. + // 3) anywhere outside an identifier, we'll get some non-identifier thing. + // We can't actually distinguish cases 1 and 3, but returning the original + // location is correct for both! + if (*Offset == 0) // Case 1 or 3. + return SourceMgr.getMacroArgExpandedLocation(InputLoc); + SourceLocation Before = + SourceMgr.getMacroArgExpandedLocation(InputLoc.getLocWithOffset(-1)); + Before = Lexer::GetBeginningOfToken(Before, SourceMgr, AST.getLangOpts()); + Token Tok; + if (Before.isValid() && + !Lexer::getRawToken(Before, Tok, SourceMgr, AST.getLangOpts(), false) && + Tok.is(tok::raw_identifier)) + return Before; // Case 2. + return SourceMgr.getMacroArgExpandedLocation(InputLoc); // Case 1 or 3. } Index: clangd/CodeComplete.cpp =================================================================== --- clangd/CodeComplete.cpp +++ clangd/CodeComplete.cpp @@ -729,8 +729,15 @@ FrontendOpts.SkipFunctionBodies = true; FrontendOpts.CodeCompleteOpts = Options; FrontendOpts.CodeCompletionAt.FileName = Input.FileName; - FrontendOpts.CodeCompletionAt.Line = Input.Pos.line + 1; - FrontendOpts.CodeCompletionAt.Column = Input.Pos.character + 1; + auto Offset = positionToOffset(Input.Contents, Input.Pos); + if (!Offset) { + log("Code completion position was invalid " + + llvm::toString(Offset.takeError())); + return false; + } + std::tie(FrontendOpts.CodeCompletionAt.Line, + FrontendOpts.CodeCompletionAt.Column) = + offsetToClangLineColumn(Input.Contents, *Offset); Clang->setCodeCompletionConsumer(Consumer.release()); Index: clangd/Protocol.h =================================================================== --- clangd/Protocol.h +++ clangd/Protocol.h @@ -91,6 +91,8 @@ int line = 0; /// Character offset on a line in a document (zero-based). + /// WARNING: this is in UTF-16 codepoints, not bytes or characters! + /// Use the functions in SourceCode.h to construct/interpret Positions. int character = 0; friend bool operator==(const Position &LHS, const Position &RHS) { Index: clangd/SourceCode.h =================================================================== --- clangd/SourceCode.h +++ clangd/SourceCode.h @@ -23,14 +23,9 @@ /// Turn a [line, column] pair into an offset in Code. /// -/// If the character value is greater than the line length, the behavior depends -/// on AllowColumnsBeyondLineLength: -/// -/// - if true: default back to the end of the line -/// - if false: return an error -/// -/// If the line number is greater than the number of lines in the document, -/// always return an error. +/// If P.character exceeds the line length, returns the offset at end-of-line. +/// (If !AllowColumnsBeyondLineLength, then returns an error instead). +/// If the line number is out of range, returns an error. /// /// The returned value is in the range [0, Code.size()]. llvm::Expected @@ -38,7 +33,7 @@ bool AllowColumnsBeyondLineLength = true); /// Turn an offset in Code into a [line, column] pair. -/// FIXME: This should return an error if the offset is invalid. +/// The offset must be in range [0, Code.size()]. Position offsetToPosition(llvm::StringRef Code, size_t Offset); /// Turn a SourceLocation into a [line, column] pair. @@ -49,6 +44,12 @@ // Note that clang also uses closed source ranges, which this can't handle! Range halfOpenToRange(const SourceManager &SM, CharSourceRange R); +// Converts an offset to a clang line/column (1-based, columns are bytes). +// The offset must be in range [0, Code.size()]. +// Prefer to use SourceManager if one is available. +std::pair offsetToClangLineColumn(llvm::StringRef Code, + size_t Offset); + /// From "a::b::c", return {"a::b::", "c"}. Scope is empty if there's no /// qualifier. std::pair Index: clangd/SourceCode.cpp =================================================================== --- clangd/SourceCode.cpp +++ clangd/SourceCode.cpp @@ -16,6 +16,66 @@ namespace clangd { using namespace llvm; +// Here be dragons. LSP positions use columns measured in *UTF-16 code units*! +// Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. + +// Iterates over unicode codepoints in the (UTF-8) string. For each, +// invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. +// Returns true if CB returned true, false if we hit the end of string. +template +bool iterateCodepoints(StringRef U8, const Callback &CB) { + for (size_t I = 0; I < U8.size();) { + unsigned char C = static_cast(U8[I]); + if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. + if (CB(1, 1)) + return true; + ++I; + continue; + } + // This convenient property of UTF-8 holds for all non-ASCII characters. + size_t UTF8Length = countLeadingOnes(C); + // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. + // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. + assert((UTF8Length >= 2 && UTF8Length <= 4) && + "Invalid UTF-8, or transcoding bug?"); + I += UTF8Length; // Skip over all trailing bytes. + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) + if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) + return true; + } + return false; +} + +// Returns the offset into the string that matches \p Units UTF-16 code units. +// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back +// to UTF-8, and returns the length in bytes. +static size_t measureUTF16(StringRef U8, int Units, bool &Valid) { + size_t Result = 0; + Valid = Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Result += U8Len; + Units -= U16Len; + return Units <= 0; + }); + if (Units < 0) // Offset was into the middle of a surrogate pair. + Valid = false; + // Don't return an out-of-range index if we overran. + return std::min(Result, U8.size()); +} + +// Counts the number of UTF-16 code units needed to represent a string. +// Like most strings in clangd, the input is UTF-8 encoded. +static size_t utf16Len(StringRef U8) { + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. + size_t Count = 0; + iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Count += U16Len; + return false; + }); + return Count; +} + llvm::Expected positionToOffset(StringRef Code, Position P, bool AllowColumnsBeyondLineLength) { if (P.line < 0) @@ -40,12 +100,15 @@ if (NextNL == StringRef::npos) NextNL = Code.size(); - if (StartOfLine + P.character > NextNL && !AllowColumnsBeyondLineLength) + bool Valid; + size_t ByteOffsetInLine = measureUTF16( + Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid); + if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error( - llvm::formatv("Character value is out of range ({0})", P.character), + llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, + P.line), llvm::errc::invalid_argument); - // FIXME: officially P.character counts UTF-16 code units, not UTF-8 bytes! - return std::min(NextNL, StartOfLine + P.character); + return StartOfLine + ByteOffsetInLine; } Position offsetToPosition(StringRef Code, size_t Offset) { @@ -54,17 +117,25 @@ int Lines = Before.count('\n'); size_t PrevNL = Before.rfind('\n'); size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); - // FIXME: officially character counts UTF-16 code units, not UTF-8 bytes! Position Pos; Pos.line = Lines; - Pos.character = static_cast(Offset - StartOfLine); + Pos.character = utf16Len(Before.substr(StartOfLine)); return Pos; } Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { + // We use the SourceManager's line tables, but its column number is in bytes. + auto LocInfo = SM.getDecomposedSpellingLoc(Loc); Position P; - P.line = static_cast(SM.getSpellingLineNumber(Loc)) - 1; - P.character = static_cast(SM.getSpellingColumnNumber(Loc)) - 1; + P.line = + static_cast(SM.getLineNumber(LocInfo.first, LocInfo.second)) - 1; + bool Invalid = false; + StringRef Code = SM.getBufferData(LocInfo.first, &Invalid); + if (!Invalid) { + auto ColumnInBytes = SM.getColumnNumber(LocInfo.first, LocInfo.second) - 1; + P.character = + utf16Len(Code.substr(LocInfo.second - ColumnInBytes, ColumnInBytes)); + } return P; } @@ -76,6 +147,16 @@ return {Begin, End}; } +std::pair offsetToClangLineColumn(StringRef Code, + size_t Offset) { + Offset = std::min(Code.size(), Offset); + StringRef Before = Code.substr(0, Offset); + int Lines = Before.count('\n'); + size_t PrevNL = Before.rfind('\n'); + size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); + return {Lines + 1, Offset - StartOfLine + 1}; +} + std::pair splitQualifiedName(llvm::StringRef QName) { size_t Pos = QName.rfind("::"); Index: clangd/XRefs.cpp =================================================================== --- clangd/XRefs.cpp +++ clangd/XRefs.cpp @@ -164,11 +164,8 @@ std::vector findDefinitions(ParsedAST &AST, Position Pos) { const SourceManager &SourceMgr = AST.getASTContext().getSourceManager(); - const FileEntry *FE = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()); - if (!FE) - return {}; - - SourceLocation SourceLocationBeg = getBeginningOfIdentifier(AST, Pos, FE); + SourceLocation SourceLocationBeg = + getBeginningOfIdentifier(AST, Pos, SourceMgr.getMainFileID()); std::vector Result; // Handle goto definition for #include. @@ -280,11 +277,8 @@ std::vector findDocumentHighlights(ParsedAST &AST, Position Pos) { const SourceManager &SourceMgr = AST.getASTContext().getSourceManager(); - const FileEntry *FE = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()); - if (!FE) - return {}; - - SourceLocation SourceLocationBeg = getBeginningOfIdentifier(AST, Pos, FE); + SourceLocation SourceLocationBeg = + getBeginningOfIdentifier(AST, Pos, SourceMgr.getMainFileID()); DeclarationAndMacrosFinder DeclMacrosFinder(llvm::errs(), SourceLocationBeg, AST.getASTContext(), @@ -413,11 +407,8 @@ Hover getHover(ParsedAST &AST, Position Pos) { const SourceManager &SourceMgr = AST.getASTContext().getSourceManager(); - const FileEntry *FE = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()); - if (FE == nullptr) - return Hover(); - - SourceLocation SourceLocationBeg = getBeginningOfIdentifier(AST, Pos, FE); + SourceLocation SourceLocationBeg = + getBeginningOfIdentifier(AST, Pos, SourceMgr.getMainFileID()); DeclarationAndMacrosFinder DeclMacrosFinder(llvm::errs(), SourceLocationBeg, AST.getASTContext(), AST.getPreprocessor()); Index: test/clangd/rename.test =================================================================== --- test/clangd/rename.test +++ test/clangd/rename.test @@ -36,4 +36,4 @@ --- {"jsonrpc":"2.0","id":3,"method":"shutdown"} --- -{"jsonrpc":"2.0":"method":"exit"} +{"jsonrpc":"2.0","method":"exit"} Index: unittests/clangd/ClangdUnitTests.cpp =================================================================== --- unittests/clangd/ClangdUnitTests.cpp +++ unittests/clangd/ClangdUnitTests.cpp @@ -9,6 +9,7 @@ #include "Annotations.h" #include "ClangdUnit.h" +#include "SourceCode.h" #include "TestFS.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/PCHContainerOperations.h" @@ -216,6 +217,28 @@ Pair(EqualToLSPDiag(NoteInMainLSP), IsEmpty()))); } +TEST(ClangdUnitTest, GetBeginningOfIdentifier) { + // First ^ is the expected beginning, last is the search position. + for (const char *Text : { + "int ^f^oo();", // inside identifier + "int ^foo();", // beginning of identifier + "int ^foo^();", // end of identifier + "int foo(^);", // non-identifier + "^int foo();", // beginning of file (can't back up) + "int ^f0^0();", // after a digit (lexing at N-1 is wrong) + "int ^λλ^λ();", // UTF-8 handled properly when backing up + }) { + Annotations TestCase(Text); + auto AST = build(TestCase.code()); + const auto &SourceMgr = AST.getASTContext().getSourceManager(); + SourceLocation Actual = getBeginningOfIdentifier( + AST, TestCase.points().back(), SourceMgr.getMainFileID()); + Position ActualPos = + offsetToPosition(TestCase.code(), SourceMgr.getFileOffset(Actual)); + EXPECT_EQ(TestCase.points().front(), ActualPos) << Text; + } +} + } // namespace } // namespace clangd } // namespace clang Index: unittests/clangd/DraftStoreTests.cpp =================================================================== --- unittests/clangd/DraftStoreTests.cpp +++ unittests/clangd/DraftStoreTests.cpp @@ -241,7 +241,7 @@ EXPECT_TRUE(!Result); EXPECT_EQ(llvm::toString(Result.takeError()), - "Character value is out of range (100)"); + "UTF-16 offset 100 is invalid for line 0"); } TEST(DraftStoreIncrementalUpdateTest, EndCharOutOfRange) { @@ -262,7 +262,7 @@ EXPECT_TRUE(!Result); EXPECT_EQ(llvm::toString(Result.takeError()), - "Character value is out of range (100)"); + "UTF-16 offset 100 is invalid for line 0"); } TEST(DraftStoreIncrementalUpdateTest, StartLineOutOfRange) { @@ -338,7 +338,7 @@ EXPECT_TRUE(!Result); EXPECT_EQ(llvm::toString(Result.takeError()), - "Character value is out of range (100)"); + "UTF-16 offset 100 is invalid for line 0"); llvm::Optional Contents = DS.getDraft(File); EXPECT_TRUE(Contents); Index: unittests/clangd/SourceCodeTests.cpp =================================================================== --- unittests/clangd/SourceCodeTests.cpp +++ unittests/clangd/SourceCodeTests.cpp @@ -24,9 +24,10 @@ return arg.line == Line && arg.character == Col; } +// The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes). const char File[] = R"(0:0 = 0 -1:0 = 8 -2:0 = 16)"; +1:0 → 8 +2:0 🡆 18)"; /// A helper to make tests easier to read. Position position(int line, int character) { @@ -66,25 +67,31 @@ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3), false), HasValue(11)); EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 6)), - HasValue(14)); // last character + HasValue(16)); // last character EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 7)), - HasValue(15)); // the newline itself + HasValue(17)); // the newline itself EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8)), - HasValue(15)); // out of range + HasValue(17)); // out of range EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8), false), Failed()); // out of range // last line EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, -1)), Failed()); // out of range EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 0)), - HasValue(16)); // first character + HasValue(18)); // first character EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 3)), - HasValue(19)); // middle character - EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 7)), - HasValue(23)); // last character + HasValue(21)); // middle character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5), false), + Failed()); // middle of surrogate pair + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5)), + HasValue(26)); // middle of surrogate pair + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 6), false), + HasValue(26)); // end of surrogate pair EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 8)), - HasValue(24)); // EOF - EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9), false), + HasValue(28)); // last character + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9)), + HasValue(29)); // EOF + EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 10), false), Failed()); // out of range // line out of bounds EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), Failed()); @@ -97,14 +104,19 @@ EXPECT_THAT(offsetToPosition(File, 6), Pos(0, 6)) << "end of first line"; EXPECT_THAT(offsetToPosition(File, 7), Pos(0, 7)) << "first newline"; EXPECT_THAT(offsetToPosition(File, 8), Pos(1, 0)) << "start of second line"; - EXPECT_THAT(offsetToPosition(File, 11), Pos(1, 3)) << "in second line"; - EXPECT_THAT(offsetToPosition(File, 14), Pos(1, 6)) << "end of second line"; - EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 7)) << "second newline"; - EXPECT_THAT(offsetToPosition(File, 16), Pos(2, 0)) << "start of last line"; - EXPECT_THAT(offsetToPosition(File, 19), Pos(2, 3)) << "in last line"; - EXPECT_THAT(offsetToPosition(File, 23), Pos(2, 7)) << "end of last line"; - EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 8)) << "EOF"; - EXPECT_THAT(offsetToPosition(File, 25), Pos(2, 8)) << "out of bounds"; + EXPECT_THAT(offsetToPosition(File, 12), Pos(1, 4)) << "before BMP char"; + EXPECT_THAT(offsetToPosition(File, 13), Pos(1, 5)) << "in BMP char"; + EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 5)) << "after BMP char"; + EXPECT_THAT(offsetToPosition(File, 16), Pos(1, 6)) << "end of second line"; + EXPECT_THAT(offsetToPosition(File, 17), Pos(1, 7)) << "second newline"; + EXPECT_THAT(offsetToPosition(File, 18), Pos(2, 0)) << "start of last line"; + EXPECT_THAT(offsetToPosition(File, 21), Pos(2, 3)) << "in last line"; + EXPECT_THAT(offsetToPosition(File, 22), Pos(2, 4)) << "before astral char"; + EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 6)) << "in astral char"; + EXPECT_THAT(offsetToPosition(File, 26), Pos(2, 6)) << "after astral char"; + EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 8)) << "end of last line"; + EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF"; + EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds"; } } // namespace