Index: clangd/ClangdLSPServer.h =================================================================== --- clangd/ClangdLSPServer.h +++ clangd/ClangdLSPServer.h @@ -40,6 +40,7 @@ ClangdLSPServer(Transport &Transp, const FileSystemProvider &FSProvider, const clangd::CodeCompleteOptions &CCOpts, llvm::Optional CompileCommandsDir, bool UseDirBasedCDB, + llvm::Optional ForcedOffsetEncoding, const ClangdServer::Options &Opts); ~ClangdLSPServer(); @@ -165,6 +166,7 @@ // It is destroyed before run() returns, to ensure worker threads exit. ClangdServer::Options ClangdServerOpts; llvm::Optional Server; + llvm::Optional NegotiatedOffsetEncoding; }; } // namespace clangd } // namespace clang Index: clangd/ClangdLSPServer.cpp =================================================================== --- clangd/ClangdLSPServer.cpp +++ clangd/ClangdLSPServer.cpp @@ -13,6 +13,7 @@ #include "Trace.h" #include "URI.h" #include "clang/Tooling/Core/Replacement.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" @@ -93,6 +94,7 @@ MessageHandler(ClangdLSPServer &Server) : Server(Server) {} bool onNotify(llvm::StringRef Method, llvm::json::Value Params) override { + WithContext HandlerContext(handlerContext()); log("<-- {0}", Method); if (Method == "exit") return false; @@ -109,6 +111,7 @@ bool onCall(llvm::StringRef Method, llvm::json::Value Params, llvm::json::Value ID) override { + WithContext HandlerContext(handlerContext()); // Calls can be canceled by the client. Add cancellation context. WithContext WithCancel(cancelableRequestContext(ID)); trace::Span Tracer(Method); @@ -129,6 +132,7 @@ bool onReply(llvm::json::Value ID, llvm::Expected Result) override { + WithContext HandlerContext(handlerContext()); // We ignore replies, just log them. if (Result) log("<-- reply({0})", ID); @@ -259,6 +263,13 @@ if (It != RequestCancelers.end()) It->second.first(); // Invoke the canceler. } + + Context handlerContext() const { + return Context::current().derive( + kCurrentOffsetEncoding, + Server.NegotiatedOffsetEncoding.getValueOr(OffsetEncoding::UTF16)); + } + // We run cancelable requests in a context that does two things: // - allows cancellation using RequestCancelers[ID] // - cleans up the entry in RequestCancelers when it's no longer needed @@ -302,6 +313,20 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, Callback Reply) { + // Determine character encoding first as it affects constructed ClangdServer. + if (Params.capabilities.offsetEncoding && !NegotiatedOffsetEncoding) { + NegotiatedOffsetEncoding = OffsetEncoding::UTF16; // fallback + for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding) + if (Supported != OffsetEncoding::UnsupportedEncoding) { + NegotiatedOffsetEncoding = Supported; + break; + } + } + llvm::Optional WithOffsetEncoding; + if (NegotiatedOffsetEncoding) + WithOffsetEncoding.emplace(kCurrentOffsetEncoding, + *NegotiatedOffsetEncoding); + if (Params.rootUri && *Params.rootUri) ClangdServerOpts.WorkspaceRoot = Params.rootUri->file(); else if (Params.rootPath && !Params.rootPath->empty()) @@ -331,7 +356,7 @@ SupportsHierarchicalDocumentSymbol = Params.capabilities.HierarchicalDocumentSymbol; SupportFileStatus = Params.initializationOptions.FileStatus; - Reply(llvm::json::Object{ + llvm::json::Object Result{ {{"capabilities", llvm::json::Object{ {"textDocumentSync", (int)TextDocumentSyncKind::Incremental}, @@ -369,7 +394,10 @@ ExecuteCommandParams::CLANGD_APPLY_TWEAK}}, }}, {"typeHierarchyProvider", true}, - }}}}); + }}}}; + if (NegotiatedOffsetEncoding) + Result["offsetEncoding"] = *NegotiatedOffsetEncoding; + Reply(std::move(Result)); } void ClangdLSPServer::onShutdown(const ShutdownParams &Params, @@ -875,19 +903,19 @@ std::move(Reply)); } -ClangdLSPServer::ClangdLSPServer(class Transport &Transp, - const FileSystemProvider &FSProvider, - const clangd::CodeCompleteOptions &CCOpts, - llvm::Optional CompileCommandsDir, - bool UseDirBasedCDB, - const ClangdServer::Options &Opts) +ClangdLSPServer::ClangdLSPServer( + class Transport &Transp, const FileSystemProvider &FSProvider, + const clangd::CodeCompleteOptions &CCOpts, + llvm::Optional CompileCommandsDir, bool UseDirBasedCDB, + llvm::Optional ForcedOffsetEncoding, + const ClangdServer::Options &Opts) : Transp(Transp), MsgHandler(new MessageHandler(*this)), FSProvider(FSProvider), CCOpts(CCOpts), SupportedSymbolKinds(defaultSymbolKinds()), SupportedCompletionItemKinds(defaultCompletionItemKinds()), UseDirBasedCDB(UseDirBasedCDB), - CompileCommandsDir(std::move(CompileCommandsDir)), - ClangdServerOpts(Opts) { + CompileCommandsDir(std::move(CompileCommandsDir)), ClangdServerOpts(Opts), + NegotiatedOffsetEncoding(ForcedOffsetEncoding) { // clang-format off MsgHandler->bind("initialize", &ClangdLSPServer::onInitialize); MsgHandler->bind("shutdown", &ClangdLSPServer::onShutdown); Index: clangd/Protocol.h =================================================================== --- clangd/Protocol.h +++ clangd/Protocol.h @@ -338,6 +338,18 @@ // https://github.com/Microsoft/language-server-protocol/issues/344 SymbolKind indexSymbolKindToSymbolKind(index::SymbolKind Kind); +// Determines the encoding used to measure offsets and lengths of source in LSP. +enum class OffsetEncoding { + // Any string is legal on the wire. Unrecognized encodings parse as this. + UnsupportedEncoding, + // Length counts code units of UTF-16 encoded text. (Standard LSP behavior). + UTF16, + // Length counts bytes of UTF-8 encoded text. (Clangd extension). + UTF8, +}; +llvm::json::Value toJSON(const OffsetEncoding &); +bool fromJSON(const llvm::json::Value &, OffsetEncoding &); + // This struct doesn't mirror LSP! // The protocol defines deeply nested structures for client capabilities. // Instead of mapping them all, this just parses out the bits we care about. @@ -369,6 +381,9 @@ /// Client supports CodeAction return value for textDocument/codeAction. /// textDocument.codeAction.codeActionLiteralSupport. bool CodeActionStructure = false; + + /// Supported encodings for LSP character offsets. (clangd extension). + llvm::Optional> offsetEncoding; }; bool fromJSON(const llvm::json::Value &, ClientCapabilities &); Index: clangd/Protocol.cpp =================================================================== --- clangd/Protocol.cpp +++ clangd/Protocol.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/LLVM.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/JSON.h" @@ -311,6 +312,11 @@ } } } + if (auto *OffsetEncoding = O->get("offsetEncoding")) { + R.offsetEncoding.emplace(); + if (!fromJSON(*OffsetEncoding, *R.offsetEncoding)) + return false; + } return true; } @@ -932,5 +938,26 @@ return fromJSON(Params, Base); } +llvm::json::Value toJSON(const OffsetEncoding &OE) { + switch (OE) { + case OffsetEncoding::UTF8: + return "utf-8"; + case OffsetEncoding::UTF16: + return "utf-16"; + case OffsetEncoding::UnsupportedEncoding: + return "unknown"; + } +} +bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) { + auto Str = V.getAsString(); + if (!Str) + return false; + OE = llvm::StringSwitch(*Str) + .Case("utf-8", OffsetEncoding::UTF8) + .Case("utf-16", OffsetEncoding::UTF16) + .Default(OffsetEncoding::UnsupportedEncoding); + return true; +} + } // namespace clangd } // namespace clang Index: clangd/SourceCode.h =================================================================== --- clangd/SourceCode.h +++ clangd/SourceCode.h @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H +#include "Context.h" #include "Protocol.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LangOptions.h" @@ -34,8 +35,14 @@ FileDigest digest(StringRef Content); Optional digestFile(const SourceManager &SM, FileID FID); +// This context variable controls the behavior of functions in this file +// that convert between LSP offsets and native clang byte offsets. +// If not set, defaults to UTF-16 for backwards-compatibility. +extern Key kCurrentOffsetEncoding; + // Counts the number of UTF-16 code units needed to represent a string (LSP // specifies string lengths in UTF-16 code units). +// Use of UTF-16 may be overridden by kCurrentOffsetEncoding. size_t lspLength(StringRef Code); /// Turn a [line, column] pair into an offset in Code. Index: clangd/SourceCode.cpp =================================================================== --- clangd/SourceCode.cpp +++ clangd/SourceCode.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "SourceCode.h" +#include "Context.h" #include "Logger.h" +#include "Protocol.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/Lexer.h" @@ -67,8 +69,23 @@ return std::min(Result, U8.size()); } +Key kCurrentOffsetEncoding; +static bool useUTF16ForLSP() { + auto *Enc = Context::current().get(kCurrentOffsetEncoding); + switch (Enc ? *Enc : OffsetEncoding::UTF16) { + case OffsetEncoding::UTF16: + return true; + case OffsetEncoding::UTF8: + return false; + case OffsetEncoding::UnsupportedEncoding: + llvm_unreachable("cannot use an unsupported encoding"); + } +} + // Like most strings in clangd, the input is UTF-8 encoded. size_t lspLength(llvm::StringRef Code) { + if (!useUTF16ForLSP()) + return Code.size(); // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; @@ -98,14 +115,25 @@ llvm::errc::invalid_argument); StartOfLine = NextNL + 1; } + StringRef Line = + Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); - size_t NextNL = Code.find('\n', StartOfLine); - if (NextNL == llvm::StringRef::npos) - NextNL = Code.size(); - + if (!useUTF16ForLSP()) { + // Bounds-checking only. + if (P.character > int(Line.size())) { + if (AllowColumnsBeyondLineLength) + return StartOfLine + Line.size(); + else + return llvm::make_error( + llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character, + P.line), + llvm::errc::invalid_argument); + } + return StartOfLine + P.character; + } + // P.character is in UTF-16 code units, so we have to transcode. bool Valid; - size_t ByteOffsetInLine = measureUTF16( - Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid); + size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid); if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error( llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, Index: clangd/index/IndexAction.cpp =================================================================== --- clangd/index/IndexAction.cpp +++ clangd/index/IndexAction.cpp @@ -9,7 +9,6 @@ #include "IndexAction.h" #include "index/SymbolOrigin.h" #include "clang/Frontend/CompilerInstance.h" -#include "clang/Index/IndexDataConsumer.h" #include "clang/Index/IndexingAction.h" #include "clang/Tooling/Tooling.h" Index: clangd/index/SymbolLocation.h =================================================================== --- clangd/index/SymbolLocation.h +++ clangd/index/SymbolLocation.h @@ -20,6 +20,13 @@ // Specify a position (Line, Column) of symbol. Using Line/Column allows us to // build LSP responses without reading the file content. // + // clangd uses the following definitions, which differ slightly from LSP: + // - Line is the number of newline characters (\n) before the point. + // - Column is (by default) the number of UTF-16 code between the last \n + // (or start of file) and the point. + // If the `offsetEncoding` protocol extension is used to negotiate UTF-8, + // then it is instead the number of *bytes* since the last \n. + // // Position is encoded into 32 bits to save space. // If Line/Column overflow, the value will be their maximum value. struct Position { @@ -37,8 +44,7 @@ static constexpr uint32_t MaxColumn = (1 << 12) - 1; private: - uint32_t Line : 20; // 0-based - // Using UTF-16 code units. + uint32_t Line : 20; // 0-based uint32_t Column : 12; // 0-based }; Index: clangd/tool/ClangdMain.cpp =================================================================== --- clangd/tool/ClangdMain.cpp +++ clangd/tool/ClangdMain.cpp @@ -9,10 +9,12 @@ #include "Features.inc" #include "ClangdLSPServer.h" #include "Path.h" +#include "Protocol.h" #include "Trace.h" #include "Transport.h" #include "index/Serialization.h" #include "clang/Basic/Version.h" +#include "llvm/ADT/Optional.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -219,6 +221,16 @@ "includes using index."), llvm::cl::init(true)); +static llvm::cl::opt ForceOffsetEncoding( + "offset-encoding", + llvm::cl::desc("Force the offsetEncoding used for character positions. " + "This bypasses negotiation via client capabilities."), + llvm::cl::values(clEnumValN(OffsetEncoding::UTF8, "utf-8", + "Offsets are in UTF-8 bytes"), + clEnumValN(OffsetEncoding::UTF16, "utf-16", + "Offsets are in UTF-16 code units")), + llvm::cl::init(OffsetEncoding::UnsupportedEncoding)); + namespace { /// \brief Supports a test URI scheme with relaxed constraints for lit tests. @@ -458,9 +470,13 @@ } Opts.ClangTidyOptProvider = ClangTidyOptProvider.get(); Opts.SuggestMissingIncludes = SuggestMissingIncludes; + llvm::Optional OffsetEncodingFromFlag; + if (ForceOffsetEncoding != OffsetEncoding::UnsupportedEncoding) + OffsetEncodingFromFlag = ForceOffsetEncoding; ClangdLSPServer LSPServer( *TransportLayer, FSProvider, CCOpts, CompileCommandsDirPath, - /*UseDirBasedCDB=*/CompileArgsFrom == FilesystemCompileArgs, Opts); + /*UseDirBasedCDB=*/CompileArgsFrom == FilesystemCompileArgs, + OffsetEncodingFromFlag, Opts); llvm::set_thread_name("clangd.main"); return LSPServer.run() ? 0 : static_cast(ErrorResultCode::NoShutdownRequest); Index: test/clangd/utf8.test =================================================================== --- test/clangd/utf8.test +++ test/clangd/utf8.test @@ -0,0 +1,32 @@ +# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s +# This test verifies that we can negotiate UTF-8 offsets via protocol extension. +{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"offsetEncoding":["utf-8","utf-16"]},"trace":"off"}} +# CHECK: "offsetEncoding": "utf-8" +--- +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ΓΆ*/int x;\nint y=x;"}}} +--- +{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}} +# /*ΓΆ*/int x; +# 01234567890 +# x is character (and utf-16) range [9,10) but byte range [10,11). +# CHECK: "id": 1, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": [ +# CHECK-NEXT: { +# CHECK-NEXT: "range": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 11, +# CHECK-NEXT: "line": 0 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 10, +# CHECK-NEXT: "line": 0 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "uri": "file://{{.*}}/main.cpp" +# CHECK-NEXT: } +# CHECK-NEXT: ] +--- +{"jsonrpc":"2.0","id":10000,"method":"shutdown"} +--- +{"jsonrpc":"2.0","method":"exit"} Index: unittests/clangd/SourceCodeTests.cpp =================================================================== --- unittests/clangd/SourceCodeTests.cpp +++ unittests/clangd/SourceCodeTests.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// #include "Annotations.h" +#include "Context.h" +#include "Protocol.h" #include "SourceCode.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_os_ostream.h" @@ -21,14 +23,9 @@ using llvm::HasValue; MATCHER_P2(Pos, Line, Col, "") { - return arg.line == Line && arg.character == Col; + return arg.line == int(Line) && arg.character == int(Col); } -// The = β†’ πŸ‘† below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes). -const char File[] = R"(0:0 = 0 -1:0 β†’ 8 -2:0 πŸ‘† 18)"; - /// A helper to make tests easier to read. Position position(int line, int character) { Position Pos; @@ -52,8 +49,28 @@ EXPECT_EQ(lspLength("Β₯"), 1UL); // astral EXPECT_EQ(lspLength("πŸ˜‚"), 2UL); + + WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8); + EXPECT_EQ(lspLength(""), 0UL); + EXPECT_EQ(lspLength("ascii"), 5UL); + // BMP + EXPECT_EQ(lspLength("↓"), 3UL); + EXPECT_EQ(lspLength("Β₯"), 2UL); + // astral + EXPECT_EQ(lspLength("πŸ˜‚"), 4UL); } +// The = β†’ πŸ‘† below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes). +const char File[] = R"(0:0 = 0 +1:0 β†’ 8 +2:0 πŸ‘† 18)"; +struct Line { + unsigned Number; + unsigned Offset; + unsigned Length; +}; +Line FileLines[] = {Line{0, 0, 7}, Line{1, 8, 9}, Line{2, 18, 11}}; + TEST(SourceCodeTests, PositionToOffset) { // line out of bounds EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed()); @@ -113,6 +130,23 @@ // line out of bounds EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed()); EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed()); + + // Test UTF-8, where transformations are trivial. + WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8); + EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed()); + EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed()); + for (Line L : FileLines) { + EXPECT_THAT_EXPECTED(positionToOffset(File, position(L.Number, -1)), + llvm::Failed()); // out of range + for (unsigned I = 0; I <= L.Length; ++I) + EXPECT_THAT_EXPECTED(positionToOffset(File, position(L.Number, I)), + llvm::HasValue(L.Offset + I)); + EXPECT_THAT_EXPECTED(positionToOffset(File, position(L.Number, L.Length+1)), + llvm::HasValue(L.Offset + L.Length)); + EXPECT_THAT_EXPECTED( + positionToOffset(File, position(L.Number, L.Length + 1), false), + llvm::Failed()); // out of range + } } TEST(SourceCodeTests, OffsetToPosition) { @@ -134,6 +168,13 @@ EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 8)) << "end of last line"; EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF"; EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds"; + + WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8); + for (Line L : FileLines) { + for (unsigned I = 0; I <= L.Length; ++I) + EXPECT_THAT(offsetToPosition(File, L.Offset + I), Pos(L.Number, I)); + } + EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 11)) << "out of bounds"; } TEST(SourceCodeTests, IsRangeConsecutive) {