Index: clangd/ClangdLSPServer.h =================================================================== --- clangd/ClangdLSPServer.h +++ clangd/ClangdLSPServer.h @@ -15,6 +15,7 @@ #include "GlobalCompilationDatabase.h" #include "Path.h" #include "Protocol.h" +#include "SourceCode.h" #include "Transport.h" #include "clang/Tooling/Core/Replacement.h" #include "llvm/ADT/Optional.h" @@ -159,6 +160,7 @@ // It is destroyed before run() returns, to ensure worker threads exit. ClangdServer::Options ClangdServerOpts; llvm::Optional Server; + llvm::Optional NegotiatedOffsetEncoding; }; } // namespace clangd } // namespace clang Index: clangd/ClangdLSPServer.cpp =================================================================== --- clangd/ClangdLSPServer.cpp +++ clangd/ClangdLSPServer.cpp @@ -13,6 +13,7 @@ #include "Trace.h" #include "URI.h" #include "clang/Tooling/Core/Replacement.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" @@ -93,6 +94,7 @@ MessageHandler(ClangdLSPServer &Server) : Server(Server) {} bool onNotify(llvm::StringRef Method, llvm::json::Value Params) override { + WithContext HandlerContext(handlerContext()); log("<-- {0}", Method); if (Method == "exit") return false; @@ -109,6 +111,7 @@ bool onCall(llvm::StringRef Method, llvm::json::Value Params, llvm::json::Value ID) override { + WithContext HandlerContext(handlerContext()); // Calls can be canceled by the client. Add cancellation context. WithContext WithCancel(cancelableRequestContext(ID)); trace::Span Tracer(Method); @@ -129,6 +132,7 @@ bool onReply(llvm::json::Value ID, llvm::Expected Result) override { + WithContext HandlerContext(handlerContext()); // We ignore replies, just log them. if (Result) log("<-- reply({0})", ID); @@ -259,6 +263,13 @@ if (It != RequestCancelers.end()) It->second.first(); // Invoke the canceler. } + + Context handlerContext() const { + return Context::current().derive( + kCurrentOffsetEncoding, + Server.NegotiatedOffsetEncoding.getValueOr(OffsetEncoding::UTF16)); + } + // We run cancelable requests in a context that does two things: // - allows cancellation using RequestCancelers[ID] // - cleans up the entry in RequestCancelers when it's no longer needed @@ -302,6 +313,20 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, Callback Reply) { + // Determine character encoding first as it affects constructed ClangdServer. + if (Params.capabilities.offsetEncoding && !NegotiatedOffsetEncoding) { + NegotiatedOffsetEncoding = OffsetEncoding::UTF16; // fallback + for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding) + if (Supported != OffsetEncoding::UnsupportedEncoding) { + NegotiatedOffsetEncoding = Supported; + break; + } + } + llvm::Optional WithOffsetEncoding; + if (NegotiatedOffsetEncoding) + WithOffsetEncoding.emplace(kCurrentOffsetEncoding, + *NegotiatedOffsetEncoding); + if (Params.rootUri && *Params.rootUri) ClangdServerOpts.WorkspaceRoot = Params.rootUri->file(); else if (Params.rootPath && !Params.rootPath->empty()) @@ -331,7 +356,7 @@ SupportsHierarchicalDocumentSymbol = Params.capabilities.HierarchicalDocumentSymbol; SupportFileStatus = Params.initializationOptions.FileStatus; - Reply(llvm::json::Object{ + llvm::json::Object Result{ {{"capabilities", llvm::json::Object{ {"textDocumentSync", (int)TextDocumentSyncKind::Incremental}, @@ -368,7 +393,10 @@ {ExecuteCommandParams::CLANGD_APPLY_FIX_COMMAND, ExecuteCommandParams::CLANGD_APPLY_TWEAK}}, }}, - }}}}); + }}}}; + if (NegotiatedOffsetEncoding) + Result["offsetEncoding"] = *NegotiatedOffsetEncoding; + Reply(std::move(Result)); } void ClangdLSPServer::onShutdown(const ShutdownParams &Params, Index: clangd/Protocol.h =================================================================== --- clangd/Protocol.h +++ clangd/Protocol.h @@ -331,6 +331,17 @@ SymbolKind adjustKindToCapability(SymbolKind Kind, SymbolKindBitset &supportedSymbolKinds); +// Determines the encoding used to measure offsets and lengths of source in LSP. +enum class OffsetEncoding { + UnsupportedEncoding, + // Length counts code units of UTF-16 encoded text. (Standard LSP behavior). + UTF16, + // Length counts bytes of UTF-8 encoded text. (Clangd extension). + UTF8, +}; +llvm::json::Value toJSON(const OffsetEncoding &); +bool fromJSON(const llvm::json::Value &, OffsetEncoding &); + // This struct doesn't mirror LSP! // The protocol defines deeply nested structures for client capabilities. // Instead of mapping them all, this just parses out the bits we care about. @@ -362,6 +373,9 @@ /// Client supports CodeAction return value for textDocument/codeAction. /// textDocument.codeAction.codeActionLiteralSupport. bool CodeActionStructure = false; + + /// Supported encodings for LSP character offsets. (clangd extension). + llvm::Optional> offsetEncoding; }; bool fromJSON(const llvm::json::Value &, ClientCapabilities &); Index: clangd/Protocol.cpp =================================================================== --- clangd/Protocol.cpp +++ clangd/Protocol.cpp @@ -12,11 +12,13 @@ #include "Protocol.h" #include "Logger.h" +#include "SourceCode.h" #include "URI.h" #include "index/Index.h" #include "clang/Basic/LLVM.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/JSON.h" @@ -257,6 +259,11 @@ } } } + if (auto *OffsetEncoding = O->get("offsetEncoding")) { + R.offsetEncoding.emplace(); + if (!fromJSON(*OffsetEncoding, *R.offsetEncoding)) + return false; + } return true; } @@ -818,5 +825,26 @@ return fromJSON(Params, Base); } +llvm::json::Value toJSON(const OffsetEncoding &OE) { + switch (OE) { + case OffsetEncoding::UTF8: + return "utf-8"; + case OffsetEncoding::UTF16: + return "utf-16"; + case OffsetEncoding::UnsupportedEncoding: + return "unknown"; + } +} +bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) { + auto Str = V.getAsString(); + if (!Str) + return false; + OE = llvm::StringSwitch(*Str) + .Case("utf-8", OffsetEncoding::UTF8) + .Case("utf-16", OffsetEncoding::UTF16) + .Default(OffsetEncoding::UnsupportedEncoding); + return true; +} + } // namespace clangd } // namespace clang Index: clangd/SourceCode.h =================================================================== --- clangd/SourceCode.h +++ clangd/SourceCode.h @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H +#include "Context.h" #include "Protocol.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LangOptions.h" @@ -34,8 +35,14 @@ FileDigest digest(StringRef Content); Optional digestFile(const SourceManager &SM, FileID FID); +// This context variable controls the behavior of functions in this file +// that convert between LSP offsets and native clang byte offsets. +// If not set, defaults to UTF-16 for backwards-compatibility. +extern Key kCurrentOffsetEncoding; + // Counts the number of UTF-16 code units needed to represent a string (LSP // specifies string lengths in UTF-16 code units). +// Use of UTF-16 may be overridden by kCurrentOffsetEncoding. size_t lspLength(StringRef Code); /// Turn a [line, column] pair into an offset in Code. Index: clangd/SourceCode.cpp =================================================================== --- clangd/SourceCode.cpp +++ clangd/SourceCode.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "SourceCode.h" +#include "Context.h" #include "Logger.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" @@ -20,6 +21,8 @@ namespace clang { namespace clangd { +Key kCurrentOffsetEncoding; + // Here be dragons. LSP positions use columns measured in *UTF-16 code units*! // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. @@ -67,8 +70,17 @@ return std::min(Result, U8.size()); } +static bool useUTF8ForLSP() { + if (auto *Enc = Context::current().get(kCurrentOffsetEncoding)) + if (*Enc == OffsetEncoding::UTF8) + return true; + return false; +} + // Like most strings in clangd, the input is UTF-8 encoded. size_t lspLength(llvm::StringRef Code) { + if (useUTF8ForLSP()) + return Code.size(); // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; @@ -98,14 +110,25 @@ llvm::errc::invalid_argument); StartOfLine = NextNL + 1; } - - size_t NextNL = Code.find('\n', StartOfLine); - if (NextNL == llvm::StringRef::npos) - NextNL = Code.size(); - + StringRef Line = + Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); + + // If the encoding is UTF-8, we do bounds-checking only. + if (useUTF8ForLSP()) { + if (P.character > int(Line.size())) { + if (AllowColumnsBeyondLineLength) + return StartOfLine + Line.size(); + else + return llvm::make_error( + llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character, + P.line), + llvm::errc::invalid_argument); + } + return StartOfLine + P.character; + } + // Otherwise P.character is in UTF-16 code units, so we have to transcode. bool Valid; - size_t ByteOffsetInLine = measureUTF16( - Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid); + size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid); if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error( llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, Index: test/clangd/utf8.test =================================================================== --- /dev/null +++ test/clangd/utf8.test @@ -0,0 +1,32 @@ +# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s +# This test verifies that we can negotiate UTF-8 offsets via protocol extension. +{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"offsetEncoding":["utf-8","utf-16"]},"trace":"off"}} +# CHECK: "offsetEncoding": "utf-8" +--- +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ö*/int x;\nint y=x;"}}} +--- +{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}} +# /*ö*/int x; +# 01234567890 +# x is character (and utf-16) range [9,10) but byte range [10,11). +# CHECK: "id": 1, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": [ +# CHECK-NEXT: { +# CHECK-NEXT: "range": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 11, +# CHECK-NEXT: "line": 0 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 10, +# CHECK-NEXT: "line": 0 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "uri": "file://{{.*}}/main.cpp" +# CHECK-NEXT: } +# CHECK-NEXT: ] +--- +{"jsonrpc":"2.0","id":10000,"method":"shutdown"} +--- +{"jsonrpc":"2.0","method":"exit"}