diff --git a/clang-tools-extra/clangd/CodeCompletionStrings.cpp b/clang-tools-extra/clangd/CodeCompletionStrings.cpp --- a/clang-tools-extra/clangd/CodeCompletionStrings.cpp +++ b/clang-tools-extra/clangd/CodeCompletionStrings.cpp @@ -12,6 +12,7 @@ #include "clang/AST/RawCommentList.h" #include "clang/Basic/SourceManager.h" #include "clang/Sema/CodeCompleteConsumer.h" +#include "llvm/Support/JSON.h" #include #include @@ -86,7 +87,12 @@ assert(!Ctx.getSourceManager().isLoadedSourceLocation(RC->getBeginLoc())); std::string Doc = RC->getFormattedText(Ctx.getSourceManager(), Ctx.getDiagnostics()); - return looksLikeDocComment(Doc) ? Doc : ""; + if (!looksLikeDocComment(Doc)) + return ""; + // Clang requires source to be UTF-8, but doesn't enforce this in comments. + if (!llvm::json::isUTF8(Doc)) + Doc = llvm::json::fixUTF8(Doc); + return Doc; } void getSignature(const CodeCompletionString &CCS, std::string *Signature, diff --git a/clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp --- a/clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "CodeCompletionStrings.h" +#include "TestTU.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -56,6 +57,14 @@ "Annotation: Ano\n\nIs this brief?"); } +TEST_F(CompletionStringTest, GetDeclCommentBadUTF8) { + // is not a valid byte here, should be replaced by encoded . + auto TU = TestTU::withCode("/*x\xffy*/ struct X;"); + auto AST = TU.build(); + EXPECT_EQ("x\xef\xbf\xbdy", + getDeclComment(AST.getASTContext(), findDecl(AST, "X"))); +} + TEST_F(CompletionStringTest, MultipleAnnotations) { Builder.AddAnnotation("Ano1"); Builder.AddAnnotation("Ano2"); diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -1606,11 +1606,11 @@ // Extracted from boost/spirit/home/support/char_encoding/iso8859_1.hpp // This looks like UTF-8 and fools clang, but has high-ISO-8859-1 comments. const char *Header = "int PUNCT = 0;\n" - "int types[] = { /* \xa1 */PUNCT };"; + "/* \xa1 */ int types[] = { /* \xa1 */PUNCT };"; CollectorOpts.RefFilter = RefKind::All; CollectorOpts.RefsInHeaders = true; runSymbolCollector(Header, ""); - EXPECT_THAT(Symbols, Contains(QName("types"))); + EXPECT_THAT(Symbols, Contains(AllOf(QName("types"), Doc("\xef\xbf\xbd ")))); EXPECT_THAT(Symbols, Contains(QName("PUNCT"))); // Reference is stored, although offset within line is not reliable. EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PUNCT").ID, _)));