Index: include/clang/AST/CommentLexer.h =================================================================== --- include/clang/AST/CommentLexer.h +++ include/clang/AST/CommentLexer.h @@ -281,6 +281,11 @@ /// command, including command marker. SmallString<16> VerbatimBlockEndCommandName; + /// If true, the commands, html tags, etc will be parsed and reported as + /// separate tokens inside the comment body. If false, the comment text will + /// be parsed into text and newline tokens. + bool ParseCommands; + /// Given a character reference name (e.g., "lt"), return the character that /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; @@ -315,12 +320,11 @@ /// Eat string matching regexp \code \s*\* \endcode. void skipLineStartingDecorations(); - /// Lex stuff inside comments. CommentEnd should be set correctly. + /// Lex comment text, including commands if ParseCommands is set to true. void lexCommentText(Token &T); - void setupAndLexVerbatimBlock(Token &T, - const char *TextBegin, - char Marker, const CommandInfo *Info); + void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, + const CommandInfo *Info); void lexVerbatimBlockFirstLine(Token &T); @@ -343,14 +347,13 @@ public: Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd); + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands = true); void lex(Token &T); - StringRef getSpelling(const Token &Tok, - const SourceManager &SourceMgr, + StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid = nullptr) const; }; Index: include/clang/AST/RawCommentList.h =================================================================== --- include/clang/AST/RawCommentList.h +++ include/clang/AST/RawCommentList.h @@ -111,6 +111,33 @@ return extractBriefText(Context); } + /// Returns sanitized comment text, suitable for presentation in editor UIs. + /// E.g. will transform: + /// // This is a long multiline comment. + /// // Parts of it might be indented. + /// /* The comments styles might be mixed. */ + /// into + /// "This is a long multiline comment.\n" + /// " Parts of it might be indented.\n" + /// "The comments styles might be mixed." + /// Also removes leading indentation and sanitizes some common cases: + /// /* This is a first line. + /// * This is a second line. It is indented. + /// * This is a third line. */ + /// and + /// /* This is a first line. + /// This is a second line. It is indented. + /// This is a third line. */ + /// will both turn into: + /// "This is a first line.\n" + /// " This is a second line. It is indented.\n" + /// "This is a third line." + std::string getFormattedText(const ASTContext &Ctx) const; + /// An overload with more fine-grained parameters. Only for unit tests, use + /// the overload with ASTContext in the rest of the code. + std::string getFormattedText(const SourceManager &SourceMgr, + DiagnosticsEngine &Diags) const; + /// Parse the comment, assuming it is attached to decl \c D. comments::FullComment *parse(const ASTContext &Context, const Preprocessor *PP, const Decl *D) const; Index: lib/AST/CommentLexer.cpp =================================================================== --- lib/AST/CommentLexer.cpp +++ lib/AST/CommentLexer.cpp @@ -294,6 +294,39 @@ assert(CommentState == LCS_InsideBCPLComment || CommentState == LCS_InsideCComment); + // Handles lexing non-command text, i.e. text and newline. + auto HandleNonCommandToken = [&]() -> void { + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + switch (*TokenPtr) { + case '\n': + case '\r': + TokenPtr = skipNewline(TokenPtr, CommentEnd); + formTokenWithChars(T, TokenPtr, tok::newline); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + return; + + default: { + StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r"; + size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr) + .find_first_of(TokStartSymbols); + if (End != StringRef::npos) + TokenPtr += End; + else + TokenPtr = CommentEnd; + formTextToken(T, TokenPtr); + return; + } + } + }; + + if (!ParseCommands) + return HandleNonCommandToken(); + switch (State) { case LS_Normal: break; @@ -315,136 +348,116 @@ } assert(State == LS_Normal); - const char *TokenPtr = BufferPtr; assert(TokenPtr < CommentEnd); - while (TokenPtr != CommentEnd) { - switch(*TokenPtr) { - case '\\': - case '@': { - // Commands that start with a backslash and commands that start with - // 'at' have equivalent semantics. But we keep information about the - // exact syntax in AST for comments. - tok::TokenKind CommandKind = - (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + switch(*TokenPtr) { + case '\\': + case '@': { + // Commands that start with a backslash and commands that start with + // 'at' have equivalent semantics. But we keep information about the + // exact syntax in AST for comments. + tok::TokenKind CommandKind = + (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + char C = *TokenPtr; + switch (C) { + default: + break; + + case '\\': case '@': case '&': case '$': + case '#': case '<': case '>': case '%': + case '\"': case '.': case ':': + // This is one of \\ \@ \& \$ etc escape sequences. TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); - return; - } - char C = *TokenPtr; - switch (C) { - default: - break; - - case '\\': case '@': case '&': case '$': - case '#': case '<': case '>': case '%': - case '\"': case '.': case ':': - // This is one of \\ \@ \& \$ etc escape sequences. + if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { + // This is the \:: escape sequence. TokenPtr++; - if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { - // This is the \:: escape sequence. - TokenPtr++; - } - StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(UnescapedText); - return; } + StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(UnescapedText); + return; + } - // Don't make zero-length commands. - if (!isCommandNameStartCharacter(*TokenPtr)) { - formTextToken(T, TokenPtr); - return; - } + // Don't make zero-length commands. + if (!isCommandNameStartCharacter(*TokenPtr)) { + formTextToken(T, TokenPtr); + return; + } - TokenPtr = skipCommandName(TokenPtr, CommentEnd); - unsigned Length = TokenPtr - (BufferPtr + 1); - - // Hardcoded support for lexing LaTeX formula commands - // \f$ \f[ \f] \f{ \f} as a single command. - if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { - C = *TokenPtr; - if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { - TokenPtr++; - Length++; - } - } + TokenPtr = skipCommandName(TokenPtr, CommentEnd); + unsigned Length = TokenPtr - (BufferPtr + 1); - StringRef CommandName(BufferPtr + 1, Length); - - const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); - if (!Info) { - if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { - StringRef CorrectedName = Info->Name; - SourceLocation Loc = getSourceLocation(BufferPtr); - SourceLocation EndLoc = getSourceLocation(TokenPtr); - SourceRange FullRange = SourceRange(Loc, EndLoc); - SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); - Diag(Loc, diag::warn_correct_comment_command_name) - << FullRange << CommandName << CorrectedName - << FixItHint::CreateReplacement(CommandRange, CorrectedName); - } else { - formTokenWithChars(T, TokenPtr, tok::unknown_command); - T.setUnknownCommandName(CommandName); - Diag(T.getLocation(), diag::warn_unknown_comment_command_name) - << SourceRange(T.getLocation(), T.getEndLocation()); - return; - } - } - if (Info->IsVerbatimBlockCommand) { - setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); - return; - } - if (Info->IsVerbatimLineCommand) { - setupAndLexVerbatimLine(T, TokenPtr, Info); - return; + // Hardcoded support for lexing LaTeX formula commands + // \f$ \f[ \f] \f{ \f} as a single command. + if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { + C = *TokenPtr; + if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { + TokenPtr++; + Length++; } - formTokenWithChars(T, TokenPtr, CommandKind); - T.setCommandID(Info->getID()); - return; } - case '&': - lexHTMLCharacterReference(T); - return; - - case '<': { - TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); + StringRef CommandName(BufferPtr + 1, Length); + + const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); + if (!Info) { + if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { + StringRef CorrectedName = Info->Name; + SourceLocation Loc = getSourceLocation(BufferPtr); + SourceLocation EndLoc = getSourceLocation(TokenPtr); + SourceRange FullRange = SourceRange(Loc, EndLoc); + SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); + Diag(Loc, diag::warn_correct_comment_command_name) + << FullRange << CommandName << CorrectedName + << FixItHint::CreateReplacement(CommandRange, CorrectedName); + } else { + formTokenWithChars(T, TokenPtr, tok::unknown_command); + T.setUnknownCommandName(CommandName); + Diag(T.getLocation(), diag::warn_unknown_comment_command_name) + << SourceRange(T.getLocation(), T.getEndLocation()); return; } - const char C = *TokenPtr; - if (isHTMLIdentifierStartingCharacter(C)) - setupAndLexHTMLStartTag(T); - else if (C == '/') - setupAndLexHTMLEndTag(T); - else - formTextToken(T, TokenPtr); + } + if (Info->IsVerbatimBlockCommand) { + setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); return; } - - case '\n': - case '\r': - TokenPtr = skipNewline(TokenPtr, CommentEnd); - formTokenWithChars(T, TokenPtr, tok::newline); - - if (CommentState == LCS_InsideCComment) - skipLineStartingDecorations(); + if (Info->IsVerbatimLineCommand) { + setupAndLexVerbatimLine(T, TokenPtr, Info); return; + } + formTokenWithChars(T, TokenPtr, CommandKind); + T.setCommandID(Info->getID()); + return; + } - default: { - size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). - find_first_of("\n\r\\@&<"); - if (End != StringRef::npos) - TokenPtr += End; - else - TokenPtr = CommentEnd; + case '&': + lexHTMLCharacterReference(T); + return; + + case '<': { + TokenPtr++; + if (TokenPtr == CommentEnd) { formTextToken(T, TokenPtr); return; } + const char C = *TokenPtr; + if (isHTMLIdentifierStartingCharacter(C)) + setupAndLexHTMLStartTag(T); + else if (C == '/') + setupAndLexHTMLEndTag(T); + else + formTextToken(T, TokenPtr); + return; } + + default: + return HandleNonCommandToken(); } } @@ -727,14 +740,13 @@ } Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd): - Allocator(Allocator), Diags(Diags), Traits(Traits), - BufferStart(BufferStart), BufferEnd(BufferEnd), - FileLoc(FileLoc), BufferPtr(BufferStart), - CommentState(LCS_BeforeComment), State(LS_Normal) { -} + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands) + : Allocator(Allocator), Diags(Diags), Traits(Traits), + BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), + BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal), + ParseCommands(ParseCommands) {} void Lexer::lex(Token &T) { again: Index: lib/AST/RawCommentList.cpp =================================================================== --- lib/AST/RawCommentList.cpp +++ lib/AST/RawCommentList.cpp @@ -335,3 +335,98 @@ BeforeThanCompare(SourceMgr)); std::swap(Comments, MergedComments); } + +std::string RawComment::getFormattedText(const ASTContext &Ctx) const { + return getFormattedText(Ctx.getSourceManager(), Ctx.getDiagnostics()); +} + +std::string RawComment::getFormattedText(const SourceManager &SourceMgr, + DiagnosticsEngine &Diags) const { + llvm::StringRef CommentText = getRawText(SourceMgr); + if (CommentText.empty()) + return ""; + + llvm::BumpPtrAllocator Allocator; + // We do not parse any commands, so CommentOptions are ignored by + // comments::Lexer. Therefore, we just use default-constructed options. + CommentOptions DefOpts; + comments::CommandTraits EmptyTraits(Allocator, DefOpts); + comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(), + CommentText.begin(), CommentText.end(), + /*ParseCommands=*/false); + + std::string Result; + // A column number of the first non-whitespace token in the comment text. + // We skip whitespace up to this column, but keep the whitespace after this + // column. IndentColumn is calculated when lexing the first line and reused + // for the rest of lines. + unsigned IndentColumn = 0; + + // Processes one line of the comment and adds it to the result. + // Handles skipping the indent at the start of the line. + // Returns false when eof is reached and true otherwise. + auto LexLine = [&](bool IsFirstLine) -> bool { + comments::Token Tok; + // Lex the first token on the line. We handle it separately, because we to + // fix up its indentation. + L.lex(Tok); + if (Tok.is(comments::tok::eof)) + return false; + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr); + bool LocInvalid = false; + unsigned TokColumn = + SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid); + assert(!LocInvalid && "getFormattedText for invalid location"); + + // Amount of leading whitespace in TokText. + size_t WhitespaceLen = TokText.find_first_not_of(" \t"); + if (WhitespaceLen == StringRef::npos) + WhitespaceLen = TokText.size(); + // Remember the amount of whitespace we skipped in the first line to remove + // indent up to that column in the following lines. + if (IsFirstLine) + IndentColumn = TokColumn + WhitespaceLen; + + // Amount of leading whitespace we actually want to skip. + // For the first line we skip all the whitespace. + // For the rest of the lines, we skip whitespace up to IndentColumn. + unsigned SkipLen = + IsFirstLine + ? WhitespaceLen + : std::min( + WhitespaceLen, + std::max(static_cast(IndentColumn) - TokColumn, 0)); + llvm::StringRef Trimmed = TokText.drop_front(SkipLen); + Result += Trimmed; + // Lex all tokens in the rest of the line. + for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) { + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + Result += L.getSpelling(Tok, SourceMgr); + } + // We've reached the end of file token. + return false; + }; + + auto DropTrailingNewLines = [](std::string &Str) { + while (Str.back() == '\n') + Str.pop_back(); + }; + + // Proces first line separately to remember indent for the following lines. + if (!LexLine(/*IsFirstLine=*/true)) { + DropTrailingNewLines(Result); + return Result; + } + // Process the rest of the lines. + while (LexLine(/*IsFirstLine=*/false)) + ; + DropTrailingNewLines(Result); + return Result; +} Index: unittests/AST/CMakeLists.txt =================================================================== --- unittests/AST/CMakeLists.txt +++ unittests/AST/CMakeLists.txt @@ -9,6 +9,7 @@ ASTVectorTest.cpp CommentLexer.cpp CommentParser.cpp + CommentTextTest.cpp DataCollectionTest.cpp DeclPrinterTest.cpp DeclTest.cpp Index: unittests/AST/CommentTextTest.cpp =================================================================== --- /dev/null +++ unittests/AST/CommentTextTest.cpp @@ -0,0 +1,128 @@ +//===- unittest/AST/CommentTextTest.cpp - Comment text extraction test ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tests for user-friendly output formatting of comments, i.e. +// RawComment::getFormattedText(). +// +//===----------------------------------------------------------------------===// + +#include "clang/AST/RawCommentList.h" +#include "clang/Basic/CommentOptions.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/VirtualFileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +namespace clang { + +class CommentTextTest : public ::testing::Test { +protected: + std::string formatComment(llvm::StringRef CommentText) { + llvm::IntrusiveRefCntPtr EmptyFS( + new vfs::InMemoryFileSystem); + FileSystemOptions Opts; + FileManager FileMgr(Opts, EmptyFS); + + DiagnosticsEngine Diags(new DiagnosticIDs, new DiagnosticOptions); + SourceManager SourceMgr(Diags, FileMgr); + + auto File = SourceMgr.createFileID( + llvm::MemoryBuffer::getMemBuffer(CommentText, "test-comment")); + + auto CommentStartOffset = CommentText.find("/"); + assert(CommentStartOffset != llvm::StringRef::npos); + + SourceRange CommentRange( + SourceMgr.getLocForStartOfFile(File).getLocWithOffset( + CommentStartOffset), + SourceMgr.getLocForEndOfFile(File)); + CommentOptions EmptyOpts; + // FIXME: technically, merged that we set here is incorrect, but that + // shouldn't matter. + RawComment Comment(SourceMgr, CommentRange, EmptyOpts, /*Merged=*/true); + return Comment.getFormattedText(SourceMgr, Diags); + } +}; + +TEST_F(CommentTextTest, FormattedText) { + // clang-format off + auto ExpectedOutput = +R"(This function does this and that. +For example, + Runnning it in that case will give you + this result. +That's about it.)"; + // Two-slash comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +// This function does this and that. +// For example, +// Runnning it in that case will give you +// this result. +// That's about it.)cpp")); + + // Three-slash comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/// This function does this and that. +/// For example, +/// Runnning it in that case will give you +/// this result. +/// That's about it.)cpp")); + + // Block comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/* This function does this and that. + * For example, + * Runnning it in that case will give you + * this result. + * That's about it.*/)cpp")); + + // Doxygen-style block comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/** This function does this and that. + * For example, + * Runnning it in that case will give you + * this result. + * That's about it.*/)cpp")); + + // Weird indentation. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( + // This function does this and that. + // For example, + // Runnning it in that case will give you + // this result. + // That's about it.)cpp")); + // clang-format on +} + +TEST_F(CommentTextTest, KeepsDoxygenControlSeqs) { + // clang-format off + auto ExpectedOutput = +R"(\brief This is the brief part of the comment. +\param a something about a. +@param b something about b.)"; + + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/// \brief This is the brief part of the comment. +/// \param a something about a. +/// @param b something about b.)cpp")); + // clang-format on +} + +} // namespace clang