Index: include/clang/AST/CommentLexer.h =================================================================== --- include/clang/AST/CommentLexer.h +++ include/clang/AST/CommentLexer.h @@ -281,6 +281,11 @@ /// command, including command marker. SmallString<16> VerbatimBlockEndCommandName; + /// If true, the commands, html tags, etc will be parsed and reported as + /// separate tokens inside the comment body. If false, the comment text will + /// be parsed into text and newline tokens. + bool ParseCommands; + /// Given a character reference name (e.g., "lt"), return the character that /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; @@ -315,12 +320,11 @@ /// Eat string matching regexp \code \s*\* \endcode. void skipLineStartingDecorations(); - /// Lex stuff inside comments. CommentEnd should be set correctly. + /// Lex comment text, including commands if ParseCommands is set to true. void lexCommentText(Token &T); - void setupAndLexVerbatimBlock(Token &T, - const char *TextBegin, - char Marker, const CommandInfo *Info); + void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, + const CommandInfo *Info); void lexVerbatimBlockFirstLine(Token &T); @@ -343,14 +347,13 @@ public: Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd); + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands = true); void lex(Token &T); - StringRef getSpelling(const Token &Tok, - const SourceManager &SourceMgr, + StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid = nullptr) const; }; Index: include/clang/AST/RawCommentList.h =================================================================== --- include/clang/AST/RawCommentList.h +++ include/clang/AST/RawCommentList.h @@ -111,6 +111,29 @@ return extractBriefText(Context); } + /// Returns sanitized comment text, suitable for presentation in editor UIs. + /// E.g. will transform: + /// // This is a long multiline comment. + /// // Parts of it might be indented. + /// /* The comments styles might be mixed. */ + /// into + /// "This is a long multiline comment.\n" + /// " Parts of it might be indented.\n" + /// "The comments styles might be mixed." + /// Also removes leading indentation and sanitizes some common cases: + /// /* This is a first line. + /// * This is a second line. It is indented. + /// * This is a third line. */ + /// and + /// /* This is a first line. + /// This is a second line. It is indented. + /// This is a third line. */ + /// will both turn into: + /// "This is a first line.\n" + /// " This is a second line. It is indented.\n" + /// "This is a third line." + std::string getFormattedText(const ASTContext &Context) const; + /// Parse the comment, assuming it is attached to decl \c D. comments::FullComment *parse(const ASTContext &Context, const Preprocessor *PP, const Decl *D) const; Index: lib/AST/CommentLexer.cpp =================================================================== --- lib/AST/CommentLexer.cpp +++ lib/AST/CommentLexer.cpp @@ -294,6 +294,38 @@ assert(CommentState == LCS_InsideBCPLComment || CommentState == LCS_InsideCComment); + // Handles lexing non-command text, i.e. text and newline. + auto HandleNonCommandToken = [&]() -> void { + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + switch (*TokenPtr) { + case '\n': + case '\r': + TokenPtr = skipNewline(TokenPtr, CommentEnd); + formTokenWithChars(T, TokenPtr, tok::newline); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + return; + + default: { + size_t End = + StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\\@&<"); + if (End != StringRef::npos) + TokenPtr += End; + else + TokenPtr = CommentEnd; + formTextToken(T, TokenPtr); + return; + } + } + }; + + if (!ParseCommands) + return HandleNonCommandToken(); + switch (State) { case LS_Normal: break; @@ -315,136 +347,116 @@ } assert(State == LS_Normal); - const char *TokenPtr = BufferPtr; assert(TokenPtr < CommentEnd); - while (TokenPtr != CommentEnd) { - switch(*TokenPtr) { - case '\\': - case '@': { - // Commands that start with a backslash and commands that start with - // 'at' have equivalent semantics. But we keep information about the - // exact syntax in AST for comments. - tok::TokenKind CommandKind = - (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + switch(*TokenPtr) { + case '\\': + case '@': { + // Commands that start with a backslash and commands that start with + // 'at' have equivalent semantics. But we keep information about the + // exact syntax in AST for comments. + tok::TokenKind CommandKind = + (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + char C = *TokenPtr; + switch (C) { + default: + break; + + case '\\': case '@': case '&': case '$': + case '#': case '<': case '>': case '%': + case '\"': case '.': case ':': + // This is one of \\ \@ \& \$ etc escape sequences. TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); - return; - } - char C = *TokenPtr; - switch (C) { - default: - break; - - case '\\': case '@': case '&': case '$': - case '#': case '<': case '>': case '%': - case '\"': case '.': case ':': - // This is one of \\ \@ \& \$ etc escape sequences. + if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { + // This is the \:: escape sequence. TokenPtr++; - if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { - // This is the \:: escape sequence. - TokenPtr++; - } - StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(UnescapedText); - return; } + StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(UnescapedText); + return; + } - // Don't make zero-length commands. - if (!isCommandNameStartCharacter(*TokenPtr)) { - formTextToken(T, TokenPtr); - return; - } + // Don't make zero-length commands. + if (!isCommandNameStartCharacter(*TokenPtr)) { + formTextToken(T, TokenPtr); + return; + } - TokenPtr = skipCommandName(TokenPtr, CommentEnd); - unsigned Length = TokenPtr - (BufferPtr + 1); - - // Hardcoded support for lexing LaTeX formula commands - // \f$ \f[ \f] \f{ \f} as a single command. - if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { - C = *TokenPtr; - if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { - TokenPtr++; - Length++; - } - } + TokenPtr = skipCommandName(TokenPtr, CommentEnd); + unsigned Length = TokenPtr - (BufferPtr + 1); - StringRef CommandName(BufferPtr + 1, Length); - - const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); - if (!Info) { - if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { - StringRef CorrectedName = Info->Name; - SourceLocation Loc = getSourceLocation(BufferPtr); - SourceLocation EndLoc = getSourceLocation(TokenPtr); - SourceRange FullRange = SourceRange(Loc, EndLoc); - SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); - Diag(Loc, diag::warn_correct_comment_command_name) - << FullRange << CommandName << CorrectedName - << FixItHint::CreateReplacement(CommandRange, CorrectedName); - } else { - formTokenWithChars(T, TokenPtr, tok::unknown_command); - T.setUnknownCommandName(CommandName); - Diag(T.getLocation(), diag::warn_unknown_comment_command_name) - << SourceRange(T.getLocation(), T.getEndLocation()); - return; - } - } - if (Info->IsVerbatimBlockCommand) { - setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); - return; - } - if (Info->IsVerbatimLineCommand) { - setupAndLexVerbatimLine(T, TokenPtr, Info); - return; + // Hardcoded support for lexing LaTeX formula commands + // \f$ \f[ \f] \f{ \f} as a single command. + if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { + C = *TokenPtr; + if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { + TokenPtr++; + Length++; } - formTokenWithChars(T, TokenPtr, CommandKind); - T.setCommandID(Info->getID()); - return; } - case '&': - lexHTMLCharacterReference(T); - return; - - case '<': { - TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); + StringRef CommandName(BufferPtr + 1, Length); + + const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); + if (!Info) { + if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { + StringRef CorrectedName = Info->Name; + SourceLocation Loc = getSourceLocation(BufferPtr); + SourceLocation EndLoc = getSourceLocation(TokenPtr); + SourceRange FullRange = SourceRange(Loc, EndLoc); + SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); + Diag(Loc, diag::warn_correct_comment_command_name) + << FullRange << CommandName << CorrectedName + << FixItHint::CreateReplacement(CommandRange, CorrectedName); + } else { + formTokenWithChars(T, TokenPtr, tok::unknown_command); + T.setUnknownCommandName(CommandName); + Diag(T.getLocation(), diag::warn_unknown_comment_command_name) + << SourceRange(T.getLocation(), T.getEndLocation()); return; } - const char C = *TokenPtr; - if (isHTMLIdentifierStartingCharacter(C)) - setupAndLexHTMLStartTag(T); - else if (C == '/') - setupAndLexHTMLEndTag(T); - else - formTextToken(T, TokenPtr); + } + if (Info->IsVerbatimBlockCommand) { + setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); return; } - - case '\n': - case '\r': - TokenPtr = skipNewline(TokenPtr, CommentEnd); - formTokenWithChars(T, TokenPtr, tok::newline); - - if (CommentState == LCS_InsideCComment) - skipLineStartingDecorations(); + if (Info->IsVerbatimLineCommand) { + setupAndLexVerbatimLine(T, TokenPtr, Info); return; + } + formTokenWithChars(T, TokenPtr, CommandKind); + T.setCommandID(Info->getID()); + return; + } - default: { - size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). - find_first_of("\n\r\\@&<"); - if (End != StringRef::npos) - TokenPtr += End; - else - TokenPtr = CommentEnd; + case '&': + lexHTMLCharacterReference(T); + return; + + case '<': { + TokenPtr++; + if (TokenPtr == CommentEnd) { formTextToken(T, TokenPtr); return; } + const char C = *TokenPtr; + if (isHTMLIdentifierStartingCharacter(C)) + setupAndLexHTMLStartTag(T); + else if (C == '/') + setupAndLexHTMLEndTag(T); + else + formTextToken(T, TokenPtr); + return; } + + default: + return HandleNonCommandToken(); } } @@ -727,14 +739,13 @@ } Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd): - Allocator(Allocator), Diags(Diags), Traits(Traits), - BufferStart(BufferStart), BufferEnd(BufferEnd), - FileLoc(FileLoc), BufferPtr(BufferStart), - CommentState(LCS_BeforeComment), State(LS_Normal) { -} + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands) + : Allocator(Allocator), Diags(Diags), Traits(Traits), + BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), + BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal), + ParseCommands(ParseCommands) {} void Lexer::lex(Token &T) { again: Index: lib/AST/RawCommentList.cpp =================================================================== --- lib/AST/RawCommentList.cpp +++ lib/AST/RawCommentList.cpp @@ -335,3 +335,90 @@ BeforeThanCompare(SourceMgr)); std::swap(Comments, MergedComments); } + +std::string RawComment::getFormattedText(const ASTContext &Ctx) const { + auto &SourceMgr = Ctx.getSourceManager(); + llvm::StringRef CommentText = getRawText(SourceMgr); + if (CommentText.empty()) + return ""; + + llvm::BumpPtrAllocator Allocator; + comments::Lexer L(Allocator, Ctx.getDiagnostics(), + Ctx.getCommentCommandTraits(), getSourceRange().getBegin(), + CommentText.begin(), CommentText.end(), + /*ParseCommands=*/false); + + std::string Result; + // A column number of the first non-whitespace token in the comment text. + // We skip whitespace up to this column, but keep the whitespace after this + // column. IndentColumn is calculated when lexing the first line and reused + // for the rest of lines. + unsigned IndentColumn = 0; + + // Processes one line of the comment and adds it to the result. + // Handles skipping the indent at the start of the line. + // Returns false when eof is reached and true otherwise. + auto LexLine = [&](bool IsFirstLine) -> bool { + comments::Token Tok; + // Lex the first token on the line. We handle it separately, because we to + // fix up its indentation. + L.lex(Tok); + if (Tok.is(comments::tok::eof)) + return false; + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr); + bool LocInvalid = false; + unsigned TokColumn = + SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid); + if (LocInvalid) + TokColumn = 0; + // Amount of leading whitespace in TokText. + size_t WhitespaceLen = TokText.find_first_not_of(" \t"); + if (WhitespaceLen == StringRef::npos) + WhitespaceLen = TokText.size(); + // Remember the amount of whitespace we skipped in the first line to remove + // indent up to that column in the following lines. + if (IsFirstLine) + IndentColumn = TokColumn + WhitespaceLen; + + // Amount of leading whitespace we actually want to skip. + // For the first line we skip all the whitespace. + // For the rest of the lines, we skip whitespace up to IndentColumn. + unsigned SkipLen = + LocInvalid || IsFirstLine + ? WhitespaceLen + : std::min(WhitespaceLen, + (size_t)std::max((int)IndentColumn - (int)TokColumn, 0)); + llvm::StringRef Trimmed = TokText.drop_front(SkipLen); + Result += Trimmed; + // Lex all tokens in the rest of the line. + for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) { + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + Result += L.getSpelling(Tok, SourceMgr); + } + // We've reached the end of the line. + return false; + }; + + auto DropTrailingNewLines = [](std::string &Str) { + while (Str.back() == '\n') + Str.pop_back(); + }; + + // Proces first line separately to remember indent for the following lines. + if (!LexLine(/*IsFirstLine=*/true)) { + DropTrailingNewLines(Result); + return Result; + } + // Process the rest of the lines. + while (LexLine(/*IsFirstLine=*/false)) + ; + DropTrailingNewLines(Result); + return Result; +}