Index: include/clang/AST/CommentLexer.h =================================================================== --- include/clang/AST/CommentLexer.h +++ include/clang/AST/CommentLexer.h @@ -281,6 +281,11 @@ /// command, including command marker. SmallString<16> VerbatimBlockEndCommandName; + /// If true, the commands, html tags, etc will be parsed and reported as + /// separate tokens inside the comment body. If false, the comment text will + /// be parsed into text and newline tokens. + bool ParseCommands; + /// Given a character reference name (e.g., "lt"), return the character that /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; @@ -315,12 +320,22 @@ /// Eat string matching regexp \code \s*\* \endcode. void skipLineStartingDecorations(); - /// Lex stuff inside comments. CommentEnd should be set correctly. + /// Calls lexCommentText(With|Without)Commands, depending on value of + /// ParseCommands. void lexCommentText(Token &T); - void setupAndLexVerbatimBlock(Token &T, - const char *TextBegin, - char Marker, const CommandInfo *Info); + /// Try to lex commands, html tags and verbatim blocks inside comment text. + /// CommentEnd should be set correctly. + /// Returns true if command was lexed and no further processing is required. + /// Returns false if no special tokens were found and lexing of ordinary comment text need to be performed instead. + bool tryLexCommands(Token &T); + + /// Lex only newlines and text inside comments. CommentEnd should be set + /// correctly. + void lexCommentTextWithoutCommands(Token &T); + + void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, + const CommandInfo *Info); void lexVerbatimBlockFirstLine(Token &T); @@ -343,14 +358,13 @@ public: Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd); + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands = true); void lex(Token &T); - StringRef getSpelling(const Token &Tok, - const SourceManager &SourceMgr, + StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid = nullptr) const; }; Index: include/clang/AST/RawCommentList.h =================================================================== --- include/clang/AST/RawCommentList.h +++ include/clang/AST/RawCommentList.h @@ -111,6 +111,29 @@ return extractBriefText(Context); } + /// Returns sanitized comment text, suitable for presentation in editor UIs. + /// E.g. will transform: + /// // This is a long multiline comment. + /// // Parts of it might be indented. + /// /* The comments styles might be mixed. */ + /// into + /// "This is a long multiline comment.\n" + /// " Parts of it might be indented.\n" + /// "The comments styles might be mixed." + /// Also removes leading indentation and sanitizes some common cases: + /// /* This is a first line. + /// * This is a second line. It is indented. + /// * This is a third line. */ + /// and + /// /* This is a first line. + /// This is a second line. It is indented. + /// This is a third line. */ + /// will both turn into: + /// "This is a first line.\n" + /// " This is a second line. It is indented.\n" + /// "This is a third line." + std::string getFormattedText(const ASTContext &Context) const; + /// Parse the comment, assuming it is attached to decl \c D. comments::FullComment *parse(const ASTContext &Context, const Preprocessor *PP, const Decl *D) const; Index: lib/AST/CommentLexer.cpp =================================================================== --- lib/AST/CommentLexer.cpp +++ lib/AST/CommentLexer.cpp @@ -291,159 +291,181 @@ } void Lexer::lexCommentText(Token &T) { + if (ParseCommands) { + if (tryLexCommands(T)) + return; + } + lexCommentTextWithoutCommands(T); +} + +bool Lexer::tryLexCommands(Token &T) { assert(CommentState == LCS_InsideBCPLComment || CommentState == LCS_InsideCComment); + if (!ParseCommands) + return false; switch (State) { case LS_Normal: break; case LS_VerbatimBlockFirstLine: lexVerbatimBlockFirstLine(T); - return; + return true; case LS_VerbatimBlockBody: lexVerbatimBlockBody(T); - return; + return true; case LS_VerbatimLineText: lexVerbatimLineText(T); - return; + return true; case LS_HTMLStartTag: lexHTMLStartTag(T); - return; + return true; case LS_HTMLEndTag: lexHTMLEndTag(T); - return; + return true; } assert(State == LS_Normal); const char *TokenPtr = BufferPtr; assert(TokenPtr < CommentEnd); - while (TokenPtr != CommentEnd) { - switch(*TokenPtr) { - case '\\': - case '@': { - // Commands that start with a backslash and commands that start with - // 'at' have equivalent semantics. But we keep information about the - // exact syntax in AST for comments. - tok::TokenKind CommandKind = - (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + switch(*TokenPtr) { + case '\\': + case '@': { + // Commands that start with a backslash and commands that start with + // 'at' have equivalent semantics. But we keep information about the + // exact syntax in AST for comments. + tok::TokenKind CommandKind = + (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return true; + } + char C = *TokenPtr; + switch (C) { + default: + break; + + case '\\': case '@': case '&': case '$': + case '#': case '<': case '>': case '%': + case '\"': case '.': case ':': + // This is one of \\ \@ \& \$ etc escape sequences. TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); - return; - } - char C = *TokenPtr; - switch (C) { - default: - break; - - case '\\': case '@': case '&': case '$': - case '#': case '<': case '>': case '%': - case '\"': case '.': case ':': - // This is one of \\ \@ \& \$ etc escape sequences. + if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { + // This is the \:: escape sequence. TokenPtr++; - if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { - // This is the \:: escape sequence. - TokenPtr++; - } - StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(UnescapedText); - return; } + StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(UnescapedText); + return true; + } - // Don't make zero-length commands. - if (!isCommandNameStartCharacter(*TokenPtr)) { - formTextToken(T, TokenPtr); - return; - } + // Don't make zero-length commands. + if (!isCommandNameStartCharacter(*TokenPtr)) { + formTextToken(T, TokenPtr); + return true; + } - TokenPtr = skipCommandName(TokenPtr, CommentEnd); - unsigned Length = TokenPtr - (BufferPtr + 1); - - // Hardcoded support for lexing LaTeX formula commands - // \f$ \f[ \f] \f{ \f} as a single command. - if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { - C = *TokenPtr; - if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { - TokenPtr++; - Length++; - } - } + TokenPtr = skipCommandName(TokenPtr, CommentEnd); + unsigned Length = TokenPtr - (BufferPtr + 1); - StringRef CommandName(BufferPtr + 1, Length); - - const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); - if (!Info) { - if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { - StringRef CorrectedName = Info->Name; - SourceLocation Loc = getSourceLocation(BufferPtr); - SourceLocation EndLoc = getSourceLocation(TokenPtr); - SourceRange FullRange = SourceRange(Loc, EndLoc); - SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); - Diag(Loc, diag::warn_correct_comment_command_name) - << FullRange << CommandName << CorrectedName - << FixItHint::CreateReplacement(CommandRange, CorrectedName); - } else { - formTokenWithChars(T, TokenPtr, tok::unknown_command); - T.setUnknownCommandName(CommandName); - Diag(T.getLocation(), diag::warn_unknown_comment_command_name) - << SourceRange(T.getLocation(), T.getEndLocation()); - return; - } - } - if (Info->IsVerbatimBlockCommand) { - setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); - return; + // Hardcoded support for lexing LaTeX formula commands + // \f$ \f[ \f] \f{ \f} as a single command. + if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { + C = *TokenPtr; + if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { + TokenPtr++; + Length++; } - if (Info->IsVerbatimLineCommand) { - setupAndLexVerbatimLine(T, TokenPtr, Info); - return; + } + + StringRef CommandName(BufferPtr + 1, Length); + + const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); + if (!Info) { + if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { + StringRef CorrectedName = Info->Name; + SourceLocation Loc = getSourceLocation(BufferPtr); + SourceLocation EndLoc = getSourceLocation(TokenPtr); + SourceRange FullRange = SourceRange(Loc, EndLoc); + SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); + Diag(Loc, diag::warn_correct_comment_command_name) + << FullRange << CommandName << CorrectedName + << FixItHint::CreateReplacement(CommandRange, CorrectedName); + } else { + formTokenWithChars(T, TokenPtr, tok::unknown_command); + T.setUnknownCommandName(CommandName); + Diag(T.getLocation(), diag::warn_unknown_comment_command_name) + << SourceRange(T.getLocation(), T.getEndLocation()); + return true; } - formTokenWithChars(T, TokenPtr, CommandKind); - T.setCommandID(Info->getID()); - return; } + if (Info->IsVerbatimBlockCommand) { + setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); + return true; + } + if (Info->IsVerbatimLineCommand) { + setupAndLexVerbatimLine(T, TokenPtr, Info); + return true; + } + formTokenWithChars(T, TokenPtr, CommandKind); + T.setCommandID(Info->getID()); + return true; + } - case '&': - lexHTMLCharacterReference(T); - return; + case '&': + lexHTMLCharacterReference(T); + return true; - case '<': { - TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); - return; - } - const char C = *TokenPtr; - if (isHTMLIdentifierStartingCharacter(C)) - setupAndLexHTMLStartTag(T); - else if (C == '/') - setupAndLexHTMLEndTag(T); - else - formTextToken(T, TokenPtr); - return; + case '<': { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return true; } + const char C = *TokenPtr; + if (isHTMLIdentifierStartingCharacter(C)) + setupAndLexHTMLStartTag(T); + else if (C == '/') + setupAndLexHTMLEndTag(T); + else + formTextToken(T, TokenPtr); + return true; + } - case '\n': - case '\r': + default: + // Signal lexing code for an ordinary comment should be run instead. + return false; + } +} + +void Lexer::lexCommentTextWithoutCommands(Token &T) { + assert(CommentState == LCS_InsideBCPLComment || + CommentState == LCS_InsideCComment); + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + switch (*TokenPtr) { + case '\n': + case '\r': TokenPtr = skipNewline(TokenPtr, CommentEnd); formTokenWithChars(T, TokenPtr, tok::newline); if (CommentState == LCS_InsideCComment) - skipLineStartingDecorations(); + skipLineStartingDecorations(); return; - default: { - size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). - find_first_of("\n\r\\@&<"); + default: { + size_t End = + StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\\@&<"); if (End != StringRef::npos) - TokenPtr += End; + TokenPtr += End; else - TokenPtr = CommentEnd; + TokenPtr = CommentEnd; formTextToken(T, TokenPtr); return; - } } } } @@ -727,14 +749,13 @@ } Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd): - Allocator(Allocator), Diags(Diags), Traits(Traits), - BufferStart(BufferStart), BufferEnd(BufferEnd), - FileLoc(FileLoc), BufferPtr(BufferStart), - CommentState(LCS_BeforeComment), State(LS_Normal) { -} + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands) + : Allocator(Allocator), Diags(Diags), Traits(Traits), + BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), + BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal), + ParseCommands(ParseCommands) {} void Lexer::lex(Token &T) { again: Index: lib/AST/RawCommentList.cpp =================================================================== --- lib/AST/RawCommentList.cpp +++ lib/AST/RawCommentList.cpp @@ -335,3 +335,88 @@ BeforeThanCompare(SourceMgr)); std::swap(Comments, MergedComments); } + +std::string RawComment::getFormattedText(const ASTContext &Ctx) const { + auto &SourceMgr = Ctx.getSourceManager(); + llvm::StringRef CommentText = getRawText(SourceMgr); + if (CommentText.empty()) + return ""; // we couldn't retreive the comment. + + llvm::BumpPtrAllocator Allocator; + comments::Lexer L(Allocator, Ctx.getDiagnostics(), + Ctx.getCommentCommandTraits(), getSourceRange().getBegin(), + CommentText.begin(), CommentText.end(), + /*ParseCommentText=*/false); + + std::string Result; + unsigned IndentColumn = 0; + + // Processes one line of the comment and adds it to the result. + // Handles skipping the indent at the start of the line. + // Returns false when eof is reached and true otherwise. + auto LexLine = [&](bool IsFirstLine) -> bool { + comments::Token Tok; + // Lex the first token on the line. We handle it separately, because we to + // fix up its indentation. + L.lex(Tok); + if (Tok.is(comments::tok::eof)) + return false; + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr); + bool LocInvalid = false; + unsigned TokColumn = + SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid); + if (LocInvalid) + TokColumn = 0; + // Compute the length of whitespace we're allowed to skip. + size_t MaxSkip; + if (IsFirstLine) { + // For the first line we skip all leading ws. + MaxSkip = std::numeric_limits::max(); + } else { + // For the rest, we skip up to the column of first non-ws symbol on the + // first line.. + MaxSkip = std::max((int)IndentColumn - (int)TokColumn, 0); + } + // Amount of leading whitespace in TokText. + size_t WhitespaceLen = TokText.find_first_not_of(" \t"); + if (WhitespaceLen == StringRef::npos) + WhitespaceLen = TokText.size(); + // Remember the amount of whitespace we skipped in the first line to remove + // indent up to that column in the following lines. + if (IsFirstLine) + IndentColumn = TokColumn + WhitespaceLen; + + llvm::StringRef Trimmed = TokText.drop_front(std::min(MaxSkip, WhitespaceLen)); + Result += Trimmed; + // Lex all tokens in the rest of the line. + for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) { + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + Result += L.getSpelling(Tok, SourceMgr); + } + // We've reached the end of the line. + return false; + }; + + auto DropTrailingNewLines = [](std::string &Str) { + while (Str.back() == '\n') + Str.pop_back(); + }; + + // Proces first line separately to remember indent for the following lines. + if (!LexLine(/*IsFirstLine=*/true)) { + DropTrailingNewLines(Result); + return Result; + } + // Process the rest of the lines. + while (LexLine(/*IsFirstLine=*/false)) + ; + DropTrailingNewLines(Result); + return Result; +}