Index: include/clang/AST/CommentLexer.h =================================================================== --- include/clang/AST/CommentLexer.h +++ include/clang/AST/CommentLexer.h @@ -281,6 +281,11 @@ /// command, including command marker. SmallString<16> VerbatimBlockEndCommandName; + /// If true, the commands, html tags, etc will be parsed and reported as + /// separate tokens inside the comment body. If false, the comment text will + /// be parsed into text and newline tokens. + bool ParseCommands; + /// Given a character reference name (e.g., "lt"), return the character that /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; @@ -315,12 +320,19 @@ /// Eat string matching regexp \code \s*\* \endcode. void skipLineStartingDecorations(); - /// Lex stuff inside comments. CommentEnd should be set correctly. + /// Calls lexCommentText(With|Without)Commands, depending on value of + /// ParseCommands. void lexCommentText(Token &T); - void setupAndLexVerbatimBlock(Token &T, - const char *TextBegin, - char Marker, const CommandInfo *Info); + /// Lex stuff inside comments. CommentEnd should be set correctly. + void lexCommentTextWithCommands(Token &T); + + /// Lex only newlines and text inside comments. CommentEnd should be set + /// correctly. + void lexCommentTextWithoutCommands(Token &T); + + void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, + const CommandInfo *Info); void lexVerbatimBlockFirstLine(Token &T); @@ -343,14 +355,13 @@ public: Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd); + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands = true); void lex(Token &T); - StringRef getSpelling(const Token &Tok, - const SourceManager &SourceMgr, + StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid = nullptr) const; }; Index: include/clang/AST/RawCommentList.h =================================================================== --- include/clang/AST/RawCommentList.h +++ include/clang/AST/RawCommentList.h @@ -111,6 +111,29 @@ return extractBriefText(Context); } + /// Returns sanitized comment text, suitable for presentation in editor UIs. + /// E.g. will transform: + /// // This is a long multiline comment. + /// // Parts of it might be indented. + /// /* The comments styles might be mixed. */ + /// into + /// "This is a long multiline comment.\n" + /// " Parts of it might be indented.\n" + /// "The comments styles might be mixed." + /// Also removes leading indentation and sanitizes some common cases: + /// /* This is a first line. + /// * This is a second line. It is indented. + /// * This is a third line. */ + /// and + /// /* This is a first line. + /// This is a second line. It is indented. + /// This is a third line. */ + /// will both turn into: + /// "This is a first line.\n" + /// " This is a second line. It is indented.\n" + /// "This is a third line." + std::string getFormattedText(const ASTContext &Context) const; + /// Parse the comment, assuming it is attached to decl \c D. comments::FullComment *parse(const ASTContext &Context, const Preprocessor *PP, const Decl *D) const; Index: lib/AST/CommentLexer.cpp =================================================================== --- lib/AST/CommentLexer.cpp +++ lib/AST/CommentLexer.cpp @@ -291,6 +291,14 @@ } void Lexer::lexCommentText(Token &T) { + if (ParseCommands) + lexCommentTextWithCommands(T); + else + lexCommentTextWithoutCommands(T); +} + +void Lexer::lexCommentTextWithCommands(Token &T) { + assert(ParseCommands); assert(CommentState == LCS_InsideBCPLComment || CommentState == LCS_InsideCComment); @@ -448,6 +456,39 @@ } } +void Lexer::lexCommentTextWithoutCommands(Token &T) { + assert(!ParseCommands); + assert(CommentState == LCS_InsideBCPLComment || + CommentState == LCS_InsideCComment); + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + while (TokenPtr != CommentEnd) { + switch(*TokenPtr) { + case '\n': + case '\r': + TokenPtr = skipNewline(TokenPtr, CommentEnd); + formTokenWithChars(T, TokenPtr, tok::newline); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + return; + + default: { + size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). + find_first_of("\n\r\\@&<"); + if (End != StringRef::npos) + TokenPtr += End; + else + TokenPtr = CommentEnd; + formTextToken(T, TokenPtr); + return; + } + } + } +} + void Lexer::setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, const CommandInfo *Info) { @@ -727,14 +768,13 @@ } Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd): - Allocator(Allocator), Diags(Diags), Traits(Traits), - BufferStart(BufferStart), BufferEnd(BufferEnd), - FileLoc(FileLoc), BufferPtr(BufferStart), - CommentState(LCS_BeforeComment), State(LS_Normal) { -} + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands) + : Allocator(Allocator), Diags(Diags), Traits(Traits), + BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), + BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal), + ParseCommands(ParseCommands) {} void Lexer::lex(Token &T) { again: Index: lib/AST/RawCommentList.cpp =================================================================== --- lib/AST/RawCommentList.cpp +++ lib/AST/RawCommentList.cpp @@ -335,3 +335,91 @@ BeforeThanCompare(SourceMgr)); std::swap(Comments, MergedComments); } + +std::string RawComment::getFormattedText(const ASTContext &Ctx) const { + auto &SourceMgr = Ctx.getSourceManager(); + llvm::StringRef CommentText = getRawText(SourceMgr); + if (CommentText.empty()) + return ""; // we couldn't retreive the comment. + + llvm::BumpPtrAllocator Allocator; + comments::Lexer L(Allocator, Ctx.getDiagnostics(), + Ctx.getCommentCommandTraits(), getSourceRange().getBegin(), + CommentText.begin(), CommentText.end(), + /*ParseCommentText=*/false); + + // Trim whitespace at the start of \p S of length up to the value of \p + // MaxSkip. + auto SkipWs = [](llvm::StringRef S, unsigned MaxSkip) -> llvm::StringRef { + unsigned SkipLen = std::min( + MaxSkip, (unsigned)std::min(S.size(), S.find_first_not_of(" \t"))); + return S.drop_front(SkipLen); + }; + + std::string Result; + unsigned IndentColumn = 0; + + // Processes one line of the comment and adds it to the result. + // Handles skipping the indent at the start of the line. + // Returns false when eof is reached and true otherwise. + auto LexLine = [&](bool IsFirstLine) -> bool { + comments::Token Tok; + // Lex the first token on the line. We handle it separately, because we to + // fix up its indentation. + L.lex(Tok); + if (Tok.is(comments::tok::eof)) + return false; + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr); + bool LocInvalid = false; + unsigned TokColumn = + SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid); + if (LocInvalid) + TokColumn = 0; + // Compute the length of whitespace we're allowed to skip. + unsigned MaxSkip; + if (IsFirstLine) { + // For the first line we skip all leading ws. + MaxSkip = std::numeric_limits::max(); + } else { + // For the rest, we skip up to the column of first non-ws symbol on the + // first line.. + MaxSkip = std::max((int)IndentColumn - (int)TokColumn, 0); + } + llvm::StringRef Trimmed = SkipWs(TokText, MaxSkip); + Result += Trimmed; + // Remember the amount of whitespace we skipped in the first line to remove + // indent up to that column in the following lines. + if (IsFirstLine) + IndentColumn = TokColumn + TokText.size() - Trimmed.size(); + // Lex all tokens in the rest of the line. + for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) { + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + Result += L.getSpelling(Tok, SourceMgr); + } + // We've reached the end of the line. + return false; + }; + + auto DropTrailingNewLines = [](std::string &Str) { + while (Str.back() == '\n') + Str.pop_back(); + }; + + // Proces first line separately to remember indent for the following lines. + if (!LexLine(/*IsFirstLine=*/true)) { + DropTrailingNewLines(Result); + return Result; + } + // Process the rest of the lines. + while (LexLine(/*IsFirstLine=*/false)) + ; + DropTrailingNewLines(Result); + return Result; +}