Index: include/clang/Format/Format.h =================================================================== --- include/clang/Format/Format.h +++ include/clang/Format/Format.h @@ -1270,6 +1270,11 @@ /// \brief The number of columns used for tab stops. unsigned TabWidth; + /// \brief A regular expression matching the delimiter of a raw string + /// containing textual protocol buffer messages, for example ``(pb|proto)``. + /// The matching is case-insensitive. + std::string TextProtoRawStringDelimeter; + /// \brief Different ways to use tab in formatting. enum UseTabStyle { /// Never use tab. @@ -1367,6 +1372,7 @@ SpacesInParentheses == R.SpacesInParentheses && SpacesInSquareBrackets == R.SpacesInSquareBrackets && Standard == R.Standard && TabWidth == R.TabWidth && + TextProtoRawStringDelimeter == R.TextProtoRawStringDelimeter && UseTab == R.UseTab; } }; Index: lib/Format/Format.cpp =================================================================== --- lib/Format/Format.cpp +++ lib/Format/Format.cpp @@ -359,6 +359,8 @@ IO.mapOptional("SpacesInSquareBrackets", Style.SpacesInSquareBrackets); IO.mapOptional("Standard", Style.Standard); IO.mapOptional("TabWidth", Style.TabWidth); + IO.mapOptional("TextProtoRawStringDelimeter", + Style.TextProtoRawStringDelimeter); IO.mapOptional("UseTab", Style.UseTab); } }; @@ -547,6 +549,7 @@ LLVMStyle.JavaScriptQuotes = FormatStyle::JSQS_Leave; LLVMStyle.JavaScriptWrapImports = true; LLVMStyle.TabWidth = 8; + LLVMStyle.TextProtoRawStringDelimeter = "(pb|proto)"; LLVMStyle.MaxEmptyLinesToKeep = 1; LLVMStyle.KeepEmptyLinesAtTheStartOfBlocks = true; LLVMStyle.NamespaceIndentation = FormatStyle::NI_None; Index: lib/Format/FormatToken.h =================================================================== --- lib/Format/FormatToken.h +++ lib/Format/FormatToken.h @@ -268,6 +268,9 @@ /// Only set to true if \c Type == \c TT_LineComment. bool ContinuesLineCommentSection = false; + /// \brief Does this token start a raw string literal with proto message text. + bool StartsRawStringProtoLiteral = false; + /// \brief If this is a bracket, this points to the matching one. FormatToken *MatchingParen = nullptr; Index: lib/Format/FormatTokenLexer.h =================================================================== --- lib/Format/FormatTokenLexer.h +++ lib/Format/FormatTokenLexer.h @@ -76,6 +76,8 @@ bool tryMergeConflictMarkers(); + void tryParseTextProtoRawStringLiteral(); + FormatToken *getStashedToken(); FormatToken *getNextToken(); @@ -102,6 +104,7 @@ llvm::Regex MacroBlockBeginRegex; llvm::Regex MacroBlockEndRegex; + llvm::Regex TextProtoRawStringDelimeterRegex; void readRawToken(FormatToken &Tok); Index: lib/Format/FormatTokenLexer.cpp =================================================================== --- lib/Format/FormatTokenLexer.cpp +++ lib/Format/FormatTokenLexer.cpp @@ -20,6 +20,11 @@ #include "clang/Format/Format.h" #include "llvm/Support/Regex.h" +#define DEBUG_TYPE "format-lexer" + +#define DBG(A) DEBUG({ llvm::dbgs() << __func__ << ":" << __LINE__ << "::" \ + << #A << " = " << A << "\n"; }); + namespace clang { namespace format { @@ -31,7 +36,9 @@ Style(Style), IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), - MacroBlockEndRegex(Style.MacroBlockEnd) { + MacroBlockEndRegex(Style.MacroBlockEnd), + TextProtoRawStringDelimeterRegex(Style.TextProtoRawStringDelimeter, + llvm::Regex::IgnoreCase) { Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, getFormattingLangOpts(Style))); Lex->SetKeepWhitespaceMode(true); @@ -53,6 +60,8 @@ tryMergePreviousTokens(); if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) FirstInLineIndex = Tokens.size() - 1; + if (Style.isCpp()) + tryParseTextProtoRawStringLiteral(); } while (Tokens.back()->Tok.isNot(tok::eof)); return Tokens; } @@ -229,6 +238,53 @@ resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); } +// Matches the text of a raw string literal, like: +// R"delimiter(contents)delimiter" +// The first group captures the delimiter; the second captures the contents. +static llvm::Regex + kRawStringLiteralPattern(R"regex(^R"([a-zA-Z0-9_]*)\((.*)\)\1"$)regex"); + +void FormatTokenLexer::tryParseTextProtoRawStringLiteral() { + FormatToken *RawTok = Tokens.back(); + SmallVector Groups; + if (!RawTok->is(tok::string_literal)) + return; + StringRef TokenText = RawTok->TokenText; + DBG(TokenText); + if (!kRawStringLiteralPattern.match(RawTok->TokenText, &Groups)) + return; + assert(Groups.size() > 1); + StringRef Delimiter = Groups[1]; + if (!TextProtoRawStringDelimeterRegex.match(Delimiter)) + return; + DBG(Delimiter); + // The opening text is 'R"delimeter('. + StringRef OpenText = TokenText.substr(0, Delimiter.size() + 3); + // The closing text is ')delimeter"'. + StringRef CloseText = + TokenText.substr(TokenText.size() - Delimiter.size() - 2); + RawTok->TokenText = OpenText; + RawTok->ColumnWidth = OpenText.size(); + RawTok->IsMultiline = false; + RawTok->StartsRawStringProtoLiteral = true; + RawTok->Tok.setKind(tok::l_brace); + RawTok->Tok.setIdentifierInfo(nullptr); + const char *Offset = Lex->getBufferLocation(); + resetLexer(SourceMgr.getFileOffset( + RawTok->Tok.getLocation().getLocWithOffset(OpenText.size()))); + while (FormatToken *Tok = getNextToken()) { + if (Lex->getBufferLocation() > Offset - CloseText.size()) { + Tok->TokenText = CloseText; + Tok->ColumnWidth = CloseText.size(); + Tok->Tok.setKind(tok::r_brace); + Tokens.push_back(Tok); + break; + } + Tokens.push_back(Tok); + } + resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); +} + void FormatTokenLexer::handleTemplateStrings() { FormatToken *BacktickToken = Tokens.back(); Index: lib/Format/UnwrappedLineParser.cpp =================================================================== --- lib/Format/UnwrappedLineParser.cpp +++ lib/Format/UnwrappedLineParser.cpp @@ -118,9 +118,10 @@ class ScopedLineState { public: - ScopedLineState(UnwrappedLineParser &Parser, + ScopedLineState(UnwrappedLineParser &Parser, bool AddScope = true, bool SwitchToPreprocessorLines = false) - : Parser(Parser), OriginalLines(Parser.CurrentLines) { + : Parser(Parser), AddScope(AddScope), OriginalLines(Parser.CurrentLines) { + if (!AddScope) return; if (SwitchToPreprocessorLines) Parser.CurrentLines = &Parser.PreprocessorDirectives; else if (!Parser.Line->Tokens.empty()) @@ -132,9 +133,9 @@ } ~ScopedLineState() { - if (!Parser.Line->Tokens.empty()) { + if (!AddScope) return; + if (!Parser.Line->Tokens.empty()) Parser.addUnwrappedLine(); - } assert(Parser.Line->Tokens.empty()); Parser.Line = std::move(PreBlockLine); if (Parser.CurrentLines == &Parser.PreprocessorDirectives) @@ -144,6 +145,7 @@ private: UnwrappedLineParser &Parser; + const bool AddScope; std::unique_ptr PreBlockLine; SmallVectorImpl *OriginalLines; @@ -1286,85 +1288,92 @@ bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons) { bool HasError = false; + bool AddScope = FormatTok->StartsRawStringProtoLiteral; nextToken(); - - // FIXME: Once we have an expression parser in the UnwrappedLineParser, - // replace this by using parseAssigmentExpression() inside. - do { - if (Style.Language == FormatStyle::LK_JavaScript) { - if (FormatTok->is(Keywords.kw_function) || - FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function)) { - tryToParseJSFunction(); - continue; - } - if (FormatTok->is(TT_JsFatArrow)) { - nextToken(); - // Fat arrows can be followed by simple expressions or by child blocks - // in curly braces. + { + ScopedLineState State(*this, AddScope); + // FIXME: Once we have an expression parser in the UnwrappedLineParser, + // replace this by using parseAssigmentExpression() inside. + while (!eof()) { + if (Style.Language == FormatStyle::LK_JavaScript) { + if (FormatTok->is(Keywords.kw_function) || + FormatTok->startsSequence(Keywords.kw_async, + Keywords.kw_function)) { + tryToParseJSFunction(); + continue; + } + if (FormatTok->is(TT_JsFatArrow)) { + nextToken(); + // Fat arrows can be followed by simple expressions or by child blocks + // in curly braces. + if (FormatTok->is(tok::l_brace)) { + parseChildBlock(); + continue; + } + } if (FormatTok->is(tok::l_brace)) { + // Could be a method inside of a braced list `{a() { return 1; }}`. + if (tryToParseBracedList()) + continue; parseChildBlock(); - continue; } } - if (FormatTok->is(tok::l_brace)) { - // Could be a method inside of a braced list `{a() { return 1; }}`. - if (tryToParseBracedList()) - continue; - parseChildBlock(); - } - } - switch (FormatTok->Tok.getKind()) { - case tok::caret: - nextToken(); - if (FormatTok->is(tok::l_brace)) { - parseChildBlock(); - } - break; - case tok::l_square: - tryToParseLambda(); - break; - case tok::l_paren: - parseParens(); - // JavaScript can just have free standing methods and getters/setters in - // object literals. Detect them by a "{" following ")". - if (Style.Language == FormatStyle::LK_JavaScript) { - if (FormatTok->is(tok::l_brace)) + if (FormatTok->is(tok::r_brace)) break; + switch (FormatTok->Tok.getKind()) { + case tok::caret: + nextToken(); + if (FormatTok->is(tok::l_brace)) { parseChildBlock(); + } break; - } - break; - case tok::l_brace: - // Assume there are no blocks inside a braced init list apart - // from the ones we explicitly parse out (like lambdas). - FormatTok->BlockKind = BK_BracedInit; - parseBracedList(); - break; - case tok::r_brace: - nextToken(); - return !HasError; - case tok::semi: - // JavaScript (or more precisely TypeScript) can have semicolons in braced - // lists (in so-called TypeMemberLists). Thus, the semicolon cannot be - // used for error recovery if we have otherwise determined that this is - // a braced list. - if (Style.Language == FormatStyle::LK_JavaScript) { + case tok::l_square: + tryToParseLambda(); + break; + case tok::l_paren: + parseParens(); + // JavaScript can just have free standing methods and getters/setters in + // object literals. Detect them by a "{" following ")". + if (Style.Language == FormatStyle::LK_JavaScript) { + if (FormatTok->is(tok::l_brace)) + parseChildBlock(); + break; + } + break; + case tok::l_brace: + // Assume there are no blocks inside a braced init list apart from the + // ones we explicitly parse out (like lambdas). + FormatTok->BlockKind = BK_BracedInit; + parseBracedList(); + break; + case tok::r_brace: + assert(0); + nextToken(); + return !HasError; + case tok::semi: + // JavaScript (or more precisely TypeScript) can have semicolons in + // braced lists (in so-called TypeMemberLists). Thus, the semicolon + // cannot be used for error recovery if we have otherwise determined + // that this is a braced list. + if (Style.Language == FormatStyle::LK_JavaScript) { + nextToken(); + break; + } + HasError = true; + if (!ContinueOnSemicolons) + return !HasError; + nextToken(); + break; + case tok::comma: + nextToken(); + break; + default: nextToken(); break; } - HasError = true; - if (!ContinueOnSemicolons) - return !HasError; - nextToken(); - break; - case tok::comma: - nextToken(); - break; - default: - nextToken(); - break; } - } while (!eof()); - return false; + } // destroy ScopedLineState + if (FormatTok->is(tok::r_brace)) nextToken(); + return !HasError; } void UnwrappedLineParser::parseParens() { @@ -2293,7 +2302,8 @@ // If there is an unfinished unwrapped line, we flush the preprocessor // directives only after that unwrapped line was finished later. bool SwitchToPreprocessorLines = !Line->Tokens.empty(); - ScopedLineState BlockState(*this, SwitchToPreprocessorLines); + ScopedLineState BlockState(*this, /*AddScope=*/true, + SwitchToPreprocessorLines); // Comments stored before the preprocessor directive need to be output // before the preprocessor directive, at the same level as the // preprocessor directive, as we consider them to apply to the directive. Index: unittests/Format/FormatTest.cpp =================================================================== --- unittests/Format/FormatTest.cpp +++ unittests/Format/FormatTest.cpp @@ -10208,6 +10208,15 @@ EXPECT_EQ("auto c = u8'a';", format("auto c = u8'a';")); } +TEST_F(FormatTest, TODO) { + std::string Code = R"code(auto a = R"pb( + field_a: OK + field_b: "OK" + field_c: "OK" + msg_field: {field_d: 123} )pb";)code"; + EXPECT_EQ(Code, format(Code)); +} + } // end namespace } // end namespace format } // end namespace clang