diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -92,6 +92,8 @@ bool tryMergeConflictMarkers(); + void resizeToken(size_t NewLen); + FormatToken *getStashedToken(); FormatToken *getNextToken(); diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -836,6 +836,61 @@ return FormatTok; } +void FormatTokenLexer::resizeToken(size_t NewLen) { + resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation( + Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen))); + FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen); + FormatTok->ColumnWidth = encoding::columnWidthWithTabs( + FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, + Encoding); + FormatTok->Tok.setLength(NewLen); +} + +/// Count the length of leading whitespace in a token. +static size_t countLeadingWhitespace(StringRef Text) { + // Basically counting the length matched by this regex. + // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+" + // Directly using the regex turned out to be slow. With the regex + // version formatting all files in this directory took about 1.25 + // seconds. This version took about 0.5 seconds. + bool Done = false; + const char *Cur = Text.begin(); + while (!Done && Cur < Text.end()) + switch (Cur[0]) { + case '\n': + case '\r': + case '\f': + case '\v': + case ' ': + case '\t': + ++Cur; + break; + // A '\' followed by a newline always escapes the newline, regardless + // of whether there is another '\' before it. + case '\\': + // The source has a null byte at the end. It is not necessary to + // check Cur + 1 < Text.end(). + if (Cur[1] == '\n' || Cur[1] == '\r') + Cur += 2; + else + Done = true; + break; + // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the + // characters are quoted individually in this comment because if we write + // them together some compilers warn that we have a trigraph in the code. + case '?': + if (Cur[1] == '?' && Cur[2] == '/' && (Cur[3] == '\n' || Cur[3] == '\r')) + Cur += 4; + else + Done = true; + break; + default: + Done = true; + break; + } + return Cur - Text.begin(); +} + FormatToken *FormatTokenLexer::getNextToken() { if (StateStack.top() == LexerState::TOKEN_STASHED) { StateStack.pop(); @@ -850,34 +905,29 @@ IsFirstToken = false; // Consume and record whitespace until we find a significant token. + // Some tok::unknown tokens are not just whitespace, e.g. whitespace + // followed by a symbol such as backtick. Those symbols may be + // significant in other languages. unsigned WhitespaceLength = TrailingWhitespace; - while (FormatTok->is(tok::unknown)) { + while (FormatTok->isNot(tok::eof)) { + auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText); + if (!LeadingWhitespace) + break; + if (LeadingWhitespace < FormatTok->TokenText.size()) + resizeToken(LeadingWhitespace); StringRef Text = FormatTok->TokenText; - auto EscapesNewline = [&](int pos) { - // A '\r' here is just part of '\r\n'. Skip it. - if (pos >= 0 && Text[pos] == '\r') - --pos; - // See whether there is an odd number of '\' before this. - // FIXME: This is wrong. A '\' followed by a newline is always removed, - // regardless of whether there is another '\' before it. - // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph. - unsigned count = 0; - for (; pos >= 0; --pos, ++count) - if (Text[pos] != '\\') - break; - return count & 1; - }; - // FIXME: This miscounts tok:unknown tokens that are not just - // whitespace, e.g. a '`' character. + bool InEscape = false; for (int i = 0, e = Text.size(); i != e; ++i) { switch (Text[i]) { + case '\r': + if (i + 1 < e && Text[i + 1] == '\n') + break; + LLVM_FALLTHROUGH; case '\n': ++FormatTok->NewlinesBefore; - FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); - FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; - Column = 0; - break; - case '\r': + if (!InEscape) + FormatTok->HasUnescapedNewline = true; + InEscape = false; FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; Column = 0; break; @@ -893,24 +943,23 @@ Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0); break; case '\\': - if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) - FormatTok->setType(TT_ImplicitStringLiteral); + case '?': + case '/': + InEscape = true; break; default: - FormatTok->setType(TT_ImplicitStringLiteral); + // This shouldn't happen. + assert(false); break; } - if (FormatTok->getType() == TT_ImplicitStringLiteral) - break; } - - if (FormatTok->is(TT_ImplicitStringLiteral)) - break; - WhitespaceLength += FormatTok->Tok.getLength(); - + WhitespaceLength += Text.size(); readRawToken(*FormatTok); } + if (FormatTok->is(tok::unknown)) + FormatTok->setType(TT_ImplicitStringLiteral); + // JavaScript and Java do not allow to escape the end of the line with a // backslash. Backslashes are syntax errors in plain source, but can occur in // comments. When a single line comment ends with a \, it'll cause the next @@ -924,41 +973,13 @@ while (BackslashPos != StringRef::npos) { if (BackslashPos + 1 < FormatTok->TokenText.size() && FormatTok->TokenText[BackslashPos + 1] == '\n') { - const char *Offset = Lex->getBufferLocation(); - Offset -= FormatTok->TokenText.size(); - Offset += BackslashPos + 1; - resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); - FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1); - FormatTok->ColumnWidth = encoding::columnWidthWithTabs( - FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, - Encoding); + resizeToken(BackslashPos + 1); break; } BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); } } - // In case the token starts with escaped newlines, we want to - // take them into account as whitespace - this pattern is quite frequent - // in macro definitions. - // FIXME: Add a more explicit test. - while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') { - unsigned SkippedWhitespace = 0; - if (FormatTok->TokenText.size() > 2 && - (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) - SkippedWhitespace = 3; - else if (FormatTok->TokenText[1] == '\n') - SkippedWhitespace = 2; - else - break; - - ++FormatTok->NewlinesBefore; - WhitespaceLength += SkippedWhitespace; - FormatTok->LastNewlineOffset = SkippedWhitespace; - Column = 0; - FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace); - } - FormatTok->WhitespaceRange = SourceRange( WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));