Index: include/clang/Format/Format.h =================================================================== --- include/clang/Format/Format.h +++ include/clang/Format/Format.h @@ -852,6 +852,22 @@ FormatStyle getStyle(StringRef StyleName, StringRef FileName, StringRef FallbackStyle, vfs::FileSystem *FS = nullptr); +// \brief Returns a string representation of ``Language`` for debugging. +inline StringRef getLanguageName(FormatStyle::LanguageKind Language) { + switch (Language) { + case FormatStyle::LK_Cpp: + return "C++"; + case FormatStyle::LK_Java: + return "Java"; + case FormatStyle::LK_JavaScript: + return "JavaScript"; + case FormatStyle::LK_Proto: + return "Proto"; + default: + return "Unknown"; + } +} + } // end namespace format } // end namespace clang Index: lib/Format/CMakeLists.txt =================================================================== --- lib/Format/CMakeLists.txt +++ lib/Format/CMakeLists.txt @@ -6,6 +6,9 @@ ContinuationIndenter.cpp Format.cpp FormatToken.cpp + FormatTokenLexer.cpp + SortJavaScriptImports.cpp + TokenAnalyzer.cpp TokenAnnotator.cpp UnwrappedLineFormatter.cpp UnwrappedLineParser.cpp Index: lib/Format/Format.cpp =================================================================== --- lib/Format/Format.cpp +++ lib/Format/Format.cpp @@ -16,6 +16,9 @@ #include "clang/Format/Format.h" #include "AffectedRangeManager.h" #include "ContinuationIndenter.h" +#include "FormatTokenLexer.h" +#include "SortJavaScriptImports.h" +#include "TokenAnalyzer.h" #include "TokenAnnotator.h" #include "UnwrappedLineFormatter.h" #include "UnwrappedLineParser.h" @@ -782,776 +785,6 @@ namespace { -class FormatTokenLexer { -public: - FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, - const FormatStyle &Style, encoding::Encoding Encoding) - : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false), - LessStashed(false), Column(0), TrailingWhitespace(0), - SourceMgr(SourceMgr), ID(ID), Style(Style), - IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable), - Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false), - MacroBlockBeginRegex(Style.MacroBlockBegin), - MacroBlockEndRegex(Style.MacroBlockEnd) { - Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, - getFormattingLangOpts(Style))); - Lex->SetKeepWhitespaceMode(true); - - for (const std::string &ForEachMacro : Style.ForEachMacros) - ForEachMacros.push_back(&IdentTable.get(ForEachMacro)); - std::sort(ForEachMacros.begin(), ForEachMacros.end()); - } - - ArrayRef lex() { - assert(Tokens.empty()); - assert(FirstInLineIndex == 0); - do { - Tokens.push_back(getNextToken()); - if (Style.Language == FormatStyle::LK_JavaScript) { - tryParseJSRegexLiteral(); - tryParseTemplateString(); - } - tryMergePreviousTokens(); - if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) - FirstInLineIndex = Tokens.size() - 1; - } while (Tokens.back()->Tok.isNot(tok::eof)); - return Tokens; - } - - const AdditionalKeywords &getKeywords() { return Keywords; } - -private: - void tryMergePreviousTokens() { - if (tryMerge_TMacro()) - return; - if (tryMergeConflictMarkers()) - return; - if (tryMergeLessLess()) - return; - - if (Style.Language == FormatStyle::LK_JavaScript) { - static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; - static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, - tok::equal}; - static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, - tok::greaterequal}; - static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; - // FIXME: Investigate what token type gives the correct operator priority. - if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) - return; - if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) - return; - if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) - return; - if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) - return; - } - } - - bool tryMergeLessLess() { - // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. - if (Tokens.size() < 3) - return false; - - bool FourthTokenIsLess = false; - if (Tokens.size() > 3) - FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less); - - auto First = Tokens.end() - 3; - if (First[2]->is(tok::less) || First[1]->isNot(tok::less) || - First[0]->isNot(tok::less) || FourthTokenIsLess) - return false; - - // Only merge if there currently is no whitespace between the two "<". - if (First[1]->WhitespaceRange.getBegin() != - First[1]->WhitespaceRange.getEnd()) - return false; - - First[0]->Tok.setKind(tok::lessless); - First[0]->TokenText = "<<"; - First[0]->ColumnWidth += 1; - Tokens.erase(Tokens.end() - 2); - return true; - } - - bool tryMergeTokens(ArrayRef Kinds, TokenType NewType) { - if (Tokens.size() < Kinds.size()) - return false; - - SmallVectorImpl::const_iterator First = - Tokens.end() - Kinds.size(); - if (!First[0]->is(Kinds[0])) - return false; - unsigned AddLength = 0; - for (unsigned i = 1; i < Kinds.size(); ++i) { - if (!First[i]->is(Kinds[i]) || - First[i]->WhitespaceRange.getBegin() != - First[i]->WhitespaceRange.getEnd()) - return false; - AddLength += First[i]->TokenText.size(); - } - Tokens.resize(Tokens.size() - Kinds.size() + 1); - First[0]->TokenText = StringRef(First[0]->TokenText.data(), - First[0]->TokenText.size() + AddLength); - First[0]->ColumnWidth += AddLength; - First[0]->Type = NewType; - return true; - } - - // Returns \c true if \p Tok can only be followed by an operand in JavaScript. - bool precedesOperand(FormatToken *Tok) { - // NB: This is not entirely correct, as an r_paren can introduce an operand - // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough - // corner case to not matter in practice, though. - return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, - tok::r_brace, tok::l_square, tok::semi, tok::exclaim, - tok::colon, tok::question, tok::tilde) || - Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, - tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, - tok::kw_typeof, Keywords.kw_instanceof, - Keywords.kw_in) || - Tok->isBinaryOperator(); - } - - bool canPrecedeRegexLiteral(FormatToken *Prev) { - if (!Prev) - return true; - - // Regex literals can only follow after prefix unary operators, not after - // postfix unary operators. If the '++' is followed by a non-operand - // introducing token, the slash here is the operand and not the start of a - // regex. - if (Prev->isOneOf(tok::plusplus, tok::minusminus)) - return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3])); - - // The previous token must introduce an operand location where regex - // literals can occur. - if (!precedesOperand(Prev)) - return false; - - return true; - } - - // Tries to parse a JavaScript Regex literal starting at the current token, - // if that begins with a slash and is in a location where JavaScript allows - // regex literals. Changes the current token to a regex literal and updates - // its text if successful. - void tryParseJSRegexLiteral() { - FormatToken *RegexToken = Tokens.back(); - if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) - return; - - FormatToken *Prev = nullptr; - for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) { - // NB: Because previous pointers are not initialized yet, this cannot use - // Token.getPreviousNonComment. - if ((*I)->isNot(tok::comment)) { - Prev = *I; - break; - } - } - - if (!canPrecedeRegexLiteral(Prev)) - return; - - // 'Manually' lex ahead in the current file buffer. - const char *Offset = Lex->getBufferLocation(); - const char *RegexBegin = Offset - RegexToken->TokenText.size(); - StringRef Buffer = Lex->getBuffer(); - bool InCharacterClass = false; - bool HaveClosingSlash = false; - for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { - // Regular expressions are terminated with a '/', which can only be - // escaped using '\' or a character class between '[' and ']'. - // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. - switch (*Offset) { - case '\\': - // Skip the escaped character. - ++Offset; - break; - case '[': - InCharacterClass = true; - break; - case ']': - InCharacterClass = false; - break; - case '/': - if (!InCharacterClass) - HaveClosingSlash = true; - break; - } - } - - RegexToken->Type = TT_RegexLiteral; - // Treat regex literals like other string_literals. - RegexToken->Tok.setKind(tok::string_literal); - RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); - RegexToken->ColumnWidth = RegexToken->TokenText.size(); - - resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); - } - - void tryParseTemplateString() { - FormatToken *BacktickToken = Tokens.back(); - if (!BacktickToken->is(tok::unknown) || BacktickToken->TokenText != "`") - return; - - // 'Manually' lex ahead in the current file buffer. - const char *Offset = Lex->getBufferLocation(); - const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" - for (; Offset != Lex->getBuffer().end() && *Offset != '`'; ++Offset) { - if (*Offset == '\\') - ++Offset; // Skip the escaped character. - } - - StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); - BacktickToken->Type = TT_TemplateString; - BacktickToken->Tok.setKind(tok::string_literal); - BacktickToken->TokenText = LiteralText; - - // Adjust width for potentially multiline string literals. - size_t FirstBreak = LiteralText.find('\n'); - StringRef FirstLineText = FirstBreak == StringRef::npos - ? LiteralText - : LiteralText.substr(0, FirstBreak); - BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( - FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); - size_t LastBreak = LiteralText.rfind('\n'); - if (LastBreak != StringRef::npos) { - BacktickToken->IsMultiline = true; - unsigned StartColumn = 0; // The template tail spans the entire line. - BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs( - LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, - Style.TabWidth, Encoding); - } - - resetLexer( - SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1))); - } - - bool tryMerge_TMacro() { - if (Tokens.size() < 4) - return false; - FormatToken *Last = Tokens.back(); - if (!Last->is(tok::r_paren)) - return false; - - FormatToken *String = Tokens[Tokens.size() - 2]; - if (!String->is(tok::string_literal) || String->IsMultiline) - return false; - - if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) - return false; - - FormatToken *Macro = Tokens[Tokens.size() - 4]; - if (Macro->TokenText != "_T") - return false; - - const char *Start = Macro->TokenText.data(); - const char *End = Last->TokenText.data() + Last->TokenText.size(); - String->TokenText = StringRef(Start, End - Start); - String->IsFirst = Macro->IsFirst; - String->LastNewlineOffset = Macro->LastNewlineOffset; - String->WhitespaceRange = Macro->WhitespaceRange; - String->OriginalColumn = Macro->OriginalColumn; - String->ColumnWidth = encoding::columnWidthWithTabs( - String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); - String->NewlinesBefore = Macro->NewlinesBefore; - String->HasUnescapedNewline = Macro->HasUnescapedNewline; - - Tokens.pop_back(); - Tokens.pop_back(); - Tokens.pop_back(); - Tokens.back() = String; - return true; - } - - bool tryMergeConflictMarkers() { - if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) - return false; - - // Conflict lines look like: - // - // For example: - // >>>>>>> /file/in/file/system at revision 1234 - // - // We merge all tokens in a line that starts with a conflict marker - // into a single token with a special token type that the unwrapped line - // parser will use to correctly rebuild the underlying code. - - FileID ID; - // Get the position of the first token in the line. - unsigned FirstInLineOffset; - std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( - Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); - StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer(); - // Calculate the offset of the start of the current line. - auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); - if (LineOffset == StringRef::npos) { - LineOffset = 0; - } else { - ++LineOffset; - } - - auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); - StringRef LineStart; - if (FirstSpace == StringRef::npos) { - LineStart = Buffer.substr(LineOffset); - } else { - LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); - } - - TokenType Type = TT_Unknown; - if (LineStart == "<<<<<<<" || LineStart == ">>>>") { - Type = TT_ConflictStart; - } else if (LineStart == "|||||||" || LineStart == "=======" || - LineStart == "====") { - Type = TT_ConflictAlternative; - } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { - Type = TT_ConflictEnd; - } - - if (Type != TT_Unknown) { - FormatToken *Next = Tokens.back(); - - Tokens.resize(FirstInLineIndex + 1); - // We do not need to build a complete token here, as we will skip it - // during parsing anyway (as we must not touch whitespace around conflict - // markers). - Tokens.back()->Type = Type; - Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); - - Tokens.push_back(Next); - return true; - } - - return false; - } - - FormatToken *getStashedToken() { - // Create a synthesized second '>' or '<' token. - Token Tok = FormatTok->Tok; - StringRef TokenText = FormatTok->TokenText; - - unsigned OriginalColumn = FormatTok->OriginalColumn; - FormatTok = new (Allocator.Allocate()) FormatToken; - FormatTok->Tok = Tok; - SourceLocation TokLocation = - FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); - FormatTok->Tok.setLocation(TokLocation); - FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); - FormatTok->TokenText = TokenText; - FormatTok->ColumnWidth = 1; - FormatTok->OriginalColumn = OriginalColumn + 1; - - return FormatTok; - } - - FormatToken *getNextToken() { - if (GreaterStashed) { - GreaterStashed = false; - return getStashedToken(); - } - if (LessStashed) { - LessStashed = false; - return getStashedToken(); - } - - FormatTok = new (Allocator.Allocate()) FormatToken; - readRawToken(*FormatTok); - SourceLocation WhitespaceStart = - FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); - FormatTok->IsFirst = IsFirstToken; - IsFirstToken = false; - - // Consume and record whitespace until we find a significant token. - unsigned WhitespaceLength = TrailingWhitespace; - while (FormatTok->Tok.is(tok::unknown)) { - StringRef Text = FormatTok->TokenText; - auto EscapesNewline = [&](int pos) { - // A '\r' here is just part of '\r\n'. Skip it. - if (pos >= 0 && Text[pos] == '\r') - --pos; - // See whether there is an odd number of '\' before this. - unsigned count = 0; - for (; pos >= 0; --pos, ++count) - if (Text[pos] != '\\') - break; - return count & 1; - }; - // FIXME: This miscounts tok:unknown tokens that are not just - // whitespace, e.g. a '`' character. - for (int i = 0, e = Text.size(); i != e; ++i) { - switch (Text[i]) { - case '\n': - ++FormatTok->NewlinesBefore; - FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); - FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; - Column = 0; - break; - case '\r': - FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; - Column = 0; - break; - case '\f': - case '\v': - Column = 0; - break; - case ' ': - ++Column; - break; - case '\t': - Column += Style.TabWidth - Column % Style.TabWidth; - break; - case '\\': - if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) - FormatTok->Type = TT_ImplicitStringLiteral; - break; - default: - FormatTok->Type = TT_ImplicitStringLiteral; - break; - } - if (FormatTok->Type == TT_ImplicitStringLiteral) - break; - } - - if (FormatTok->is(TT_ImplicitStringLiteral)) - break; - WhitespaceLength += FormatTok->Tok.getLength(); - - readRawToken(*FormatTok); - } - - // In case the token starts with escaped newlines, we want to - // take them into account as whitespace - this pattern is quite frequent - // in macro definitions. - // FIXME: Add a more explicit test. - while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' && - FormatTok->TokenText[1] == '\n') { - ++FormatTok->NewlinesBefore; - WhitespaceLength += 2; - FormatTok->LastNewlineOffset = 2; - Column = 0; - FormatTok->TokenText = FormatTok->TokenText.substr(2); - } - - FormatTok->WhitespaceRange = SourceRange( - WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); - - FormatTok->OriginalColumn = Column; - - TrailingWhitespace = 0; - if (FormatTok->Tok.is(tok::comment)) { - // FIXME: Add the trimmed whitespace to Column. - StringRef UntrimmedText = FormatTok->TokenText; - FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); - TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); - } else if (FormatTok->Tok.is(tok::raw_identifier)) { - IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); - FormatTok->Tok.setIdentifierInfo(&Info); - FormatTok->Tok.setKind(Info.getTokenID()); - if (Style.Language == FormatStyle::LK_Java && - FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, - tok::kw_operator)) { - FormatTok->Tok.setKind(tok::identifier); - FormatTok->Tok.setIdentifierInfo(nullptr); - } else if (Style.Language == FormatStyle::LK_JavaScript && - FormatTok->isOneOf(tok::kw_struct, tok::kw_union, - tok::kw_operator)) { - FormatTok->Tok.setKind(tok::identifier); - FormatTok->Tok.setIdentifierInfo(nullptr); - } - } else if (FormatTok->Tok.is(tok::greatergreater)) { - FormatTok->Tok.setKind(tok::greater); - FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); - GreaterStashed = true; - } else if (FormatTok->Tok.is(tok::lessless)) { - FormatTok->Tok.setKind(tok::less); - FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); - LessStashed = true; - } - - // Now FormatTok is the next non-whitespace token. - - StringRef Text = FormatTok->TokenText; - size_t FirstNewlinePos = Text.find('\n'); - if (FirstNewlinePos == StringRef::npos) { - // FIXME: ColumnWidth actually depends on the start column, we need to - // take this into account when the token is moved. - FormatTok->ColumnWidth = - encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); - Column += FormatTok->ColumnWidth; - } else { - FormatTok->IsMultiline = true; - // FIXME: ColumnWidth actually depends on the start column, we need to - // take this into account when the token is moved. - FormatTok->ColumnWidth = encoding::columnWidthWithTabs( - Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); - - // The last line of the token always starts in column 0. - // Thus, the length can be precomputed even in the presence of tabs. - FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( - Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, - Encoding); - Column = FormatTok->LastLineColumnWidth; - } - - if (Style.Language == FormatStyle::LK_Cpp) { - if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && - Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == - tok::pp_define) && - std::find(ForEachMacros.begin(), ForEachMacros.end(), - FormatTok->Tok.getIdentifierInfo()) != - ForEachMacros.end()) { - FormatTok->Type = TT_ForEachMacro; - } else if (FormatTok->is(tok::identifier)) { - if (MacroBlockBeginRegex.match(Text)) { - FormatTok->Type = TT_MacroBlockBegin; - } else if (MacroBlockEndRegex.match(Text)) { - FormatTok->Type = TT_MacroBlockEnd; - } - } - } - - return FormatTok; - } - - FormatToken *FormatTok; - bool IsFirstToken; - bool GreaterStashed, LessStashed; - unsigned Column; - unsigned TrailingWhitespace; - std::unique_ptr Lex; - const SourceManager &SourceMgr; - FileID ID; - const FormatStyle &Style; - IdentifierTable IdentTable; - AdditionalKeywords Keywords; - encoding::Encoding Encoding; - llvm::SpecificBumpPtrAllocator Allocator; - // Index (in 'Tokens') of the last token that starts a new line. - unsigned FirstInLineIndex; - SmallVector Tokens; - SmallVector ForEachMacros; - - bool FormattingDisabled; - - llvm::Regex MacroBlockBeginRegex; - llvm::Regex MacroBlockEndRegex; - - void readRawToken(FormatToken &Tok) { - Lex->LexFromRawLexer(Tok.Tok); - Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), - Tok.Tok.getLength()); - // For formatting, treat unterminated string literals like normal string - // literals. - if (Tok.is(tok::unknown)) { - if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { - Tok.Tok.setKind(tok::string_literal); - Tok.IsUnterminatedLiteral = true; - } else if (Style.Language == FormatStyle::LK_JavaScript && - Tok.TokenText == "''") { - Tok.Tok.setKind(tok::string_literal); - } - } - - if (Style.Language == FormatStyle::LK_JavaScript && - Tok.is(tok::char_constant)) { - Tok.Tok.setKind(tok::string_literal); - } - - if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" || - Tok.TokenText == "/* clang-format on */")) { - FormattingDisabled = false; - } - - Tok.Finalized = FormattingDisabled; - - if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" || - Tok.TokenText == "/* clang-format off */")) { - FormattingDisabled = true; - } - } - - void resetLexer(unsigned Offset) { - StringRef Buffer = SourceMgr.getBufferData(ID); - Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), - getFormattingLangOpts(Style), Buffer.begin(), - Buffer.begin() + Offset, Buffer.end())); - Lex->SetKeepWhitespaceMode(true); - TrailingWhitespace = 0; - } -}; - -static StringRef getLanguageName(FormatStyle::LanguageKind Language) { - switch (Language) { - case FormatStyle::LK_Cpp: - return "C++"; - case FormatStyle::LK_Java: - return "Java"; - case FormatStyle::LK_JavaScript: - return "JavaScript"; - case FormatStyle::LK_Proto: - return "Proto"; - default: - return "Unknown"; - } -} - -class Environment { -public: - Environment(SourceManager &SM, FileID ID, ArrayRef Ranges) - : ID(ID), CharRanges(Ranges.begin(), Ranges.end()), SM(SM) {} - - Environment(FileID ID, std::unique_ptr FileMgr, - std::unique_ptr VirtualSM, - std::unique_ptr Diagnostics, - const std::vector &CharRanges) - : ID(ID), CharRanges(CharRanges.begin(), CharRanges.end()), - SM(*VirtualSM), FileMgr(std::move(FileMgr)), - VirtualSM(std::move(VirtualSM)), Diagnostics(std::move(Diagnostics)) {} - - // This sets up an virtual file system with file \p FileName containing \p - // Code. - static std::unique_ptr - CreateVirtualEnvironment(StringRef Code, StringRef FileName, - ArrayRef Ranges) { - // This is referenced by `FileMgr` and will be released by `FileMgr` when it - // is deleted. - IntrusiveRefCntPtr InMemoryFileSystem( - new vfs::InMemoryFileSystem); - // This is passed to `SM` as reference, so the pointer has to be referenced - // in `Environment` so that `FileMgr` can out-live this function scope. - std::unique_ptr FileMgr( - new FileManager(FileSystemOptions(), InMemoryFileSystem)); - // This is passed to `SM` as reference, so the pointer has to be referenced - // by `Environment` due to the same reason above. - std::unique_ptr Diagnostics(new DiagnosticsEngine( - IntrusiveRefCntPtr(new DiagnosticIDs), - new DiagnosticOptions)); - // This will be stored as reference, so the pointer has to be stored in - // due to the same reason above. - std::unique_ptr VirtualSM( - new SourceManager(*Diagnostics, *FileMgr)); - InMemoryFileSystem->addFile( - FileName, 0, llvm::MemoryBuffer::getMemBuffer( - Code, FileName, /*RequiresNullTerminator=*/false)); - FileID ID = VirtualSM->createFileID( - FileMgr->getFile(FileName), SourceLocation(), clang::SrcMgr::C_User); - assert(ID.isValid()); - SourceLocation StartOfFile = VirtualSM->getLocForStartOfFile(ID); - std::vector CharRanges; - for (const tooling::Range &Range : Ranges) { - SourceLocation Start = StartOfFile.getLocWithOffset(Range.getOffset()); - SourceLocation End = Start.getLocWithOffset(Range.getLength()); - CharRanges.push_back(CharSourceRange::getCharRange(Start, End)); - } - return llvm::make_unique(ID, std::move(FileMgr), - std::move(VirtualSM), - std::move(Diagnostics), CharRanges); - } - - FileID getFileID() const { return ID; } - - StringRef getFileName() const { return FileName; } - - ArrayRef getCharRanges() const { return CharRanges; } - - const SourceManager &getSourceManager() const { return SM; } - -private: - FileID ID; - StringRef FileName; - SmallVector CharRanges; - SourceManager &SM; - - // The order of these fields are important - they should be in the same order - // as they are created in `CreateVirtualEnvironment` so that they can be - // deleted in the reverse order as they are created. - std::unique_ptr FileMgr; - std::unique_ptr VirtualSM; - std::unique_ptr Diagnostics; -}; - -class TokenAnalyzer : public UnwrappedLineConsumer { -public: - TokenAnalyzer(const Environment &Env, const FormatStyle &Style) - : Style(Style), Env(Env), - AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()), - UnwrappedLines(1), - Encoding(encoding::detectEncoding( - Env.getSourceManager().getBufferData(Env.getFileID()))) { - DEBUG(llvm::dbgs() << "File encoding: " - << (Encoding == encoding::Encoding_UTF8 ? "UTF8" - : "unknown") - << "\n"); - DEBUG(llvm::dbgs() << "Language: " << getLanguageName(Style.Language) - << "\n"); - } - - tooling::Replacements process() { - tooling::Replacements Result; - FormatTokenLexer Tokens(Env.getSourceManager(), Env.getFileID(), Style, - Encoding); - - UnwrappedLineParser Parser(Style, Tokens.getKeywords(), Tokens.lex(), - *this); - Parser.parse(); - assert(UnwrappedLines.rbegin()->empty()); - for (unsigned Run = 0, RunE = UnwrappedLines.size(); Run + 1 != RunE; - ++Run) { - DEBUG(llvm::dbgs() << "Run " << Run << "...\n"); - SmallVector AnnotatedLines; - - TokenAnnotator Annotator(Style, Tokens.getKeywords()); - for (unsigned i = 0, e = UnwrappedLines[Run].size(); i != e; ++i) { - AnnotatedLines.push_back(new AnnotatedLine(UnwrappedLines[Run][i])); - Annotator.annotate(*AnnotatedLines.back()); - } - - tooling::Replacements RunResult = - analyze(Annotator, AnnotatedLines, Tokens, Result); - - DEBUG({ - llvm::dbgs() << "Replacements for run " << Run << ":\n"; - for (tooling::Replacements::iterator I = RunResult.begin(), - E = RunResult.end(); - I != E; ++I) { - llvm::dbgs() << I->toString() << "\n"; - } - }); - for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { - delete AnnotatedLines[i]; - } - Result.insert(RunResult.begin(), RunResult.end()); - } - return Result; - } - -protected: - virtual tooling::Replacements - analyze(TokenAnnotator &Annotator, - SmallVectorImpl &AnnotatedLines, - FormatTokenLexer &Tokens, tooling::Replacements &Result) = 0; - - void consumeUnwrappedLine(const UnwrappedLine &TheLine) override { - assert(!UnwrappedLines.empty()); - UnwrappedLines.back().push_back(TheLine); - } - - void finishRun() override { - UnwrappedLines.push_back(SmallVector()); - } - - FormatStyle Style; - // Stores Style, FileID and SourceManager etc. - const Environment &Env; - // AffectedRangeMgr stores ranges to be fixed. - AffectedRangeManager AffectedRangeMgr; - SmallVector, 2> UnwrappedLines; - encoding::Encoding Encoding; -}; - class Formatter : public TokenAnalyzer { public: Formatter(const Environment &Env, const FormatStyle &Style, @@ -1987,6 +1220,8 @@ tooling::Replacements Replaces; if (!Style.SortIncludes) return Replaces; + if (Style.Language == FormatStyle::LanguageKind::LK_JavaScript) + return sortJavaScriptImports(Style, Code, Ranges, FileName); unsigned Prev = 0; unsigned SearchFrom = 0; Index: lib/Format/FormatToken.h =================================================================== --- lib/Format/FormatToken.h +++ lib/Format/FormatToken.h @@ -535,6 +535,7 @@ kw_NS_ENUM = &IdentTable.get("NS_ENUM"); kw_NS_OPTIONS = &IdentTable.get("NS_OPTIONS"); + kw_as = &IdentTable.get("as"); kw_async = &IdentTable.get("async"); kw_await = &IdentTable.get("await"); kw_finally = &IdentTable.get("finally"); @@ -585,6 +586,7 @@ IdentifierInfo *kw___except; // JavaScript keywords. + IdentifierInfo *kw_as; IdentifierInfo *kw_async; IdentifierInfo *kw_await; IdentifierInfo *kw_finally; Index: lib/Format/FormatTokenLexer.h =================================================================== --- /dev/null +++ lib/Format/FormatTokenLexer.h @@ -0,0 +1,97 @@ +//===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains FormatTokenLexer, which tokenizes a source file +/// into a token stream suitable for ClangFormat. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H +#define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H + +#include "Encoding.h" +#include "FormatToken.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "llvm/Support/Regex.h" + +namespace clang { +namespace format { + +class FormatTokenLexer { +public: + FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, + const FormatStyle &Style, encoding::Encoding Encoding); + + ArrayRef lex(); + + const AdditionalKeywords &getKeywords() { return Keywords; } + +private: + void tryMergePreviousTokens(); + + bool tryMergeLessLess(); + + bool tryMergeTokens(ArrayRef Kinds, TokenType NewType); + + // Returns \c true if \p Tok can only be followed by an operand in JavaScript. + bool precedesOperand(FormatToken *Tok); + + bool canPrecedeRegexLiteral(FormatToken *Prev); + + // Tries to parse a JavaScript Regex literal starting at the current token, + // if that begins with a slash and is in a location where JavaScript allows + // regex literals. Changes the current token to a regex literal and updates + // its text if successful. + void tryParseJSRegexLiteral(); + + void tryParseTemplateString(); + + bool tryMerge_TMacro(); + + bool tryMergeConflictMarkers(); + + FormatToken *getStashedToken(); + + FormatToken *getNextToken(); + + FormatToken *FormatTok; + bool IsFirstToken; + bool GreaterStashed, LessStashed; + unsigned Column; + unsigned TrailingWhitespace; + std::unique_ptr Lex; + const SourceManager &SourceMgr; + FileID ID; + const FormatStyle &Style; + IdentifierTable IdentTable; + AdditionalKeywords Keywords; + encoding::Encoding Encoding; + llvm::SpecificBumpPtrAllocator Allocator; + // Index (in 'Tokens') of the last token that starts a new line. + unsigned FirstInLineIndex; + SmallVector Tokens; + SmallVector ForEachMacros; + + bool FormattingDisabled; + + llvm::Regex MacroBlockBeginRegex; + llvm::Regex MacroBlockEndRegex; + + void readRawToken(FormatToken &Tok); + + void resetLexer(unsigned Offset); +}; + +} // namespace format +} // namespace clang + +#endif Index: lib/Format/FormatTokenLexer.cpp =================================================================== --- /dev/null +++ lib/Format/FormatTokenLexer.cpp @@ -0,0 +1,600 @@ +//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements FormatTokenLexer, which tokenizes a source file +/// into a FormatToken stream suitable for ClangFormat. +/// +//===----------------------------------------------------------------------===// + +#include "FormatTokenLexer.h" +#include "FormatToken.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "llvm/Support/Regex.h" + +namespace clang { +namespace format { + +FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, + const FormatStyle &Style, + encoding::Encoding Encoding) + : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false), + LessStashed(false), Column(0), TrailingWhitespace(0), + SourceMgr(SourceMgr), ID(ID), Style(Style), + IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable), + Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false), + MacroBlockBeginRegex(Style.MacroBlockBegin), + MacroBlockEndRegex(Style.MacroBlockEnd) { + Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, + getFormattingLangOpts(Style))); + Lex->SetKeepWhitespaceMode(true); + + for (const std::string &ForEachMacro : Style.ForEachMacros) + ForEachMacros.push_back(&IdentTable.get(ForEachMacro)); + std::sort(ForEachMacros.begin(), ForEachMacros.end()); + for (const std::string &ForEachMacro : Style.ForEachMacros) + ForEachMacros.push_back(&IdentTable.get(ForEachMacro)); + std::sort(ForEachMacros.begin(), ForEachMacros.end()); +} + +ArrayRef FormatTokenLexer::lex() { + assert(Tokens.empty()); + assert(FirstInLineIndex == 0); + do { + Tokens.push_back(getNextToken()); + if (Style.Language == FormatStyle::LK_JavaScript) { + tryParseJSRegexLiteral(); + tryParseTemplateString(); + } + tryMergePreviousTokens(); + if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) + FirstInLineIndex = Tokens.size() - 1; + } while (Tokens.back()->Tok.isNot(tok::eof)); + return Tokens; +} + +void FormatTokenLexer::tryMergePreviousTokens() { + if (tryMerge_TMacro()) + return; + if (tryMergeConflictMarkers()) + return; + if (tryMergeLessLess()) + return; + + if (Style.Language == FormatStyle::LK_JavaScript) { + static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; + static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, + tok::equal}; + static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, + tok::greaterequal}; + static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; + // FIXME: Investigate what token type gives the correct operator priority. + if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) + return; + if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) + return; + if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) + return; + if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) + return; + } +} + +bool FormatTokenLexer::tryMergeLessLess() { + // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. + if (Tokens.size() < 3) + return false; + + bool FourthTokenIsLess = false; + if (Tokens.size() > 3) + FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less); + + auto First = Tokens.end() - 3; + if (First[2]->is(tok::less) || First[1]->isNot(tok::less) || + First[0]->isNot(tok::less) || FourthTokenIsLess) + return false; + + // Only merge if there currently is no whitespace between the two "<". + if (First[1]->WhitespaceRange.getBegin() != + First[1]->WhitespaceRange.getEnd()) + return false; + + First[0]->Tok.setKind(tok::lessless); + First[0]->TokenText = "<<"; + First[0]->ColumnWidth += 1; + Tokens.erase(Tokens.end() - 2); + return true; +} + +bool FormatTokenLexer::tryMergeTokens(ArrayRef Kinds, + TokenType NewType) { + if (Tokens.size() < Kinds.size()) + return false; + + SmallVectorImpl::const_iterator First = + Tokens.end() - Kinds.size(); + if (!First[0]->is(Kinds[0])) + return false; + unsigned AddLength = 0; + for (unsigned i = 1; i < Kinds.size(); ++i) { + if (!First[i]->is(Kinds[i]) || + First[i]->WhitespaceRange.getBegin() != + First[i]->WhitespaceRange.getEnd()) + return false; + AddLength += First[i]->TokenText.size(); + } + Tokens.resize(Tokens.size() - Kinds.size() + 1); + First[0]->TokenText = StringRef(First[0]->TokenText.data(), + First[0]->TokenText.size() + AddLength); + First[0]->ColumnWidth += AddLength; + First[0]->Type = NewType; + return true; +} + +// Returns \c true if \p Tok can only be followed by an operand in JavaScript. +bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { + // NB: This is not entirely correct, as an r_paren can introduce an operand + // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough + // corner case to not matter in practice, though. + return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, + tok::r_brace, tok::l_square, tok::semi, tok::exclaim, + tok::colon, tok::question, tok::tilde) || + Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, + tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, + tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || + Tok->isBinaryOperator(); +} + +bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { + if (!Prev) + return true; + + // Regex literals can only follow after prefix unary operators, not after + // postfix unary operators. If the '++' is followed by a non-operand + // introducing token, the slash here is the operand and not the start of a + // regex. + if (Prev->isOneOf(tok::plusplus, tok::minusminus)) + return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3])); + + // The previous token must introduce an operand location where regex + // literals can occur. + if (!precedesOperand(Prev)) + return false; + + return true; +} + +// Tries to parse a JavaScript Regex literal starting at the current token, +// if that begins with a slash and is in a location where JavaScript allows +// regex literals. Changes the current token to a regex literal and updates +// its text if successful. +void FormatTokenLexer::tryParseJSRegexLiteral() { + FormatToken *RegexToken = Tokens.back(); + if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) + return; + + FormatToken *Prev = nullptr; + for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) { + // NB: Because previous pointers are not initialized yet, this cannot use + // Token.getPreviousNonComment. + if ((*I)->isNot(tok::comment)) { + Prev = *I; + break; + } + } + + if (!canPrecedeRegexLiteral(Prev)) + return; + + // 'Manually' lex ahead in the current file buffer. + const char *Offset = Lex->getBufferLocation(); + const char *RegexBegin = Offset - RegexToken->TokenText.size(); + StringRef Buffer = Lex->getBuffer(); + bool InCharacterClass = false; + bool HaveClosingSlash = false; + for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { + // Regular expressions are terminated with a '/', which can only be + // escaped using '\' or a character class between '[' and ']'. + // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. + switch (*Offset) { + case '\\': + // Skip the escaped character. + ++Offset; + break; + case '[': + InCharacterClass = true; + break; + case ']': + InCharacterClass = false; + break; + case '/': + if (!InCharacterClass) + HaveClosingSlash = true; + break; + } + } + + RegexToken->Type = TT_RegexLiteral; + // Treat regex literals like other string_literals. + RegexToken->Tok.setKind(tok::string_literal); + RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); + RegexToken->ColumnWidth = RegexToken->TokenText.size(); + + resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); +} + +void FormatTokenLexer::tryParseTemplateString() { + FormatToken *BacktickToken = Tokens.back(); + if (!BacktickToken->is(tok::unknown) || BacktickToken->TokenText != "`") + return; + + // 'Manually' lex ahead in the current file buffer. + const char *Offset = Lex->getBufferLocation(); + const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" + for (; Offset != Lex->getBuffer().end() && *Offset != '`'; ++Offset) { + if (*Offset == '\\') + ++Offset; // Skip the escaped character. + } + + StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); + BacktickToken->Type = TT_TemplateString; + BacktickToken->Tok.setKind(tok::string_literal); + BacktickToken->TokenText = LiteralText; + + // Adjust width for potentially multiline string literals. + size_t FirstBreak = LiteralText.find('\n'); + StringRef FirstLineText = FirstBreak == StringRef::npos + ? LiteralText + : LiteralText.substr(0, FirstBreak); + BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( + FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); + size_t LastBreak = LiteralText.rfind('\n'); + if (LastBreak != StringRef::npos) { + BacktickToken->IsMultiline = true; + unsigned StartColumn = 0; // The template tail spans the entire line. + BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs( + LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, + Style.TabWidth, Encoding); + } + + resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1))); +} + +bool FormatTokenLexer::tryMerge_TMacro() { + if (Tokens.size() < 4) + return false; + FormatToken *Last = Tokens.back(); + if (!Last->is(tok::r_paren)) + return false; + + FormatToken *String = Tokens[Tokens.size() - 2]; + if (!String->is(tok::string_literal) || String->IsMultiline) + return false; + + if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) + return false; + + FormatToken *Macro = Tokens[Tokens.size() - 4]; + if (Macro->TokenText != "_T") + return false; + + const char *Start = Macro->TokenText.data(); + const char *End = Last->TokenText.data() + Last->TokenText.size(); + String->TokenText = StringRef(Start, End - Start); + String->IsFirst = Macro->IsFirst; + String->LastNewlineOffset = Macro->LastNewlineOffset; + String->WhitespaceRange = Macro->WhitespaceRange; + String->OriginalColumn = Macro->OriginalColumn; + String->ColumnWidth = encoding::columnWidthWithTabs( + String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); + String->NewlinesBefore = Macro->NewlinesBefore; + String->HasUnescapedNewline = Macro->HasUnescapedNewline; + + Tokens.pop_back(); + Tokens.pop_back(); + Tokens.pop_back(); + Tokens.back() = String; + return true; +} + +bool FormatTokenLexer::tryMergeConflictMarkers() { + if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) + return false; + + // Conflict lines look like: + // + // For example: + // >>>>>>> /file/in/file/system at revision 1234 + // + // We merge all tokens in a line that starts with a conflict marker + // into a single token with a special token type that the unwrapped line + // parser will use to correctly rebuild the underlying code. + + FileID ID; + // Get the position of the first token in the line. + unsigned FirstInLineOffset; + std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( + Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); + StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer(); + // Calculate the offset of the start of the current line. + auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); + if (LineOffset == StringRef::npos) { + LineOffset = 0; + } else { + ++LineOffset; + } + + auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); + StringRef LineStart; + if (FirstSpace == StringRef::npos) { + LineStart = Buffer.substr(LineOffset); + } else { + LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); + } + + TokenType Type = TT_Unknown; + if (LineStart == "<<<<<<<" || LineStart == ">>>>") { + Type = TT_ConflictStart; + } else if (LineStart == "|||||||" || LineStart == "=======" || + LineStart == "====") { + Type = TT_ConflictAlternative; + } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { + Type = TT_ConflictEnd; + } + + if (Type != TT_Unknown) { + FormatToken *Next = Tokens.back(); + + Tokens.resize(FirstInLineIndex + 1); + // We do not need to build a complete token here, as we will skip it + // during parsing anyway (as we must not touch whitespace around conflict + // markers). + Tokens.back()->Type = Type; + Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); + + Tokens.push_back(Next); + return true; + } + + return false; +} + +FormatToken *FormatTokenLexer::getStashedToken() { + // Create a synthesized second '>' or '<' token. + Token Tok = FormatTok->Tok; + StringRef TokenText = FormatTok->TokenText; + + unsigned OriginalColumn = FormatTok->OriginalColumn; + FormatTok = new (Allocator.Allocate()) FormatToken; + FormatTok->Tok = Tok; + SourceLocation TokLocation = + FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); + FormatTok->Tok.setLocation(TokLocation); + FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); + FormatTok->TokenText = TokenText; + FormatTok->ColumnWidth = 1; + FormatTok->OriginalColumn = OriginalColumn + 1; + + return FormatTok; +} + +FormatToken *FormatTokenLexer::getNextToken() { + if (GreaterStashed) { + GreaterStashed = false; + return getStashedToken(); + } + if (LessStashed) { + LessStashed = false; + return getStashedToken(); + } + + FormatTok = new (Allocator.Allocate()) FormatToken; + readRawToken(*FormatTok); + SourceLocation WhitespaceStart = + FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); + FormatTok->IsFirst = IsFirstToken; + IsFirstToken = false; + + // Consume and record whitespace until we find a significant token. + unsigned WhitespaceLength = TrailingWhitespace; + while (FormatTok->Tok.is(tok::unknown)) { + StringRef Text = FormatTok->TokenText; + auto EscapesNewline = [&](int pos) { + // A '\r' here is just part of '\r\n'. Skip it. + if (pos >= 0 && Text[pos] == '\r') + --pos; + // See whether there is an odd number of '\' before this. + unsigned count = 0; + for (; pos >= 0; --pos, ++count) + if (Text[pos] != '\\') + break; + return count & 1; + }; + // FIXME: This miscounts tok:unknown tokens that are not just + // whitespace, e.g. a '`' character. + for (int i = 0, e = Text.size(); i != e; ++i) { + switch (Text[i]) { + case '\n': + ++FormatTok->NewlinesBefore; + FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); + FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; + Column = 0; + break; + case '\r': + FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; + Column = 0; + break; + case '\f': + case '\v': + Column = 0; + break; + case ' ': + ++Column; + break; + case '\t': + Column += Style.TabWidth - Column % Style.TabWidth; + break; + case '\\': + if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) + FormatTok->Type = TT_ImplicitStringLiteral; + break; + default: + FormatTok->Type = TT_ImplicitStringLiteral; + break; + } + if (FormatTok->Type == TT_ImplicitStringLiteral) + break; + } + + if (FormatTok->is(TT_ImplicitStringLiteral)) + break; + WhitespaceLength += FormatTok->Tok.getLength(); + + readRawToken(*FormatTok); + } + + // In case the token starts with escaped newlines, we want to + // take them into account as whitespace - this pattern is quite frequent + // in macro definitions. + // FIXME: Add a more explicit test. + while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' && + FormatTok->TokenText[1] == '\n') { + ++FormatTok->NewlinesBefore; + WhitespaceLength += 2; + FormatTok->LastNewlineOffset = 2; + Column = 0; + FormatTok->TokenText = FormatTok->TokenText.substr(2); + } + + FormatTok->WhitespaceRange = SourceRange( + WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); + + FormatTok->OriginalColumn = Column; + + TrailingWhitespace = 0; + if (FormatTok->Tok.is(tok::comment)) { + // FIXME: Add the trimmed whitespace to Column. + StringRef UntrimmedText = FormatTok->TokenText; + FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); + TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); + } else if (FormatTok->Tok.is(tok::raw_identifier)) { + IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); + FormatTok->Tok.setIdentifierInfo(&Info); + FormatTok->Tok.setKind(Info.getTokenID()); + if (Style.Language == FormatStyle::LK_Java && + FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, + tok::kw_operator)) { + FormatTok->Tok.setKind(tok::identifier); + FormatTok->Tok.setIdentifierInfo(nullptr); + } else if (Style.Language == FormatStyle::LK_JavaScript && + FormatTok->isOneOf(tok::kw_struct, tok::kw_union, + tok::kw_operator)) { + FormatTok->Tok.setKind(tok::identifier); + FormatTok->Tok.setIdentifierInfo(nullptr); + } + } else if (FormatTok->Tok.is(tok::greatergreater)) { + FormatTok->Tok.setKind(tok::greater); + FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); + GreaterStashed = true; + } else if (FormatTok->Tok.is(tok::lessless)) { + FormatTok->Tok.setKind(tok::less); + FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); + LessStashed = true; + } + + // Now FormatTok is the next non-whitespace token. + + StringRef Text = FormatTok->TokenText; + size_t FirstNewlinePos = Text.find('\n'); + if (FirstNewlinePos == StringRef::npos) { + // FIXME: ColumnWidth actually depends on the start column, we need to + // take this into account when the token is moved. + FormatTok->ColumnWidth = + encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); + Column += FormatTok->ColumnWidth; + } else { + FormatTok->IsMultiline = true; + // FIXME: ColumnWidth actually depends on the start column, we need to + // take this into account when the token is moved. + FormatTok->ColumnWidth = encoding::columnWidthWithTabs( + Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); + + // The last line of the token always starts in column 0. + // Thus, the length can be precomputed even in the presence of tabs. + FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( + Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); + Column = FormatTok->LastLineColumnWidth; + } + + if (Style.Language == FormatStyle::LK_Cpp) { + if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && + Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == + tok::pp_define) && + std::find(ForEachMacros.begin(), ForEachMacros.end(), + FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) { + FormatTok->Type = TT_ForEachMacro; + } else if (FormatTok->is(tok::identifier)) { + if (MacroBlockBeginRegex.match(Text)) { + FormatTok->Type = TT_MacroBlockBegin; + } else if (MacroBlockEndRegex.match(Text)) { + FormatTok->Type = TT_MacroBlockEnd; + } + } + } + + return FormatTok; +} + +void FormatTokenLexer::readRawToken(FormatToken &Tok) { + Lex->LexFromRawLexer(Tok.Tok); + Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), + Tok.Tok.getLength()); + // For formatting, treat unterminated string literals like normal string + // literals. + if (Tok.is(tok::unknown)) { + if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { + Tok.Tok.setKind(tok::string_literal); + Tok.IsUnterminatedLiteral = true; + } else if (Style.Language == FormatStyle::LK_JavaScript && + Tok.TokenText == "''") { + Tok.Tok.setKind(tok::string_literal); + } + } + + if (Style.Language == FormatStyle::LK_JavaScript && + Tok.is(tok::char_constant)) { + Tok.Tok.setKind(tok::string_literal); + } + + if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" || + Tok.TokenText == "/* clang-format on */")) { + FormattingDisabled = false; + } + + Tok.Finalized = FormattingDisabled; + + if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" || + Tok.TokenText == "/* clang-format off */")) { + FormattingDisabled = true; + } +} + +void FormatTokenLexer::resetLexer(unsigned Offset) { + StringRef Buffer = SourceMgr.getBufferData(ID); + Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), + getFormattingLangOpts(Style), Buffer.begin(), + Buffer.begin() + Offset, Buffer.end())); + Lex->SetKeepWhitespaceMode(true); + TrailingWhitespace = 0; +} + +} // namespace format +} // namespace clang Index: lib/Format/SortJavaScriptImports.h =================================================================== --- /dev/null +++ lib/Format/SortJavaScriptImports.h @@ -0,0 +1,35 @@ +//===--- SortJavaScriptImports.h - Sort ES6 Imports -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a sorter for JavaScript ES6 imports. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H +#define LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H + +#include "clang/Basic/LLVM.h" +#include "clang/Format/Format.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" + +namespace clang { +namespace format { + +// Sort JavaScript ES6 imports/exports in ``Code``. +tooling::Replacements sortJavaScriptImports(const FormatStyle &Style, + StringRef Code, + ArrayRef Ranges, + StringRef FileName); + +} // end namespace format +} // end namespace clang + +#endif Index: lib/Format/SortJavaScriptImports.cpp =================================================================== --- /dev/null +++ lib/Format/SortJavaScriptImports.cpp @@ -0,0 +1,310 @@ +//===--- SortJavaScriptImports.h - Sort ES6 Imports -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a sort operation for JavaScript ES6 imports. +/// +//===----------------------------------------------------------------------===// + +#include "SortJavaScriptImports.h" +#include "SortJavaScriptImports.h" +#include "TokenAnalyzer.h" +#include "TokenAnnotator.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Debug.h" +#include + +#define DEBUG_TYPE "format-formatter" + +namespace clang { +namespace format { + +class FormatTokenLexer; + +using clang::format::FormatStyle; + +// An imported symbol in a JavaScript ES6 import/export, possibly aliased. +struct JsImportedSymbol { + StringRef Symbol; + StringRef Alias; +}; + +struct JsImportExport { + bool IsExport; + // JS imports are sorted into these categories, in order. + enum JsImportCategory { + SIDE_EFFECT, // "import 'something';" + ABSOLUTE, // from 'something' + RELATIVE_PARENT, // from '../*' + RELATIVE, // from './*' + }; + JsImportCategory Category; + // The URL imported, e.g. `import .. from 'url';`. Empty for `export {a, b};`. + StringRef URL; + // Prefix from "import * as prefix". Empty for symbol imports. Implies an + // empty names list. + StringRef Prefix; + // Symbols from `import {SymbolA, SymbolB, ...} from ...;`. + SmallVector Symbols; + // Textual position of the import/export, including preceding and trailing + // comments. + SourceLocation Start; + SourceLocation End; +}; + +bool operator<(const JsImportExport &LHS, const JsImportExport &RHS) { + if (LHS.IsExport != RHS.IsExport) + return LHS.IsExport < RHS.IsExport; + if (LHS.Category != RHS.Category) + return LHS.Category < RHS.Category; + if (LHS.Category == JsImportExport::JsImportCategory::SIDE_EFFECT) + // Side effect imports might be ordering sensitive. Consider them equal so + // that they maintain their relative order in the stable sort below. + return false; + // NB: empty URLs sort *last* (for export {...};). + if (LHS.URL.empty() != RHS.URL.empty()) + return LHS.URL.empty() < RHS.URL.empty(); + if (LHS.URL != RHS.URL) + return LHS.URL < RHS.URL; + // NB: '*' imports (with prefix) sort before {a, b, ...} imports. + if (LHS.Prefix.empty() != RHS.Prefix.empty()) + return LHS.Prefix.empty() < RHS.Prefix.empty(); + if (LHS.Prefix != RHS.Prefix) + return LHS.Prefix > RHS.Prefix; + return false; +} + +// JavaScriptImportSorter sorts JavaScript ES6 imports and exports. It is +// implemented as a TokenAnalyzer because ES6 imports have substantial syntactic +// structure, making it messy to sort them using regular expressions. +class JavaScriptImportSorter : public TokenAnalyzer { +public: + JavaScriptImportSorter(const Environment &Env, const FormatStyle &Style) + : TokenAnalyzer(Env, Style), + FileContents(Env.getSourceManager().getBufferData(Env.getFileID())) {} + + tooling::Replacements + analyze(TokenAnnotator &Annotator, + SmallVectorImpl &AnnotatedLines, + FormatTokenLexer &Tokens, tooling::Replacements &Result) override { + AffectedRangeMgr.computeAffectedLines(AnnotatedLines.begin(), + AnnotatedLines.end()); + + const AdditionalKeywords &Keywords = Tokens.getKeywords(); + + SmallVector Imports; + SourceLocation LastStart; + for (auto Line : AnnotatedLines) { + if (!Line->Affected) + break; + Current = Line->First; + LineEnd = Line->Last; + JsImportExport ImpExp; + skipComments(); + if (LastStart.isInvalid() || Imports.empty()) + // After the first file level comment, consider line comments to be part + // of the import that immediately follows them by using the previously + // set LastStart. + LastStart = Line->First->Tok.getLocation(); + if (!Current) + continue; // Only comments on this line. + ImpExp.Start = LastStart; + LastStart = SourceLocation(); + if (!parseImportExport(Keywords, ImpExp)) + break; + ImpExp.End = LineEnd->Tok.getEndLoc(); + DEBUG({ + llvm::dbgs() << "Import: {" + << "is_export: " << ImpExp.IsExport + << ", cat: " << ImpExp.Category << ", url: " << ImpExp.URL + << ", prefix: " << ImpExp.Prefix; + for (size_t i = 0; i < ImpExp.Symbols.size(); ++i) + llvm::dbgs() << ", " << ImpExp.Symbols[i].Symbol << " as " + << ImpExp.Symbols[i].Alias; + llvm::dbgs() << ", text: " << getSourceText(ImpExp.Start, ImpExp.End); + llvm::dbgs() << "}\n"; + }); + Imports.push_back(ImpExp); + } + + if (Imports.empty()) + return Result; + + SmallVector Indices; + for (unsigned i = 0, e = Imports.size(); i != e; ++i) + Indices.push_back(i); + std::stable_sort(Indices.begin(), Indices.end(), + [&](unsigned LHSI, unsigned RHSI) { + return Imports[LHSI] < Imports[RHSI]; + }); + // FIXME: Pull this into a common function. + bool OutOfOrder = false; + for (unsigned i = 0, e = Indices.size(); i != e; ++i) { + if (i != Indices[i]) { + OutOfOrder = true; + break; + } + } + if (!OutOfOrder) + return Result; + + // Replace all existing import/export statements. + std::string ImportsText; + for (unsigned i = 0, e = Indices.size(); i != e; ++i) { + JsImportExport ImpExp = Imports[Indices[i]]; + StringRef ImportStmt = getSourceText(ImpExp.Start, ImpExp.End); + ImportsText += ImportStmt; + ImportsText += "\n"; + // Separate groups outside of exports with two line breaks. + if (i + 1 < e && !ImpExp.IsExport && + ImpExp.Category != Imports[Indices[i + 1]].Category) + ImportsText += "\n"; + } + SourceLocation InsertionPoint = Imports[0].Start; + SourceLocation End = Imports[Imports.size() - 1].End; + DEBUG(llvm::dbgs() << "Replacing imports:\n" + << getSourceText(InsertionPoint, End) << "\nwith:\n" + << ImportsText); + Result.insert(tooling::Replacement( + Env.getSourceManager(), + CharSourceRange::getCharRange(InsertionPoint, End), ImportsText)); + + return Result; + } + +private: + FormatToken *Current; + FormatToken *LineEnd; + StringRef FileContents; + + void skipComments() { Current = skipComments(Current); } + + FormatToken *skipComments(FormatToken *Tok) { + while (Tok && Tok->is(tok::comment)) + Tok = Tok->Next; + return Tok; + } + + bool nextToken() { + Current = Current->Next; + skipComments(); + return Current && Current != LineEnd->Next; + } + + StringRef getSourceText(SourceLocation Start, SourceLocation End) { + const SourceManager &SM = Env.getSourceManager(); + return FileContents.substr(SM.getFileOffset(Start), + SM.getFileOffset(End) - SM.getFileOffset(Start)); + } + + bool parseImportExport(const AdditionalKeywords &Keywords, + JsImportExport &ImpExp) { + if (!Current || !Current->isOneOf(Keywords.kw_import, tok::kw_export)) + return false; + ImpExp.IsExport = Current->is(tok::kw_export); + + if (!nextToken()) + return false; + + if (Current->isStringLiteral() && !ImpExp.IsExport) { + // "import 'side-effect';" + ImpExp.Category = JsImportExport::JsImportCategory::SIDE_EFFECT; + ImpExp.URL = Current->TokenText.substr(1, Current->TokenText.size() - 2); + return true; + } + + if (!parseImportExportSpecifier(Keywords, ImpExp) || !nextToken()) + return false; + if (Current->is(Keywords.kw_from)) { + // imports have a 'from' clause, exports might not. + if (!nextToken()) + return false; + if (!Current->isStringLiteral()) + return false; + // URL = TokenText without the quotes. + ImpExp.URL = Current->TokenText.substr(1, Current->TokenText.size() - 2); + if (ImpExp.URL.startswith("..")) + ImpExp.Category = JsImportExport::JsImportCategory::RELATIVE_PARENT; + else if (ImpExp.URL.startswith(".")) + ImpExp.Category = JsImportExport::JsImportCategory::RELATIVE; + else + ImpExp.Category = JsImportExport::JsImportCategory::ABSOLUTE; + } else { + // w/o URL groups with "empty". + ImpExp.Category = JsImportExport::JsImportCategory::RELATIVE; + } + return true; + } + + bool parseImportExportSpecifier(const AdditionalKeywords &Keywords, + JsImportExport &ImpExp) { + // * as prefix from '...'; + if (Current->is(tok::star)) { + if (!nextToken()) + return false; + if (!Current->is(Keywords.kw_as) || !nextToken()) + return false; + if (!Current->is(tok::identifier)) + return false; + ImpExp.Prefix = Current->TokenText; + return true; + } + + if (!Current->is(tok::l_brace)) + return false; + + // {sym as alias, sym2 as ...} from '...'; + if (!nextToken()) + return false; + while (true) { + if (!Current->is(tok::identifier)) + return false; + + JsImportedSymbol Symbol; + Symbol.Symbol = Current->TokenText; + nextToken(); + + if (Current->is(Keywords.kw_as)) { + nextToken(); + if (!Current->is(tok::identifier)) + return false; + Symbol.Alias = Current->TokenText; + nextToken(); + } + ImpExp.Symbols.push_back(Symbol); + + if (Current->is(tok::r_brace)) + return true; + if (!Current->is(tok::comma)) + return false; + nextToken(); + } + } +}; + +tooling::Replacements sortJavaScriptImports(const FormatStyle &Style, + StringRef Code, + ArrayRef Ranges, + StringRef FileName) { + // FIXME: Cursor support. + std::unique_ptr Env = + Environment::CreateVirtualEnvironment(Code, FileName, Ranges); + JavaScriptImportSorter Sorter(*Env, Style); + return Sorter.process(); +} + +} // end namespace format +} // end namespace clang Index: lib/Format/TokenAnalyzer.h =================================================================== --- /dev/null +++ lib/Format/TokenAnalyzer.h @@ -0,0 +1,108 @@ +//===--- TokenAnalyzer.h - Analyze Token Streams ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares an abstract TokenAnalyzer, and associated helper +/// classes. TokenAnalyzer can be extended to generate replacements based on +/// an annotated and pre-processed token stream. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H +#define LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H + +#include "AffectedRangeManager.h" +#include "Encoding.h" +#include "FormatToken.h" +#include "FormatTokenLexer.h" +#include "TokenAnnotator.h" +#include "UnwrappedLineParser.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "format-formatter" + +namespace clang { +namespace format { + +class Environment { +public: + Environment(SourceManager &SM, FileID ID, ArrayRef Ranges) + : ID(ID), CharRanges(Ranges.begin(), Ranges.end()), SM(SM) {} + + Environment(FileID ID, std::unique_ptr FileMgr, + std::unique_ptr VirtualSM, + std::unique_ptr Diagnostics, + const std::vector &CharRanges) + : ID(ID), CharRanges(CharRanges.begin(), CharRanges.end()), + SM(*VirtualSM), FileMgr(std::move(FileMgr)), + VirtualSM(std::move(VirtualSM)), Diagnostics(std::move(Diagnostics)) {} + + // This sets up an virtual file system with file \p FileName containing \p + // Code. + static std::unique_ptr + CreateVirtualEnvironment(StringRef Code, StringRef FileName, + ArrayRef Ranges); + + FileID getFileID() const { return ID; } + + StringRef getFileName() const { return FileName; } + + ArrayRef getCharRanges() const { return CharRanges; } + + const SourceManager &getSourceManager() const { return SM; } + +private: + FileID ID; + StringRef FileName; + SmallVector CharRanges; + SourceManager &SM; + + // The order of these fields are important - they should be in the same order + // as they are created in `CreateVirtualEnvironment` so that they can be + // deleted in the reverse order as they are created. + std::unique_ptr FileMgr; + std::unique_ptr VirtualSM; + std::unique_ptr Diagnostics; +}; + +class TokenAnalyzer : public UnwrappedLineConsumer { +public: + TokenAnalyzer(const Environment &Env, const FormatStyle &Style); + + tooling::Replacements process(); + +protected: + virtual tooling::Replacements + analyze(TokenAnnotator &Annotator, + SmallVectorImpl &AnnotatedLines, + FormatTokenLexer &Tokens, tooling::Replacements &Result) = 0; + + void consumeUnwrappedLine(const UnwrappedLine &TheLine) override; + + void finishRun() override; + + FormatStyle Style; + // Stores Style, FileID and SourceManager etc. + const Environment &Env; + // AffectedRangeMgr stores ranges to be fixed. + AffectedRangeManager AffectedRangeMgr; + SmallVector, 2> UnwrappedLines; + encoding::Encoding Encoding; +}; + +} // end namespace format +} // end namespace clang + +#endif Index: lib/Format/TokenAnalyzer.cpp =================================================================== --- /dev/null +++ lib/Format/TokenAnalyzer.cpp @@ -0,0 +1,138 @@ +//===--- TokenAnalyzer.cpp - Analyze Token Streams --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements an abstract TokenAnalyzer and associated helper +/// classes. TokenAnalyzer can be extended to generate replacements based on +/// an annotated and pre-processed token stream. +/// +//===----------------------------------------------------------------------===// + +#include "TokenAnalyzer.h" +#include "AffectedRangeManager.h" +#include "Encoding.h" +#include "FormatToken.h" +#include "FormatTokenLexer.h" +#include "TokenAnnotator.h" +#include "UnwrappedLineParser.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "format-formatter" + +namespace clang { +namespace format { + +// This sets up an virtual file system with file \p FileName containing \p +// Code. +std::unique_ptr +Environment::CreateVirtualEnvironment(StringRef Code, StringRef FileName, + ArrayRef Ranges) { + // This is referenced by `FileMgr` and will be released by `FileMgr` when it + // is deleted. + IntrusiveRefCntPtr InMemoryFileSystem( + new vfs::InMemoryFileSystem); + // This is passed to `SM` as reference, so the pointer has to be referenced + // in `Environment` so that `FileMgr` can out-live this function scope. + std::unique_ptr FileMgr( + new FileManager(FileSystemOptions(), InMemoryFileSystem)); + // This is passed to `SM` as reference, so the pointer has to be referenced + // by `Environment` due to the same reason above. + std::unique_ptr Diagnostics(new DiagnosticsEngine( + IntrusiveRefCntPtr(new DiagnosticIDs), + new DiagnosticOptions)); + // This will be stored as reference, so the pointer has to be stored in + // due to the same reason above. + std::unique_ptr VirtualSM( + new SourceManager(*Diagnostics, *FileMgr)); + InMemoryFileSystem->addFile( + FileName, 0, llvm::MemoryBuffer::getMemBuffer( + Code, FileName, /*RequiresNullTerminator=*/false)); + FileID ID = VirtualSM->createFileID(FileMgr->getFile(FileName), + SourceLocation(), clang::SrcMgr::C_User); + assert(ID.isValid()); + SourceLocation StartOfFile = VirtualSM->getLocForStartOfFile(ID); + std::vector CharRanges; + for (const tooling::Range &Range : Ranges) { + SourceLocation Start = StartOfFile.getLocWithOffset(Range.getOffset()); + SourceLocation End = Start.getLocWithOffset(Range.getLength()); + CharRanges.push_back(CharSourceRange::getCharRange(Start, End)); + } + return llvm::make_unique(ID, std::move(FileMgr), + std::move(VirtualSM), + std::move(Diagnostics), CharRanges); +} + +TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style) + : Style(Style), Env(Env), + AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()), + UnwrappedLines(1), + Encoding(encoding::detectEncoding( + Env.getSourceManager().getBufferData(Env.getFileID()))) { + DEBUG( + llvm::dbgs() << "File encoding: " + << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown") + << "\n"); + DEBUG(llvm::dbgs() << "Language: " << getLanguageName(Style.Language) + << "\n"); +} + +tooling::Replacements TokenAnalyzer::process() { + tooling::Replacements Result; + FormatTokenLexer Tokens(Env.getSourceManager(), Env.getFileID(), Style, + Encoding); + + UnwrappedLineParser Parser(Style, Tokens.getKeywords(), Tokens.lex(), *this); + Parser.parse(); + assert(UnwrappedLines.rbegin()->empty()); + for (unsigned Run = 0, RunE = UnwrappedLines.size(); Run + 1 != RunE; ++Run) { + DEBUG(llvm::dbgs() << "Run " << Run << "...\n"); + SmallVector AnnotatedLines; + + TokenAnnotator Annotator(Style, Tokens.getKeywords()); + for (unsigned i = 0, e = UnwrappedLines[Run].size(); i != e; ++i) { + AnnotatedLines.push_back(new AnnotatedLine(UnwrappedLines[Run][i])); + Annotator.annotate(*AnnotatedLines.back()); + } + + tooling::Replacements RunResult = + analyze(Annotator, AnnotatedLines, Tokens, Result); + + DEBUG({ + llvm::dbgs() << "Replacements for run " << Run << ":\n"; + for (tooling::Replacements::iterator I = RunResult.begin(), + E = RunResult.end(); + I != E; ++I) { + llvm::dbgs() << I->toString() << "\n"; + } + }); + for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { + delete AnnotatedLines[i]; + } + Result.insert(RunResult.begin(), RunResult.end()); + } + return Result; +} + +void TokenAnalyzer::consumeUnwrappedLine(const UnwrappedLine &TheLine) { + assert(!UnwrappedLines.empty()); + UnwrappedLines.back().push_back(TheLine); +} + +void TokenAnalyzer::finishRun() { + UnwrappedLines.push_back(SmallVector()); +} + +} // end namespace format +} // end namespace clang Index: unittests/Format/CMakeLists.txt =================================================================== --- unittests/Format/CMakeLists.txt +++ unittests/Format/CMakeLists.txt @@ -9,6 +9,7 @@ FormatTestJS.cpp FormatTestProto.cpp FormatTestSelective.cpp + SortImportsTestJS.cpp SortIncludesTest.cpp ) Index: unittests/Format/SortImportsTestJS.cpp =================================================================== --- /dev/null +++ unittests/Format/SortImportsTestJS.cpp @@ -0,0 +1,137 @@ +//===- unittest/Format/SortImportsTestJS.cpp - JS import sort unit tests --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "FormatTestUtils.h" +#include "clang/Format/Format.h" +#include "llvm/Support/Debug.h" +#include "gtest/gtest.h" + +#define DEBUG_TYPE "format-test" + +namespace clang { +namespace format { +namespace { + +class SortImportsTestJS : public ::testing::Test { +protected: + std::vector GetCodeRange(StringRef Code) { + return std::vector(1, tooling::Range(0, Code.size())); + } + + std::string sort(StringRef Code, StringRef FileName = "input.js") { + auto Ranges = GetCodeRange(Code); + std::string Sorted = + applyAllReplacements(Code, sortIncludes(Style, Code, Ranges, FileName)); + return applyAllReplacements(Sorted, + reformat(Style, Sorted, Ranges, FileName)); + } + + void verifySort(llvm::StringRef Expected, llvm::StringRef Code) { + std::string Result = sort(Code); + EXPECT_EQ(Expected.str(), Result) << "Formatted:\n" << Result; + } + + FormatStyle Style = getGoogleStyle(FormatStyle::LK_JavaScript); +}; + +TEST_F(SortImportsTestJS, BasicSorting) { + verifySort("import {sym} from 'a';\n" + "import {sym} from 'b';\n" + "import {sym} from 'c';\n" + "\n" + "let x = 1;", + "import {sym} from 'a';\n" + "import {sym} from 'c';\n" + "import {sym} from 'b';\n" + "let x = 1;"); +} + +TEST_F(SortImportsTestJS, Comments) { + verifySort("/** @fileoverview This is a great file. */\n" + "// A very important import follows.\n" + "import {sym} from 'a'; /* more comments */\n" + "import {sym} from 'b'; // from //foo:bar\n", + "/** @fileoverview This is a great file. */\n" + "import {sym} from 'b'; // from //foo:bar\n" + "// A very important import follows.\n" + "import {sym} from 'a'; /* more comments */\n"); +} + +TEST_F(SortImportsTestJS, SortStar) { + verifySort("import * as foo from 'a';\n" + "import {sym} from 'a';\n" + "import * as bar from 'b';\n", + "import {sym} from 'a';\n" + "import * as foo from 'a';\n" + "import * as bar from 'b';\n"); +} + +TEST_F(SortImportsTestJS, AliasesSymbols) { + verifySort("import {sym1 as alias1} from 'b';\n" + "import {sym2 as alias2, sym3 as alias3} from 'c';\n", + "import {sym2 as alias2, sym3 as alias3} from 'c';\n" + "import {sym1 as alias1} from 'b';\n"); +} + +TEST_F(SortImportsTestJS, GroupImports) { + verifySort("import {a} from 'absolute';\n" + "\n" + "import {b} from '../parent';\n" + "import {b} from '../parent/nested';\n" + "\n" + "import {b} from './relative/path';\n" + "import {b} from './relative/path/nested';\n" + "\n" + "let x = 1;\n", + "import {b} from './relative/path/nested';\n" + "import {b} from './relative/path';\n" + "import {b} from '../parent/nested';\n" + "import {b} from '../parent';\n" + "import {a} from 'absolute';\n" + "let x = 1;\n"); +} + +TEST_F(SortImportsTestJS, Exports) { + verifySort("import {S} from 'bpath';\n" + "\n" + "import {T} from './cpath';\n" + "\n" + "export {A, B} from 'apath';\n" + "export {P} from '../parent';\n" + "export {R} from './relative';\n" + "export {S};\n" + "\n" + "let x = 1;\n" + "export y = 1;\n", + "export {R} from './relative';\n" + "import {T} from './cpath';\n" + "export {S};\n" + "export {A, B} from 'apath';\n" + "import {S} from 'bpath';\n" + "export {P} from '../parent';\n" + "let x = 1;\n" + "export y = 1;\n"); +} + +TEST_F(SortImportsTestJS, SideEffectImports) { + verifySort("import 'ZZside-effect';\n" + "import 'AAside-effect';\n" + "\n" + "import {A} from 'absolute';\n" + "\n" + "import {R} from './relative';\n", + "import {R} from './relative';\n" + "import 'ZZside-effect';\n" + "import {A} from 'absolute';\n" + "import 'AAside-effect';\n"); +} + +} // end namespace +} // end namespace format +} // end namespace clang