diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -135,6 +135,8 @@ TYPE(UnaryOperator) \ TYPE(UnionLBrace) \ TYPE(UntouchableMacroFunc) \ + /* for the base in a number literal, not including the quote */ \ + TYPE(VerilogNumberBase) \ TYPE(Unknown) /// Determines the semantic type of a syntactic token, e.g. whether "<" is a @@ -368,6 +370,9 @@ } bool isTypeFinalized() const { return TypeIsFinalized; } + /// Used to set an operator precedence explicitly. + prec::Level ForcedPrecedence = prec::Unknown; + /// The number of newlines immediately before the \c Token. /// /// This can be used to determine what the user wrote in the original code @@ -697,6 +702,8 @@ } prec::Level getPrecedence() const { + if (ForcedPrecedence != prec::Unknown) + return ForcedPrecedence; return getBinOpPrecedence(Tok.getKind(), /*GreaterThanIsOperator=*/true, /*CPlusPlus11=*/true); } @@ -1119,6 +1126,7 @@ // Symbols that are treated as keywords. kw_verilogHash = &IdentTable.get("#"); kw_verilogHashHash = &IdentTable.get("##"); + kw_apostrophe = &IdentTable.get("\'"); // Keep this at the end of the constructor to make sure everything here // is @@ -1511,11 +1519,14 @@ IdentifierInfo *kw_verilogHash; IdentifierInfo *kw_verilogHashHash; + // Symbols in Verilog that don't exist in C++. + IdentifierInfo *kw_apostrophe; + /// Returns \c true if \p Tok is a keyword or an identifier. bool isWordLike(const FormatToken &Tok) const { // getIdentifierinfo returns non-null for keywords as well as identifiers. return Tok.Tok.getIdentifierInfo() != nullptr && - !Tok.isOneOf(kw_verilogHash, kw_verilogHashHash); + !Tok.isOneOf(kw_verilogHash, kw_verilogHashHash, kw_apostrophe); } /// Returns \c true if \p Tok is a true JavaScript identifier, returns @@ -1644,6 +1655,11 @@ } } + bool isVerilogWordOperator(const FormatToken &Tok) const { + return Tok.isOneOf(kw_before, kw_intersect, kw_dist, kw_iff, kw_inside, + kw_with); + } + bool isVerilogIdentifier(const FormatToken &Tok) const { switch (Tok.Tok.getKind()) { case tok::kw_case: diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -60,7 +60,14 @@ bool tryMergeForEach(); bool tryTransformTryUsageForC(); + // Merge the most recently lexed tokens into a single token if their kinds are + // correct. bool tryMergeTokens(ArrayRef Kinds, TokenType NewType); + // Merge without checking their kinds. + bool tryMergeTokens(size_t Count, TokenType NewType); + // Merge if their kinds match any one of Kinds. + bool tryMergeTokensAny(ArrayRef> Kinds, + TokenType NewType); // Returns \c true if \p Tok can only be followed by an operand in JavaScript. bool precedesOperand(FormatToken *Tok); diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -193,6 +193,78 @@ if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) return; } + + if (Style.isVerilog()) { + // Merge the number following a base like `'h?a0`. + if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) && + Tokens.end()[-2]->is(tok::numeric_constant) && + Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier, + tok::question) && + tryMergeTokens(2, TT_Unknown)) { + return; + } + // Part select. + if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}}, + TT_BitFieldColon)) { + return; + } + // Xnor. The combined token is treated as a caret which can also be either a + // unary or binary operator. The actual type is determined in + // TokenAnnotator. We also check the token length so we know it is not + // already a merged token. + if (Tokens.back()->TokenText.size() == 1 && + tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}}, + TT_BinaryOperator)) { + Tokens.back()->Tok.setKind(tok::caret); + return; + } + // Signed shift and distribution weight. + if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) { + Tokens.back()->Tok.setKind(tok::lessless); + return; + } + if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) { + Tokens.back()->Tok.setKind(tok::greatergreater); + return; + } + if (tryMergeTokensAny({{tok::lessless, tok::equal}, + {tok::lessless, tok::lessequal}, + {tok::greatergreater, tok::equal}, + {tok::greatergreater, tok::greaterequal}, + {tok::colon, tok::equal}, + {tok::colon, tok::slash}}, + TT_BinaryOperator)) { + Tokens.back()->ForcedPrecedence = prec::Assignment; + return; + } + // Exponentiation, signed shift, case equality, and wildcard equality. + if (tryMergeTokensAny({{tok::star, tok::star}, + {tok::lessless, tok::less}, + {tok::greatergreater, tok::greater}, + {tok::exclaimequal, tok::equal}, + {tok::exclaimequal, tok::question}, + {tok::equalequal, tok::equal}, + {tok::equalequal, tok::question}}, + TT_BinaryOperator)) { + return; + } + // Module paths in specify blocks and implications in properties. + if (tryMergeTokensAny({{tok::plusequal, tok::greater}, + {tok::plus, tok::star, tok::greater}, + {tok::minusequal, tok::greater}, + {tok::minus, tok::star, tok::greater}, + {tok::less, tok::arrow}, + {tok::equal, tok::greater}, + {tok::star, tok::greater}, + {tok::pipeequal, tok::greater}, + {tok::pipe, tok::arrow}, + {tok::hash, tok::minus, tok::hash}, + {tok::hash, tok::equal, tok::hash}}, + TT_BinaryOperator)) { + Tokens.back()->ForcedPrecedence = prec::Comma; + return; + } + } } bool FormatTokenLexer::tryMergeNSStringLiteral() { @@ -412,15 +484,28 @@ SmallVectorImpl::const_iterator First = Tokens.end() - Kinds.size(); - if (!First[0]->is(Kinds[0])) + for (unsigned i = 0; i < Kinds.size(); ++i) + if (!First[i]->is(Kinds[i])) + return false; + + return tryMergeTokens(Kinds.size(), NewType); +} + +bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) { + if (Tokens.size() < Count) return false; + + SmallVectorImpl::const_iterator First = Tokens.end() - Count; unsigned AddLength = 0; - for (unsigned i = 1; i < Kinds.size(); ++i) { - if (!First[i]->is(Kinds[i]) || First[i]->hasWhitespaceBefore()) + for (size_t i = 1; i < Count; ++i) { + // If there is whitespace separating the token and the previous one, + // they should not be merged. + if (First[i]->hasWhitespaceBefore()) return false; AddLength += First[i]->TokenText.size(); } - Tokens.resize(Tokens.size() - Kinds.size() + 1); + + Tokens.resize(Tokens.size() - Count + 1); First[0]->TokenText = StringRef(First[0]->TokenText.data(), First[0]->TokenText.size() + AddLength); First[0]->ColumnWidth += AddLength; @@ -428,6 +513,14 @@ return true; } +bool FormatTokenLexer::tryMergeTokensAny( + ArrayRef> Kinds, TokenType NewType) { + return std::any_of(Kinds.begin(), Kinds.end(), + [this, NewType](ArrayRef Kinds) { + return tryMergeTokens(Kinds, NewType); + }); +} + // Returns \c true if \p Tok can only be followed by an operand in JavaScript. bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { // NB: This is not entirely correct, as an r_paren can introduce an operand @@ -1004,12 +1097,19 @@ } if (Style.isVerilog()) { + static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase); + SmallVector Matches; // Verilog uses the backtick instead of the hash for preprocessor stuff. // And it uses the hash for delays and parameter lists. In order to continue // using `tok::hash` in other places, the backtick gets marked as the hash // here. And in order to tell the backtick and hash apart for // Verilog-specific stuff, the hash becomes an identifier. - if (FormatTok->isOneOf(tok::hash, tok::hashhash)) { + if (FormatTok->is(tok::numeric_constant)) { + // In Verilog the quote is not part of a number. + auto Quote = FormatTok->TokenText.find('\''); + if (Quote != StringRef::npos) + truncateToken(Quote); + } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) { FormatTok->Tok.setKind(tok::raw_identifier); } else if (FormatTok->is(tok::raw_identifier)) { if (FormatTok->TokenText == "`") { @@ -1018,6 +1118,15 @@ } else if (FormatTok->TokenText == "``") { FormatTok->Tok.setIdentifierInfo(nullptr); FormatTok->Tok.setKind(tok::hashhash); + } else if (Tokens.size() > 0 && + Tokens.back()->is(Keywords.kw_apostrophe) && + NumberBase.match(FormatTok->TokenText, &Matches)) { + // In Verilog in a based number literal like `'b10`, there may be + // whitespace between `'b` and `10`. Therefore we handle the base and + // the rest of the number literal as two tokens. But if there is no + // space in the input code, we need to manually separate the two parts. + truncateToken(Matches[0].size()); + FormatTok->setFinalizedType(TT_VerilogNumberBase); } } } @@ -1060,6 +1169,13 @@ StateStack.push(LexerState::TOKEN_STASHED); } + if (Style.isVerilog() && Tokens.size() > 0 && + Tokens.back()->is(TT_VerilogNumberBase) && + FormatTok->Tok.isOneOf(tok::identifier, tok::question)) { + // Mark the number following a base like `'h?a0` as a number. + FormatTok->Tok.setKind(tok::numeric_constant); + } + // Now FormatTok is the next non-whitespace token. StringRef Text = FormatTok->TokenText; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1842,7 +1842,8 @@ Current, Contexts.back().CanBeExpression && Contexts.back().IsExpression, Contexts.back().ContextType == Context::TemplateArgument)); - } else if (Current.isOneOf(tok::minus, tok::plus, tok::caret)) { + } else if (Current.isOneOf(tok::minus, tok::plus, tok::caret) || + (Style.isVerilog() && Current.is(tok::pipe))) { Current.setType(determinePlusMinusCaretUsage(Current)); if (Current.is(TT_UnaryOperator) && Current.is(tok::caret)) Contexts.back().CaretFound = true; @@ -3996,6 +3997,23 @@ Left.MatchingParen->endsSequence(tok::l_paren, tok::at)))) { return true; } + // Don't add embedded spaces in a number literal like `16'h1?ax` or an array + // literal like `'{}`. + if (Left.is(Keywords.kw_apostrophe) || + (Left.is(TT_VerilogNumberBase) && Right.is(tok::numeric_constant))) { + return false; + } + // Don't add spaces between a casting type and the quote or repetition count + // and the brace. + if ((Right.is(Keywords.kw_apostrophe) || + (Right.is(BK_BracedInit) && Right.is(tok::l_brace))) && + !(Left.isOneOf(Keywords.kw_assign, Keywords.kw_unique) || + Keywords.isVerilogWordOperator(Left)) && + (Left.isOneOf(tok::r_square, tok::r_paren, tok::r_brace, + tok::numeric_constant) || + Keywords.isWordLike(Left))) { + return false; + } } if (Left.is(TT_ImplicitStringLiteral)) return Right.hasWhitespaceBefore(); diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp --- a/clang/unittests/Format/FormatTestVerilog.cpp +++ b/clang/unittests/Format/FormatTestVerilog.cpp @@ -45,6 +45,27 @@ } }; +TEST_F(FormatTestVerilog, BasedLiteral) { + verifyFormat("x = '0;"); + verifyFormat("x = '1;"); + verifyFormat("x = 'X;"); + verifyFormat("x = 'x;"); + verifyFormat("x = 'Z;"); + verifyFormat("x = 'z;"); + verifyFormat("x = 659;"); + verifyFormat("x = 'h837ff;"); + verifyFormat("x = 'o7460;"); + verifyFormat("x = 4'b1001;"); + verifyFormat("x = 5'D3;"); + verifyFormat("x = 3'b01x;"); + verifyFormat("x = 12'hx;"); + verifyFormat("x = 16'hz;"); + verifyFormat("x = -8'd6;"); + verifyFormat("x = 4'shf;"); + verifyFormat("x = -4'sd15;"); + verifyFormat("x = 16'sd?;"); +} + TEST_F(FormatTestVerilog, Delay) { // Delay by the default unit. verifyFormat("#0;"); @@ -139,6 +160,64 @@ " {x} = {x};"); } +TEST_F(FormatTestVerilog, Operators) { + // Test that unary operators are not followed by space. + verifyFormat("x = +x;"); + verifyFormat("x = -x;"); + verifyFormat("x = !x;"); + verifyFormat("x = ~x;"); + verifyFormat("x = &x;"); + verifyFormat("x = ~&x;"); + verifyFormat("x = |x;"); + verifyFormat("x = ~|x;"); + verifyFormat("x = ^x;"); + verifyFormat("x = ~^x;"); + verifyFormat("x = ^~x;"); + verifyFormat("x = ++x;"); + verifyFormat("x = --x;"); + + // Test that operators don't get split. + verifyFormat("x = x++;"); + verifyFormat("x = x--;"); + verifyFormat("x = x ** x;"); + verifyFormat("x = x << x;"); + verifyFormat("x = x >> x;"); + verifyFormat("x = x <<< x;"); + verifyFormat("x = x >>> x;"); + verifyFormat("x = x <= x;"); + verifyFormat("x = x >= x;"); + verifyFormat("x = x == x;"); + verifyFormat("x = x != x;"); + verifyFormat("x = x === x;"); + verifyFormat("x = x !== x;"); + verifyFormat("x = x ==? x;"); + verifyFormat("x = x !=? x;"); + verifyFormat("x = x ~^ x;"); + verifyFormat("x = x ^~ x;"); + verifyFormat("x = x && x;"); + verifyFormat("x = x || x;"); + verifyFormat("x = x->x;"); + verifyFormat("x = x <-> x;"); + verifyFormat("x += x;"); + verifyFormat("x -= x;"); + verifyFormat("x *= x;"); + verifyFormat("x /= x;"); + verifyFormat("x %= x;"); + verifyFormat("x &= x;"); + verifyFormat("x ^= x;"); + verifyFormat("x |= x;"); + verifyFormat("x <<= x;"); + verifyFormat("x >>= x;"); + verifyFormat("x <<<= x;"); + verifyFormat("x >>>= x;"); + verifyFormat("x <= x;"); + + // Test that space is added between operators. + EXPECT_EQ("x = x < -x;", format("x=x<-x;")); + EXPECT_EQ("x = x << -x;", format("x=x<<-x;")); + EXPECT_EQ("x = x <<< -x;", format("x=x<<<-x;")); +} + TEST_F(FormatTestVerilog, Preprocessor) { auto Style = getLLVMStyle(FormatStyle::LK_Verilog); Style.ColumnLimit = 20; diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -38,6 +38,8 @@ EXPECT_EQ((FormatTok)->Tok.getKind(), Kind) << *(FormatTok) #define EXPECT_TOKEN_TYPE(FormatTok, Type) \ EXPECT_EQ((FormatTok)->getType(), Type) << *(FormatTok) +#define EXPECT_TOKEN_PRECEDENCE(FormatTok, Prec) \ + EXPECT_EQ((FormatTok)->getPrecedence(), Prec) << *(FormatTok) #define EXPECT_TOKEN(FormatTok, Kind, Type) \ do { \ EXPECT_TOKEN_KIND(FormatTok, Kind); \ @@ -764,6 +766,67 @@ EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace); } +TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) { + auto Annotate = [this](llvm::StringRef Code) { + return annotate(Code, getLLVMStyle(FormatStyle::LK_Verilog)); + }; + // Test that unary operators get labeled as such and that operators like '++' + // don't get split. + tok::TokenKind Unary[] = {tok::plus, tok::minus, tok::exclaim, + tok::tilde, tok::amp, tok::pipe, + tok::caret, tok::plusplus, tok::minusminus}; + for (auto Kind : Unary) { + auto Tokens = + Annotate(std::string("x = ") + tok::getPunctuatorSpelling(Kind) + "x;"); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; + EXPECT_TOKEN(Tokens[2], Kind, TT_UnaryOperator); + } + // Operators formed by joining two operators like '^~'. For some of these + // joined operators, we don't have a separate type, so we only test for their + // precedence. + std::pair JoinedBinary[] = { + {prec::Comma, "<->"}, {prec::Assignment, "+="}, + {prec::Assignment, "-="}, {prec::Assignment, "*="}, + {prec::Assignment, "/="}, {prec::Assignment, "%="}, + {prec::Assignment, "&="}, {prec::Assignment, "^="}, + {prec::Assignment, "<<="}, {prec::Assignment, ">>="}, + {prec::Assignment, "<<<="}, {prec::Assignment, ">>>="}, + {prec::LogicalOr, "||"}, {prec::LogicalAnd, "&&"}, + {prec::Equality, "=="}, {prec::Equality, "!="}, + {prec::Equality, "==="}, {prec::Equality, "!=="}, + {prec::Equality, "==?"}, {prec::Equality, "!=?"}, + {prec::ExclusiveOr, "~^"}, {prec::ExclusiveOr, "^~"}, + }; + for (auto Operator : JoinedBinary) { + auto Tokens = Annotate(std::string("x = x ") + Operator.second + " x;"); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN_TYPE(Tokens[3], TT_BinaryOperator); + EXPECT_TOKEN_PRECEDENCE(Tokens[3], Operator.first); + } + // '~^' and '^~' can be unary as well as binary operators. + auto Tokens = Annotate("x = ~^x;"); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; + EXPECT_TOKEN_TYPE(Tokens[2], TT_UnaryOperator); + Tokens = Annotate("x = ^~x;"); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; + EXPECT_TOKEN_TYPE(Tokens[2], TT_UnaryOperator); + // The unary operators '~&' and '~|' can only be unary operators. The current + // implementation treats each of them as separate unary '~' and '&' or '|' + // operators, which is enough for formatting purposes. In FormatTestVerilog, + // there is a test that there is no space in between. And even if a new line + // is inserted between the '~' and '|', the semantic meaning is the same as + // the joined operator, so the CanBreakBefore property doesn't need to be + // false for the second operator. + Tokens = Annotate("x = ~&x;"); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::tilde, TT_UnaryOperator); + EXPECT_TOKEN(Tokens[3], tok::amp, TT_UnaryOperator); + Tokens = Annotate("x = ~|x;"); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::tilde, TT_UnaryOperator); + EXPECT_TOKEN(Tokens[3], tok::pipe, TT_UnaryOperator); +} + } // namespace } // namespace format } // namespace clang