diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -20,7 +20,7 @@ // non-terminal or terminal, identified by a SymbolID. // // Notions about the BNF grammar: -// - "_" is the augmented symbol, formed by start symbols. +// - "_" is the start symbol of the augmented grammar; // - single-line comment is supported, starting with a # // - A rule describes how a nonterminal (left side of :=) is constructed, and // it is *per line* in the grammar file @@ -38,6 +38,7 @@ #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include #include @@ -110,13 +111,16 @@ // It is a building block for constructing a table-based parser. class Grammar { public: - explicit Grammar(std::unique_ptr T) : T(std::move(T)) {} + explicit Grammar(std::unique_ptr); // Parses grammar from a BNF file. // Diagnostics emitted during parsing are stored in Diags. static std::unique_ptr parseBNF(llvm::StringRef BNF, std::vector &Diags); + // Returns the SymbolID of the start symbol '_'. + SymbolID startSymbol() const { return StartSymbol; }; + // Returns all rules of the given non-terminal symbol. llvm::ArrayRef rulesFor(SymbolID SID) const; const Rule &lookupRule(RuleID RID) const; @@ -136,7 +140,15 @@ private: std::unique_ptr T; + // The start symbol '_' of the augmented grammar. + SymbolID StartSymbol; }; +// For each nonterminal X, computes the set of terminals that begin strings +// derived from X. (Known as FIRST sets in grammar-based parsers). +std::vector> firstSets(const Grammar &); +// For each nonterminal X, computes the set of terminals that could immediately +// follow X. (Known as FOLLOW sets in grammar-based parsers). +std::vector> followSets(const Grammar &); // Storage for the underlying data of the Grammar. // It can be constructed dynamically (from compiling BNF file) or statically diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -23,6 +24,16 @@ llvm::copy(Sequence, this->Sequence); } +Grammar::Grammar(std::unique_ptr Table) : T(std::move(Table)) { + // start symbol is named _, binary search it. + auto It = llvm::partition_point( + T->Nonterminals, + [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; }); + assert(It != T->Nonterminals.end() && It->Name == "_" && + "symbol _ must exist in the grammar!"); + StartSymbol = It - T->Nonterminals.begin(); +} + llvm::ArrayRef Grammar::rulesFor(SymbolID SID) const { assert(isNonterminal(SID)); const auto &R = T->Nonterminals[SID].RuleRange; @@ -72,6 +83,86 @@ return OS.str(); } +std::vector> firstSets(const Grammar &G) { + std::vector> FirstSets( + G.table().Nonterminals.size()); + auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { + assert(isNonterminal(Target)); + if (isToken(First)) + return FirstSets[Target].insert(First).second; + bool Changed = false; + for (SymbolID SID : FirstSets[First]) + Changed |= FirstSets[Target].insert(SID).second; + return Changed; + }; + + // A rule S := T ... implies elements in FIRST(S): + // - if T is a terminal, FIRST(S) contains T + // - if T is a nonterminal, FIRST(S) contains FIRST(T) + // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may + // end up being incomplete. + // We iterate until we hit a fixed point. + // (This isn't particularly efficient, but table building isn't on the + // critical path). + bool Changed = true; + while (Changed) { + Changed = false; + for (const auto &R : G.table().Rules) + // We only need to consider the first element because symbols are + // non-nullable. + Changed |= ExpandFirstSet(R.Target, R.seq().front()); + } + return FirstSets; +} + +std::vector> followSets(const Grammar &G) { + auto FirstSets = firstSets(G); + std::vector> FollowSets( + G.table().Nonterminals.size()); + // Expand the follow set of a non-terminal symbol Y by adding all from the + // given symbol set. + auto ExpandFollowSet = [&FollowSets](SymbolID Y, + const llvm::DenseSet &ToAdd) { + assert(isNonterminal(Y)); + bool Changed = false; + for (SymbolID F : ToAdd) + Changed |= FollowSets[Y].insert(F).second; + return Changed; + }; + // Follow sets is computed based on the following 3 rules, the computation + // is completed at a fixed point where there is no more new symbols can be + // added to any of the follow sets. + // + // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol. + FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof)); + bool Changed = true; + while (Changed) { + Changed = false; + for (const auto &R : G.table().Rules) { + // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to + // FOLLOW(Y). + for (size_t i = 0; i + 1 < R.seq().size(); ++i) { + if (isToken(R.seq()[i])) + continue; + // We only need to consider the next symbol because symbols are + // non-nullable. + SymbolID Next = R.seq()[i + 1]; + if (isToken(Next)) + // First set for a terminal is itself. + Changed |= ExpandFollowSet(R.seq()[i], {Next}); + else + Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]); + } + // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to + // FOLLOW(Z). + SymbolID Z = R.seq().back(); + if (isNonterminal(Z)) + Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); + } + } + return FollowSets; +} + } // namespace pseudo } // namespace syntax } // namespace clang diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp --- a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp +++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp @@ -19,6 +19,7 @@ using testing::AllOf; using testing::ElementsAre; using testing::IsEmpty; +using testing::Pair; using testing::UnorderedElementsAre; MATCHER_P(TargetID, SID, "") { return arg.Target == SID; } @@ -33,7 +34,7 @@ G = Grammar::parseBNF(BNF, Diags); } - SymbolID lookup(llvm::StringRef Name) const { + SymbolID id(llvm::StringRef Name) const { for (unsigned I = 0; I < NumTerminals; ++I) if (G->table().Terminals[I] == Name) return tokenSymbol(static_cast(I)); @@ -50,31 +51,28 @@ }; TEST_F(GrammarTest, Basic) { - build("expression := IDENTIFIER + expression # comment"); + build("_ := IDENTIFIER + _ # comment"); EXPECT_THAT(Diags, IsEmpty()); auto ExpectedRule = - AllOf(TargetID(lookup("expression")), - Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression"))); - auto ExpressionID = lookup("expression"); - EXPECT_EQ(G->symbolName(ExpressionID), "expression"); - EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule)); + AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_"))); + EXPECT_EQ(G->symbolName(id("_")), "_"); + EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule)); const auto &Rule = G->lookupRule(/*RID=*/0); EXPECT_THAT(Rule, ExpectedRule); EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER"); EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+"); - EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression"); + EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_"); } TEST_F(GrammarTest, EliminatedOptional) { build("_ := CONST_opt INT ;_opt"); EXPECT_THAT(Diags, IsEmpty()); EXPECT_THAT(G->table().Rules, - UnorderedElementsAre( - Sequence(lookup("INT")), - Sequence(lookup("CONST"), lookup("INT")), - Sequence(lookup("CONST"), lookup("INT"), lookup(";")), - Sequence(lookup("INT"), lookup(";")))); + UnorderedElementsAre(Sequence(id("INT")), + Sequence(id("CONST"), id("INT")), + Sequence(id("CONST"), id("INT"), id(";")), + Sequence(id("INT"), id(";")))); } TEST_F(GrammarTest, Diagnostics) { @@ -87,6 +85,7 @@ invalid )cpp"); + EXPECT_EQ(G->startSymbol(), id("_")); EXPECT_THAT(Diags, UnorderedElementsAre( "Rule '_ := ,_opt' has a nullable RHS", "Rule 'null := ' has a nullable RHS", @@ -96,6 +95,66 @@ "No rules for nonterminal: IDENFIFIE")); } +TEST_F(GrammarTest, FirstAndFollowSets) { + build( + R"bnf( +_ := expr +expr := expr - term +expr := term +term := IDENTIFIER +term := ( expr ) +)bnf"); + ASSERT_TRUE(Diags.empty()); + auto ToPairs = [](std::vector> Input) { + std::vector>> Sets; + for (SymbolID ID = 0; ID < Input.size(); ++ID) + Sets.emplace_back(ID, std::move(Input[ID])); + return Sets; + }; + + EXPECT_THAT( + ToPairs(firstSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), + Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), + Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); + EXPECT_THAT( + ToPairs(followSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("EOF"))), + Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), + Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); + + build(R"bnf( +# A simplfied C++ decl-specifier-seq. +_ := decl-specifier-seq +decl-specifier-seq := decl-specifier decl-specifier-seq +decl-specifier-seq := decl-specifier +decl-specifier := simple-type-specifier +decl-specifier := INLINE +simple-type-specifier := INT + )bnf"); + ASSERT_TRUE(Diags.empty()); + EXPECT_THAT( + ToPairs(firstSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), + Pair(id("decl-specifier-seq"), + UnorderedElementsAre(id("INLINE"), id("INT"))), + Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), + Pair(id("decl-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"))))); + EXPECT_THAT( + ToPairs(followSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("EOF"))), + Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), + Pair(id("decl-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), + Pair(id("simple-type-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); +} + } // namespace } // namespace pseudo } // namespace syntax