diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -20,7 +20,7 @@ // non-terminal or terminal, identified by a SymbolID. // // Notions about the BNF grammar: -// - "_" is the augmented symbol, formed by start symbols. +// - "_" is the start symbol of the augmented grammar; // - single-line comment is supported, starting with a # // - A rule describes how a nonterminal (left side of :=) is constructed, and // it is *per line* in the grammar file @@ -38,6 +38,7 @@ #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include #include @@ -117,6 +118,9 @@ static std::unique_ptr parseBNF(llvm::StringRef BNF, std::vector &Diags); + // Returns the SymbolID of the start symbol '_'. + SymbolID startSymbol() const; + // Returns all rules of the given non-terminal symbol. llvm::ArrayRef rulesFor(SymbolID SID) const; const Rule &lookupRule(RuleID RID) const; @@ -137,6 +141,12 @@ private: std::unique_ptr T; }; +// For each nonterminal X, computes the set of terminals that begin strings +// derived from X. (Known as FIRST sets in grammar-based parsers). +std::vector> firstSets(const Grammar &); +// For each nonterminal X, computes the set of terminals that could immediately +// follow X. (Known as FOLLOW sets in grammar-based parsers). +std::vector> followSets(const Grammar &); // Storage for the underlying data of the Grammar. // It can be constructed dynamically (from compiling BNF file) or statically diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -72,6 +73,94 @@ return OS.str(); } +std::vector> firstSets(const Grammar &G) { + std::vector> FirstSets( + G.table().Nonterminals.size()); + auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { + assert(isNonterminal(Target)); + if (isToken(First)) + return FirstSets[Target].insert(First).second; + bool Changed = false; + for (SymbolID SID : FirstSets[First]) + Changed |= FirstSets[Target].insert(SID).second; + return Changed; + }; + + // A rule S := T ... implies elements in FIRST(S): + // - if T is a terminal, FIRST(S) contains T + // - if T is a nonterminal, FIRST(S) contains FIRST(T) + // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may + // end up being incomplete. + // We iterate until we hit a fixed point. + // (This isn't particularly efficient, but table building isn't on the + // critical path). + bool Changed = true; + while (Changed) { + Changed = false; + for (const auto &R : G.table().Rules) + // We only need to consider the first element because symbols are + // non-nullable. + Changed |= ExpandFirstSet(R.Target, R.seq().front()); + } + return FirstSets; +} + +std::vector> followSets(const Grammar &G) { + auto FirstSets = firstSets(G); + std::vector> FollowSets( + G.table().Nonterminals.size()); + // Expand the follow set of a non-terminal symbol Y by adding all from the + // given symbol set. + auto ExpandFollowSet = [&FollowSets](SymbolID Y, + const llvm::DenseSet &ToAdd) { + assert(isNonterminal(Y)); + bool Changed = false; + for (SymbolID F : ToAdd) + Changed |= FollowSets[Y].insert(F).second; + return Changed; + }; + // Follow sets is computed based on the following 3 rules, the computation + // is completed at a fixed point where there is no more new symbols can be + // added to any of the follow sets. + // + // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol. + FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof)); + bool Changed = true; + while (Changed) { + Changed = false; + for (const auto &R : G.table().Rules) { + // Rule 2: for a rule X := ...YZ, we add all symbols from FIRST(Z) to + // FOLLOW(Y). + for (size_t i = 0; i + 1 < R.seq().size(); ++i) { + if (isToken(R.seq()[i])) + continue; + // We only need to consider the next symbol because symbols are + // non-nullable. + SymbolID Next = R.seq()[i + 1]; + if (isToken(Next)) + // First set for a terminal is itself. + Changed |= ExpandFollowSet(R.seq()[i], {Next}); + else + Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]); + } + // Rule 3: for a rule X := ...Z, we add all symbols from FOLLOW(X) to + // FOLLOW(Z). + SymbolID Z = R.seq().back(); + if (isNonterminal(Z)) + Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); + } + } + return FollowSets; +} + +SymbolID Grammar::startSymbol() const { + // start symbol is named _, binary search it. + auto It = llvm::partition_point( + T->Nonterminals, + [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; }); + assert(It != T->Nonterminals.end() && It->Name == "_"); + return It - T->Nonterminals.begin(); +} } // namespace pseudo } // namespace syntax } // namespace clang diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp --- a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp +++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp @@ -1,4 +1,4 @@ -//===--- GrammarTest.cpp - grammar tests -----------------------*- C++ -*-===// +//===--- GrammarTest.cpp - grammar tests ------------------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -19,6 +19,7 @@ using testing::AllOf; using testing::ElementsAre; using testing::IsEmpty; +using testing::Pair; using testing::UnorderedElementsAre; MATCHER_P(TargetID, SID, "") { return arg.Target == SID; } @@ -33,7 +34,7 @@ G = Grammar::parseBNF(BNF, Diags); } - SymbolID lookup(llvm::StringRef Name) const { + SymbolID id(llvm::StringRef Name) const { for (unsigned I = 0; I < NumTerminals; ++I) if (G->table().Terminals[I] == Name) return tokenSymbol(static_cast(I)); @@ -54,9 +55,9 @@ EXPECT_THAT(Diags, IsEmpty()); auto ExpectedRule = - AllOf(TargetID(lookup("expression")), - Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression"))); - auto ExpressionID = lookup("expression"); + AllOf(TargetID(id("expression")), + Sequence(id("IDENTIFIER"), id("+"), id("expression"))); + auto ExpressionID = id("expression"); EXPECT_EQ(G->symbolName(ExpressionID), "expression"); EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule)); const auto &Rule = G->lookupRule(/*RID=*/0); @@ -70,11 +71,10 @@ build("_ := CONST_opt INT ;_opt"); EXPECT_THAT(Diags, IsEmpty()); EXPECT_THAT(G->table().Rules, - UnorderedElementsAre( - Sequence(lookup("INT")), - Sequence(lookup("CONST"), lookup("INT")), - Sequence(lookup("CONST"), lookup("INT"), lookup(";")), - Sequence(lookup("INT"), lookup(";")))); + UnorderedElementsAre(Sequence(id("INT")), + Sequence(id("CONST"), id("INT")), + Sequence(id("CONST"), id("INT"), id(";")), + Sequence(id("INT"), id(";")))); } TEST_F(GrammarTest, Diagnostics) { @@ -96,6 +96,66 @@ "No rules for nonterminal: IDENFIFIE")); } +TEST_F(GrammarTest, FirstAndFollowSets) { + build( + R"bnf( +_ := expr +expr := expr - term +expr := term +term := IDENTIFIER +term := ( expr ) +)bnf"); + ASSERT_TRUE(Diags.empty()); + auto ToPair = [](std::vector> Input) { + std::vector>> Sets; + for (SymbolID ID = 0; ID < Input.size(); ++ID) + Sets.emplace_back(ID, std::move(Input[ID])); + return Sets; + }; + + EXPECT_THAT( + ToPair(firstSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), + Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), + Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); + EXPECT_THAT( + ToPair(followSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("EOF"))), + Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), + Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); + + build(R"bnf( +# A simplfied C++ decl-specifier-seq. +_ := decl-specifier-seq +decl-specifier-seq := decl-specifier decl-specifier-seq +decl-specifier-seq := decl-specifier +decl-specifier := simple-type-specifier +decl-specifier := INLINE +simple-type-specifier := INT + )bnf"); + ASSERT_TRUE(Diags.empty()); + EXPECT_THAT( + ToPair(firstSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), + Pair(id("decl-specifier-seq"), + UnorderedElementsAre(id("INLINE"), id("INT"))), + Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), + Pair(id("decl-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"))))); + EXPECT_THAT( + ToPair(followSets(*G)), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("EOF"))), + Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), + Pair(id("decl-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), + Pair(id("simple-type-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); +} + } // namespace } // namespace pseudo } // namespace syntax