diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -165,8 +165,15 @@ } RuleRange; }; - // The rules are sorted (and thus grouped) by target symbol. - // RuleID is the index of the vector. + // RuleID is an index into this table of rule definitions. + // + // Rules with the same target symbol (LHS) are grouped into a single range. + // The relative order of different target symbols is *not* by SymbolID, but + // rather a topological sort: if S := T then the rules producing T have lower + // RuleIDs than rules producing S. + // (This strange order simplifies the GLR parser: for a given token range, if + // we reduce in increasing RuleID order then we need never backtrack -- + // prerequisite reductions are reached before dependent ones). std::vector Rules; // A table of terminals (aka tokens). It corresponds to the clang::Token. // clang::tok::TokenKind is the index of the table. diff --git a/clang-tools-extra/pseudo/lib/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/GrammarBNF.cpp --- a/clang-tools-extra/pseudo/lib/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/GrammarBNF.cpp @@ -9,9 +9,11 @@ #include "clang-pseudo/Grammar.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FormatVariadic.h" #include +#include namespace clang { namespace pseudo { @@ -87,23 +89,75 @@ } assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); - llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) { - // Sorted by the Target. - return std::tie(Left.Target, Left.Size) < - std::tie(Right.Target, Right.Size); - }); - RuleID RulePos = 0; + const auto &SymbolOrder = getTopologicalOrder(T.get()); + llvm::stable_sort( + T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { + // Sorted by the topological order of the nonterminal Target. + return SymbolOrder[Left.Target] < SymbolOrder[Right.Target]; + }); for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { - RuleID Start = RulePos; - while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID) - ++RulePos; - T->Nonterminals[SID].RuleRange = {Start, RulePos}; + auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) { + return SymbolOrder[R.Target] < SymbolOrder[SID]; + }); + RuleID Start = StartIt - T->Rules.begin(); + RuleID End = Start; + while (End < T->Rules.size() && T->Rules[End].Target == SID) + ++End; + T->Nonterminals[SID].RuleRange = {Start, End}; } auto G = std::make_unique(std::move(T)); diagnoseGrammar(*G); return G; } + // Gets topological order for nonterminal symbols. + // + // The topological order is defined as: if a *single* nonterminal A produces + // (or transitively) a nonterminal B (that said, there is a production rule + // B := A), then A is less than B. + // + // It returns the sort key for each symbol, the array is indexed by SymbolID. + std::vector getTopologicalOrder(GrammarTable *T) { + std::vector> Dependencies; + for (const auto &Rule : T->Rules) { + // if A := B, A depends on B. + if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0])) + Dependencies.push_back({Rule.Target, Rule.Sequence[0]}); + } + llvm::sort(Dependencies); + std::vector Order; + // Each nonterminal state flows: NotVisited -> Visiting -> Visited. + enum State { + NotVisited, + Visiting, + Visited, + }; + std::vector VisitStates(T->Nonterminals.size(), NotVisited); + std::function DFS = [&](SymbolID SID) -> void { + if (VisitStates[SID] == Visited) + return; + if (VisitStates[SID] == Visiting) { + Diagnostics.push_back( + llvm::formatv("The grammar contains a cycle involving symbol {0}", + T->Nonterminals[SID].Name)); + return; + } + VisitStates[SID] = Visiting; + for (auto It = llvm::lower_bound(Dependencies, + std::pair{SID, 0}); + It != Dependencies.end() && It->first == SID; ++It) + DFS(It->second); + VisitStates[SID] = Visited; + Order.push_back(SID); + }; + for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) + DFS(ID); + std::vector Result(T->Nonterminals.size(), 0); + for (size_t I = 0; I < Order.size(); ++I) + Result[Order[I]] = I; + return Result; + } + private: // Text representation of a BNF grammar rule. struct RuleSpec { diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test --- a/clang-tools-extra/pseudo/test/lr-build-basic.test +++ b/clang-tools-extra/pseudo/test/lr-build-basic.test @@ -26,4 +26,4 @@ # TABLE-NEXT: State 2 # TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id' # TABLE-NEXT: State 3 -# TABLE-NEXT: 'EOF': reduce by rule 2 'id := IDENTIFIER' +# TABLE-NEXT: 'EOF': reduce by rule 0 'id := IDENTIFIER' diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test --- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test +++ b/clang-tools-extra/pseudo/test/lr-build-conflicts.test @@ -5,8 +5,8 @@ # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH # GRAPH: States # GRAPH-NEXT: State 0 -# GRAPH-NEXT: _ := • expr # GRAPH-NEXT: expr := • expr - expr +# GRAPH-NEXT: _ := • expr # GRAPH-NEXT: expr := • IDENTIFIER # GRAPH-NEXT: State 1 # GRAPH-NEXT: _ := expr • @@ -42,6 +42,6 @@ # TABLE-NEXT: 'IDENTIFIER': shift state 2 # TABLE-NEXT: 'expr': go to state 4 # TABLE-NEXT: State 4 -# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: 'EOF': reduce by rule 0 'expr := expr - expr' # TABLE-NEXT: '-': shift state 3 -# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: '-': reduce by rule 0 'expr := expr - expr' diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -44,6 +44,16 @@ return 0; } + RuleID ruleFor(llvm::StringRef NonterminalName) const { + auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange; + if (RuleRange.End - RuleRange.Start == 1) + return G->table().Nonterminals[id(NonterminalName)].RuleRange.Start; + ADD_FAILURE() << "Expected a single rule for " << NonterminalName + << ", but it has " << RuleRange.End - RuleRange.Start + << " rule!\n"; + return 0; + } + protected: std::unique_ptr G; std::vector Diags; @@ -74,6 +84,21 @@ Sequence(id("INT"), id(";")))); } +TEST_F(GrammarTest, RuleIDSorted) { + build(R"bnf( + _ := x + + x := y + y := z + z := IDENTIFIER + )bnf"); + ASSERT_TRUE(Diags.empty()); + + EXPECT_LT(ruleFor("z"), ruleFor("y")); + EXPECT_LT(ruleFor("y"), ruleFor("x")); + EXPECT_LT(ruleFor("x"), ruleFor("_")); +} + TEST_F(GrammarTest, Diagnostics) { build(R"cpp( _ := ,_opt @@ -82,6 +107,9 @@ _ := IDENFIFIE # a typo of the terminal IDENFITIER invalid + # cycle + a := b + b := a )cpp"); EXPECT_EQ(G->startSymbol(), id("_")); @@ -91,7 +119,8 @@ "No rules for nonterminal: undefined-sym", "Failed to parse 'invalid': no separator :=", "Token-like name IDENFIFIE is used as a nonterminal", - "No rules for nonterminal: IDENFIFIE")); + "No rules for nonterminal: IDENFIFIE", + "The grammar contains a cycle involving symbol a")); } TEST_F(GrammarTest, FirstAndFollowSets) {