diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -165,7 +165,8 @@ } RuleRange; }; - // The rules are sorted (and thus grouped) by target symbol. + // The rules are topological sorted (and thus grouped) by the target symbol. + // // RuleID is the index of the vector. std::vector Rules; // A table of terminals (aka tokens). It corresponds to the clang::Token. diff --git a/clang-tools-extra/pseudo/lib/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/GrammarBNF.cpp --- a/clang-tools-extra/pseudo/lib/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/GrammarBNF.cpp @@ -9,6 +9,7 @@ #include "clang-pseudo/Grammar.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FormatVariadic.h" #include @@ -87,23 +88,77 @@ } assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); - llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) { - // Sorted by the Target. - return std::tie(Left.Target, Left.Size) < - std::tie(Right.Target, Right.Size); - }); - RuleID RulePos = 0; + const auto &TopologicalOrder = getTopologicalOrder(T.get()); + llvm::stable_sort( + T->Rules, [&TopologicalOrder](const Rule &Left, const Rule &Right) { + // Sorted by the topological order of the nonterminal Target. + return TopologicalOrder[Left.Target] < TopologicalOrder[Right.Target]; + }); for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { - RuleID Start = RulePos; - while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID) - ++RulePos; - T->Nonterminals[SID].RuleRange = {Start, RulePos}; + auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) { + return TopologicalOrder[R.Target] < TopologicalOrder[SID]; + }); + RuleID Start = StartIt - T->Rules.begin(); + RuleID End = Start; + while (End < T->Rules.size() && + TopologicalOrder[T->Rules[End].Target] == TopologicalOrder[SID]) + ++End; + T->Nonterminals[SID].RuleRange = {Start, End}; } auto G = std::make_unique(std::move(T)); diagnoseGrammar(*G); return G; } + // Gets topological order for nonterminal symbols. + // + // The topological ordering is defined as: if a *single* nonterminal A produce + // (or transitively) a nonterminal B (that said, there is a production rule + // B := A in the grammar), then A comes before B. + // + // It returns a "lookup" vector where the index is SymbolID, and the value is + // the index of the SymbolID in the topological-orderred array. + std::vector getTopologicalOrder(GrammarTable *T) { + llvm::DenseMap> DependencyGraph; + for (const auto &Rule : T->Rules) { + // if A := B, A depends on B. + if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0])) + DependencyGraph[Rule.Target].insert(Rule.Sequence[0]); + } + std::vector Order; + // Each nonterminal state flows: NotVisited -> Visiting -> Visited. + enum State { + NotVisited, + Visiting, + Visited, + }; + std::vector VisitStates(T->Nonterminals.size(), NotVisited); + std::function DFS = [&](SymbolID SID) -> void { + if (VisitStates[SID] == Visited) + return; + if (VisitStates[SID] == Visiting) { + Diagnostics.push_back( + llvm::formatv("The grammar is cyclic, see symbol {0}", + T->Nonterminals[SID].Name)); + return; + } + VisitStates[SID] = Visiting; + auto It = DependencyGraph.find(SID); + if (It != DependencyGraph.end()) { + for (SymbolID Dep : (It->getSecond())) + DFS(Dep); + } + VisitStates[SID] = Visited; + Order.push_back(SID); + }; + for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) + DFS(ID); + std::vector Resuslt(T->Nonterminals.size(), 0); + for (size_t I = 0; I < Order.size(); ++I) + Resuslt[Order[I]] = I; + return Resuslt; + } + private: // Text representation of a BNF grammar rule. struct RuleSpec { diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test --- a/clang-tools-extra/pseudo/test/lr-build-basic.test +++ b/clang-tools-extra/pseudo/test/lr-build-basic.test @@ -26,4 +26,4 @@ # TABLE-NEXT: State 2 # TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id' # TABLE-NEXT: State 3 -# TABLE-NEXT: 'EOF': reduce by rule 2 'id := IDENTIFIER' +# TABLE-NEXT: 'EOF': reduce by rule 0 'id := IDENTIFIER' diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test --- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test +++ b/clang-tools-extra/pseudo/test/lr-build-conflicts.test @@ -5,8 +5,8 @@ # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH # GRAPH: States # GRAPH-NEXT: State 0 -# GRAPH-NEXT: _ := • expr # GRAPH-NEXT: expr := • expr - expr +# GRAPH-NEXT: _ := • expr # GRAPH-NEXT: expr := • IDENTIFIER # GRAPH-NEXT: State 1 # GRAPH-NEXT: _ := expr • @@ -42,6 +42,6 @@ # TABLE-NEXT: 'IDENTIFIER': shift state 2 # TABLE-NEXT: 'expr': go to state 4 # TABLE-NEXT: State 4 -# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: 'EOF': reduce by rule 0 'expr := expr - expr' # TABLE-NEXT: '-': shift state 3 -# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: '-': reduce by rule 0 'expr := expr - expr' diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -7,6 +7,7 @@ DirectiveMapTest.cpp GrammarTest.cpp LRTableTest.cpp + TestGrammar.cpp TokenTest.cpp ) diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/Grammar.h" +#include "TestGrammar.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include @@ -26,76 +27,78 @@ return testing::Property(&Rule::seq, ElementsAre(IDs...)); } -class GrammarTest : public ::testing::Test { -public: - void build(llvm::StringRef BNF) { - Diags.clear(); - G = Grammar::parseBNF(BNF, Diags); - } - - SymbolID id(llvm::StringRef Name) const { - for (unsigned I = 0; I < NumTerminals; ++I) - if (G->table().Terminals[I] == Name) - return tokenSymbol(static_cast(I)); - for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) - if (G->table().Nonterminals[ID].Name == Name) - return ID; - ADD_FAILURE() << "No such symbol found: " << Name; - return 0; - } - -protected: - std::unique_ptr G; - std::vector Diags; -}; - -TEST_F(GrammarTest, Basic) { - build("_ := IDENTIFIER + _ # comment"); - EXPECT_THAT(Diags, IsEmpty()); +TEST(GrammarTest, Basic) { + const auto &TG = TestGrammar::build("_ := IDENTIFIER + _ # comment"); + EXPECT_THAT(TG.Diags, IsEmpty()); auto ExpectedRule = - AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_"))); - EXPECT_EQ(G->symbolName(id("_")), "_"); - EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule)); - const auto &Rule = G->lookupRule(/*RID=*/0); + AllOf(TargetID(TG.symbol("_")), + Sequence(TG.symbol("IDENTIFIER"), TG.symbol("+"), TG.symbol("_"))); + EXPECT_EQ(TG.G->symbolName(TG.symbol("_")), "_"); + EXPECT_THAT(TG.G->rulesFor(TG.symbol("_")), + UnorderedElementsAre(ExpectedRule)); + const auto &Rule = TG.G->lookupRule(/*RID=*/0); EXPECT_THAT(Rule, ExpectedRule); - EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER"); - EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+"); - EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_"); + EXPECT_THAT(TG.G->symbolName(Rule.seq()[0]), "IDENTIFIER"); + EXPECT_THAT(TG.G->symbolName(Rule.seq()[1]), "+"); + EXPECT_THAT(TG.G->symbolName(Rule.seq()[2]), "_"); } -TEST_F(GrammarTest, EliminatedOptional) { - build("_ := CONST_opt INT ;_opt"); - EXPECT_THAT(Diags, IsEmpty()); - EXPECT_THAT(G->table().Rules, - UnorderedElementsAre(Sequence(id("INT")), - Sequence(id("CONST"), id("INT")), - Sequence(id("CONST"), id("INT"), id(";")), - Sequence(id("INT"), id(";")))); +TEST(GrammarTest, RuleIDSorted) { + const auto &TG = TestGrammar::build(R"bnf( + _ := x + + x := y + y := z + z := IDENTIFIER + )bnf"); + EXPECT_THAT(TG.Diags, IsEmpty()); + + EXPECT_LT(TG.singleRuleFor("z"), TG.singleRuleFor("y")); + EXPECT_LT(TG.singleRuleFor("y"), TG.singleRuleFor("x")); + EXPECT_LT(TG.singleRuleFor("x"), TG.singleRuleFor("_")); +} + +TEST(GrammarTest, EliminatedOptional) { + const auto &TG = TestGrammar::build("_ := CONST_opt INT ;_opt"); + EXPECT_THAT(TG.Diags, IsEmpty()); + EXPECT_THAT( + TG.G->table().Rules, + UnorderedElementsAre( + Sequence(TG.symbol("INT")), + Sequence(TG.symbol("CONST"), TG.symbol("INT")), + Sequence(TG.symbol("CONST"), TG.symbol("INT"), TG.symbol(";")), + Sequence(TG.symbol("INT"), TG.symbol(";")))); } -TEST_F(GrammarTest, Diagnostics) { - build(R"cpp( +TEST(GrammarTest, Diagnostics) { + const auto &TG = TestGrammar::build(R"cpp( _ := ,_opt _ := undefined-sym null := _ := IDENFIFIE # a typo of the terminal IDENFITIER invalid + + # cycle + a := b + b := a )cpp"); - EXPECT_EQ(G->startSymbol(), id("_")); - EXPECT_THAT(Diags, UnorderedElementsAre( - "Rule '_ := ,_opt' has a nullable RHS", - "Rule 'null := ' has a nullable RHS", - "No rules for nonterminal: undefined-sym", - "Failed to parse 'invalid': no separator :=", - "Token-like name IDENFIFIE is used as a nonterminal", - "No rules for nonterminal: IDENFIFIE")); + EXPECT_EQ(TG.G->startSymbol(), TG.symbol("_")); + EXPECT_THAT( + TG.Diags, + UnorderedElementsAre("Rule '_ := ,_opt' has a nullable RHS", + "Rule 'null := ' has a nullable RHS", + "No rules for nonterminal: undefined-sym", + "Failed to parse 'invalid': no separator :=", + "Token-like name IDENFIFIE is used as a nonterminal", + "No rules for nonterminal: IDENFIFIE", + "The grammar is cyclic, see symbol a")); } -TEST_F(GrammarTest, FirstAndFollowSets) { - build( +TEST(GrammarTest, FirstAndFollowSets) { + auto TG = TestGrammar::build( R"bnf( _ := expr expr := expr - term @@ -103,7 +106,7 @@ term := IDENTIFIER term := ( expr ) )bnf"); - ASSERT_TRUE(Diags.empty()); + ASSERT_TRUE(TG.Diags.empty()); auto ToPairs = [](std::vector> Input) { std::vector>> Sets; for (SymbolID ID = 0; ID < Input.size(); ++ID) @@ -112,19 +115,25 @@ }; EXPECT_THAT( - ToPairs(firstSets(*G)), + ToPairs(firstSets(*TG.G)), UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); - EXPECT_THAT( - ToPairs(followSets(*G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), - Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); - - build(R"bnf( + Pair(TG.symbol("_"), + UnorderedElementsAre(TG.symbol("IDENTIFIER"), TG.symbol("("))), + Pair(TG.symbol("expr"), + UnorderedElementsAre(TG.symbol("IDENTIFIER"), TG.symbol("("))), + Pair(TG.symbol("term"), + UnorderedElementsAre(TG.symbol("IDENTIFIER"), TG.symbol("("))))); + EXPECT_THAT(ToPairs(followSets(*TG.G)), + UnorderedElementsAre( + Pair(TG.symbol("_"), UnorderedElementsAre(TG.symbol("EOF"))), + Pair(TG.symbol("expr"), + UnorderedElementsAre(TG.symbol("-"), TG.symbol("EOF"), + TG.symbol(")"))), + Pair(TG.symbol("term"), + UnorderedElementsAre(TG.symbol("-"), TG.symbol("EOF"), + TG.symbol(")"))))); + + TG = TestGrammar::build(R"bnf( # A simplfied C++ decl-specifier-seq. _ := decl-specifier-seq decl-specifier-seq := decl-specifier decl-specifier-seq @@ -133,25 +142,30 @@ decl-specifier := INLINE simple-type-specifier := INT )bnf"); - ASSERT_TRUE(Diags.empty()); + ASSERT_TRUE(TG.Diags.empty()); EXPECT_THAT( - ToPairs(firstSets(*G)), + ToPairs(firstSets(*TG.G)), UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("decl-specifier-seq"), - UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"))))); + Pair(TG.symbol("_"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"))), + Pair(TG.symbol("decl-specifier-seq"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"))), + Pair(TG.symbol("simple-type-specifier"), + UnorderedElementsAre(TG.symbol("INT"))), + Pair(TG.symbol("decl-specifier"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"))))); EXPECT_THAT( - ToPairs(followSets(*G)), + ToPairs(followSets(*TG.G)), UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), - Pair(id("simple-type-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); + Pair(TG.symbol("_"), UnorderedElementsAre(TG.symbol("EOF"))), + Pair(TG.symbol("decl-specifier-seq"), + UnorderedElementsAre(TG.symbol("EOF"))), + Pair(TG.symbol("decl-specifier"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"), + TG.symbol("EOF"))), + Pair(TG.symbol("simple-type-specifier"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"), + TG.symbol("EOF"))))); } } // namespace diff --git a/clang-tools-extra/pseudo/unittests/TestGrammar.h b/clang-tools-extra/pseudo/unittests/TestGrammar.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/TestGrammar.h @@ -0,0 +1,33 @@ +//===--- TestGrammar.h ----------------------------------------------------===// + +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides shared grammar-related test facilities among tests. +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Grammar.h" + +namespace clang { +namespace pseudo { + +struct TestGrammar { + static TestGrammar build(llvm::StringRef BNF); + + // Returns the symbol id for the given name. + SymbolID symbol(llvm::StringRef Name) const; + + // Returns the rule id for the given nonterminal name. + // The nonterminal symbo is expected to have a single rule in the grammar. + RuleID singleRuleFor(llvm::StringRef NonterminalName) const; + + std::unique_ptr G; + std::vector Diags; +}; + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/TestGrammar.cpp b/clang-tools-extra/pseudo/unittests/TestGrammar.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/TestGrammar.cpp @@ -0,0 +1,43 @@ +//===--- TestGrammar.cpp ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestGrammar.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace pseudo { + +TestGrammar TestGrammar::build(llvm::StringRef BNF) { + TestGrammar T; + T.G = Grammar::parseBNF(BNF, T.Diags); + return T; +} + +SymbolID TestGrammar::symbol(llvm::StringRef Name) const { + for (unsigned I = 0; I < NumTerminals; ++I) + if (G->table().Terminals[I] == Name) + return tokenSymbol(static_cast(I)); + for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) + if (G->table().Nonterminals[ID].Name == Name) + return ID; + llvm::errs() << "No such symbol found: " << Name; + std::abort(); +} + +RuleID TestGrammar::singleRuleFor(llvm::StringRef NonterminalName) const { + auto RuleRange = G->table().Nonterminals[symbol(NonterminalName)].RuleRange; + if (RuleRange.End - RuleRange.Start == 1) + return G->table().Nonterminals[symbol(NonterminalName)].RuleRange.Start; + llvm::errs() << "Expected a single rule for " << NonterminalName + << ", but it has " << RuleRange.End - RuleRange.Start + << " rule!\n"; + std::abort(); +} + +} // namespace pseudo +} // namespace clang