diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp --- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp +++ b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp @@ -103,10 +103,11 @@ clang::LangOptions LangOpts = genericLangOpts(); LRTable Table = clang::pseudo::LRTable::buildSLR(*G); TokenStream ParseableStream = parseableTokenStream(); + SymbolID StartSymbol = *G->findNonterminal("translation-unit"); for (auto _ : State) { pseudo::ForestArena Forest; pseudo::GSS GSS; - glrParse(ParseableStream, ParseParams{*G, Table, Forest, GSS}); + glrParse(ParseableStream, ParseParams{*G, Table, Forest, GSS}, StartSymbol); } State.SetBytesProcessed(static_cast(State.iterations()) * SourceText->size()); @@ -116,10 +117,12 @@ static void runParseOverall(benchmark::State &State) { clang::LangOptions LangOpts = genericLangOpts(); LRTable Table = clang::pseudo::LRTable::buildSLR(*G); + SymbolID StartSymbol = *G->findNonterminal("translation-unit"); for (auto _ : State) { pseudo::ForestArena Forest; pseudo::GSS GSS; - glrParse(parseableTokenStream(), ParseParams{*G, Table, Forest, GSS}); + glrParse(parseableTokenStream(), ParseParams{*G, Table, Forest, GSS}, + StartSymbol); } State.SetBytesProcessed(static_cast(State.iterations()) * SourceText->size()); diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp --- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp +++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp @@ -58,8 +58,9 @@ clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; - auto &Root = glrParse(ParseableStream, - clang::pseudo::ParseParams{*G, T, Arena, GSS}); + auto &Root = + glrParse(ParseableStream, clang::pseudo::ParseParams{*G, T, Arena, GSS}, + *G->findNonterminal("translation-unit")); if (Print) llvm::outs() << Root.dumpRecursive(*G); } diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -122,13 +122,14 @@ ForestArena &Forest; // Storage for the output forest. GSS &GSStack; // Storage for parsing stacks. }; -// Parse the given token stream with the GLR algorithm, and return a forest node -// of the start symbol. +// Parses the given token stream as the start symbol with the GLR algorithm, +// and returns a forest node of the start symbol. // -// If the parsing fails, we model it as an opaque node in the forest. +// A rule `_ := StartSymbol` must exit for the chosen start symbol. // -// FIXME: add support for variant start symbols. -const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params); +// If the parsing fails, we model it as an opaque node in the forest. +const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params, + SymbolID StartSymbol); // An active stack head can have multiple available actions (reduce/reduce // actions, reduce/shift actions). diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -39,6 +39,7 @@ #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/StringRef.h" #include #include @@ -117,8 +118,8 @@ static std::unique_ptr parseBNF(llvm::StringRef BNF, std::vector &Diags); - // Returns the SymbolID of the start symbol '_'. - SymbolID startSymbol() const { return StartSymbol; }; + // Returns the SymbolID of the symbol '_'. + SymbolID underscore() const { return Underscore; }; // Returns all rules of the given nonterminal symbol. llvm::ArrayRef rulesFor(SymbolID SID) const; @@ -128,6 +129,9 @@ // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). llvm::StringRef symbolName(SymbolID) const; + // Lookup the SymbolID of the nonterminal symbol by Name. + llvm::Optional findNonterminal(llvm::StringRef Name) const; + // Dumps the whole grammar. std::string dump() const; // Dumps a particular rule. @@ -139,8 +143,9 @@ private: std::unique_ptr T; - // The start symbol '_' of the augmented grammar. - SymbolID StartSymbol; + // The symbol ID of '_'. (In the LR literature, this is the start symbol of + // the augmented grammar.) + SymbolID Underscore; }; // For each nonterminal X, computes the set of terminals that begin strings // derived from X. (Known as FIRST sets in grammar-based parsers). diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h @@ -139,15 +139,21 @@ llvm::ArrayRef states() const { return States; } llvm::ArrayRef edges() const { return Edges; } + llvm::ArrayRef> startStates() const { + return StartStates; + } std::string dumpForTests(const Grammar &) const; private: - LRGraph(std::vector States, std::vector Edges) - : States(std::move(States)), Edges(std::move(Edges)) {} + LRGraph(std::vector States, std::vector Edges, + std::vector> StartStates) + : States(std::move(States)), Edges(std::move(Edges)), + StartStates(std::move(StartStates)) {} std::vector States; std::vector Edges; + std::vector> StartStates; }; } // namespace pseudo diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h @@ -132,6 +132,17 @@ // Returns empty if no available actions in the table. llvm::ArrayRef find(StateID State, SymbolID Symbol) const; + // Returns the state from which the LR parser should start to parse the input + // tokens as the given StartSymbol. + // + // In LR parsing, the start state of `translation-unit` corresponds to + // `_ := • translation-unit`. + // + // Each start state responds to **a** single grammar rule like `_ := start`. + // REQUIRE: The given StartSymbol must exist in the grammar (in a form of + // `_ := start`). + StateID getStartState(SymbolID StartSymbol) const; + size_t bytes() const { return sizeof(*this) + Actions.capacity() * sizeof(Action) + States.capacity() * sizeof(StateID) + @@ -171,7 +182,10 @@ std::vector States; // A flat list of available actions, sorted by (SymbolID, State). std::vector Actions; + // A sorted table, storing the start state for each target parsing symbol. + std::vector> StartStates; }; + llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); } // namespace pseudo diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -36,8 +36,8 @@ return OS; } -const ForestNode &glrParse(const TokenStream &Tokens, - const ParseParams &Params) { +const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, + SymbolID Target) { llvm::ArrayRef Terminals = Params.Forest.createTerminals(Tokens); auto &G = Params.G; auto &GSS = Params.GSStack; @@ -61,9 +61,9 @@ } } }; - std::vector NewHeads = { - GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})}; + GSS.addNode(/*State=*/Params.Table.getStartState(Target), + /*ForestNode=*/nullptr, {})}; for (const ForestNode &Terminal : Terminals) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", G.symbolName(Terminal.symbol()), @@ -101,11 +101,7 @@ return *PendingAccept.front().Head->Payload; } // We failed to parse the input, returning an opaque forest node for recovery. - auto RulesForStart = G.rulesFor(G.startSymbol()); - // FIXME: support multiple start symbols. - assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 && - "start symbol _ must have exactly one rule"); - return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0); + return Params.Forest.createOpaque(Target, /*Token::Index=*/0); } // Apply all pending shift actions. diff --git a/clang-tools-extra/pseudo/lib/Grammar.cpp b/clang-tools-extra/pseudo/lib/Grammar.cpp --- a/clang-tools-extra/pseudo/lib/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/Grammar.cpp @@ -24,13 +24,7 @@ } Grammar::Grammar(std::unique_ptr Table) : T(std::move(Table)) { - // start symbol is named _, binary search it. - auto It = llvm::partition_point( - T->Nonterminals, - [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; }); - assert(It != T->Nonterminals.end() && It->Name == "_" && - "symbol _ must exist in the grammar!"); - StartSymbol = It - T->Nonterminals.begin(); + Underscore = *findNonterminal("_"); } llvm::ArrayRef Grammar::rulesFor(SymbolID SID) const { @@ -51,6 +45,15 @@ return T->Nonterminals[SID].Name; } +llvm::Optional Grammar::findNonterminal(llvm::StringRef Name) const { + auto It = llvm::partition_point( + T->Nonterminals, + [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); + if (It != T->Nonterminals.end() && It->Name == Name) + return It - T->Nonterminals.begin(); + return llvm::None; +} + std::string Grammar::dumpRule(RuleID RID) const { std::string Result; llvm::raw_string_ostream OS(Result); @@ -132,8 +135,9 @@ // is completed at a fixed point where there is no more new symbols can be // added to any of the follow sets. // - // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol. - FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof)); + // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the + // augmented grammar, in our case it is '_'. + FollowSets[G.underscore()].insert(tokenSymbol(tok::eof)); bool Changed = true; while (Changed) { Changed = false; @@ -179,5 +183,15 @@ } GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {} +SymbolID findNonterminal(llvm::StringRef Name, + const clang::pseudo::GrammarTable >) { + auto It = llvm::partition_point( + GT.Nonterminals, + [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); + assert(It != GT.Nonterminals.end() && It->Name == Name && + "Given nonterminal must exist in the grammar!"); + return It - GT.Nonterminals.begin(); +} + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/lib/LRGraph.cpp b/clang-tools-extra/pseudo/lib/LRGraph.cpp --- a/clang-tools-extra/pseudo/lib/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/LRGraph.cpp @@ -187,10 +187,17 @@ return States[ID]; } + void addStartState(SymbolID Sym, StateID State) { + StartStates.push_back({Sym, State}); + } + LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges)); + llvm::sort(StartStates); + StartStates.shrink_to_fit(); + return LRGraph(std::move(States), std::move(Edges), + std::move(StartStates)); } private: @@ -199,16 +206,22 @@ std::vector States; std::vector Edges; const Grammar &G; + std::vector> StartStates; } Builder(G); std::vector PendingStates; // Initialize states with the start symbol. - auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange; + auto RRange = G.table().Nonterminals[G.underscore()].RuleRange; for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { auto StartState = std::vector{Item::start(RID, G)}; auto Result = Builder.insert(std::move(StartState)); assert(Result.second && "State must be new"); PendingStates.push_back(Result.first); + + const Rule &StartRule = G.lookupRule(RID); + assert(StartRule.Size == 1 && + "Start rule must have exactly one symbol in its body!"); + Builder.addStartState(StartRule.seq().front(), Result.first); } while (!PendingStates.empty()) { diff --git a/clang-tools-extra/pseudo/lib/LRTable.cpp b/clang-tools-extra/pseudo/lib/LRTable.cpp --- a/clang-tools-extra/pseudo/lib/LRTable.cpp +++ b/clang-tools-extra/pseudo/lib/LRTable.cpp @@ -120,5 +120,16 @@ /*length=*/End - Start); } +LRTable::StateID LRTable::getStartState(SymbolID Target) const { + assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!"); + auto It = llvm::partition_point( + StartStates, [Target](const std::pair &X) { + return X.first < Target; + }); + assert(It != StartStates.end() && It->first == Target && + "target symbol doesn't have a start state!"); + return It->second; +} + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/lib/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/LRTableBuild.cpp --- a/clang-tools-extra/pseudo/lib/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/LRTableBuild.cpp @@ -40,6 +40,9 @@ class LRTable::Builder { public: + Builder(llvm::ArrayRef> StartStates) + : StartStates(StartStates) {} + bool insert(Entry E) { return Entries.insert(std::move(E)).second; } LRTable build(const GrammarTable >) && { // E.g. given the following parsing table with 3 states and 3 terminals: @@ -92,24 +95,26 @@ tokenSymbol(static_cast(Terminal))) ++SortedIndex; } + Table.StartStates = std::move(StartStates); return Table; } private: llvm::DenseSet Entries; + std::vector> StartStates; }; LRTable LRTable::buildForTests(const GrammarTable >, llvm::ArrayRef Entries) { - Builder Build; + Builder Build({}); for (const Entry &E : Entries) Build.insert(E); return std::move(Build).build(GT); } LRTable LRTable::buildSLR(const Grammar &G) { - Builder Build; auto Graph = LRGraph::buildLR0(G); + Builder Build(Graph.startStates()); for (const auto &T : Graph.edges()) { Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); Build.insert({T.Src, T.Label, Act}); @@ -120,7 +125,7 @@ for (StateID SID = 0; SID < Graph.states().size(); ++SID) { for (const Item &I : Graph.states()[SID].Items) { // If we've just parsed the start symbol, we can accept the input. - if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) { + if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext()) { Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())}); continue; } diff --git a/clang-tools-extra/pseudo/lib/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx.bnf @@ -23,10 +23,15 @@ # - optional symbols are supported, with a _opt suffix; # # [1] https://isocpp.org/files/papers/N4860.pdf + +# _ lists all the start-symbols which we support parsing. # -# -# _ serves as a "fake" start symbol, coming with real grammar symbols. +# We list important nonterminals as start symbols, rather than doing it for all +# nonterminals by default, this reduces the number of states by 30% and LRTable +# actions by 16%. _ := translation-unit +_ := statement-seq +_ := declaration-seq # gram.key typedef-name := IDENTIFIER diff --git a/clang-tools-extra/pseudo/test/glr-variant-start.cpp b/clang-tools-extra/pseudo/test/glr-variant-start.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/glr-variant-start.cpp @@ -0,0 +1,9 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s + +a + a; +// CHECK: statement-seq~expression-statement := expression ; +// CHECK-NEXT: ├─expression~additive-expression := additive-expression + multiplicative-expression +// CHECK-NEXT: │ ├─additive-expression~IDENTIFIER := +// CHECK-NEXT: │ ├─+ := +// CHECK-NEXT: │ └─multiplicative-expression~IDENTIFIER := +// CHECK-NEXT: └─; := diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -43,6 +43,9 @@ desc("Strip directives and select conditional sections")); static opt PrintStatistics("print-statistics", desc("Print GLR parser statistics")); static opt PrintForest("print-forest", desc("Print parse forest")); +static opt StartSymbol("start-symbol", + desc("specify the start symbol to parse"), + init("translation-unit")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr> Text = @@ -110,9 +113,16 @@ if (ParseableStream) { clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; - auto &Root = - glrParse(*ParseableStream, - clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}); + llvm::Optional StartSymID = + G->findNonterminal(StartSymbol); + if (!StartSymID) { + llvm::errs() << llvm::formatv( + "The start symbol {0} doesn't exit in the grammar!\n", Grammar); + return 2; + } + auto &Root = glrParse(*ParseableStream, + clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}, + *StartSymID); if (PrintForest) llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true); diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -344,7 +344,8 @@ const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions); auto LRTable = LRTable::buildSLR(*G); - const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack}); + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` // in the forest, see the `#1` and `=#1` in the dump string. EXPECT_EQ(Parsed.dumpRecursive(*G), @@ -381,7 +382,8 @@ const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions); auto LRTable = LRTable::buildSLR(*G); - const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack}); + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); EXPECT_EQ(Parsed.dumpRecursive(*G), "[ 0, end) test := \n" "[ 0, end) ├─test := IDENTIFIER\n" diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -112,7 +112,7 @@ b := a )cpp"); - EXPECT_EQ(G->startSymbol(), id("_")); + EXPECT_EQ(G->underscore(), id("_")); EXPECT_THAT(Diags, UnorderedElementsAre( "Rule '_ := ,_opt' has a nullable RHS", "Rule 'null := ' has a nullable RHS",