diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -122,13 +122,12 @@ ForestArena &Forest; // Storage for the output forest. GSS &GSStack; // Storage for parsing stacks. }; -// Parse the given token stream with the GLR algorithm, and return a forest node -// of the start symbol. +// Parses the given token stream as the Target symbol with the GLR algorithm, +// and returns a forest node of the target symbol. // // If the parsing fails, we model it as an opaque node in the forest. -// -// FIXME: add support for variant start symbols. -const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params); +const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params, + SymbolID Target); // An active stack head can have multiple available actions (reduce/reduce // actions, reduce/shift actions). diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h @@ -139,15 +139,21 @@ llvm::ArrayRef states() const { return States; } llvm::ArrayRef edges() const { return Edges; } + llvm::ArrayRef> startStates() const { + return StartStates; + } std::string dumpForTests(const Grammar &) const; private: - LRGraph(std::vector States, std::vector Edges) - : States(std::move(States)), Edges(std::move(Edges)) {} + LRGraph(std::vector States, std::vector Edges, + std::vector> StartStates) + : States(std::move(States)), Edges(std::move(Edges)), + StartStates(std::move(StartStates)) {} std::vector States; std::vector Edges; + std::vector> StartStates; }; } // namespace pseudo diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h @@ -132,6 +132,16 @@ // Returns empty if no available actions in the table. llvm::ArrayRef find(StateID State, SymbolID Symbol) const; + // Returns the state from which the LR parser should start to parse the input + // tokens as the given Target symbol. + // + // In LR parsing, the start state of `translation-unit` corresponds to + // `_ := • translation-unit`. + // + // REQUIRE: The given Target symbol must exist in the grammar (in a form of + // `_ := target`). + StateID getStartState(SymbolID Target) const; + size_t bytes() const { return sizeof(*this) + Actions.capacity() * sizeof(Action) + States.capacity() * sizeof(StateID) + @@ -171,7 +181,10 @@ std::vector States; // A flat list of available actions, sorted by (SymbolID, State). std::vector Actions; + // A sorted table, storing the start state for each target parsing symbol. + std::vector> StartStates; }; + llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); } // namespace pseudo diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -36,8 +36,8 @@ return OS; } -const ForestNode &glrParse(const TokenStream &Tokens, - const ParseParams &Params) { +const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, + SymbolID Target) { llvm::ArrayRef Terminals = Params.Forest.createTerminals(Tokens); auto &G = Params.G; auto &GSS = Params.GSStack; @@ -61,9 +61,9 @@ } } }; - std::vector NewHeads = { - GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})}; + GSS.addNode(/*State=*/Params.Table.getStartState(Target), + /*ForestNode=*/nullptr, {})}; for (const ForestNode &Terminal : Terminals) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", G.symbolName(Terminal.symbol()), @@ -101,11 +101,7 @@ return *PendingAccept.front().Head->Payload; } // We failed to parse the input, returning an opaque forest node for recovery. - auto RulesForStart = G.rulesFor(G.startSymbol()); - // FIXME: support multiple start symbols. - assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 && - "start symbol _ must have exactly one rule"); - return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0); + return Params.Forest.createOpaque(Target, /*Token::Index=*/0); } // Apply all pending shift actions. diff --git a/clang-tools-extra/pseudo/lib/LRGraph.cpp b/clang-tools-extra/pseudo/lib/LRGraph.cpp --- a/clang-tools-extra/pseudo/lib/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/LRGraph.cpp @@ -160,9 +160,12 @@ } LRGraph LRGraph::buildLR0(const Grammar &G) { + std::vector> StartStates; class Builder { public: - Builder(const Grammar &G) : G(G) {} + Builder(const Grammar &G, + std::vector> &StartStates) + : G(G), StartStates(StartStates) {} // Adds a given state if not existed. std::pair insert(ItemSet KernelItems) { @@ -190,7 +193,10 @@ LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges)); + llvm::sort(StartStates); + StartStates.shrink_to_fit(); + return LRGraph(std::move(States), std::move(Edges), + std::move(StartStates)); } private: @@ -199,7 +205,8 @@ std::vector States; std::vector Edges; const Grammar &G; - } Builder(G); + std::vector> &StartStates; + } Builder(G, StartStates); std::vector PendingStates; // Initialize states with the start symbol. @@ -209,6 +216,11 @@ auto Result = Builder.insert(std::move(StartState)); assert(Result.second && "State must be new"); PendingStates.push_back(Result.first); + + const Rule &StartRule = G.lookupRule(RID); + assert(StartRule.Size == 1 && + "Start rule must have exactly one symbol in its body!"); + StartStates.push_back({StartRule.seq().front(), Result.first}); } while (!PendingStates.empty()) { diff --git a/clang-tools-extra/pseudo/lib/LRTable.cpp b/clang-tools-extra/pseudo/lib/LRTable.cpp --- a/clang-tools-extra/pseudo/lib/LRTable.cpp +++ b/clang-tools-extra/pseudo/lib/LRTable.cpp @@ -120,5 +120,16 @@ /*length=*/End - Start); } +LRTable::StateID LRTable::getStartState(SymbolID Target) const { + assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!"); + auto It = llvm::partition_point( + StartStates, [Target](const std::pair &X) { + return X.first < Target; + }); + assert(It != StartStates.end() && It->first == Target && + "target symbol doesn't have a start state!"); + return It->second; +} + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/lib/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/LRTableBuild.cpp --- a/clang-tools-extra/pseudo/lib/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/LRTableBuild.cpp @@ -40,6 +40,9 @@ class LRTable::Builder { public: + Builder(std::vector> StartStates) + : StartStates(std::move(StartStates)) {} + bool insert(Entry E) { return Entries.insert(std::move(E)).second; } LRTable build(const GrammarTable >) && { // E.g. given the following parsing table with 3 states and 3 terminals: @@ -92,24 +95,26 @@ tokenSymbol(static_cast(Terminal))) ++SortedIndex; } + Table.StartStates = std::move(StartStates); return Table; } private: llvm::DenseSet Entries; + std::vector> StartStates; }; LRTable LRTable::buildForTests(const GrammarTable >, llvm::ArrayRef Entries) { - Builder Build; + Builder Build({}); for (const Entry &E : Entries) Build.insert(E); return std::move(Build).build(GT); } LRTable LRTable::buildSLR(const Grammar &G) { - Builder Build; auto Graph = LRGraph::buildLR0(G); + Builder Build(Graph.startStates()); for (const auto &T : Graph.edges()) { Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); Build.insert({T.Src, T.Label, Act}); diff --git a/clang-tools-extra/pseudo/lib/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx.bnf @@ -25,8 +25,11 @@ # [1] https://isocpp.org/files/papers/N4860.pdf # # -# _ serves as a "fake" start symbol, coming with real grammar symbols. +# _ serves as a "fake" start symbol, coming with real grammar symbols which we +# target to parse. _ := translation-unit +_ := statement-seq +_ := declaration-seq # gram.key typedef-name := IDENTIFIER diff --git a/clang-tools-extra/pseudo/test/glr-variant-start.cpp b/clang-tools-extra/pseudo/test/glr-variant-start.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/glr-variant-start.cpp @@ -0,0 +1,9 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --parse-symbol=statement-seq --print-forest | FileCheck %s + +a + a; +// CHECK: statement-seq~expression-statement := expression ; +// CHECK-NEXT: ├─expression~additive-expression := additive-expression + multiplicative-expression +// CHECK-NEXT: │ ├─additive-expression~IDENTIFIER := +// CHECK-NEXT: │ ├─+ := +// CHECK-NEXT: │ └─multiplicative-expression~IDENTIFIER := +// CHECK-NEXT: └─; := diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -38,6 +38,9 @@ desc("Print directive structure of source code")); static opt PrintStatistics("print-statistics", desc("Print GLR parser statistics")); static opt PrintForest("print-forest", desc("Print parse forest")); +static opt ParseSymbol("parse-symbol", + desc("specify the target symbol to parse"), + init("translation-unit")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr> Text = @@ -50,6 +53,14 @@ return Text.get()->getBuffer().str(); } +static clang::pseudo::SymbolID +findNonterminalID(llvm::StringRef Name, const clang::pseudo::Grammar &G) { + for (clang::pseudo::SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID) + if (G.table().Nonterminals[ID].Name == Name) + return ID; + return 0; +} + int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); @@ -96,9 +107,9 @@ if (ParseableStream) { clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; - auto &Root = - glrParse(*ParseableStream, - clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}); + auto &Root = glrParse(*ParseableStream, + clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}, + findNonterminalID(ParseSymbol, *G)); if (PrintForest) llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true); diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -344,7 +344,8 @@ const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions); auto LRTable = LRTable::buildSLR(*G); - const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack}); + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` // in the forest, see the `#1` and `=#1` in the dump string. EXPECT_EQ(Parsed.dumpRecursive(*G), @@ -381,7 +382,8 @@ const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions); auto LRTable = LRTable::buildSLR(*G); - const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack}); + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); EXPECT_EQ(Parsed.dumpRecursive(*G), "[ 0, end) test := \n" "[ 0, end) ├─test := IDENTIFIER\n"