diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp --- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp +++ b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp @@ -77,11 +77,11 @@ } BENCHMARK(parseBNF); -static void buildSLR(benchmark::State &State) { +static void buildLR0(benchmark::State &State) { for (auto _ : State) - LRTable::buildSLR(*G); + LRTable::buildLR0(*G); } -BENCHMARK(buildSLR); +BENCHMARK(buildLR0); TokenStream lexAndPreprocess() { clang::LangOptions LangOpts = genericLangOpts(); @@ -129,7 +129,7 @@ BENCHMARK(preprocess); static void glrParse(benchmark::State &State) { - LRTable Table = clang::pseudo::LRTable::buildSLR(*G); + LRTable Table = clang::pseudo::LRTable::buildLR0(*G); SymbolID StartSymbol = *G->findNonterminal("translation-unit"); TokenStream Stream = lexAndPreprocess(); for (auto _ : State) { @@ -143,7 +143,7 @@ BENCHMARK(glrParse); static void full(benchmark::State &State) { - LRTable Table = clang::pseudo::LRTable::buildSLR(*G); + LRTable Table = clang::pseudo::LRTable::buildLR0(*G); SymbolID StartSymbol = *G->findNonterminal("translation-unit"); for (auto _ : State) { TokenStream Stream = lexAndPreprocess(); diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -132,34 +132,23 @@ const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params, SymbolID StartSymbol); -// An active stack head can have multiple available actions (reduce/reduce -// actions, reduce/shift actions). -// A step is any one action applied to any one stack head. -struct ParseStep { - // A specific stack head. - const GSS::Node *Head = nullptr; - // An action associated with the head. - LRTable::Action Action = LRTable::Action::sentinel(); -}; -// A callback is invoked whenever a new GSS head is created during the GLR -// parsing process (glrShift, or glrReduce). -using NewHeadCallback = std::function; // Apply all PendingShift actions on a given GSS state, newly-created heads are // passed to the callback. // // When this function returns, PendingShift is empty. // // Exposed for testing only. -void glrShift(std::vector &PendingShift, const ForestNode &NextTok, - const ParseParams &Params, NewHeadCallback NewHeadCB); +void glrShift(llvm::ArrayRef OldHeads, + const ForestNode &NextTok, const ParseParams &Params, + std::vector &NewHeads); // Applies PendingReduce actions, until no more reduce actions are available. // // When this function returns, PendingReduce is empty. Calls to NewHeadCB may // add elements to PendingReduce // // Exposed for testing only. -void glrReduce(std::vector &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB); +void glrReduce(std::vector &Heads, + const ParseParams &Params); } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -174,12 +174,6 @@ // the augmented grammar.) SymbolID Underscore; }; -// For each nonterminal X, computes the set of terminals that begin strings -// derived from X. (Known as FIRST sets in grammar-based parsers). -std::vector> firstSets(const Grammar &); -// For each nonterminal X, computes the set of terminals that could immediately -// follow X. (Known as FOLLOW sets in grammar-based parsers). -std::vector> followSets(const Grammar &); // Storage for the underlying data of the Grammar. // It can be constructed dynamically (from compiling BNF file) or statically diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h @@ -38,9 +38,39 @@ #include "clang-pseudo/Grammar.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Capacity.h" +#include "llvm/Support/raw_ostream.h" #include #include +namespace clang { +namespace pseudo { +struct StateSymbol { + using StateID = uint16_t; // XXX + StateID State; + SymbolID Symbol; +}; +} // namespace pseudo +} // namespace clang +namespace llvm { +template <> struct llvm::DenseMapInfo { + using StateSymbol = clang::pseudo::StateSymbol; + static inline StateSymbol getEmptyKey() { + return StateSymbol{StateSymbol::StateID(-1), 0}; + } + static inline StateSymbol getTombstoneKey() { + return StateSymbol{StateSymbol::StateID(-1), 1}; + } + static unsigned getHashValue(const StateSymbol &Val) { + return (Val.State * 2754435761U) ^ Val.Symbol; // Knuth hash + } + static bool isEqual(const StateSymbol &LHS, const StateSymbol &RHS) { + return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol; + } +}; +} // namespace llvm namespace clang { namespace pseudo { @@ -60,79 +90,26 @@ using StateID = uint16_t; static constexpr unsigned StateBits = 13; - // Action represents the terminal and nonterminal actions, it combines the - // entry of the ACTION and GOTO tables from the LR literature. - class Action { - public: - enum Kind : uint8_t { - Sentinel = 0, - // Terminal actions, corresponding to entries of ACTION table. - - // Shift to state n: move forward with the lookahead, and push state n - // onto the state stack. - // A shift is a forward transition, and the value n is the next state that - // the parser is to enter. - Shift, - // Reduce by a rule: pop the state stack. - Reduce, - - // NOTE: there are no typical accept actions in the LRtable, accept - // actions are handled specifically in the parser -- if the parser - // reaches to a target state (which is goto(StartState, StartSymbol)) at - // the EOF token after a reduce, this indicates the input has been parsed - // as the StartSymbol successfully. - - // Nonterminal actions, corresponding to entry of GOTO table. - - // Go to state n: push state n onto the state stack. - // Similar to Shift, but it is a nonterminal forward transition. - GoTo, - }; - - static Action goTo(StateID S) { return Action(GoTo, S); } - static Action shift(StateID S) { return Action(Shift, S); } - static Action reduce(RuleID RID) { return Action(Reduce, RID); } - static Action sentinel() { return Action(Sentinel, 0); } - - StateID getShiftState() const { - assert(kind() == Shift); - return Value; - } - StateID getGoToState() const { - assert(kind() == GoTo); - return Value; - } - RuleID getReduceRule() const { - assert(kind() == Reduce); - return Value; - } - Kind kind() const { return static_cast(K); } - - bool operator==(const Action &L) const { return opaque() == L.opaque(); } - uint16_t opaque() const { return K << ValueBits | Value; }; - - private: - Action(Kind K1, unsigned Value) : K(K1), Value(Value) {} - static constexpr unsigned ValueBits = StateBits; - static constexpr unsigned KindBits = 3; - static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID"); - static_assert(KindBits + ValueBits <= 16, - "Must be able to store kind and value efficiently"); - uint16_t K : KindBits; - // Either StateID or RuleID, depending on the Kind. - uint16_t Value : ValueBits; - }; - - // Returns all available actions for the given state on a terminal. - // Expected to be called by LR parsers. - llvm::ArrayRef getActions(StateID State, SymbolID Terminal) const; - // Returns the state after we reduce a nonterminal. - // Expected to be called by LR parsers. - StateID getGoToState(StateID State, SymbolID Nonterminal) const; - - // Looks up available actions. - // Returns empty if no available actions in the table. - llvm::ArrayRef find(StateID State, SymbolID Symbol) const; + // Return the list of reductions applicable from a given state. + // These correspond to items with the dot at the end. + llvm::ArrayRef getReduceRules(StateID State) const { + return Reduce.find(State); + } + // Return the state to transition to after shifting a token Terminal. + // Returns None if this terminal is not legal here. + llvm::Optional getShiftState(StateID From, SymbolID Terminal) const { + auto It = Shift.find(StateSymbol{From, Terminal}); + if (It == Shift.end()) + return llvm::None; + return It->second; + } + // Return the state to transition to after reducing a symbol Nonterminal. + // REQUIRES: this nonterminal is legal here. + StateID getGoToState(StateID From, SymbolID Nonterminal) const { + auto It = Goto.find(StateSymbol{From, Nonterminal}); + assert(It != Goto.end()); + return It->second; + } // Returns the state from which the LR parser should start to parse the input // tokens as the given StartSymbol. @@ -146,18 +123,16 @@ StateID getStartState(SymbolID StartSymbol) const; size_t bytes() const { - return sizeof(*this) + Actions.capacity() * sizeof(Action) + - Symbols.capacity() * sizeof(SymbolID) + - StateOffset.capacity() * sizeof(uint32_t); + return sizeof(*this) + llvm::capacity_in_bytes(Shift) + + llvm::capacity_in_bytes(Goto) + Reduce.bytes(); } std::string dumpStatistics() const; std::string dumpForTests(const Grammar &G) const; - // Build a SLR(1) parsing table. - static LRTable buildSLR(const Grammar &G); + static LRTable buildLR0(const Grammar &G); - class Builder; +#if 0 // Represents an entry in the table, used for building the LRTable. struct Entry { StateID State; @@ -166,27 +141,63 @@ }; // Build a specifid table for testing purposes. static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef); +#endif private: - // Conceptually the LR table is a multimap from (State, SymbolID) => Action. - // Our physical representation is quite different for compactness. - - // Index is StateID, value is the offset into Symbols/Actions - // where the entries for this state begin. - // Give a state id, the corresponding half-open range of Symbols/Actions is - // [StateOffset[id], StateOffset[id+1]). - std::vector StateOffset; - // Parallel to Actions, the value is SymbolID (columns of the matrix). - // Grouped by the StateID, and only subranges are sorted. - std::vector Symbols; - // A flat list of available actions, sorted by (State, SymbolID). - std::vector Actions; + // A multimap from K => V, where Ks form a dense range [0, n). + // A flat array stores the values: + // Values = [ values for k=0 | values for k=1 | values for k=2 | ... ] + // And another stores the index for each K: + // Keys = [ index for k=0, index for k=1, ... , Values.size()] + // Lookup[k] is Values[ Keys[k]...Keys[k+1] ] + template struct OffsetTable { + std::vector Keys; + std::vector Values; + + OffsetTable(std::vector> &&Pairs, K Limit) { + llvm::stable_sort(Pairs, llvm::less_first{}); + Keys.reserve(Limit + 1); + Values.reserve(Pairs.size()); + unsigned I = 0; + for (K Key = 0; Key < Limit; ++Key) { + Keys.push_back(Values.size()); + while (I < Pairs.size() && Pairs[I].first == Key) + Values.push_back(Pairs[I++].second); + } + Keys.push_back(Values.size()); + assert(Values.size() == Pairs.size()); + assert(Keys.size() == Limit + 1); + } + + size_t bytes() const { + return sizeof(*this) + llvm::capacity_in_bytes(Keys) + + llvm::capacity_in_bytes(Values); + } + size_t size() const { return Values.size(); } + size_t keys() const { return Keys.size() - 1; } + + llvm::ArrayRef find(K Key) const { + return llvm::makeArrayRef(&Values[Keys[Key]], &Values[Keys[Key + 1]]); + } + }; + + llvm::DenseMap Shift; + OffsetTable Reduce; + llvm::DenseMap Goto; // A sorted table, storing the start state for each target parsing symbol. std::vector> StartStates; + + LRTable(llvm::DenseMap Shift, + OffsetTable Reduce, + llvm::DenseMap GoTo, + std::vector> StartStates) + : Shift(std::move(Shift)), Reduce(std::move(Reduce)), + Goto(std::move(GoTo)), StartStates(std::move(StartStates)) {} }; -llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); } // namespace pseudo } // namespace clang +namespace llvm {} // namespace llvm + #endif // CLANG_PSEUDO_LRTABLE_H diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -46,67 +46,35 @@ auto &GSS = Params.GSStack; // Lists of active shift, reduce actions. - std::vector PendingShift, PendingReduce; - auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) { - for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) { - switch (Action.kind()) { - case LRTable::Action::Shift: - PendingShift.push_back({Head, Action}); - break; - case LRTable::Action::Reduce: - PendingReduce.push_back({Head, Action}); - break; - default: - llvm_unreachable("unexpected action kind!"); - } - } - }; StateID StartState = Params.Table.getStartState(StartSymbol); - std::vector NewHeads = { + std::vector Heads = { GSS.addNode(/*State=*/StartState, /*ForestNode=*/nullptr, {})}; + std::vector NextHeads; auto MaybeGC = [&, Roots(std::vector{}), I(0u)]() mutable { - assert(PendingShift.empty() && PendingReduce.empty() && - "Running GC at the wrong time!"); - + assert(NextHeads.empty() && "Running GC at the wrong time!"); if (++I != 20) // Run periodically to balance CPU and memory usage. return; I = 0; // We need to copy the list: Roots is consumed by the GC. - Roots = NewHeads; + Roots = Heads; GSS.gc(std::move(Roots)); }; for (const ForestNode &Terminal : Terminals) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", G.symbolName(Terminal.symbol()), Terminal.symbol())); - for (const auto *Head : NewHeads) - AddSteps(Head, Terminal.symbol()); - NewHeads.clear(); - glrReduce(PendingReduce, Params, - [&](const GSS::Node * NewHead) { - // A reduce will enable more steps. - AddSteps(NewHead, Terminal.symbol()); - }); - - glrShift(PendingShift, Terminal, Params, - [&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); }); + glrShift(Heads, Terminal, Params, NextHeads); + std::swap(Heads, NextHeads); + glrReduce(Heads, Params); + NextHeads.clear(); MaybeGC(); } - LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n")); - for (const auto *Heads : NewHeads) - AddSteps(Heads, tokenSymbol(tok::eof)); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); // Collect new heads created from the final reduce. - std::vector Heads; - glrReduce(PendingReduce, Params, [&](const GSS::Node *NewHead) { - Heads.push_back(NewHead); - // A reduce will enable more steps. - AddSteps(NewHead, tokenSymbol(tok::eof)); - }); - const ForestNode *Result = nullptr; for (const auto *Head : Heads) { if (Head->State == AcceptState) { @@ -138,42 +106,40 @@ // After the shift action, the GSS is: // 0---1---2---4 // └---3---┘ -void glrShift(std::vector &PendingShift, const ForestNode &NewTok, - const ParseParams &Params, NewHeadCallback NewHeadCB) { +void glrShift(llvm::ArrayRef OldHeads, + const ForestNode &NewTok, const ParseParams &Params, + std::vector &NewHeads) { assert(NewTok.kind() == ForestNode::Terminal); - assert(llvm::all_of(PendingShift, - [](const ParseStep &Step) { - return Step.Action.kind() == LRTable::Action::Shift; - }) && - "Pending shift actions must be shift actions"); LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n", Params.G.symbolName(NewTok.symbol()), - PendingShift.size())); + OldHeads.size())); // We group pending shifts by their target state so we can merge them. - llvm::stable_sort(PendingShift, [](const ParseStep &L, const ParseStep &R) { - return L.Action.getShiftState() < R.Action.getShiftState(); - }); - auto Rest = llvm::makeArrayRef(PendingShift); + llvm::SmallVector, 8> Shifts; + for (const auto* H : OldHeads) + if (auto S = Params.Table.getShiftState(H->State, NewTok.symbol())) + Shifts.push_back({*S, H}); + llvm::stable_sort(Shifts, llvm::less_first{}); + + auto Rest = llvm::makeArrayRef(Shifts); llvm::SmallVector Parents; while (!Rest.empty()) { // Collect the batch of PendingShift that have compatible shift states. // Their heads become TempParents, the parents of the new GSS node. - StateID NextState = Rest.front().Action.getShiftState(); + StateID NextState = Rest.front().first; Parents.clear(); for (const auto &Base : Rest) { - if (Base.Action.getShiftState() != NextState) + if (Base.first != NextState) break; - Parents.push_back(Base.Head); + Parents.push_back(Base.second); } Rest = Rest.drop_front(Parents.size()); LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n", NextState, Parents.size())); - NewHeadCB(Params.GSStack.addNode(NextState, &NewTok, Parents)); + NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents)); } - PendingShift.clear(); } namespace { @@ -231,8 +197,8 @@ // After reducing 3 by `pointer := class-name STAR` and // 2 by`enum-name := class-name STAR`: // 0--5(pointer) // 5 is goto(0, pointer) -void glrReduce(std::vector &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB) { +void glrReduce(std::vector &Heads, + const ParseParams &Params) { // There are two interacting complications: // 1. Performing one reduce can unlock new reduces on the newly-created head. // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). @@ -311,10 +277,36 @@ }; DFS(Head, 0, DFS); }; + unsigned PoppedHeads = 0; + auto FastPath = [&]() -> bool { + if (!Sequences.empty() || Heads.size() != PoppedHeads + 1) + return false; + const GSS::Node *Head = Heads.back(); + auto Rules = Params.Table.getReduceRules(Head->State); + if (Rules.size() != 1) + return false; + const auto &Rule = Params.G.lookupRule(Rules.front()); + const GSS::Node *Base = Head; + TempSequence.resize_for_overwrite(Rule.Size); + for (unsigned I = 0; I < Rule.Size; ++I) { + if (Base->parents().size() != 1) + return false; + TempSequence[Rule.Size - 1 - I] = Base->Payload; + Base = Base->parents().front(); + } + const ForestNode *Parsed = + &Params.Forest.createSequence(Rule.Target, Rules.front(), TempSequence); + StateID NextState = Params.Table.getGoToState(Base->State, Rule.Target); + Heads.push_back(Params.GSStack.addNode(NextState, Parsed, {Base})); + return true; + }; auto PopPending = [&] { - for (const ParseStep &Pending : PendingReduce) - Pop(Pending.Head, Pending.Action.getReduceRule()); - PendingReduce.clear(); + for (; PoppedHeads < Heads.size(); ++PoppedHeads) { + if (FastPath()) + continue; + for (RuleID R : Params.Table.getReduceRules(Heads[PoppedHeads]->State)) + Pop(Heads[PoppedHeads], R); + } }; std::vector> FamilyBases; @@ -379,9 +371,7 @@ } BasesLeft = BasesLeft.drop_front(Parents.size()); - // Invoking the callback for new heads, a real GLR parser may add new - // reduces to the PendingReduce queue! - NewHeadCB(Params.GSStack.addNode(NextState, Parsed, Parents)); + Heads.push_back(Params.GSStack.addNode(NextState, Parsed, Parents)); } PopPending(); } diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp --- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp @@ -25,7 +25,7 @@ } const LRTable &getLRTable() { - static LRTable *Table = new LRTable(LRTable::buildSLR(getGrammar())); + static LRTable *Table = new LRTable(LRTable::buildLR0(getGrammar())); return *Table; } diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -87,87 +87,6 @@ return OS.str(); } -std::vector> firstSets(const Grammar &G) { - std::vector> FirstSets( - G.table().Nonterminals.size()); - auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { - assert(isNonterminal(Target)); - if (isToken(First)) - return FirstSets[Target].insert(First).second; - bool Changed = false; - for (SymbolID SID : FirstSets[First]) - Changed |= FirstSets[Target].insert(SID).second; - return Changed; - }; - - // A rule S := T ... implies elements in FIRST(S): - // - if T is a terminal, FIRST(S) contains T - // - if T is a nonterminal, FIRST(S) contains FIRST(T) - // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may - // end up being incomplete. - // We iterate until we hit a fixed point. - // (This isn't particularly efficient, but table building isn't on the - // critical path). - bool Changed = true; - while (Changed) { - Changed = false; - for (const auto &R : G.table().Rules) - // We only need to consider the first element because symbols are - // non-nullable. - Changed |= ExpandFirstSet(R.Target, R.seq().front()); - } - return FirstSets; -} - -std::vector> followSets(const Grammar &G) { - auto FirstSets = firstSets(G); - std::vector> FollowSets( - G.table().Nonterminals.size()); - // Expand the follow set of a nonterminal symbol Y by adding all from the - // given symbol set. - auto ExpandFollowSet = [&FollowSets](SymbolID Y, - const llvm::DenseSet &ToAdd) { - assert(isNonterminal(Y)); - bool Changed = false; - for (SymbolID F : ToAdd) - Changed |= FollowSets[Y].insert(F).second; - return Changed; - }; - // Follow sets is computed based on the following 3 rules, the computation - // is completed at a fixed point where there is no more new symbols can be - // added to any of the follow sets. - // - // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the - // augmented grammar, in our case it is '_'. - FollowSets[G.underscore()].insert(tokenSymbol(tok::eof)); - bool Changed = true; - while (Changed) { - Changed = false; - for (const auto &R : G.table().Rules) { - // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to - // FOLLOW(Y). - for (size_t I = 0; I + 1 < R.seq().size(); ++I) { - if (isToken(R.seq()[I])) - continue; - // We only need to consider the next symbol because symbols are - // non-nullable. - SymbolID Next = R.seq()[I + 1]; - if (isToken(Next)) - // First set for a terminal is itself. - Changed |= ExpandFollowSet(R.seq()[I], {Next}); - else - Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]); - } - // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to - // FOLLOW(Z). - SymbolID Z = R.seq().back(); - if (isNonterminal(Z)) - Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); - } - } - return FollowSets; -} - static llvm::ArrayRef getTerminalNames() { static const auto &TerminalNames = []() { auto TerminalNames = new std::string[NumTerminals]; diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp @@ -10,35 +10,19 @@ #include "clang-pseudo/Grammar.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" namespace clang { namespace pseudo { -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) { - switch (A.kind()) { - case LRTable::Action::Shift: - return OS << llvm::formatv("shift state {0}", A.getShiftState()); - case LRTable::Action::Reduce: - return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule()); - case LRTable::Action::GoTo: - return OS << llvm::formatv("go to state {0}", A.getGoToState()); - case LRTable::Action::Sentinel: - llvm_unreachable("unexpected Sentinel action kind!"); - } - llvm_unreachable("unexpected action kind!"); -} - std::string LRTable::dumpStatistics() const { return llvm::formatv(R"( Statistics of the LR parsing table: - number of states: {0} - number of actions: {1} - size of the table (bytes): {2} + number of actions: shift={0} reduce={1} goto={2} + size of the table (bytes): {3} )", - StateOffset.size() - 1, Actions.size(), bytes()) + Shift.size(), Reduce.size(), Goto.size(), bytes()) .str(); } @@ -46,66 +30,28 @@ std::string Result; llvm::raw_string_ostream OS(Result); OS << "LRTable:\n"; - for (StateID S = 0; S < StateOffset.size() - 1; ++S) { + for (StateID S = 0; S < Reduce.keys(); ++S) { OS << llvm::formatv("State {0}\n", S); + for (RuleID R : getReduceRules(S)) + OS.indent(4) << llvm::formatv("reduce by rule {0} '{1}'\n", R, + G.dumpRule(R)); for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) { SymbolID TokID = tokenSymbol(static_cast(Terminal)); - for (auto A : find(S, TokID)) { - if (A.kind() == LRTable::Action::Shift) - OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n", - G.symbolName(TokID), A.getShiftState()); - else if (A.kind() == LRTable::Action::Reduce) - OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n", - G.symbolName(TokID), A.getReduceRule(), - G.dumpRule(A.getReduceRule())); - } + if (auto Next = getShiftState(S, TokID)) + OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n", + G.symbolName(TokID), *Next); } for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size(); ++NontermID) { - if (find(S, NontermID).empty()) - continue; - OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n", - G.symbolName(NontermID), - getGoToState(S, NontermID)); + auto It = Goto.find(StateSymbol{S, NontermID}); + if (It != Goto.end()) + OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n", + G.symbolName(NontermID), It->second); } } return OS.str(); } -llvm::ArrayRef LRTable::getActions(StateID State, - SymbolID Terminal) const { - assert(pseudo::isToken(Terminal) && "expect terminal symbol!"); - return find(State, Terminal); -} - -LRTable::StateID LRTable::getGoToState(StateID State, - SymbolID Nonterminal) const { - assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!"); - auto Result = find(State, Nonterminal); - assert(Result.size() == 1 && Result.front().kind() == Action::GoTo); - return Result.front().getGoToState(); -} - -llvm::ArrayRef LRTable::find(StateID Src, SymbolID ID) const { - assert(Src + 1u < StateOffset.size()); - std::pair Range = - std::make_pair(StateOffset[Src], StateOffset[Src + 1]); - auto SymbolRange = llvm::makeArrayRef(Symbols.data() + Range.first, - Symbols.data() + Range.second); - - assert(llvm::is_sorted(SymbolRange) && - "subrange of the Symbols should be sorted!"); - const LRTable::StateID *Start = - llvm::partition_point(SymbolRange, [&ID](SymbolID S) { return S < ID; }); - if (Start == SymbolRange.end()) - return {}; - const LRTable::StateID *End = Start; - while (End != SymbolRange.end() && *End == ID) - ++End; - return llvm::makeArrayRef(&Actions[Start - Symbols.data()], - /*length=*/End - Start); -} - LRTable::StateID LRTable::getStartState(SymbolID Target) const { assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!"); auto It = llvm::partition_point( diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp @@ -9,35 +9,12 @@ #include "clang-pseudo/Grammar.h" #include "clang-pseudo/LRGraph.h" #include "clang-pseudo/LRTable.h" -#include "clang/Basic/TokenKinds.h" #include -namespace llvm { -template <> struct DenseMapInfo { - using Entry = clang::pseudo::LRTable::Entry; - static inline Entry getEmptyKey() { - static Entry E{static_cast(-1), 0, - clang::pseudo::LRTable::Action::sentinel()}; - return E; - } - static inline Entry getTombstoneKey() { - static Entry E{static_cast(-2), 0, - clang::pseudo::LRTable::Action::sentinel()}; - return E; - } - static unsigned getHashValue(const Entry &I) { - return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque()); - } - static bool isEqual(const Entry &LHS, const Entry &RHS) { - return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol && - LHS.Act == RHS.Act; - } -}; -} // namespace llvm - namespace clang { namespace pseudo { +#if 0 class LRTable::Builder { public: Builder(llvm::ArrayRef> StartStates) @@ -45,48 +22,6 @@ bool insert(Entry E) { return Entries.insert(std::move(E)).second; } LRTable build(const GrammarTable >, unsigned NumStates) && { - // E.g. given the following parsing table with 3 states and 3 terminals: - // - // a b c - // +-------+----+-------+-+ - // |state0 | | s0,r0 | | - // |state1 | acc| | | - // |state2 | | r1 | | - // +-------+----+-------+-+ - // - // The final LRTable: - // - StateOffset: [s0] = 0, [s1] = 2, [s2] = 3, [sentinel] = 4 - // - Symbols: [ b, b, a, b] - // Actions: [ s0, r0, acc, r1] - // ~~~~~~ range for state 0 - // ~~~~ range for state 1 - // ~~ range for state 2 - // First step, we sort all entries by (State, Symbol, Action). - std::vector Sorted(Entries.begin(), Entries.end()); - llvm::sort(Sorted, [](const Entry &L, const Entry &R) { - return std::forward_as_tuple(L.State, L.Symbol, L.Act.opaque()) < - std::forward_as_tuple(R.State, R.Symbol, R.Act.opaque()); - }); - - LRTable Table; - Table.Actions.reserve(Sorted.size()); - Table.Symbols.reserve(Sorted.size()); - // We are good to finalize the States and Actions. - for (const auto &E : Sorted) { - Table.Actions.push_back(E.Act); - Table.Symbols.push_back(E.Symbol); - } - // Initialize the terminal and nonterminal offset, all ranges are empty by - // default. - Table.StateOffset = std::vector(NumStates + 1, 0); - size_t SortedIndex = 0; - for (StateID State = 0; State < Table.StateOffset.size(); ++State) { - Table.StateOffset[State] = SortedIndex; - while (SortedIndex < Sorted.size() && Sorted[SortedIndex].State == State) - ++SortedIndex; - } - Table.StartStates = std::move(StartStates); - return Table; } private: @@ -105,34 +40,40 @@ return std::move(Build).build(GT, /*NumStates=*/MaxState + 1); } -LRTable LRTable::buildSLR(const Grammar &G) { +#endif + +LRTable LRTable::buildLR0(const Grammar &G) { auto Graph = LRGraph::buildLR0(G); - Builder Build(Graph.startStates()); - for (const auto &T : Graph.edges()) { - Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); - Build.insert({T.Src, T.Label, Act}); - } assert(Graph.states().size() <= (1 << StateBits) && "Graph states execceds the maximum limit!"); - auto FollowSets = followSets(G); + + std::vector> Reduce; + llvm::DenseMap Shift; + llvm::DenseMap GoTo; + + for (const auto &T : Graph.edges()) { + (isToken(T.Label) ? Shift : GoTo) + .try_emplace(StateSymbol{T.Src, T.Label}, T.Dst); + } + for (StateID SID = 0; SID < Graph.states().size(); ++SID) { for (const Item &I : Graph.states()[SID].Items) { - // If we've just parsed the start symbol, this means we successfully parse - // the input. We don't add the reduce action of `_ := start_symbol` in the - // LRTable (the GLR parser handles it specifically). - if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext()) - continue; if (!I.hasNext()) { - // If we've reached the end of a rule A := ..., then we can reduce if - // the next token is in the follow set of A. - for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) { - assert(isToken(Follow)); - Build.insert({SID, Follow, Action::reduce(I.rule())}); - } + // If we've just parsed the start symbol, this means we successfully + // parse the input. We don't add the reduce action of `_ := + // start_symbol` in the LRTable (the GLR parser handles it + // specifically). + if (G.lookupRule(I.rule()).Target == G.underscore()) + continue; + // If we've reached the end of a rule A := ..., then we can reduce. + Reduce.push_back({SID, I.rule()}); } } } - return std::move(Build).build(G.table(), Graph.states().size()); + return LRTable( + std::move(Shift), + OffsetTable(std::move(Reduce), Graph.states().size()), + std::move(GoTo), Graph.startStates()); } } // namespace pseudo diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test --- a/clang-tools-extra/pseudo/test/lr-build-basic.test +++ b/clang-tools-extra/pseudo/test/lr-build-basic.test @@ -23,6 +23,6 @@ # TABLE-NEXT: 'id': go to state 2 # TABLE-NEXT: State 1 # TABLE-NEXT: State 2 -# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id' +# TABLE-NEXT: reduce by rule 1 'expr := id' # TABLE-NEXT: State 3 -# TABLE-NEXT: 'EOF': reduce by rule 0 'id := IDENTIFIER' +# TABLE-NEXT: reduce by rule 0 'id := IDENTIFIER' diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test --- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test +++ b/clang-tools-extra/pseudo/test/lr-build-conflicts.test @@ -35,12 +35,10 @@ # TABLE-NEXT: State 1 # TABLE-NEXT: '-': shift state 3 # TABLE-NEXT: State 2 -# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER' -# TABLE-NEXT: '-': reduce by rule 1 'expr := IDENTIFIER' +# TABLE-NEXT: reduce by rule 1 'expr := IDENTIFIER' # TABLE-NEXT: State 3 # TABLE-NEXT: 'IDENTIFIER': shift state 2 # TABLE-NEXT: 'expr': go to state 4 # TABLE-NEXT: State 4 -# TABLE-NEXT: 'EOF': reduce by rule 0 'expr := expr - expr' +# TABLE-NEXT: reduce by rule 0 'expr := expr - expr' # TABLE-NEXT: '-': shift state 3 -# TABLE-NEXT: '-': reduce by rule 0 'expr := expr - expr' diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -108,7 +108,7 @@ llvm::outs() << G->dump(); if (PrintGraph) llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G); - auto LRTable = clang::pseudo::LRTable::buildSLR(*G); + auto LRTable = clang::pseudo::LRTable::buildLR0(*G); if (PrintTable) llvm::outs() << LRTable.dumpForTests(*G); if (PrintStatistics) diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -142,66 +142,6 @@ "Unknown attribute 'unknown'")); } -TEST_F(GrammarTest, FirstAndFollowSets) { - build( - R"bnf( -_ := expr -expr := expr - term -expr := term -term := IDENTIFIER -term := ( expr ) -)bnf"); - ASSERT_TRUE(Diags.empty()); - auto ToPairs = [](std::vector> Input) { - std::vector>> Sets; - for (SymbolID ID = 0; ID < Input.size(); ++ID) - Sets.emplace_back(ID, std::move(Input[ID])); - return Sets; - }; - - EXPECT_THAT( - ToPairs(firstSets(*G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); - EXPECT_THAT( - ToPairs(followSets(*G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), - Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); - - build(R"bnf( -# A simplfied C++ decl-specifier-seq. -_ := decl-specifier-seq -decl-specifier-seq := decl-specifier decl-specifier-seq -decl-specifier-seq := decl-specifier -decl-specifier := simple-type-specifier -decl-specifier := INLINE -simple-type-specifier := INT - )bnf"); - ASSERT_TRUE(Diags.empty()); - EXPECT_THAT( - ToPairs(firstSets(*G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("decl-specifier-seq"), - UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"))))); - EXPECT_THAT( - ToPairs(followSets(*G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), - Pair(id("simple-type-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); -} - } // namespace } // namespace pseudo } // namespace clang