diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include #include @@ -37,62 +36,6 @@ return OS; } -const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, - SymbolID StartSymbol) { - assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal"); - llvm::ArrayRef Terminals = Params.Forest.createTerminals(Tokens); - auto &G = Params.G; - (void)G; - auto &GSS = Params.GSStack; - - StateID StartState = Params.Table.getStartState(StartSymbol); - std::vector Heads = {GSS.addNode(/*State=*/StartState, - /*ForestNode=*/nullptr, - {})}; - std::vector NextHeads; - auto MaybeGC = [&, Roots(std::vector{}), I(0u)]() mutable { - assert(NextHeads.empty() && "Running GC at the wrong time!"); - if (++I != 20) // Run periodically to balance CPU and memory usage. - return; - I = 0; - - // We need to copy the list: Roots is consumed by the GC. - Roots = Heads; - GSS.gc(std::move(Roots)); - }; - for (unsigned I = 0; I < Terminals.size(); ++I) { - LLVM_DEBUG(llvm::dbgs() << llvm::formatv( - "Next token {0} (id={1})\n", - G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); - glrShift(Heads, Terminals[I], Params, NextHeads); - std::swap(Heads, NextHeads); - SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof) - : Terminals[I + 1].symbol(); - glrReduce(Heads, Lookahead, Params); - NextHeads.clear(); - MaybeGC(); - } - LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); - - StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); - const ForestNode *Result = nullptr; - for (const auto *Head : Heads) { - if (Head->State == AcceptState) { - assert(Head->Payload->symbol() == StartSymbol); - assert(Result == nullptr && "multiple results!"); - Result = Head->Payload; - } - } - if (Result) - return *Result; - // We failed to parse the input, returning an opaque forest node for recovery. - // - // FIXME: We will need to invoke our generic error-recovery handlers when we - // reach EOF without reaching accept state, and involving the eof - // token in the above main for-loopmay be the best way to reuse the code). - return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); -} - // Apply all pending shift actions. // In theory, LR parsing doesn't have shift/shift conflicts on a single head. // But we may have multiple active heads, and each head has a shift action. @@ -153,7 +96,6 @@ llvm::sort(Vec); Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end()); } -} // namespace // Perform reduces until no more are possible. // @@ -197,9 +139,12 @@ // After reducing 3 by `pointer := class-name STAR` and // 2 by`enum-name := class-name STAR`: // 0--5(pointer) // 5 is goto(0, pointer) -void glrReduce(std::vector &Heads, SymbolID Lookahead, - const ParseParams &Params) { - assert(isToken(Lookahead)); +// +// (This is a functor rather than a function to allow it to reuse scratch +// storage across calls). +class GLRReduce { + const ParseParams &Params; + // There are two interacting complications: // 1. Performing one reduce can unlock new reduces on the newly-created head. // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). @@ -255,49 +200,38 @@ const GSS::Node* Base = nullptr; Sequence Seq; }; - KeyedQueue Sequences; - - Sequence TempSequence; + KeyedQueue Sequences; // FIXME: rename => PendingPushes? // We treat Heads as a queue of Pop operations still to be performed. // PoppedHeads is our position within it. - unsigned PoppedHeads = 0; - // In general the sequencing is complicated: each pop can yield multiple - // pending pushes that might run in a different order than we found them. - // However in trivial cases (only pop that yields only one push) we can - // bypass all these fancy queues and pop+push directly. This is very common. - auto PopAndPushTrivial = [&]() -> bool { - if (!Sequences.empty() || Heads.size() != PoppedHeads + 1) - return false; - const GSS::Node *Head = Heads.back(); - llvm::Optional RID; - for (auto &A : Params.Table.getActions(Head->State, Lookahead)) { - if (A.kind() != LRTable::Action::Reduce) - continue; - if (RID.hasValue()) - return false; - RID = A.getReduceRule(); - } - if (!RID.hasValue()) - return false; - const auto &Rule = Params.G.lookupRule(*RID); - const GSS::Node *Base = Head; - TempSequence.resize_for_overwrite(Rule.Size); - for (unsigned I = 0; I < Rule.Size; ++I) { - if (Base->parents().size() != 1) - return false; - TempSequence[Rule.Size - 1 - I] = Base->Payload; - Base = Base->parents().front(); + std::vector *Heads; + unsigned PoppedHeads; + SymbolID Lookahead; + + Sequence TempSequence; + +public: + GLRReduce(const ParseParams &Params) : Params(Params) {} + + void operator()(std::vector &Heads, SymbolID Lookahead) { + assert(isToken(Lookahead)); + + PoppedHeads = 0; + this->Heads = &Heads; + this->Lookahead = Lookahead; + assert(Sequences.empty()); + + popPending(); + while (!Sequences.empty()) { + pushNext(); + popPending(); } - const ForestNode *Parsed = - &Params.Forest.createSequence(Rule.Target, *RID, TempSequence); - StateID NextState = Params.Table.getGoToState(Base->State, Rule.Target); - Heads.push_back(Params.GSStack.addNode(NextState, Parsed, {Base})); - return true; - }; - // Pop walks up the parent chain(s) for a reduction from Head by to Rule. + } + +private: + // pop walks up the parent chain(s) for a reduction from Head by to Rule. // Once we reach the end, record the bases and sequences. - auto Pop = [&](const GSS::Node *Head, RuleID RID) { + void pop(const GSS::Node *Head, RuleID RID) { LLVM_DEBUG(llvm::dbgs() << " Pop " << Params.G.dumpRule(RID) << "\n"); const auto &Rule = Params.G.lookupRule(RID); Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID}; @@ -314,32 +248,32 @@ DFS(Parent, I + 1, DFS); }; DFS(Head, 0, DFS); - }; - auto PopPending = [&] { - for (; PoppedHeads < Heads.size(); ++PoppedHeads) { - if (PopAndPushTrivial()) + } + + // popPending pops every available reduction. + void popPending() { + for (; PoppedHeads < Heads->size(); ++PoppedHeads) { + if (popAndPushTrivial()) continue; for (const auto &A : - Params.Table.getActions(Heads[PoppedHeads]->State, Lookahead)) { + Params.Table.getActions((*Heads)[PoppedHeads]->State, Lookahead)) { if (A.kind() != LRTable::Action::Reduce) continue; - Pop(Heads[PoppedHeads], A.getReduceRule()); + pop((*Heads)[PoppedHeads], A.getReduceRule()); } } - }; + } + // Storage reused by each call to pushNext. std::vector> FamilyBases; std::vector> FamilySequences; + std::vector Parents; + std::vector SequenceNodes; - std::vector TempGSSNodes; - std::vector TempForestNodes; - - // Main reduction loop: - // - pop as much as we can - // - process one family at a time, forming a forest node - // - produces new GSS heads which may enable more pops - PopPending(); - while (!Sequences.empty()) { + // Process one push family, forming a forest node. + // This produces new GSS heads which may enable more pops. + void pushNext() { + assert(!Sequences.empty()); Family F = Sequences.top().first; LLVM_DEBUG(llvm::dbgs() << " Push " << Params.G.symbolName(F.Symbol) @@ -363,7 +297,6 @@ } while (!Sequences.empty() && Sequences.top().first == F); // Build a forest node for each unique sequence. sortAndUnique(FamilySequences); - auto &SequenceNodes = TempForestNodes; SequenceNodes.clear(); for (const auto &SequenceSpec : FamilySequences) SequenceNodes.push_back(&Params.Forest.createSequence( @@ -381,7 +314,6 @@ llvm::ArrayRef BasesLeft = FamilyBases; while (!BasesLeft.empty()) { StateID NextState = BasesLeft.front().first; - auto &Parents = TempGSSNodes; Parents.clear(); for (const auto &Base : BasesLeft) { if (Base.first != NextState) @@ -392,11 +324,109 @@ // Invoking the callback for new heads, a real GLR parser may add new // reduces to the PendingReduce queue! - Heads.push_back(Params.GSStack.addNode(NextState, Parsed, Parents)); + Heads->push_back(Params.GSStack.addNode(NextState, Parsed, Parents)); } - PopPending(); } - assert(Sequences.empty()); + + // In general the sequencing is complicated: each pop can yield multiple + // pending pushes that might run in a different order than we found them. + // However in trivial cases (only pop that yields only one push) we can + // bypass all these fancy queues and pop+push directly. This is very common. + bool popAndPushTrivial() { + if (!Sequences.empty() || Heads->size() != PoppedHeads + 1) + return false; + const GSS::Node *Head = Heads->back(); + llvm::Optional RID; + for (auto &A : Params.Table.getActions(Head->State, Lookahead)) { + if (A.kind() != LRTable::Action::Reduce) + continue; + if (RID.hasValue()) + return false; + RID = A.getReduceRule(); + } + if (!RID.hasValue()) + return false; + const auto &Rule = Params.G.lookupRule(*RID); + const GSS::Node *Base = Head; + TempSequence.resize_for_overwrite(Rule.Size); + for (unsigned I = 0; I < Rule.Size; ++I) { + if (Base->parents().size() != 1) + return false; + TempSequence[Rule.Size - 1 - I] = Base->Payload; + Base = Base->parents().front(); + } + const ForestNode *Parsed = + &Params.Forest.createSequence(Rule.Target, *RID, TempSequence); + StateID NextState = Params.Table.getGoToState(Base->State, Rule.Target); + Heads->push_back(Params.GSStack.addNode(NextState, Parsed, {Base})); + return true; + } +}; + +} // namespace + +const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, + SymbolID StartSymbol) { + GLRReduce Reduce(Params); + + assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal"); + llvm::ArrayRef Terminals = Params.Forest.createTerminals(Tokens); + auto &G = Params.G; + (void)G; + auto &GSS = Params.GSStack; + + StateID StartState = Params.Table.getStartState(StartSymbol); + std::vector Heads = {GSS.addNode(/*State=*/StartState, + /*ForestNode=*/nullptr, + {})}; + std::vector NextHeads; + auto MaybeGC = [&, Roots(std::vector{}), I(0u)]() mutable { + assert(NextHeads.empty() && "Running GC at the wrong time!"); + if (++I != 20) // Run periodically to balance CPU and memory usage. + return; + I = 0; + + // We need to copy the list: Roots is consumed by the GC. + Roots = Heads; + GSS.gc(std::move(Roots)); + }; + for (unsigned I = 0; I < Terminals.size(); ++I) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + "Next token {0} (id={1})\n", + G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); + glrShift(Heads, Terminals[I], Params, NextHeads); + std::swap(Heads, NextHeads); + SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof) + : Terminals[I + 1].symbol(); + Reduce(Heads, Lookahead); + NextHeads.clear(); + MaybeGC(); + } + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); + + StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); + const ForestNode *Result = nullptr; + for (const auto *Head : Heads) { + if (Head->State == AcceptState) { + assert(Head->Payload->symbol() == StartSymbol); + assert(Result == nullptr && "multiple results!"); + Result = Head->Payload; + } + } + if (Result) + return *Result; + // We failed to parse the input, returning an opaque forest node for recovery. + // + // FIXME: We will need to invoke our generic error-recovery handlers when we + // reach EOF without reaching accept state, and involving the eof + // token in the above main for-loopmay be the best way to reuse the code). + return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); +} + +void glrReduce(std::vector &Heads, SymbolID Lookahead, + const ParseParams &Params) { + // Create a new GLRReduce each time for tests, performance doesn't matter. + GLRReduce{Params}(Heads, Lookahead); } const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,