diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -132,34 +132,17 @@ const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params, SymbolID StartSymbol); -// An active stack head can have multiple available actions (reduce/reduce -// actions, reduce/shift actions). -// A step is any one action applied to any one stack head. -struct ParseStep { - // A specific stack head. - const GSS::Node *Head = nullptr; - // An action associated with the head. - LRTable::Action Action = LRTable::Action::sentinel(); -}; -// A callback is invoked whenever a new GSS head is created during the GLR -// parsing process (glrShift, or glrReduce). -using NewHeadCallback = std::function; -// Apply all PendingShift actions on a given GSS state, newly-created heads are -// passed to the callback. -// -// When this function returns, PendingShift is empty. +// Shift a token onto all OldHeads, placing the results into NewHeads. // // Exposed for testing only. -void glrShift(std::vector &PendingShift, const ForestNode &NextTok, - const ParseParams &Params, NewHeadCallback NewHeadCB); -// Applies PendingReduce actions, until no more reduce actions are available. -// -// When this function returns, PendingReduce is empty. Calls to NewHeadCB may -// add elements to PendingReduce +void glrShift(llvm::ArrayRef OldHeads, + const ForestNode &NextTok, const ParseParams &Params, + std::vector &NewHeads); +// Applies available reductions on Heads, appending resulting heads to the list. // // Exposed for testing only. -void glrReduce(std::vector &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB); +void glrReduce(std::vector &Heads, SymbolID Lookahead, + const ParseParams &Params); } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -128,7 +128,12 @@ llvm::ArrayRef getActions(StateID State, SymbolID Terminal) const; // Returns the state after we reduce a nonterminal. // Expected to be called by LR parsers. + // REQUIRES: Nonterminal is valid here. StateID getGoToState(StateID State, SymbolID Nonterminal) const; + // Returns the state after we shift a terminal. + // Expected to be called by LR parsers. + // If the terminal is invalid here, returns None. + llvm::Optional getShiftState(StateID State, SymbolID Terminal) const; // Looks up available actions. // Returns empty if no available actions in the table. diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -45,68 +45,41 @@ (void)G; auto &GSS = Params.GSStack; - // Lists of active shift, reduce actions. - std::vector PendingShift, PendingReduce; - auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) { - for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) { - switch (Action.kind()) { - case LRTable::Action::Shift: - PendingShift.push_back({Head, Action}); - break; - case LRTable::Action::Reduce: - PendingReduce.push_back({Head, Action}); - break; - default: - llvm_unreachable("unexpected action kind!"); - } - } - }; StateID StartState = Params.Table.getStartState(StartSymbol); - std::vector NewHeads = { - GSS.addNode(/*State=*/StartState, - /*ForestNode=*/nullptr, {})}; + // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1). + std::vector Heads = {GSS.addNode(/*State=*/StartState, + /*ForestNode=*/nullptr, + {})}; + std::vector NextHeads; auto MaybeGC = [&, Roots(std::vector{}), I(0u)]() mutable { - assert(PendingShift.empty() && PendingReduce.empty() && - "Running GC at the wrong time!"); - + assert(NextHeads.empty() && "Running GC at the wrong time!"); if (++I != 20) // Run periodically to balance CPU and memory usage. return; I = 0; // We need to copy the list: Roots is consumed by the GC. - Roots = NewHeads; + Roots = Heads; GSS.gc(std::move(Roots)); }; - for (const ForestNode &Terminal : Terminals) { - LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", - G.symbolName(Terminal.symbol()), - Terminal.symbol())); - for (const auto *Head : NewHeads) - AddSteps(Head, Terminal.symbol()); - NewHeads.clear(); - glrReduce(PendingReduce, Params, - [&](const GSS::Node * NewHead) { - // A reduce will enable more steps. - AddSteps(NewHead, Terminal.symbol()); - }); - - glrShift(PendingShift, Terminal, Params, - [&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); }); + // Each iteration fully processes a single token. + for (unsigned I = 0; I < Terminals.size(); ++I) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + "Next token {0} (id={1})\n", + G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); + // Consume the token. + glrShift(Heads, Terminals[I], Params, NextHeads); + // Form nonterminals containing the token we just consumed. + SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof) + : Terminals[I + 1].symbol(); + glrReduce(NextHeads, Lookahead, Params); + // Prepare for the next token. + std::swap(Heads, NextHeads); + NextHeads.clear(); MaybeGC(); } - LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n")); - for (const auto *Heads : NewHeads) - AddSteps(Heads, tokenSymbol(tok::eof)); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); - // Collect new heads created from the final reduce. - std::vector Heads; - glrReduce(PendingReduce, Params, [&](const GSS::Node *NewHead) { - Heads.push_back(NewHead); - // A reduce will enable more steps. - AddSteps(NewHead, tokenSymbol(tok::eof)); - }); - const ForestNode *Result = nullptr; for (const auto *Head : Heads) { if (Head->State == AcceptState) { @@ -138,42 +111,40 @@ // After the shift action, the GSS is: // 0---1---2---4 // └---3---┘ -void glrShift(std::vector &PendingShift, const ForestNode &NewTok, - const ParseParams &Params, NewHeadCallback NewHeadCB) { +void glrShift(llvm::ArrayRef OldHeads, + const ForestNode &NewTok, const ParseParams &Params, + std::vector &NewHeads) { assert(NewTok.kind() == ForestNode::Terminal); - assert(llvm::all_of(PendingShift, - [](const ParseStep &Step) { - return Step.Action.kind() == LRTable::Action::Shift; - }) && - "Pending shift actions must be shift actions"); LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n", Params.G.symbolName(NewTok.symbol()), - PendingShift.size())); + OldHeads.size())); // We group pending shifts by their target state so we can merge them. - llvm::stable_sort(PendingShift, [](const ParseStep &L, const ParseStep &R) { - return L.Action.getShiftState() < R.Action.getShiftState(); - }); - auto Rest = llvm::makeArrayRef(PendingShift); + llvm::SmallVector, 8> Shifts; + for (const auto *H : OldHeads) + if (auto S = Params.Table.getShiftState(H->State, NewTok.symbol())) + Shifts.push_back({*S, H}); + llvm::stable_sort(Shifts, llvm::less_first{}); + + auto Rest = llvm::makeArrayRef(Shifts); llvm::SmallVector Parents; while (!Rest.empty()) { // Collect the batch of PendingShift that have compatible shift states. // Their heads become TempParents, the parents of the new GSS node. - StateID NextState = Rest.front().Action.getShiftState(); + StateID NextState = Rest.front().first; Parents.clear(); for (const auto &Base : Rest) { - if (Base.Action.getShiftState() != NextState) + if (Base.first != NextState) break; - Parents.push_back(Base.Head); + Parents.push_back(Base.second); } Rest = Rest.drop_front(Parents.size()); LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n", NextState, Parents.size())); - NewHeadCB(Params.GSStack.addNode(NextState, &NewTok, Parents)); + NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents)); } - PendingShift.clear(); } namespace { @@ -231,8 +202,9 @@ // After reducing 3 by `pointer := class-name STAR` and // 2 by`enum-name := class-name STAR`: // 0--5(pointer) // 5 is goto(0, pointer) -void glrReduce(std::vector &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB) { +void glrReduce(std::vector &Heads, SymbolID Lookahead, + const ParseParams &Params) { + assert(isToken(Lookahead)); // There are two interacting complications: // 1. Performing one reduce can unlock new reduces on the newly-created head. // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). @@ -291,6 +263,10 @@ KeyedQueue Sequences; Sequence TempSequence; + + // We treat Heads as a queue of Pop operations still to be performed. + // NextPopHead is our position within it. + unsigned NextPopHead = 0; // Pop walks up the parent chain(s) for a reduction from Head by to Rule. // Once we reach the end, record the bases and sequences. auto Pop = [&](const GSS::Node *Head, RuleID RID) { @@ -312,9 +288,16 @@ DFS(Head, 0, DFS); }; auto PopPending = [&] { - for (const ParseStep &Pending : PendingReduce) - Pop(Pending.Head, Pending.Action.getReduceRule()); - PendingReduce.clear(); + for (; NextPopHead < Heads.size(); ++NextPopHead) { + // FIXME: if there's exactly one head in the queue, and the pop stage + // is trivial, we could pop + push without touching the expensive queues. + for (const auto &A : + Params.Table.getActions(Heads[NextPopHead]->State, Lookahead)) { + if (A.kind() != LRTable::Action::Reduce) + continue; + Pop(Heads[NextPopHead], A.getReduceRule()); + } + } }; std::vector> FamilyBases; @@ -378,10 +361,7 @@ Parents.push_back(Base.second); } BasesLeft = BasesLeft.drop_front(Parents.size()); - - // Invoking the callback for new heads, a real GLR parser may add new - // reduces to the PendingReduce queue! - NewHeadCB(Params.GSStack.addNode(NextState, Parsed, Parents)); + Heads.push_back(Params.GSStack.addNode(NextState, Parsed, Parents)); } PopPending(); } diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp @@ -72,6 +72,17 @@ return OS.str(); } +llvm::Optional +LRTable::getShiftState(StateID State, SymbolID Terminal) const { + // FIXME: we spend a significant amount of time on misses here. + // We could consider storing a std::bitset for a cheaper test? + assert(pseudo::isToken(Terminal) && "expected terminal symbol!"); + for (const auto &Result : getActions(State, Terminal)) + if (Result.kind() == Action::Shift) + return Result.getShiftState(); // unique: no shift/shift conflicts. + return llvm::None; +} + llvm::ArrayRef LRTable::getActions(StateID State, SymbolID Terminal) const { assert(pseudo::isToken(Terminal) && "expect terminal symbol!"); diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/GLR.h" -#include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/Token.h" +#include "clang-pseudo/grammar/Grammar.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/StringExtras.h" @@ -31,6 +31,7 @@ using Action = LRTable::Action; using testing::AllOf; +using testing::UnorderedElementsAre; MATCHER_P(state, StateID, "") { return arg->State == StateID; } MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; } @@ -83,17 +84,10 @@ return 0; } - NewHeadCallback captureNewHeads() { - return [this](const GSS::Node *NewHead) { - NewHeadResults.push_back(NewHead); - }; - }; - protected: std::unique_ptr G; ForestArena Arena; GSS GSStack; - std::vector NewHeadResults; }; TEST_F(GLRTest, ShiftMergingHeads) { @@ -109,31 +103,32 @@ // └---3---5 auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - auto *GSSNode1 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); - auto *GSSNode2 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); - auto *GSSNode3 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); buildGrammar({}, {}); // Create a fake empty grammar. - LRTable T = LRTable::buildForTests(G->table(), /*Entries=*/{}); + LRTable T = + LRTable::buildForTests(G->table(), /*Entries=*/{ + {1, tokenSymbol(tok::semi), Action::shift(4)}, + {2, tokenSymbol(tok::semi), Action::shift(4)}, + {3, tokenSymbol(tok::semi), Action::shift(5)}, + }); ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0); - std::vector PendingShift = { - {GSSNode1, Action::shift(4)}, - {GSSNode3, Action::shift(5)}, - {GSSNode2, Action::shift(4)}, - }; - glrShift(PendingShift, SemiTerminal, {*G, T, Arena, GSStack}, - captureNewHeads()); - - EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( - AllOf(state(4), parsedSymbol(&SemiTerminal), - parents({GSSNode1, GSSNode2})), - AllOf(state(5), parsedSymbol(&SemiTerminal), - parents({GSSNode3})))) - << NewHeadResults; + std::vector NewHeads; + glrShift({GSSNode1, GSSNode2, GSSNode3}, SemiTerminal, + {*G, T, Arena, GSStack}, NewHeads); + + EXPECT_THAT(NewHeads, + UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal), + parents({GSSNode1, GSSNode2})), + AllOf(state(5), parsedSymbol(&SemiTerminal), + parents({GSSNode3})))) + << NewHeads; } TEST_F(GLRTest, ReduceConflictsSplitting) { @@ -147,25 +142,29 @@ {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"}); LRTable Table = LRTable::buildForTests( - G->table(), {{/*State=*/0, id("class-name"), Action::goTo(2)}, - {/*State=*/0, id("enum-name"), Action::goTo(3)}}); + G->table(), { + {/*State=*/0, id("class-name"), Action::goTo(2)}, + {/*State=*/0, id("enum-name"), Action::goTo(3)}, + {/*State=*/1, tokenSymbol(tok::l_brace), + Action::reduce(ruleFor("class-name"))}, + {/*State=*/1, tokenSymbol(tok::l_brace), + Action::reduce(ruleFor("enum-name"))}, + }); const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); const auto *GSSNode1 = - GSStack.addNode(3, &Arena.createTerminal(tok::identifier, 0), {GSSNode0}); - - std::vector PendingReduce = { - {GSSNode1, Action::reduce(ruleFor("class-name"))}, - {GSSNode1, Action::reduce(ruleFor("enum-name"))}}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, - captureNewHeads()); - EXPECT_THAT(NewHeadResults, - testing::UnorderedElementsAre( - AllOf(state(2), parsedSymbolID(id("class-name")), - parents({GSSNode0})), - AllOf(state(3), parsedSymbolID(id("enum-name")), - parents({GSSNode0})))) << NewHeadResults; + GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0}); + + std::vector Heads = {GSSNode1}; + glrReduce(Heads, tokenSymbol(tok::l_brace), {*G, Table, Arena, GSStack}); + EXPECT_THAT(Heads, UnorderedElementsAre( + GSSNode1, + AllOf(state(2), parsedSymbolID(id("class-name")), + parents({GSSNode0})), + AllOf(state(3), parsedSymbolID(id("enum-name")), + parents({GSSNode0})))) + << Heads; } TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) { @@ -191,22 +190,25 @@ LRTable Table = LRTable::buildForTests( G->table(), - {{/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)}, - {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}}); - std::vector PendingReduce = { - {GSSNode4, Action::reduce(ruleFor("ptr-operator"))}}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, - captureNewHeads()); - - EXPECT_THAT(NewHeadResults, - testing::UnorderedElementsAre( - AllOf(state(5), parsedSymbolID(id("ptr-operator")), - parents({GSSNode2})), - AllOf(state(6), parsedSymbolID(id("ptr-operator")), - parents({GSSNode3})))) << NewHeadResults; + { + {/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)}, + {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}, + {/*State=*/4, tokenSymbol(tok::identifier), + Action::reduce(ruleFor("ptr-operator"))}, + }); + std::vector Heads = {GSSNode4}; + glrReduce(Heads, tokenSymbol(tok::identifier), {*G, Table, Arena, GSStack}); + + EXPECT_THAT(Heads, UnorderedElementsAre( + GSSNode4, + AllOf(state(5), parsedSymbolID(id("ptr-operator")), + parents({GSSNode2})), + AllOf(state(6), parsedSymbolID(id("ptr-operator")), + parents({GSSNode3})))) + << Heads; // Verify that the payload of the two new heads is shared, only a single // ptr-operator node is created in the forest. - EXPECT_EQ(NewHeadResults[0]->Payload, NewHeadResults[1]->Payload); + EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload); } TEST_F(GLRTest, ReduceJoiningWithMultipleBases) { @@ -238,28 +240,28 @@ GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, /*Parents=*/{GSSNode2}); + // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! LRTable Table = LRTable::buildForTests( G->table(), - {{/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)}, - {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}}); - // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! - std::vector PendingReduce = { { - GSSNode3, Action::reduce(/*RuleID=*/0) // type-name := class-name - }, - { - GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name - }}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, - captureNewHeads()); + {/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)}, + {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}, + {/*State=*/3, tokenSymbol(tok::l_paren), + Action::reduce(/* type-name := class-name */ 0)}, + {/*State=*/4, tokenSymbol(tok::l_paren), + Action::reduce(/* type-name := enum-name */ 1)}, + }); + std::vector Heads = {GSSNode3, GSSNode4}; + glrReduce(Heads, tokenSymbol(tok::l_paren), {*G, Table, Arena, GSStack}); // Verify that the stack heads are joint at state 5 after reduces. - EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(AllOf( - state(5), parsedSymbolID(id("type-name")), - parents({GSSNode1, GSSNode2})))) - << NewHeadResults; + EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4, + AllOf(state(5), + parsedSymbolID(id("type-name")), + parents({GSSNode1, GSSNode2})))) + << Heads; // Verify that we create an ambiguous ForestNode of two parses of `type-name`. - EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G), + EXPECT_EQ(Heads.back()->Payload->dumpRecursive(*G), "[ 1, end) type-name := \n" "[ 1, end) ├─type-name := class-name\n" "[ 1, end) │ └─class-name := \n" @@ -296,24 +298,24 @@ GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal, /*Parents=*/{GSSNode2}); - LRTable Table = LRTable::buildForTests( - G->table(), {{/*State=*/0, id("pointer"), Action::goTo(5)}}); // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! - std::vector PendingReduce = { - { - GSSNode3, Action::reduce(/*RuleID=*/0) // pointer := class-name * - }, - { - GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name * - }}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, - captureNewHeads()); - - EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( + LRTable Table = LRTable::buildForTests( + G->table(), { + {/*State=*/0, id("pointer"), Action::goTo(5)}, + {3, tokenSymbol(tok::l_paren), + Action::reduce(/* pointer := class-name */ 0)}, + {4, tokenSymbol(tok::l_paren), + Action::reduce(/* pointer := enum-name */ 1)}, + }); + std::vector Heads = {GSSNode3, GSSNode4}; + glrReduce(Heads, tokenSymbol(tok::l_paren), {*G, Table, Arena, GSStack}); + + EXPECT_THAT( + Heads, UnorderedElementsAre(GSSNode3, GSSNode4, AllOf(state(5), parsedSymbolID(id("pointer")), parents({GSSNode0})))) - << NewHeadResults; - EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G), + << Heads; + EXPECT_EQ(Heads.back()->Payload->dumpRecursive(*G), "[ 0, end) pointer := \n" "[ 0, end) ├─pointer := class-name *\n" "[ 0, 1) │ ├─class-name := \n"