diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -71,6 +71,8 @@ LRTable::StateID State; // Used internally to track reachability during garbage collection. bool GCParity; + // Have we already used this node for error recovery? (prevents loops) + mutable bool Recovered = false; // Number of the parents of this node. // The parents hold previous parsed symbols, and may resume control after // this node is reduced. diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp --- a/clang-tools-extra/pseudo/lib/Forest.cpp +++ b/clang-tools-extra/pseudo/lib/Forest.cpp @@ -178,7 +178,7 @@ llvm::ArrayRef ForestArena::createTerminals(const TokenStream &Code) { - ForestNode *Terminals = Arena.Allocate(Code.tokens().size()); + ForestNode *Terminals = Arena.Allocate(Code.tokens().size() + 1); size_t Index = 0; for (const auto &T : Code.tokens()) { new (&Terminals[Index]) @@ -186,6 +186,12 @@ /*Start=*/Index, /*TerminalData*/ 0); ++Index; } + // Include an `eof` terminal. + // This is important to drive the final shift/recover/reduce loop. + new (&Terminals[Index]) + ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof), + /*Start=*/Index, /*TerminalData*/ 0); + ++Index; NodeCount = Index; return llvm::makeArrayRef(Terminals, Index); } diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -95,17 +95,19 @@ auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) { if (!Seen.insert(N).second) return; - for (auto Strategy : Lang.Table.getRecovery(N->State)) { - Options.push_back(PlaceholderRecovery{ - NextTok, - Strategy.Result, - Strategy.Strategy, - N, - Path, - }); - LLVM_DEBUG(llvm::dbgs() - << "Option: recover " << Lang.G.symbolName(Strategy.Result) - << " at token " << NextTok << "\n"); + if (!N->Recovered) { // Don't recover the same way twice! + for (auto Strategy : Lang.Table.getRecovery(N->State)) { + Options.push_back(PlaceholderRecovery{ + NextTok, + Strategy.Result, + Strategy.Strategy, + N, + Path, + }); + LLVM_DEBUG(llvm::dbgs() + << "Option: recover " << Lang.G.symbolName(Strategy.Result) + << " at token " << NextTok << "\n"); + } } Path.push_back(N->Payload); for (const GSS::Node *Parent : N->parents()) @@ -180,6 +182,7 @@ // There are various options, including simply breaking ties between options. // For now it's obscure enough to ignore. for (const PlaceholderRecovery *Option : BestOptions) { + Option->RecoveryNode->Recovered = true; const ForestNode &Placeholder = Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin); LRTable::StateID OldState = Option->RecoveryNode->State; @@ -587,6 +590,9 @@ auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target); assert(NextState.has_value() && "goto must succeed after reduce!"); Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base})); + LLVM_DEBUG(llvm::dbgs() + << " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n" + << " --> S" << Heads->back()->State << "\n"); return true; } }; @@ -638,7 +644,7 @@ // We discard all heads formed by reduction, and recreate them without // this constraint. This may duplicate some nodes, but it's rare. LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. " - "Re-reducing without lookahead."); + "Re-reducing without lookahead.\n"); Heads.resize(HeadsPartition); Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown)); @@ -662,34 +668,26 @@ } LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); - // The parse was successful if we're in state `_ := start-symbol .` - auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol); - assert(AcceptState.has_value() && "goto must succeed after start symbol!"); + // The parse was successful if in state `_ := start-symbol EOF .` + // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse. + auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol); + assert(AfterStart.has_value() && "goto must succeed after start symbol!"); + auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof)); + assert(Accept.has_value() && "shift EOF must succeed!"); auto SearchForAccept = [&](llvm::ArrayRef Heads) { const ForestNode *Result = nullptr; for (const auto *Head : Heads) { - if (Head->State == *AcceptState) { - assert(Head->Payload->symbol() == StartSymbol); + if (Head->State == *Accept) { + assert(Head->Payload->symbol() == tokenSymbol(tok::eof)); assert(Result == nullptr && "multiple results!"); - Result = Head->Payload; + Result = Head->parents().front()->Payload; + assert(Result->symbol() == StartSymbol); } } return Result; }; if (auto *Result = SearchForAccept(Heads)) return *Result; - // Failed to parse the input, attempt to run recovery. - // FIXME: this awkwardly repeats the recovery in the loop, when shift fails. - // More elegant is to include EOF in the token stream, and make the - // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF - // would not be a special case: it show up as a failure to shift the EOF - // token. - unsigned I = Terminals.size(); - glrRecover(Heads, I, Params, Lang, NextHeads); - Reduce(NextHeads, tokenSymbol(tok::eof)); - if (auto *Result = SearchForAccept(NextHeads)) - return *Result; - // We failed to parse the input, returning an opaque forest node for recovery. // FIXME: as above, we can add fallback error handling so this is impossible. return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); @@ -704,8 +702,10 @@ const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol, llvm::ArrayRef Parents) { - Node *Result = new (allocate(Parents.size())) - Node({State, GCParity, static_cast(Parents.size())}); + Node *Result = new (allocate(Parents.size())) Node(); + Result->State = State; + Result->GCParity = GCParity; + Result->ParentCount = Parents.size(); Alive.push_back(Result); ++NodesCreated; Result->Payload = Symbol; diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf @@ -29,9 +29,9 @@ # We list important nonterminals as start symbols, rather than doing it for all # nonterminals by default, this reduces the number of states by 30% and LRTable # actions by 16%. -_ := translation-unit -_ := statement-seq -_ := declaration-seq +_ := translation-unit EOF +_ := statement-seq EOF +_ := declaration-seq EOF # gram.key #! we don't distinguish between namespaces and namespace aliases, as it's hard diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp @@ -240,8 +240,9 @@ PendingStates.push_back(Result.first); const Rule &StartRule = G.lookupRule(RID); - assert(StartRule.Size == 1 && - "Start rule must have exactly one symbol in its body!"); + assert(StartRule.Size == 2 && + StartRule.seq().back() == tokenSymbol(tok::eof) && + "Start rule must be of the form `_ := start-symbol EOF`!"); Builder.addStartState(StartRule.seq().front(), Result.first); } diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test --- a/clang-tools-extra/pseudo/test/lr-build-basic.test +++ b/clang-tools-extra/pseudo/test/lr-build-basic.test @@ -1,19 +1,21 @@ -_ := expr +_ := expr EOF expr := id id := IDENTIFIER # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH # GRAPH: States: # GRAPH-NEXT: State 0 -# GRAPH-NEXT: _ := • expr +# GRAPH-NEXT: _ := • expr EOF # GRAPH-NEXT: expr := • id # GRAPH-NEXT: id := • IDENTIFIER # GRAPH-NEXT: State 1 -# GRAPH-NEXT: _ := expr • +# GRAPH-NEXT: _ := expr • EOF # GRAPH-NEXT: State 2 # GRAPH-NEXT: expr := id • # GRAPH-NEXT: State 3 # GRAPH-NEXT: id := IDENTIFIER • +# GRAPH-NEXT: State 4 +# GRAPH-NEXT: _ := expr EOF • # RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE # TABLE: LRTable: @@ -22,7 +24,9 @@ # TABLE-NEXT: expr: go to state 1 # TABLE-NEXT: id: go to state 2 # TABLE-NEXT: State 1 +# TABLE-NEXT: EOF: shift state 4 # TABLE-NEXT: State 2 -# TABLE-NEXT: EOF: reduce by rule 1 'expr := id' +# TABLE-NEXT: EOF: reduce by rule 2 'expr := id' # TABLE-NEXT: State 3 -# TABLE-NEXT: EOF: reduce by rule 0 'id := IDENTIFIER' +# TABLE-NEXT: EOF: reduce by rule 1 'id := IDENTIFIER' +# TABLE-NEXT: State 4 diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test --- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test +++ b/clang-tools-extra/pseudo/test/lr-build-conflicts.test @@ -1,31 +1,34 @@ -_ := expr +_ := expr EOF expr := expr - expr # S/R conflict at state 4 on '-' token expr := IDENTIFIER # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH # GRAPH: States # GRAPH-NEXT: State 0 +# GRAPH-NEXT: _ := • expr EOF # GRAPH-NEXT: expr := • expr - expr -# GRAPH-NEXT: _ := • expr # GRAPH-NEXT: expr := • IDENTIFIER # GRAPH-NEXT: State 1 -# GRAPH-NEXT: _ := expr • +# GRAPH-NEXT: _ := expr • EOF # GRAPH-NEXT: expr := expr • - expr # GRAPH-NEXT: State 2 # GRAPH-NEXT: expr := IDENTIFIER • # GRAPH-NEXT: State 3 +# GRAPH-NEXT: _ := expr EOF • +# GRAPH-NEXT: State 4 # GRAPH-NEXT: expr := • expr - expr # GRAPH-NEXT: expr := expr - • expr # GRAPH-NEXT: expr := • IDENTIFIER -# GRAPH-NEXT: State 4 +# GRAPH-NEXT: State 5 # GRAPH-NEXT: expr := expr - expr • # GRAPH-NEXT: expr := expr • - expr # GRAPH-NEXT: 0 ->[expr] 1 # GRAPH-NEXT: 0 ->[IDENTIFIER] 2 -# GRAPH-NEXT: 1 ->[-] 3 -# GRAPH-NEXT: 3 ->[expr] 4 -# GRAPH-NEXT: 3 ->[IDENTIFIER] 2 -# GRAPH-NEXT: 4 ->[-] 3 +# GRAPH-NEXT: 1 ->[EOF] 3 +# GRAPH-NEXT: 1 ->[-] 4 +# GRAPH-NEXT: 4 ->[expr] 5 +# GRAPH-NEXT: 4 ->[IDENTIFIER] 2 +# GRAPH-NEXT: 5 ->[-] 4 # RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE # TABLE: LRTable: @@ -33,12 +36,14 @@ # TABLE-NEXT: IDENTIFIER: shift state 2 # TABLE-NEXT: expr: go to state 1 # TABLE-NEXT: State 1 -# TABLE-NEXT: -: shift state 3 +# TABLE-NEXT: EOF: shift state 3 +# TABLE-NEXT: -: shift state 4 # TABLE-NEXT: State 2 -# TABLE-NEXT: EOF -: reduce by rule 1 'expr := IDENTIFIER' +# TABLE-NEXT: EOF -: reduce by rule 2 'expr := IDENTIFIER' # TABLE-NEXT: State 3 -# TABLE-NEXT: IDENTIFIER: shift state 2 -# TABLE-NEXT: expr: go to state 4 # TABLE-NEXT: State 4 -# TABLE-NEXT: -: shift state 3 -# TABLE-NEXT: EOF -: reduce by rule 0 'expr := expr - expr' +# TABLE-NEXT: IDENTIFIER: shift state 2 +# TABLE-NEXT: expr: go to state 5 +# TABLE-NEXT: State 5 +# TABLE-NEXT: -: shift state 4 +# TABLE-NEXT: EOF -: reduce by rule 1 'expr := expr - expr' diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp --- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp +++ b/clang-tools-extra/pseudo/unittests/ForestTest.cpp @@ -54,7 +54,7 @@ TEST_F(ForestTest, DumpBasic) { build(R"cpp( - _ := add-expression + _ := add-expression EOF add-expression := id-expression + id-expression id-expression := IDENTIFIER )cpp"); @@ -64,7 +64,7 @@ cook(lex("a + b", clang::LangOptions()), clang::LangOptions()); auto T = Arena.createTerminals(TS); - ASSERT_EQ(T.size(), 3u); + ASSERT_EQ(T.size(), 4u); const auto *Left = &Arena.createSequence( symbol("id-expression"), ruleFor("id-expression"), {&T.front()}); const auto *Right = &Arena.createSequence(symbol("id-expression"), @@ -89,9 +89,9 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) { build(R"cpp( - _ := type - type := class-type # rule 3 - type := enum-type # rule 4 + _ := type EOF + type := class-type # rule 4 + type := enum-type # rule 5 class-type := shared-type enum-type := shared-type shared-type := IDENTIFIER)cpp"); @@ -100,7 +100,7 @@ const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions()); auto Terminals = Arena.createTerminals(TS); - ASSERT_EQ(Terminals.size(), 1u); + ASSERT_EQ(Terminals.size(), 2u); const auto *SharedType = &Arena.createSequence( symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()}); @@ -109,9 +109,9 @@ const auto *EnumType = &Arena.createSequence( symbol("enum-type"), ruleFor("enum-type"), {SharedType}); const auto *Alternative1 = - &Arena.createSequence(symbol("type"), /*RuleID=*/3, {ClassType}); + &Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType}); const auto *Alternative2 = - &Arena.createSequence(symbol("type"), /*RuleID=*/4, {EnumType}); + &Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType}); const auto *Type = &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2}); EXPECT_EQ(Type->dumpRecursive(G), diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -509,7 +509,7 @@ // item `expr := • IDENTIFIER`, and both have different goto states on the // nonterminal `expr`. build(R"bnf( - _ := test + _ := test EOF test := { expr test := { IDENTIFIER @@ -548,7 +548,7 @@ // foo should be reduced first, so that in step 2 we have completed reduces // for test, and form an ambiguous forest node. build(R"bnf( - _ := test + _ := test EOF test := IDENTIFIER test := foo @@ -575,7 +575,7 @@ // - multiple possible recovery rules // - recovery from outer scopes is rejected build(R"bnf( - _ := block + _ := block EOF block := { block [recover=Braces] } block := { numbers [recover=Braces] } @@ -606,14 +606,14 @@ TEST_F(GLRTest, RecoverTerminal) { build(R"bnf( - _ := stmt + _ := stmt EOF stmt := IDENTIFIER ; [recover=Skip] )bnf"); TestLang.Table = LRTable::buildSLR(TestLang.G); TestLang.RecoveryStrategies.try_emplace( extensionID("Skip"), - [](Token::Index Start, const TokenStream &) { return Start + 1; }); + [](Token::Index Start, const TokenStream &) { return Start; }); clang::LangOptions LOptions; TokenStream Tokens = cook(lex("foo", LOptions), LOptions); @@ -630,7 +630,7 @@ // We would not normally reduce `word := IDENTIFIER`, but do so for recovery. build(R"bnf( - _ := sentence + _ := sentence EOF word := IDENTIFIER sentence := word word [recover=AcceptAnyTokenInstead] @@ -652,9 +652,40 @@ "[ 1, end) └─word := \n"); } +TEST_F(GLRTest, RepeatedRecovery) { + // We require multiple steps of recovery at eof and then a reduction in order + // to successfully parse. + build(R"bnf( + _ := function EOF + # FIXME: this forces EOF to be in follow(signature). + # Remove it once we use unconstrained reduction for recovery. + _ := signature EOF + + function := signature body [recover=Skip] + signature := IDENTIFIER params [recover=Skip] + params := ( ) + body := { } + )bnf"); + TestLang.Table = LRTable::buildSLR(TestLang.G); + TestLang.RecoveryStrategies.try_emplace( + extensionID("Skip"), + [](Token::Index Start, const TokenStream &) { return Start; }); + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("main", LOptions), LOptions); + + const ForestNode &Parsed = + glrParse({Tokens, Arena, GSStack}, id("function"), TestLang); + EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), + "[ 0, end) function := signature body [recover=Skip]\n" + "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n" + "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n" + "[ 1, 1) │ └─params := \n" + "[ 1, end) └─body := \n"); +} + TEST_F(GLRTest, NoExplicitAccept) { build(R"bnf( - _ := test + _ := test EOF test := IDENTIFIER test test := IDENTIFIER @@ -677,7 +708,7 @@ TEST_F(GLRTest, GuardExtension) { build(R"bnf( - _ := start + _ := start EOF start := IDENTIFIER [guard] )bnf");