diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -105,7 +105,11 @@ bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const { assert(isToken(Terminal)); assert(isNonterminal(Nonterminal)); - return FollowSets.test(tok::NUM_TOKENS * Nonterminal + + // tok::unknown is a sentinel value used in recovery: can follow anything. + if (tok::unknown) + return true; + return Terminal == tokenSymbol(tok::unknown) || + FollowSets.test(tok::NUM_TOKENS * Nonterminal + symbolToToken(Terminal)); } diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -597,6 +597,10 @@ std::vector Heads = {GSS.addNode(/*State=*/StartState, /*ForestNode=*/nullptr, {})}; + // Invariant: Heads is partitioned by source: {shifted | reduced}. + // HeadsPartition is the index of the first head formed by reduction. + // We use this to discard and recreate the reduced heads during recovery. + unsigned HeadsPartition = 0; std::vector NextHeads; auto MaybeGC = [&, Roots(std::vector{}), I(0u)]() mutable { assert(NextHeads.empty() && "Running GC at the wrong time!"); @@ -619,8 +623,17 @@ // If we weren't able to consume the token, try to skip over some tokens // so we can keep parsing. if (NextHeads.empty()) { - // FIXME: Heads may not be fully reduced, because our reductions were - // constrained by lookahead (but lookahead is meaningless to recovery). + // The reduction in the previous round was constrained by lookahead. + // On valid code this only rejects dead ends, but on broken code we should + // consider all possibilities. + // + // We discard all heads formed by reduction, and recreate them without + // this constraint. This may duplicate some nodes, but it's rare. + LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. " + "Re-reducing without lookahead."); + Heads.resize(HeadsPartition); + Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown)); + glrRecover(Heads, I, Params, Lang, NextHeads); if (NextHeads.empty()) // FIXME: Ensure the `_ := start-symbol` rules have a fallback @@ -632,6 +645,7 @@ // Form nonterminals containing the token we just consumed. SymbolID Lookahead = I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol(); + HeadsPartition = NextHeads.size(); Reduce(NextHeads, Lookahead); // Prepare for the next token. std::swap(Heads, NextHeads); diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -604,6 +604,33 @@ "[ 5, end) └─} := tok[5]\n"); } +TEST_F(GLRTest, RecoverUnrestrictedReduce) { + // Here, ! is not in any rule and therefore not in the follow set of `word`. + // We would not normally reduce `word := IDENTIFIER`, but do so for recovery. + + build(R"bnf( + _ := sentence + + word := IDENTIFIER + sentence := word word [recover=AcceptAnyTokenInstead] + )bnf"); + + clang::LangOptions LOptions; + const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions); + TestLang.Table = LRTable::buildSLR(TestLang.G); + TestLang.RecoveryStrategies.try_emplace( + extensionID("AcceptAnyTokenInstead"), + [](Token::Index Start, const TokenStream &Stream) { return Start + 1; }); + + const ForestNode &Parsed = + glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang); + EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), + "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n" + "[ 0, 1) ├─word := IDENTIFIER\n" + "[ 0, 1) │ └─IDENTIFIER := tok[0]\n" + "[ 1, end) └─word := \n"); +} + TEST_F(GLRTest, NoExplicitAccept) { build(R"bnf( _ := test