diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -143,6 +143,25 @@ void glrReduce(std::vector &Heads, SymbolID Lookahead, const ParseParams &Params, const Language &Lang); +// Heuristically recover from a state where no further parsing is possible. +// +// OldHeads is the parse state at TokenIndex. +// This function consumes zero or more tokens by advancing TokenIndex, +// and places any recovery states created in NewHeads. +// +// On failure, NewHeads is empty and TokenIndex is unchanged. +// +// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, +// there is a risk of the parser falling into an infinite loop, creating an +// endless sequence of recovery nodes. +// Generally it is safe for recovery to match 0 tokens against sequence symbols +// like `statement-seq`, as the grammar won't permit another statement-seq +// immediately afterwards. However recovery strategies for `statement` should +// consume at least one token, as statements may be adjacent in the input. +void glrRecover(llvm::ArrayRef OldHeads, + unsigned &TokenIndex, const ParseParams &Params, + const Language &Lang, std::vector &NewHeads); + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h @@ -82,9 +82,12 @@ assert(SID < NumTerminals); return static_cast(SID); } -inline SymbolID tokenSymbol(tok::TokenKind TK) { +inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { return TokenFlag | static_cast(TK); } +// Error recovery strategies. +// FIXME: these should be provided as extensions instead. +enum class RecoveryStrategy : uint8_t { None, Braces }; // An extension is a piece of native code specific to a grammar that modifies // the behavior of annotated rules. One ExtensionID is assigned for each unique @@ -108,7 +111,7 @@ // length to 9 (this is the longest sequence in cxx grammar). static constexpr unsigned SizeBits = 4; static constexpr unsigned MaxElements = 9; - static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit"); + static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit"); static_assert(SizeBits + SymbolBits <= 16, "Must be able to store symbol ID + size efficiently"); @@ -124,6 +127,14 @@ // being set for this rule. ExtensionID Guard = 0; + // Specifies the index within Sequence eligible for error recovery. + // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we + // should recover by finding the matching brace, and forcing stmt-seq to match + // everything between braces. + // For now, only a single strategy at a single point is possible. + uint8_t RecoveryIndex = -1; + RecoveryStrategy Recovery = RecoveryStrategy::None; + llvm::ArrayRef seq() const { return llvm::ArrayRef(Sequence, Size); } diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h @@ -137,8 +137,20 @@ SymbolID Label; }; + // A possible error recovery: choose to match some tokens against a symbol. + // + // e.g. a state that contains + // stmt := { . stmt-seq [recover=braces] } + // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. + struct Recovery { + StateID Src; // The state we are in when encountering the error. + RecoveryStrategy Strategy; // Heuristic choosing the tokens to match. + SymbolID Result; // The symbol that is produced. + }; + llvm::ArrayRef states() const { return States; } llvm::ArrayRef edges() const { return Edges; } + llvm::ArrayRef recoveries() const { return Recoveries; } llvm::ArrayRef> startStates() const { return StartStates; } @@ -147,12 +159,15 @@ private: LRGraph(std::vector States, std::vector Edges, + std::vector Recoveries, std::vector> StartStates) : States(std::move(States)), Edges(std::move(Edges)), - StartStates(std::move(StartStates)) {} + Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) { + } std::vector States; std::vector Edges; + std::vector Recoveries; std::vector> StartStates; }; diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -42,7 +42,6 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include #include @@ -68,6 +67,11 @@ using StateID = uint16_t; static constexpr unsigned StateBits = 13; + struct Recovery { + RecoveryStrategy Strategy; + SymbolID Result; + }; + // Returns the state after we reduce a nonterminal. // Expected to be called by LR parsers. // If the nonterminal is invalid here, returns None. @@ -105,6 +109,12 @@ symbolToToken(Terminal)); } + // Looks up available recovery actions if we stopped parsing in this state. + llvm::ArrayRef getRecovery(StateID State) const { + return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State], + Recoveries.data() + RecoveryOffset[State + 1]); + } + // Returns the state from which the LR parser should start to parse the input // tokens as the given StartSymbol. // @@ -147,6 +157,8 @@ llvm::DenseMap> Reduce; // FollowSets[NT] is the set of terminals that can follow the nonterminal. std::vector> FollowSets; + // Recovery options available at each state. + std::vector> Recoveries; LRTable build() &&; }; @@ -251,6 +263,11 @@ // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token) // as an index: Nonterminal * NUM_TOKENS + Token. llvm::BitVector FollowSets; + + // Recovery stores all recovery actions from all states. + // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]). + std::vector RecoveryOffset; + std::vector Recoveries; }; } // namespace pseudo diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -25,6 +25,172 @@ namespace clang { namespace pseudo { +namespace { + +Token::Index findRecoveryEndpoint(RecoveryStrategy Strategy, + const GSS::Node *RecoveryNode, + const TokenStream &Tokens) { + assert(Strategy == RecoveryStrategy::Braces); + const ForestNode *LBrace = RecoveryNode->Payload; + assert(LBrace->kind() == ForestNode::Terminal && + LBrace->symbol() == tokenSymbol(tok::l_brace)); + if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair()) + return Tokens.index(*RBrace); + return Token::Invalid; +} + +} // namespace + +void glrRecover(llvm::ArrayRef OldHeads, + unsigned &TokenIndex, const ParseParams &Params, + const Language &Lang, + std::vector &NewHeads) { + LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n"); + // Describes a possibility to recover by forcibly interpreting a range of + // tokens around the cursor as a nonterminal that we expected to see. + struct PlaceholderRecovery { + // The token prior to the nonterminal which is being recovered. + // This starts of the region we're skipping, so higher Position is better. + Token::Index Position; + // The nonterminal which will be created in order to recover. + SymbolID Symbol; + // The heuristic used to choose the bounds of the nonterminal to recover. + RecoveryStrategy Strategy; + + // The GSS head where we are expecting the recovered nonterminal. + const GSS::Node *RecoveryNode; + // Payload of nodes on the way back from the OldHead to the recovery node. + // These represent the partial parse that is being discarded. + // They should become the children of the opaque recovery node. + // FIXME: internal structure of opaque nodes is not implemented. + // + // There may be multiple paths leading to the same recovery node, we choose + // one arbitrarily. + std::vector DiscardedParse; + }; + std::vector Options; + + // Find recovery options by walking up the stack. + // + // This is similar to exception handling: we walk up the "frames" of nested + // rules being parsed until we find one that has a "handler" which allows us + // to determine the node bounds without parsing it. + // + // Unfortunately there's a significant difference: the stack contains both + // "upward" nodes (ancestor parses) and "leftward" ones. + // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains: + // stmt := IF ( expr ) . stmt - current state, we should recover here! + // stmt := IF ( expr . ) stmt - (left, no recovery here) + // stmt := IF ( . expr ) stmt - left, we should NOT recover here! + // stmt := IF . ( expr ) stmt - (left, no recovery here) + // stmt-seq := . stmt - up, we might recover here + // compound-stmt := { . stmt-seq } - up, we should recover here! + // + // It's not obvious how to avoid collecting "leftward" recovery options. + // I think the distinction is ill-defined after merging items into states. + // For now, we have to take this into account when defining recovery rules. + // (e.g. in the expr recovery above, stay inside the parentheses). + // FIXME: find a more satisfying way to avoid such false recovery. + // FIXME: Add a test for spurious recovery once tests can define strategies. + std::vector Path; + llvm::DenseSet Seen; + auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) { + if (!Seen.insert(N).second) + return; + for (auto Strategy : Lang.Table.getRecovery(N->State)) { + Options.push_back(PlaceholderRecovery{ + NextTok, + Strategy.Result, + Strategy.Strategy, + N, + Path, + }); + LLVM_DEBUG(llvm::dbgs() + << "Option: recover " << Lang.G.symbolName(Strategy.Result) + << " at token " << NextTok << "\n"); + } + Path.push_back(N->Payload); + for (const GSS::Node *Parent : N->parents()) + WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp); + Path.pop_back(); + }; + for (auto *N : OldHeads) + WalkUp(N, TokenIndex, WalkUp); + + // Now we select the option(s) we will use to recover. + // + // We prefer options starting further right, as these discard less code + // (e.g. we prefer to recover inner scopes rather than outer ones). + // The options also need to agree on an endpoint, so the parser has a + // consistent position afterwards. + // + // So conceptually we're sorting by the tuple (start, end), though we avoid + // computing `end` for options that can't be winners. + + // Consider options starting further right first. + // Don't drop the others yet though, we may still use them if preferred fails. + llvm::stable_sort(Options, [&](const auto &L, const auto &R) { + return L.Position > R.Position; + }); + + // We may find multiple winners, but they will have the same range. + llvm::Optional RecoveryRange; + std::vector BestOptions; + for (const PlaceholderRecovery &Option : Options) { + // If this starts further left than options we've already found, then + // we'll never find anything better. Skip computing End for the rest. + if (RecoveryRange && Option.Position < RecoveryRange->Begin) + break; + + auto End = + findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Params.Code); + // Recovery may not take the parse backwards. + if (End == Token::Invalid || End < TokenIndex) + continue; + if (RecoveryRange) { + // If this is worse than our previous options, ignore it. + if (RecoveryRange->End < End) + continue; + // If this is an improvement over our previous options, then drop them. + if (RecoveryRange->End > End) + BestOptions.clear(); + } + // Create recovery nodes and heads for them in the GSS. These may be + // discarded if a better recovery is later found, but this path isn't hot. + RecoveryRange = {Option.Position, End}; + BestOptions.push_back(&Option); + } + + if (BestOptions.empty()) { + LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size() + << " strategies\n"); + return; + } + + // We've settled on a set of recovery options, so create their nodes and + // advance the cursor. + LLVM_DEBUG({ + llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":"; + for (const auto *Option : BestOptions) + llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol); + llvm::dbgs() << "\n"; + }); + // FIXME: in general, we might have the same Option->Symbol multiple times, + // and we risk creating redundant Forest and GSS nodes. + // We also may inadvertently set up the next glrReduce to create a sequence + // node duplicating an opaque node that we're creating here. + // There are various options, including simply breaking ties between options. + // For now it's obscure enough to ignore. + for (const PlaceholderRecovery *Option : BestOptions) { + const ForestNode &Placeholder = + Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin); + const GSS::Node *NewHead = Params.GSStack.addNode( + *Lang.Table.getGoToState(Option->RecoveryNode->State, Option->Symbol), + &Placeholder, {Option->RecoveryNode}); + NewHeads.push_back(NewHead); + } + TokenIndex = RecoveryRange->End; +} using StateID = LRTable::StateID; @@ -32,8 +198,9 @@ std::vector ParentStates; for (const auto *Parent : N.parents()) ParentStates.push_back(llvm::formatv("{0}", Parent->State)); - OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State, - N.Payload->symbol(), llvm::join(ParentStates, ", ")); + OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State, + N.Payload ? N.Payload->symbol() : 0, + llvm::join(ParentStates, ", ")); return OS; } @@ -420,8 +587,8 @@ } // namespace -const ForestNode &glrParse( const ParseParams &Params, SymbolID StartSymbol, - const Language& Lang) { +const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, + const Language &Lang) { GLRReduce Reduce(Params, Lang); assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal"); llvm::ArrayRef Terminals = Params.Forest.createTerminals(Params.Code); @@ -444,15 +611,29 @@ GSS.gc(std::move(Roots)); }; // Each iteration fully processes a single token. - for (unsigned I = 0; I < Terminals.size(); ++I) { + for (unsigned I = 0; I < Terminals.size();) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv( "Next token {0} (id={1})\n", Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); // Consume the token. glrShift(Heads, Terminals[I], Params, Lang, NextHeads); + + // If we weren't able to consume the token, try to skip over some tokens + // so we can keep parsing. + if (NextHeads.empty()) { + // FIXME: Heads may not be fully reduced, because our reductions were + // constrained by lookahead (but lookahead is meaningless to recovery). + glrRecover(Heads, I, Params, Lang, NextHeads); + if (NextHeads.empty()) + // FIXME: Ensure the `_ := start-symbol` rules have a fallback + // error-recovery strategy attached. Then this condition can't happen. + return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); + } else + ++I; + // Form nonterminals containing the token we just consumed. - SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof) - : Terminals[I + 1].symbol(); + SymbolID Lookahead = + I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol(); Reduce(NextHeads, Lookahead); // Prepare for the next token. std::swap(Heads, NextHeads); @@ -461,23 +642,36 @@ } LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); + // The parse was successful if we're in state `_ := start-symbol .` auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol); assert(AcceptState.hasValue() && "goto must succeed after start symbol!"); - const ForestNode *Result = nullptr; - for (const auto *Head : Heads) { - if (Head->State == AcceptState) { - assert(Head->Payload->symbol() == StartSymbol); - assert(Result == nullptr && "multiple results!"); - Result = Head->Payload; + auto SearchForAccept = [&](llvm::ArrayRef Heads) { + const ForestNode *Result = nullptr; + for (const auto *Head : Heads) { + if (Head->State == *AcceptState) { + assert(Head->Payload->symbol() == StartSymbol); + assert(Result == nullptr && "multiple results!"); + Result = Head->Payload; + } } - } - if (Result) + return Result; + }; + if (auto *Result = SearchForAccept(Heads)) + return *Result; + // Failed to parse the input, attempt to run recovery. + // FIXME: this awkwardly repeats the recovery in the loop, when shift fails. + // More elegant is to include EOF in the token stream, and make the + // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF + // would not be a special case: it show up as a failure to shift the EOF + // token. + unsigned I = Terminals.size(); + glrRecover(Heads, I, Params, Lang, NextHeads); + Reduce(NextHeads, tokenSymbol(tok::eof)); + if (auto *Result = SearchForAccept(NextHeads)) return *Result; + // We failed to parse the input, returning an opaque forest node for recovery. - // - // FIXME: We will need to invoke our generic error-recovery handlers when we - // reach EOF without reaching accept state, and involving the eof - // token in the above main for-loopmay be the best way to reuse the code). + // FIXME: as above, we can add fallback error handling so this is impossible. return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); } @@ -488,9 +682,10 @@ } const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Parents) { Node *Result = new (allocate(Parents.size())) - Node({State, GCParity, static_cast(Parents.size())}); + Node({State, GCParity, static_cast(Parents.size())}); Alive.push_back(Result); ++NodesCreated; Result->Payload = Symbol; diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -59,8 +59,11 @@ llvm::raw_string_ostream OS(Result); const Rule &R = T->Rules[RID]; OS << symbolName(R.Target) << " :="; - for (SymbolID SID : R.seq()) - OS << " " << symbolName(SID); + for (unsigned I = 0; I < R.Size; ++I) { + OS << " " << symbolName(R.Sequence[I]); + if (R.RecoveryIndex == I) + OS << " [recover=" << static_cast(R.Recovery) << "]"; + } if (R.Guard) OS << " [guard=" << T->AttributeValues[R.Guard] << "]"; return Result; diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -106,6 +106,17 @@ assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); + // Wherever RHS contains { foo }, mark foo for brace-recovery. + // FIXME: this should be grammar annotations instead. + for (auto &Rule : T->Rules) { + for (unsigned I = 2; I < Rule.Size; ++I) + if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) && + Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) && + !isToken(Rule.Sequence[I - 1])) { + Rule.Recovery = RecoveryStrategy::Braces; + Rule.RecoveryIndex = I - 1; + } + } const auto &SymbolOrder = getTopologicalOrder(T.get()); llvm::stable_sort( T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp @@ -120,6 +120,20 @@ return Results; } +std::vector> +availableRecovery(const State &S, const Grammar &G) { + std::vector> Result; + for (const Item &I : S.Items) { + const auto &Rule = G.lookupRule(I.rule()); + if (I.dot() != Rule.RecoveryIndex) + continue; + Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]}); + } + llvm::sort(Result); + Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); + return Result; +} + } // namespace std::string Item::dump(const Grammar &G) const { @@ -130,9 +144,10 @@ Results.push_back(G.symbolName(SID)); return Results; }; - return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target), + return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target), llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), - llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " ")) + llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "), + Rule.RecoveryIndex == DotPos ? " [recovery]" : "") .str(); } @@ -181,6 +196,11 @@ Edges.push_back({Src, Dst, Label}); } + void insertRecovery(StateID Src, RecoveryStrategy Strategy, + SymbolID Result) { + Recoveries.push_back({Src, Strategy, Result}); + } + // Returns a state with the given id. const State &find(StateID ID) const { assert(ID < States.size()); @@ -194,9 +214,10 @@ LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); + Recoveries.shrink_to_fit(); llvm::sort(StartStates); StartStates.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges), + return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries), std::move(StartStates)); } @@ -205,6 +226,7 @@ llvm::DenseMap StatesIndex; std::vector States; std::vector Edges; + std::vector Recoveries; const Grammar &G; std::vector> StartStates; } Builder(G); @@ -225,15 +247,16 @@ } while (!PendingStates.empty()) { - auto CurrentStateID = PendingStates.back(); + auto StateID = PendingStates.back(); PendingStates.pop_back(); - for (auto Next : - nextAvailableKernelItems(Builder.find(CurrentStateID), G)) { + for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) { auto Insert = Builder.insert(Next.second); if (Insert.second) // new state, insert to the pending queue. PendingStates.push_back(Insert.first); - Builder.insertEdge(CurrentStateID, Insert.first, Next.first); + Builder.insertEdge(StateID, Insert.first, Next.first); } + for (auto Recovery : availableRecovery(Builder.find(StateID), G)) + Builder.insertRecovery(StateID, Recovery.first, Recovery.second); } return std::move(Builder).build(); } diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp @@ -65,6 +65,25 @@ } Table.ReduceOffset.push_back(Table.Reduces.size()); + // Error recovery entries: sort (no dups already), and build offset lookup. + llvm::sort(Recoveries, [&](const auto &L, const auto &R) { + return std::tie(L.first, L.second.Result, L.second.Strategy) < + std::tie(R.first, R.second.Result, R.second.Strategy); + }); + Table.Recoveries.reserve(Recoveries.size()); + for (const auto &R : Recoveries) + Table.Recoveries.push_back({R.second.Strategy, R.second.Result}); + Table.RecoveryOffset = std::vector(NumStates + 1, 0); + unsigned SortedIndex = 0; + for (StateID State = 0; State < NumStates; ++State) { + Table.RecoveryOffset[State] = SortedIndex; + while (SortedIndex < Recoveries.size() && + Recoveries[SortedIndex].first == State) + SortedIndex++; + } + Table.RecoveryOffset[NumStates] = SortedIndex; + assert(SortedIndex == Recoveries.size()); + return Table; } @@ -74,6 +93,10 @@ Build.StartStates = Graph.startStates(); for (const auto &T : Graph.edges()) Build.Transition.try_emplace({T.Src, T.Label}, T.Dst); + for (const auto &Entry : Graph.recoveries()) + Build.Recoveries.push_back( + {Entry.Src, Recovery{Entry.Strategy, Entry.Result}}); + Build.FollowSets = followSets(G); assert(Graph.states().size() <= (1 << StateBits) && "Graph states execceds the maximum limit!"); // Add reduce actions. diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp --- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp +++ b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp @@ -2,7 +2,7 @@ class Foo { public: }; -// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification } +// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] } // CHECK-NEXT: ├─class-head := class-key class-head-name // CHECK-NEXT: │ ├─class-key~CLASS := tok[0] // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1] diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp @@ -0,0 +1,13 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s +auto x = { complete garbage }; +// CHECK: translation-unit~simple-declaration +// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0] +// CHECK-NEXT: ├─init-declarator-list~init-declarator +// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1] +// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer +// CHECK-NEXT: │ ├─= := tok[2] +// CHECK-NEXT: │ └─initializer-clause~braced-init-list +// CHECK-NEXT: │ ├─{ := tok[3] +// CHECK-NEXT: │ ├─initializer-list := +// CHECK-NEXT: │ └─} := tok[6] +// CHECK-NEXT: └─; := tok[7] diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -7,8 +7,9 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/GLR.h" -#include "clang-pseudo/Token.h" +#include "clang-pseudo/Bracket.h" #include "clang-pseudo/Language.h" +#include "clang-pseudo/Token.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/TokenKinds.h" @@ -33,11 +34,13 @@ using StateID = LRTable::StateID; using testing::AllOf; using testing::ElementsAre; +using testing::IsEmpty; using testing::UnorderedElementsAre; MATCHER_P(state, StateID, "") { return arg->State == StateID; } MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; } MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; } +MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; } testing::Matcher parents(llvm::ArrayRef Parents) { @@ -247,9 +250,9 @@ /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); const auto *GSSNode2 = GSStack.addNode( /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); - const auto *GSSNode3 = - GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode, - /*Parents=*/{GSSNode1}); + const auto *GSSNode3 = GSStack.addNode( + /*State=*/3, /*ForestNode=*/ClassNameNode, + /*Parents=*/{GSSNode1}); const auto *GSSNode4 = GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, /*Parents=*/{GSSNode2}); @@ -366,6 +369,120 @@ EXPECT_THAT(Heads, ElementsAre(GSSNode1)); } +TEST_F(GLRTest, Recover) { + // Recovery while parsing "word" inside braces. + // Before: + // 0--1({)--2(?) + // After recovering a `word` at state 1: + // 0--3(word) // 3 is goto(1, word) + buildGrammar({"word"}, {}); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{1}, id("word")}] = StateID{3}; + B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}}); + TestLang.Table = std::move(B).build(); + + auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); + auto *Question1 = &Arena.createTerminal(tok::question, 1); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); + const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces}); + + // Need a token stream with paired braces so the strategy works. + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions); + pairBrackets(Tokens); + std::vector NewHeads; + + unsigned TokenIndex = 2; + glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, + NewHeads); + EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace"; + EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")), + parents({OpenedBraces}), start(1u)))); + EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque); + + // Test recovery failure: omit closing brace so strategy fails + TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions); + pairBrackets(NoRBrace); + NewHeads.clear(); + TokenIndex = 2; + glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang, + NewHeads); + EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure"; + EXPECT_THAT(NewHeads, IsEmpty()); +} + +TEST_F(GLRTest, RecoverRightmost) { + // In a nested block structure, we recover at the innermost possible block. + // Before: + // 0--1({)--1({)--1({) + // After recovering a `block` at inside the second braces: + // 0--1({)--2(body) // 2 is goto(1, body) + buildGrammar({"body"}, {}); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{1}, id("body")}] = StateID{2}; + B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("body")}}); + TestLang.Table = std::move(B).build(); + + clang::LangOptions LOptions; + // Innermost brace is unmatched, to test fallback to next brace. + TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions); + Tokens.tokens()[0].Pair = 5; + Tokens.tokens()[1].Pair = 4; + Tokens.tokens()[4].Pair = 1; + Tokens.tokens()[5].Pair = 0; + + auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0); + auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1); + auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *In1 = GSStack.addNode(1, Brace1, {Root}); + const auto *In2 = GSStack.addNode(1, Brace2, {In1}); + const auto *In3 = GSStack.addNode(1, Brace3, {In2}); + + unsigned TokenIndex = 3; + std::vector NewHeads; + glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads); + EXPECT_EQ(TokenIndex, 5u); + EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")), + parents({In2}), start(2u)))); +} + +TEST_F(GLRTest, RecoverAlternatives) { + // Recovery inside braces with multiple equally good options + // Before: + // 0--1({) + // After recovering either `word` or `number` inside the braces: + // 0--1({)--2(word) // 2 is goto(1, word) + // └--3(number) // 3 is goto(1, number) + buildGrammar({"number", "word"}, {}); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{1}, id("number")}] = StateID{2}; + B.Transition[{StateID{1}, id("word")}] = StateID{3}; + B.Recoveries.push_back( + {StateID{1}, {RecoveryStrategy::Braces, id("number")}}); + B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}}); + TestLang.Table = std::move(B).build(); + auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); + + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions); + pairBrackets(Tokens); + std::vector NewHeads; + unsigned TokenIndex = 1; + + glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, + NewHeads); + EXPECT_EQ(TokenIndex, 2u); + EXPECT_THAT(NewHeads, + UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")), + parents({OpenedBraces}), start(1u)), + AllOf(state(3), parsedSymbolID(id("word")), + parents({OpenedBraces}), start(1u)))); +} + TEST_F(GLRTest, PerfectForestNodeSharing) { // Run the GLR on a simple grammar and test that we build exactly one forest // node per (SymbolID, token range). @@ -435,6 +552,40 @@ "[ 0, end) └─IDENTIFIER := tok[0]\n"); } +TEST_F(GLRTest, RecoveryEndToEnd) { + // Simple example of brace-based recovery showing: + // - recovered region includes tokens both ahead of and behind the cursor + // - multiple possible recovery rules + // - recovery from outer scopes is rejected + build(R"bnf( + _ := block + + block := { block } + block := { numbers } + numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT + )bnf"); + TestLang.Table = LRTable::buildSLR(TestLang.G); + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions); + pairBrackets(Tokens); + + const ForestNode &Parsed = + glrParse({Tokens, Arena, GSStack}, id("block"), TestLang); + EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), + "[ 0, end) block := { block [recover=1] }\n" + "[ 0, 1) ├─{ := tok[0]\n" + "[ 1, 5) ├─block := \n" + "[ 1, 5) │ ├─block := { block [recover=1] }\n" + "[ 1, 2) │ │ ├─{ := tok[1]\n" + "[ 2, 4) │ │ ├─block := \n" + "[ 4, 5) │ │ └─} := tok[4]\n" + "[ 1, 5) │ └─block := { numbers [recover=1] }\n" + "[ 1, 2) │ ├─{ := tok[1]\n" + "[ 2, 4) │ ├─numbers := \n" + "[ 4, 5) │ └─} := tok[4]\n" + "[ 5, end) └─} := tok[5]\n"); +} + TEST_F(GLRTest, NoExplicitAccept) { build(R"bnf( _ := test