diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -144,6 +144,26 @@ void glrReduce(std::vector &Heads, SymbolID Lookahead, const ParseParams &Params); +// Heuristically recover from a state where no further parsing is possible. +// +// OldHeads is the parse state at TokenIndex. +// This function consumes consumes zero or more tokens (advancing TokenIndex), +// and places any recovery states created in NewHeads. +// +// On failure, NewHeads is empty and TokenIndex is unchanged. +// +// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, +// there is a risk of the parser falling into an infinite loop, creating an +// endless sequence of recovery nodes. +// Generally it is safe for recovery to match 0 tokens against sequence symbols +// like `statement-seq`, as the grammar won't permit another statement-seq +// immediately afterwards. However recovery strategies for `statement` should +// consume at least one token, as statements may be adjacent in the input. +void glrRecover(llvm::ArrayRef OldHeads, + unsigned &TokenIndex, const TokenStream &Tokens, + const ParseParams &Params, + std::vector &NewHeads); + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h @@ -81,9 +81,12 @@ assert(SID < NumTerminals); return static_cast(SID); } -inline SymbolID tokenSymbol(tok::TokenKind TK) { +inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { return TokenFlag | static_cast(TK); } +// Error recovery strategies. +// FIXME: these should be provided as extensions instead. +enum class RecoveryStrategy : uint8_t { None, Braces }; // An extension is a piece of native code specific to a grammar that modifies // the behavior of annotated rules. One ExtensionID is assigned for each unique @@ -107,7 +110,7 @@ // length to 9 (this is the longest sequence in cxx grammar). static constexpr unsigned SizeBits = 4; static constexpr unsigned MaxElements = 9; - static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit"); + static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit"); static_assert(SizeBits + SymbolBits <= 16, "Must be able to store symbol ID + size efficiently"); @@ -123,6 +126,13 @@ // being set for this rule. ExtensionID Guard = 0; + // Specifies the index within Sequence eligible for error recovery. + // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we + // should recover by finding the matching brace, and forcing stmt-seq to match + // everything between braces. + uint8_t RecoveryIndex = -1; + RecoveryStrategy Recovery = RecoveryStrategy::None; + llvm::ArrayRef seq() const { return llvm::ArrayRef(Sequence, Size); } diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h @@ -137,8 +137,20 @@ SymbolID Label; }; + // A possible error recovery: choose to match some tokens against a symbol. + // + // e.g. a state that contains + // stmt := { . stmt-seq [recover=braces] } + // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. + struct Recovery { + StateID Src; // The state we are in when encountering the error. + RecoveryStrategy Strategy; // Heuristic choosing the tokens to match. + SymbolID Result; // The symbol that is produced. + }; + llvm::ArrayRef states() const { return States; } llvm::ArrayRef edges() const { return Edges; } + llvm::ArrayRef recoveries() const { return Recoveries; } llvm::ArrayRef> startStates() const { return StartStates; } @@ -147,12 +159,15 @@ private: LRGraph(std::vector States, std::vector Edges, + std::vector Recoveries, std::vector> StartStates) : States(std::move(States)), Edges(std::move(Edges)), - StartStates(std::move(StartStates)) {} + Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) { + } std::vector States; std::vector Edges; + std::vector Recoveries; std::vector> StartStates; }; diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -123,6 +123,11 @@ uint16_t Value : ValueBits; }; + struct Recovery { + RecoveryStrategy Strategy; + SymbolID Result; + }; + // Returns all available actions for the given state on a terminal. // Expected to be called by LR parsers. llvm::ArrayRef getActions(StateID State, SymbolID Terminal) const; @@ -139,6 +144,12 @@ // Returns empty if no available actions in the table. llvm::ArrayRef find(StateID State, SymbolID Symbol) const; + // Looks up available recovery actions if we stopped parsing in this state. + llvm::ArrayRef getRecovery(StateID State) const { + return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State], + Recoveries.data() + RecoveryOffset[State + 1]); + } + // Returns the state from which the LR parser should start to parse the input // tokens as the given StartSymbol. // @@ -169,8 +180,14 @@ SymbolID Symbol; Action Act; }; - // Build a specifid table for testing purposes. - static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef); + struct RecoveryEntry { + StateID State; + RecoveryStrategy Strategy; + SymbolID Result; + }; + // Build a specified table for testing purposes. + static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef, + llvm::ArrayRef = {}); private: // Conceptually the LR table is a multimap from (State, SymbolID) => Action. @@ -188,6 +205,11 @@ std::vector Actions; // A sorted table, storing the start state for each target parsing symbol. std::vector> StartStates; + + // Recovery stores all recovery actions from all states. + // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]). + std::vector RecoveryOffset; + std::vector Recoveries; }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -24,6 +24,156 @@ namespace clang { namespace pseudo { +namespace { + +llvm::Optional +findRecoveryEndpoint(RecoveryStrategy Strategy, + const GSS::Node *RecoveryNode, + const TokenStream &Tokens) { + assert(Strategy == RecoveryStrategy::Braces); + const ForestNode *LBrace = RecoveryNode->Payload; + assert(LBrace->kind() == ForestNode::Terminal && + LBrace->symbol() == tokenSymbol(tok::l_brace)); + if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair()) + return Tokens.index(*RBrace); + return llvm::None; +} + +} // namespace + +void glrRecover(llvm::ArrayRef OldHeads, + unsigned &TokenIndex, const TokenStream &Tokens, + const ParseParams &Params, + std::vector &NewHeads) { + LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n"); + // Describes a possibility to recover by forcibly interpreting a range of + // tokens around the cursor as a nonterminal that we expected to see. + struct PlaceholderRecovery { + // The token prior to the nonterminal which is being recovered. + // This starts of the region we're skipping, so higher Position is better. + Token::Index Position; + // The nonterminal which will be created in order to recover. + SymbolID Symbol; + // The heuristic used to choose the bounds of the nonterminal to recover. + RecoveryStrategy Strategy; + + // The GSS head where we are expecting the recovered nonterminal. + const GSS::Node *RecoveryNode; + // Payload of nodes on the way back from the OldHead to the recovery node. + // These represent the partial parse that is being discarded. + // They should become the children of the opaque recovery node. + // + // There may be multiple paths leading to the same recovery node, we choose + // one arbitrarily. + std::vector Path; + }; + std::vector Options; + + // Find recovery options by walking up the stack. + // + // This is similar to exception handling: we walk up the "frames" of nested + // rules being parsed until we find one that has a "handler" which allows us + // to determine the node bounds without parsing it. + // + // Unfortunately there's a significant difference: the stack contains both + // "upward" nodes (ancestor parses) and "leftward" ones. + // e.g. when parsing `int(2 + ?)`, the stack contains: + // expr := expr + . expr - which we're currently parsing + // expr := type ( . expr ) - (up) we should recover this outer expr + // expr := . type ( expr ) - (up+left) we should not recover this type! + // + // It's not obvious how to avoid collecting the latter as a recovery option. + // I think the distinction is ill-defined after merging items into states. + // For now, we have to take this into account when defining recovery rules. + // FIXME: find a more satisfying way to avoid such false recovery. + std::vector Path; + llvm::DenseSet Seen; + auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) { + if (!Seen.insert(N).second) + return; + for (auto Strategy : Params.Table.getRecovery(N->State)) { + Options.push_back(PlaceholderRecovery{ + NextTok, + Strategy.Result, + Strategy.Strategy, + N, + Path, + }); + LLVM_DEBUG(llvm::dbgs() + << "Option: recover " << Params.G.symbolName(Strategy.Result) + << " at token " << NextTok << "\n"); + } + Path.push_back(N->Payload); + for (const GSS::Node *Parent : N->parents()) + DFS(Parent, N->Payload->startTokenIndex(), DFS); + Path.pop_back(); + }; + for (auto *N : llvm::reverse(OldHeads)) + DFS(N, TokenIndex, DFS); + + // Now we select the option(s) we will use to recover. + // + // We prefer options starting further right, as these discard less code + // (e.g. we prefer to recover inner scopes rather than outer ones). + // The options also need to agree on an endpoint, so the parser has a + // consistent position afterwards. + // + // So conceptually we're sorting by the tuple (start, end), though we avoid + // computing `end` for options that can't be winners. + + // Consider options starting further right first. + // Don't drop the others yet though, we may still use them if preferred fails. + llvm::stable_sort(Options, [&](const auto &L, const auto &R) { + return L.Position > R.Position; + }); + + assert(NewHeads.empty()); // We may repeatedly populate and clear it. + llvm::Optional RecoveryRange; + for (const PlaceholderRecovery &Option : Options) { + // If this starts further right than options we've already found, then + // we'll never find anything better. Skip computing End for the rest. + if (RecoveryRange && Option.Position < RecoveryRange->Begin) + break; + + auto End = + findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens); + // Only consider recovery that advances the parse. + if (!End || *End <= TokenIndex) + continue; + if (RecoveryRange) { + // If this is worse than our previous options, ignore it. + if (RecoveryRange->End < *End) + continue; + // If this is an improvement over our previous options, then drop them. + if (RecoveryRange->End > *End) + NewHeads.clear(); + } + // Create recovery nodes and heads for them in the GSS. These may be + // discarded if a better recovery is later found, but this path isn't hot. + RecoveryRange = {Option.Position, *End}; + const ForestNode &Placeholder = + Params.Forest.createOpaque(Option.Symbol, Option.Position); + const GSS::Node *NewHead = Params.GSStack.addNode( + Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol), + &Placeholder, {Option.RecoveryNode}); + NewHeads.push_back(NewHead); + } + + // Advance the cursor, whether recovery succeeded or not. + if (RecoveryRange) { + LLVM_DEBUG({ + llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":"; + for (const auto *Head : NewHeads) + llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol()); + llvm::dbgs() << "\n"; + }); + TokenIndex = RecoveryRange->End; + } else { + LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size() + << " strategies\n"); + ++TokenIndex; + } +} using StateID = LRTable::StateID; @@ -31,8 +181,9 @@ std::vector ParentStates; for (const auto *Parent : N.parents()) ParentStates.push_back(llvm::formatv("{0}", Parent->State)); - OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State, - N.Payload->symbol(), llvm::join(ParentStates, ", ")); + OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State, + N.Payload ? N.Payload->symbol() : 0, + llvm::join(ParentStates, ", ")); return OS; } @@ -426,15 +577,27 @@ GSS.gc(std::move(Roots)); }; // Each iteration fully processes a single token. - for (unsigned I = 0; I < Terminals.size(); ++I) { + for (unsigned I = 0; I < Terminals.size();) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv( "Next token {0} (id={1})\n", G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); // Consume the token. glrShift(Heads, Terminals[I], Params, NextHeads); + + // If we weren't able to consume the token, try to skip over some tokens + // so we can keep parsing. + if (NextHeads.empty()) { + glrRecover(Heads, I, Tokens, Params, NextHeads); + if (NextHeads.empty()) + // FIXME: Ensure the `_ := start-symbol` rules have a fallback + // error-recovery strategy attached. Then this condition can't happen. + return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); + } else + ++I; + // Form nonterminals containing the token we just consumed. - SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof) - : Terminals[I + 1].symbol(); + SymbolID Lookahead = + I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol(); Reduce(NextHeads, Lookahead); // Prepare for the next token. std::swap(Heads, NextHeads); @@ -443,22 +606,35 @@ } LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); + // The parse was successful if we're in state `_ := start-symbol .` StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); - const ForestNode *Result = nullptr; - for (const auto *Head : Heads) { - if (Head->State == AcceptState) { - assert(Head->Payload->symbol() == StartSymbol); - assert(Result == nullptr && "multiple results!"); - Result = Head->Payload; + auto SearchForAccept = [&](llvm::ArrayRef Heads) { + const ForestNode *Result = nullptr; + for (const auto *Head : Heads) { + if (Head->State == AcceptState) { + assert(Head->Payload->symbol() == StartSymbol); + assert(Result == nullptr && "multiple results!"); + Result = Head->Payload; + } } - } - if (Result) + return Result; + }; + if (auto *Result = SearchForAccept(Heads)) return *Result; + // Failed to parse the input, attempt to run recovery. + // FIXME: this awkwardly repeats the recovery in the loop, when shift fails. + // More elegant is to include EOF in the token stream, and make the + // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF + // would not be a special case: it show up as a failure to shift the EOF + // token. + unsigned I = Terminals.size(); + glrRecover(Heads, I, Tokens, Params, NextHeads); + Reduce(NextHeads, tokenSymbol(tok::eof)); + if (auto *Result = SearchForAccept(NextHeads)) + return *Result; + // We failed to parse the input, returning an opaque forest node for recovery. - // - // FIXME: We will need to invoke our generic error-recovery handlers when we - // reach EOF without reaching accept state, and involving the eof - // token in the above main for-loopmay be the best way to reuse the code). + // FIXME: as above, we can add fallback error handling so this is impossible. return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); } @@ -469,9 +645,10 @@ } const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Parents) { Node *Result = new (allocate(Parents.size())) - Node({State, GCParity, static_cast(Parents.size())}); + Node({State, GCParity, static_cast(Parents.size())}); Alive.push_back(Result); ++NodesCreated; Result->Payload = Symbol; diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -59,8 +59,11 @@ llvm::raw_string_ostream OS(Result); const Rule &R = T->Rules[RID]; OS << symbolName(R.Target) << " :="; - for (SymbolID SID : R.seq()) - OS << " " << symbolName(SID); + for (unsigned I = 0; I < R.Size; ++I) { + OS << " " << symbolName(R.Sequence[I]); + if (R.RecoveryIndex == I) + OS << " [recover=" << static_cast(R.Recovery) << "]"; + } if (R.Guard) OS << " [guard=" << T->AttributeValues[R.Guard] << "]"; return Result; diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -106,6 +106,17 @@ assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); + // Wherever RHS contains { foo }, mark foo for brace-recovery. + // FIXME: this should be grammar annotations instead. + for (auto &Rule : T->Rules) { + for (unsigned I = 2; I < Rule.Size; ++I) + if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) && + Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) && + !isToken(Rule.Sequence[I - 1])) { + Rule.Recovery = RecoveryStrategy::Braces; + Rule.RecoveryIndex = I - 1; + } + } const auto &SymbolOrder = getTopologicalOrder(T.get()); llvm::stable_sort( T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp @@ -120,6 +120,20 @@ return Results; } +std::vector> +availableRecovery(const State &S, const Grammar &G) { + std::vector> Result; + for (const Item &I : S.Items) { + const auto &Rule = G.lookupRule(I.rule()); + if (I.dot() != Rule.RecoveryIndex) + continue; + Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]}); + } + llvm::sort(Result); + Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); + return Result; +} + } // namespace std::string Item::dump(const Grammar &G) const { @@ -130,9 +144,10 @@ Results.push_back(G.symbolName(SID)); return Results; }; - return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target), + return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target), llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), - llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " ")) + llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "), + Rule.RecoveryIndex == DotPos ? " [recovery]" : "") .str(); } @@ -181,6 +196,11 @@ Edges.push_back({Src, Dst, Label}); } + void insertRecovery(StateID Src, RecoveryStrategy Strategy, + SymbolID Result) { + Recoveries.push_back({Src, Strategy, Result}); + } + // Returns a state with the given id. const State &find(StateID ID) const { assert(ID < States.size()); @@ -194,9 +214,10 @@ LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); + Recoveries.shrink_to_fit(); llvm::sort(StartStates); StartStates.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges), + return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries), std::move(StartStates)); } @@ -205,6 +226,7 @@ llvm::DenseMap StatesIndex; std::vector States; std::vector Edges; + std::vector Recoveries; const Grammar &G; std::vector> StartStates; } Builder(G); @@ -225,15 +247,16 @@ } while (!PendingStates.empty()) { - auto CurrentStateID = PendingStates.back(); + auto StateID = PendingStates.back(); PendingStates.pop_back(); - for (auto Next : - nextAvailableKernelItems(Builder.find(CurrentStateID), G)) { + for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) { auto Insert = Builder.insert(Next.second); if (Insert.second) // new state, insert to the pending queue. PendingStates.push_back(Insert.first); - Builder.insertEdge(CurrentStateID, Insert.first, Next.first); + Builder.insertEdge(StateID, Insert.first, Next.first); } + for (auto Recovery : availableRecovery(Builder.find(StateID), G)) + Builder.insertRecovery(StateID, Recovery.first, Recovery.second); } return std::move(Builder).build(); } diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp @@ -10,6 +10,7 @@ #include "clang-pseudo/grammar/LRGraph.h" #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/TokenKinds.h" +#include "llvm/Support/raw_ostream.h" #include namespace llvm { @@ -40,8 +41,9 @@ class LRTable::Builder { public: - Builder(llvm::ArrayRef> StartStates) - : StartStates(StartStates) {} + Builder(llvm::ArrayRef Recoveries, + llvm::ArrayRef> StartStates) + : Recoveries(Recoveries), StartStates(StartStates) {} bool insert(Entry E) { return Entries.insert(std::move(E)).second; } LRTable build(const GrammarTable >, unsigned NumStates) && { @@ -86,16 +88,42 @@ ++SortedIndex; } Table.StartStates = std::move(StartStates); + + // Error recovery entries: sort (no dups already), and build offset lookup. + llvm::sort(Recoveries, + [&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) { + return std::tie(L.Src, L.Result, L.Strategy) < + std::tie(R.Src, R.Result, R.Strategy); + }); + Table.Recoveries.reserve(Recoveries.size()); + for (const auto &R : Recoveries) + Table.Recoveries.push_back({R.Strategy, R.Result}); + Table.RecoveryOffset = std::vector(NumStates + 1, 0); + SortedIndex = 0; + for (StateID State = 0; State < NumStates; ++State) { + Table.RecoveryOffset[State] = SortedIndex; + while (SortedIndex < Recoveries.size() && + Recoveries[SortedIndex].Src == State) + SortedIndex++; + } + Table.RecoveryOffset[NumStates] = SortedIndex; + assert(SortedIndex == Recoveries.size()); + return Table; } private: llvm::DenseSet Entries; + std::vector Recoveries; std::vector> StartStates; }; LRTable LRTable::buildForTests(const GrammarTable >, - llvm::ArrayRef Entries) { + llvm::ArrayRef Entries, + llvm::ArrayRef Recoveries) { + std::vector GRecoveries; + for (const auto &R : Recoveries) + GRecoveries.push_back({R.State, R.Strategy, R.Result}); StateID MaxState = 0; for (const auto &Entry : Entries) { MaxState = std::max(MaxState, Entry.State); @@ -104,7 +132,7 @@ if (Entry.Act.kind() == LRTable::Action::GoTo) MaxState = std::max(MaxState, Entry.Act.getGoToState()); } - Builder Build({}); + Builder Build(GRecoveries, {}); for (const Entry &E : Entries) Build.insert(E); return std::move(Build).build(GT, /*NumStates=*/MaxState + 1); @@ -112,7 +140,7 @@ LRTable LRTable::buildSLR(const Grammar &G) { auto Graph = LRGraph::buildLR0(G); - Builder Build(Graph.startStates()); + Builder Build(Graph.recoveries(), Graph.startStates()); for (const auto &T : Graph.edges()) { Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); Build.insert({T.Src, T.Label, Act}); diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp --- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp +++ b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp @@ -2,7 +2,7 @@ class Foo { public: }; -// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification } +// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] } // CHECK-NEXT: ├─class-head := class-key class-head-name // CHECK-NEXT: │ ├─class-key~CLASS := tok[0] // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1] diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp @@ -0,0 +1,13 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s +auto x = { complete garbage }; +// CHECK: translation-unit~simple-declaration +// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0] +// CHECK-NEXT: ├─init-declarator-list~init-declarator +// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1] +// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer +// CHECK-NEXT: │ ├─= := tok[2] +// CHECK-NEXT: │ └─initializer-clause~braced-init-list +// CHECK-NEXT: │ ├─{ := tok[3] +// CHECK-NEXT: │ ├─initializer-list := +// CHECK-NEXT: │ └─} := tok[6] +// CHECK-NEXT: └─; := tok[7] diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/GLR.h" +#include "clang-pseudo/Bracket.h" #include "clang-pseudo/Token.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang/Basic/LangOptions.h" @@ -31,11 +32,13 @@ using Action = LRTable::Action; using testing::AllOf; +using testing::IsEmpty; using testing::UnorderedElementsAre; MATCHER_P(state, StateID, "") { return arg->State == StateID; } MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; } MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; } +MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; } testing::Matcher parents(llvm::ArrayRef Parents) { @@ -101,8 +104,8 @@ // 0---1---4 // └---2---┘ // └---3---5 - auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + /*Parents=*/{}); auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr, @@ -151,8 +154,8 @@ Action::reduce(ruleFor("enum-name"))}, }); - const auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + /*Parents=*/{}); const auto *GSSNode1 = GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0}); @@ -180,8 +183,8 @@ auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0); auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0); - const auto *GSSNode2 = - GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{}); + const auto *GSSNode2 = GSStack.addNode( + /*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{}); const auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{}); const auto *GSSNode4 = GSStack.addNode( @@ -227,15 +230,17 @@ auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1); auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1); - const auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + /*Parents=*/{}); const auto *GSSNode1 = GSStack.addNode( - /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); + /*State=*/1, /*ForestNode=*/CVQualifierNode, + /*Parents=*/{GSSNode0}); const auto *GSSNode2 = GSStack.addNode( - /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); - const auto *GSSNode3 = - GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode, - /*Parents=*/{GSSNode1}); + /*State=*/2, /*ForestNode=*/CVQualifierNode, + /*Parents=*/{GSSNode0}); + const auto *GSSNode3 = GSStack.addNode( + /*State=*/3, /*ForestNode=*/ClassNameNode, + /*Parents=*/{GSSNode1}); const auto *GSSNode4 = GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, /*Parents=*/{GSSNode2}); @@ -283,20 +288,20 @@ auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0); auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1); - const auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - const auto *GSSNode1 = - GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode, - /*Parents=*/{GSSNode0}); + const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, + /*Parents=*/{}); + const auto *GSSNode1 = GSStack.addNode( + /*State=*/1, /*ForestNode=*/ClassNameNode, + /*Parents=*/{GSSNode0}); const auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode, /*Parents=*/{GSSNode0}); - const auto *GSSNode3 = - GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal, - /*Parents=*/{GSSNode1}); - const auto *GSSNode4 = - GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal, - /*Parents=*/{GSSNode2}); + const auto *GSSNode3 = GSStack.addNode( + /*State=*/3, /*ForestNode=*/StartTerminal, + /*Parents=*/{GSSNode1}); + const auto *GSSNode4 = GSStack.addNode( + /*State=*/4, /*ForestNode=*/StartTerminal, + /*Parents=*/{GSSNode2}); // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! LRTable Table = LRTable::buildForTests( @@ -325,6 +330,122 @@ "[ 1, end) └─* := tok[1]\n"); } +TEST_F(GLRTest, Recover) { + // Recovery while parsing "word" inside braces. + // Before: + // 0--1({)--2(?) + // After recovering a `word` at state 1: + // 0--3(word) // 3 is goto(1, word) + buildGrammar({"word"}, {}); + LRTable Table = LRTable::buildForTests( + G->table(), {{/*State=*/1, id("word"), Action::goTo(3)}}, + /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}}); + + auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); + auto *Question1 = &Arena.createTerminal(tok::question, 1); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); + const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces}); + + // Need a token stream with paired braces so the strategy works. + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions); + pairBrackets(Tokens); + std::vector NewHeads; + + unsigned TokenIndex = 2; + glrRecover({AfterQuestion1}, TokenIndex, Tokens, {*G, Table, Arena, GSStack}, + NewHeads); + EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace"; + EXPECT_THAT(NewHeads, ElementsAre( + AllOf(state(3), parsedSymbolID(id("word")), + parents({OpenedBraces}), start(1u)))); + EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque); + + // Test recovery failure: omit closing brace so strategy fails + TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions); + pairBrackets(NoRBrace); + NewHeads.clear(); + TokenIndex = 2; + glrRecover({AfterQuestion1}, TokenIndex, NoRBrace, + {*G, Table, Arena, GSStack}, NewHeads); + EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default"; + EXPECT_THAT(NewHeads, IsEmpty()); +} + +TEST_F(GLRTest, RecoverRightmost) { + // In a nested block structure, we recover at the innermost possible block. + // Before: + // 0--1({)--1({)--1({) + // After recovering a `block` at inside the second braces: + // 0--1({)--2(body) // 2 is goto(1, body) + buildGrammar({"body"}, {}); + LRTable Table = LRTable::buildForTests( + G->table(), {{/*State=*/1, id("body"), Action::goTo(2)}}, + /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}}); + + clang::LangOptions LOptions; + // Innermost brace is unmatched, to test fallback to next brace. + TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions); + Tokens.tokens()[0].Pair = 5; + Tokens.tokens()[1].Pair = 4; + Tokens.tokens()[4].Pair = 1; + Tokens.tokens()[5].Pair = 0; + + auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0); + auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1); + auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *In1 = GSStack.addNode(1, Brace1, {Root}); + const auto *In2 = GSStack.addNode(1, Brace2, {In1}); + const auto *In3 = GSStack.addNode(1, Brace3, {In2}); + + unsigned TokenIndex = 3; + std::vector NewHeads; + glrRecover({In3}, TokenIndex, Tokens, {*G, Table, Arena, GSStack}, NewHeads); + EXPECT_EQ(TokenIndex, 5u); + EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")), + parents({In2}), start(2u)))); +} + +TEST_F(GLRTest, RecoverAlternatives) { + // Recovery inside braces with multiple equally good options + // Before: + // 0--1({) + // After recovering either `word` or `number` inside the braces: + // 0--1({)--2(word) // 2 is goto(1, word) + // └--3(number) // 3 is goto(1, number) + buildGrammar({"number", "word"}, {}); + LRTable Table = LRTable::buildForTests( + G->table(), + { + {/*State=*/1, id("number"), Action::goTo(2)}, + {/*State=*/1, id("word"), Action::goTo(3)}, + }, + /*Recovery=*/{ + {/*State=*/1, RecoveryStrategy::Braces, id("number")}, + {/*State=*/1, RecoveryStrategy::Braces, id("word")}, + }); + auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); + + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions); + pairBrackets(Tokens); + std::vector NewHeads; + unsigned TokenIndex = 1; + + glrRecover({OpenedBraces}, TokenIndex, Tokens, {*G, Table, Arena, GSStack}, + NewHeads); + EXPECT_EQ(TokenIndex, 2u); + EXPECT_THAT(NewHeads, + UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")), + parents({OpenedBraces}), start(1u)), + AllOf(state(3), parsedSymbolID(id("word")), + parents({OpenedBraces}), start(1u)))); +} + TEST_F(GLRTest, PerfectForestNodeSharing) { // Run the GLR on a simple grammar and test that we build exactly one forest // node per (SymbolID, token range). @@ -395,6 +516,40 @@ "[ 0, end) └─IDENTIFIER := tok[0]\n"); } +TEST_F(GLRTest, RecoveryEndToEnd) { + // Simple example of brace-based recovery showing: + // - recovered region includes tokens both ahead of and behind the cursor + // - multiple possible recovery rules + // - recovery from outer scopes is rejected + build(R"bnf( + _ := block + + block := { block } + block := { numbers } + numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT + )bnf"); + auto LRTable = LRTable::buildSLR(*G); + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions); + pairBrackets(Tokens); + + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("block")); + EXPECT_EQ(Parsed.dumpRecursive(*G), + "[ 0, end) block := { block [recover=1] }\n" + "[ 0, 1) ├─{ := tok[0]\n" + "[ 1, 5) ├─block := \n" + "[ 1, 5) │ ├─block := { block [recover=1] }\n" + "[ 1, 2) │ │ ├─{ := tok[1]\n" + "[ 2, 4) │ │ ├─block := \n" + "[ 4, 5) │ │ └─} := tok[4]\n" + "[ 1, 5) │ └─block := { numbers [recover=1] }\n" + "[ 1, 2) │ ├─{ := tok[1]\n" + "[ 2, 4) │ ├─numbers := \n" + "[ 4, 5) │ └─} := tok[4]\n" + "[ 5, end) └─} := tok[5]\n"); +} + TEST_F(GLRTest, NoExplicitAccept) { build(R"bnf( _ := test