diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h @@ -9,6 +9,7 @@ #ifndef CLANG_PSEUDO_GRAMMAR_LANGUAGE_H #define CLANG_PSEUDO_GRAMMAR_LANGUAGE_H +#include "clang-pseudo/Token.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" @@ -28,13 +29,25 @@ using RuleGuard = llvm::function_ref RHS, const TokenStream &)>; +// A recovery strategy determines a region of code to skip when parsing fails. +// +// For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`, +// if parsing fails while attempting to parse `body`, we may skip up to the +// matching `}` and assume everything between was a `body`. +// +// The provided index is the token where the skipped region begins. +// Returns the (excluded) end of the range, or Token::Invalid for no recovery. +using RecoveryStrategy = + llvm::function_ref; + // Specify a language that can be parsed by the pseduoparser. struct Language { Grammar G; LRTable Table; - // Binding "guard" extension id to a piece of C++ code. + // Binding extension ids to corresponding implementations. llvm::DenseMap Guards; + llvm::DenseMap RecoveryStrategies; // FIXME: add clang::LangOptions. // FIXME: add default start symbols. diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h @@ -85,9 +85,6 @@ inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { return TokenFlag | static_cast(TK); } -// Error recovery strategies. -// FIXME: these should be provided as extensions instead. -enum class RecoveryStrategy : uint8_t { None, Braces }; // An extension is a piece of native code specific to a grammar that modifies // the behavior of annotated rules. One ExtensionID is assigned for each unique @@ -133,7 +130,7 @@ // everything between braces. // For now, only a single strategy at a single point is possible. uint8_t RecoveryIndex = -1; - RecoveryStrategy Recovery = RecoveryStrategy::None; + ExtensionID Recovery = 0; llvm::ArrayRef seq() const { return llvm::ArrayRef(Sequence, Size); diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h @@ -144,7 +144,7 @@ // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. struct Recovery { StateID Src; // The state we are in when encountering the error. - RecoveryStrategy Strategy; // Heuristic choosing the tokens to match. + ExtensionID Strategy; // Heuristic choosing the tokens to match. SymbolID Result; // The symbol that is produced. }; diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -68,7 +68,7 @@ static constexpr unsigned StateBits = 13; struct Recovery { - RecoveryStrategy Strategy; + ExtensionID Strategy; SymbolID Result; }; diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -27,15 +27,13 @@ namespace pseudo { namespace { -Token::Index findRecoveryEndpoint(RecoveryStrategy Strategy, - const GSS::Node *RecoveryNode, - const TokenStream &Tokens) { - assert(Strategy == RecoveryStrategy::Braces); - const ForestNode *LBrace = RecoveryNode->Payload; - assert(LBrace->kind() == ForestNode::Terminal && - LBrace->symbol() == tokenSymbol(tok::l_brace)); - if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair()) - return Tokens.index(*RBrace); +Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin, + const TokenStream &Tokens, + const Language &Lang) { + assert(Strategy != 0); + assert(Begin > 0); + if (auto S = Lang.RecoveryStrategies.lookup(Strategy)) + return S(Begin, Tokens); return Token::Invalid; } @@ -55,7 +53,7 @@ // The nonterminal which will be created in order to recover. SymbolID Symbol; // The heuristic used to choose the bounds of the nonterminal to recover. - RecoveryStrategy Strategy; + ExtensionID Strategy; // The GSS head where we are expecting the recovered nonterminal. const GSS::Node *RecoveryNode; @@ -142,8 +140,8 @@ if (RecoveryRange && Option.Position < RecoveryRange->Begin) break; - auto End = - findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Params.Code); + auto End = findRecoveryEndpoint(Option.Strategy, Option.Position, + Params.Code, Lang); // Recovery may not take the parse backwards. if (End == Token::Invalid || End < TokenIndex) continue; diff --git a/clang-tools-extra/pseudo/lib/cli/CLI.cpp b/clang-tools-extra/pseudo/lib/cli/CLI.cpp --- a/clang-tools-extra/pseudo/lib/cli/CLI.cpp +++ b/clang-tools-extra/pseudo/lib/cli/CLI.cpp @@ -40,8 +40,12 @@ for (const auto &Diag : Diags) llvm::errs() << Diag << "\n"; auto Table = LRTable::buildSLR(G); - return new Language{std::move(G), std::move(Table), - llvm::DenseMap()}; + return new Language{ + std::move(G), + std::move(Table), + llvm::DenseMap(), + llvm::DenseMap(), + }; }(); return *Lang; } diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp --- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp @@ -35,9 +35,24 @@ } llvm::DenseMap buildGuards() { - return llvm::DenseMap( - {{(ExtensionID)Extension::Override, guardOverride}, - {(ExtensionID)Extension::Final, guardFinal}}); + return { + {(ExtensionID)Extension::Override, guardOverride}, + {(ExtensionID)Extension::Final, guardFinal}, + }; +} + +Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { + assert(Begin > 0); + const Token &Left = Tokens.tokens()[Begin - 1]; + if (const Token *Right = Left.pair()) + return Tokens.index(*Right); + return Token::Invalid; +} + +llvm::DenseMap buildRecoveryStrategies() { + return { + {(ExtensionID)Extension::Brackets, recoverBrackets}, + }; } } // namespace @@ -48,8 +63,12 @@ auto G = Grammar::parseBNF(CXXBNF, Diags); assert(Diags.empty()); LRTable Table = LRTable::buildSLR(G); - const Language *PL = - new Language{std::move(G), std::move(Table), buildGuards()}; + const Language *PL = new Language{ + std::move(G), + std::move(Table), + buildGuards(), + buildRecoveryStrategies(), + }; return *PL; }(); return CXXLanguage; diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf @@ -287,7 +287,7 @@ labeled-statement := CASE constant-expression : statement labeled-statement := DEFAULT : statement expression-statement := expression_opt ; -compound-statement := { statement-seq_opt } +compound-statement := { statement-seq_opt [recover=Brackets] } statement-seq := statement statement-seq := statement-seq statement selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement @@ -458,7 +458,9 @@ initializer-clause := braced-init-list #! Allow mixed designated/non-designated init-list. # This is standard C, and accepted by clang and others as an extension. -braced-init-list := { initializer-list ,_opt } +# FIXME: Decouple recovery from is-there-a-trailing-comma! +braced-init-list := { initializer-list [recover=Brackets] } +braced-init-list := { initializer-list , } braced-init-list := { } initializer-list := initializer-list-item initializer-list := initializer-list , initializer-list-item @@ -533,7 +535,7 @@ private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt # gram.class -class-specifier := class-head { member-specification_opt } +class-specifier := class-head { member-specification_opt [recover=Brackets] } class-head := class-key class-head-name class-virt-specifier_opt base-clause_opt class-head := class-key base-clause_opt class-head-name := nested-name-specifier_opt class-name diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -62,7 +62,7 @@ for (unsigned I = 0; I < R.Size; ++I) { OS << " " << symbolName(R.Sequence[I]); if (R.RecoveryIndex == I) - OS << " [recover=" << static_cast(R.Recovery) << "]"; + OS << " [recover=" << T->AttributeValues[R.Recovery] << "]"; } if (R.Guard) OS << " [guard=" << T->AttributeValues[R.Guard] << "]"; diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -106,17 +106,6 @@ assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); - // Wherever RHS contains { foo }, mark foo for brace-recovery. - // FIXME: this should be grammar annotations instead. - for (auto &Rule : T->Rules) { - for (unsigned I = 2; I < Rule.Size; ++I) - if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) && - Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) && - !isToken(Rule.Sequence[I - 1])) { - Rule.Recovery = RecoveryStrategy::Braces; - Rule.RecoveryIndex = I - 1; - } - } const auto &SymbolOrder = getTopologicalOrder(T.get()); llvm::stable_sort( T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { @@ -266,13 +255,18 @@ "Didn't find the attribute in AttrValues!"); return It - T.AttributeValues.begin(); }; - for (const auto &KV : Spec.Sequence.back().Attributes) { - if (KV.first == "guard") { - R.Guard = LookupExtensionID(KV.second); - continue; + for (unsigned I = 0; I < Spec.Sequence.size(); ++I) { + for (const auto &KV : Spec.Sequence[I].Attributes) { + if (KV.first == "guard") { + R.Guard = LookupExtensionID(KV.second); + } else if (KV.first == "recover") { + R.Recovery = LookupExtensionID(KV.second); + R.RecoveryIndex = I; + } else { + Diagnostics.push_back( + llvm::formatv("Unknown attribute '{0}'", KV.first).str()); + } } - Diagnostics.push_back( - llvm::formatv("Unknown attribute '{0}'", KV.first).str()); } } diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp @@ -120,9 +120,9 @@ return Results; } -std::vector> +std::vector> availableRecovery(const State &S, const Grammar &G) { - std::vector> Result; + std::vector> Result; for (const Item &I : S.Items) { const auto &Rule = G.lookupRule(I.rule()); if (I.dot() != Rule.RecoveryIndex) @@ -196,8 +196,7 @@ Edges.push_back({Src, Dst, Label}); } - void insertRecovery(StateID Src, RecoveryStrategy Strategy, - SymbolID Result) { + void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) { Recoveries.push_back({Src, Strategy, Result}); } diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp --- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp +++ b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp @@ -2,7 +2,7 @@ class Foo { public: }; -// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] } +// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=Brackets] } // CHECK-NEXT: ├─class-head := class-key class-head-name // CHECK-NEXT: │ ├─class-key~CLASS := tok[0] // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1] diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp --- a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp +++ b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp @@ -1,4 +1,4 @@ -// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s +// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s auto x = { complete garbage }; // CHECK: translation-unit~simple-declaration // CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0] diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -48,6 +48,17 @@ testing::UnorderedElementsAreArray(Parents)); } +Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) { + EXPECT_GT(Begin, 0u); + const Token &Left = Code.tokens()[Begin - 1]; + EXPECT_EQ(Left.Kind, tok::l_brace); + if (const auto* Right = Left.pair()) { + EXPECT_EQ(Right->Kind, tok::r_brace); + return Code.index(*Right); + } + return Token::Invalid; +} + class GLRTest : public ::testing::Test { public: void build(llvm::StringRef GrammarBNF) { @@ -375,11 +386,12 @@ // 0--1({)--2(?) // After recovering a `word` at state 1: // 0--3(word) // 3 is goto(1, word) - buildGrammar({"word"}, {}); + buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"}); LRTable::Builder B(TestLang.G); B.Transition[{StateID{1}, id("word")}] = StateID{3}; - B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}}); + B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}}); TestLang.Table = std::move(B).build(); + TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); auto *Question1 = &Arena.createTerminal(tok::question, 1); @@ -418,11 +430,12 @@ // 0--1({)--1({)--1({) // After recovering a `block` at inside the second braces: // 0--1({)--2(body) // 2 is goto(1, body) - buildGrammar({"body"}, {}); + buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"}); LRTable::Builder B(TestLang.G); B.Transition[{StateID{1}, id("body")}] = StateID{2}; - B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("body")}}); + B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("body")}}); TestLang.Table = std::move(B).build(); + TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); clang::LangOptions LOptions; // Innermost brace is unmatched, to test fallback to next brace. @@ -455,13 +468,17 @@ // After recovering either `word` or `number` inside the braces: // 0--1({)--2(word) // 2 is goto(1, word) // └--3(number) // 3 is goto(1, number) - buildGrammar({"number", "word"}, {}); + buildGrammar({"number", "word", "top"}, + { + "top := { number [recover=Braces] }", + "top := { word [recover=Braces] }", + }); LRTable::Builder B(TestLang.G); B.Transition[{StateID{1}, id("number")}] = StateID{2}; B.Transition[{StateID{1}, id("word")}] = StateID{3}; - B.Recoveries.push_back( - {StateID{1}, {RecoveryStrategy::Braces, id("number")}}); - B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}}); + B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("number")}}); + B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}}); + TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); TestLang.Table = std::move(B).build(); auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); const auto *Root = GSStack.addNode(0, nullptr, {}); @@ -560,11 +577,12 @@ build(R"bnf( _ := block - block := { block } - block := { numbers } + block := { block [recover=Braces] } + block := { numbers [recover=Braces] } numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT )bnf"); TestLang.Table = LRTable::buildSLR(TestLang.G); + TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); clang::LangOptions LOptions; TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions); pairBrackets(Tokens); @@ -572,14 +590,14 @@ const ForestNode &Parsed = glrParse({Tokens, Arena, GSStack}, id("block"), TestLang); EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) block := { block [recover=1] }\n" + "[ 0, end) block := { block [recover=Braces] }\n" "[ 0, 1) ├─{ := tok[0]\n" "[ 1, 5) ├─block := \n" - "[ 1, 5) │ ├─block := { block [recover=1] }\n" + "[ 1, 5) │ ├─block := { block [recover=Braces] }\n" "[ 1, 2) │ │ ├─{ := tok[1]\n" "[ 2, 4) │ │ ├─block := \n" "[ 4, 5) │ │ └─} := tok[4]\n" - "[ 1, 5) │ └─block := { numbers [recover=1] }\n" + "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n" "[ 1, 2) │ ├─{ := tok[1]\n" "[ 2, 4) │ ├─numbers := \n" "[ 4, 5) │ └─} := tok[4]\n"