diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h @@ -34,6 +34,14 @@ // Return true if the guard is satisfied. using RuleGuard = llvm::function_ref; +struct RecoveryParams { + // The position in the stream where the node being recovered will start. + Token::Index Begin; + // The position in the stream where the parse failed. + // Begin <= Cursor <= End (for a valid recovery). + Token::Index Cursor; + const TokenStream &Tokens; +}; // A recovery strategy determines a region of code to skip when parsing fails. // // For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`, @@ -43,7 +51,7 @@ // The provided index is the token where the skipped region begins. // Returns the (excluded) end of the range, or Token::Invalid for no recovery. using RecoveryStrategy = - llvm::function_ref; + llvm::function_ref; // Specify a language that can be parsed by the pseduoparser. struct Language { diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -82,6 +82,11 @@ assert(Kind != tok::eof); return *(this + 1); } + /// Returns the previous token in the stream. this may not be a sentinel. + const Token &prev() const { + assert(Kind != tok::eof); + return *(this - 1); + } /// Returns the next token in the stream, skipping over comments. const Token &nextNC() const { const Token *T = this; diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -28,12 +28,12 @@ namespace { Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin, + Token::Index Cursor, const TokenStream &Tokens, const Language &Lang) { assert(Strategy != 0); - assert(Begin > 0); if (auto S = Lang.RecoveryStrategies.lookup(Strategy)) - return S(Begin, Tokens); + return S({Begin, Cursor, Tokens}); return Token::Invalid; } @@ -141,7 +141,7 @@ break; auto End = findRecoveryEndpoint(Option.Strategy, Option.Position, - Params.Code, Lang); + TokenIndex, Params.Code, Lang); // Recovery may not take the parse backwards. if (End == Token::Invalid || End < TokenIndex) continue; diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp --- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp @@ -252,21 +252,122 @@ #undef SYMBOL_GUARD } -Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { - assert(Begin > 0); - const Token &Left = Tokens.tokens()[Begin - 1]; +Token::Index recoverBrackets(const RecoveryParams &P) { + assert(P.Begin > 0); + const Token &Left = P.Tokens.tokens()[P.Begin - 1]; assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren || Left.Kind == tok::l_square); if (const Token *Right = Left.pair()) { - assert(Tokens.index(*Right) > Begin - 1); - return Tokens.index(*Right); + assert(P.Tokens.index(*Right) > P.Begin - 1); + return P.Tokens.index(*Right); } return Token::Invalid; } +Token::Index recoverNextDeclaration(const RecoveryParams &P) { + if (P.Begin == P.Tokens.tokens().size()) + return Token::Invalid; + + // Declarations that don't end in semicolons: + // function-definition + // EXPORT {...} + // EXTERN string-literal {...} + // INLINE_opt NAMESPACE IDENTIFIER {...} + // INLINE_opt NAMESPACE {...} + // NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER {...} + // Functions are hard to do anything about: + // - if the declarator is good and the braces are present, parsing succeeds + // - if the declarator is bad, it's hard to know it's a function + // - if the braces are missing, we don't know where to recover + // So We only try to detect the others. + bool AcceptBraces = false; + auto ConsiderForBraces = [&](tok::TokenKind K) { + if (K == tok::kw_export || K == tok::kw_extern || K == tok::kw_namespace) + AcceptBraces = true; + }; + + // Some tokens are pretty good indicators of a declaration starting, when + // they appear at the beginning of a line. + auto LikelyStartsDeclaration = [&](tok::TokenKind K) -> bool { + switch (K) { + // Things you might be declaring. + case tok::kw_template: + case tok::kw_using: + case tok::kw_namespace: + case tok::kw_typedef: + case tok::kw_class: + case tok::kw_struct: + case tok::kw_union: + // Types & other decl-specifiers + case tok::kw_void: + case tok::kw_auto: + case tok::kw_int: + case tok::kw_char: + case tok::kw_float: + case tok::kw_double: + case tok::kw_long: + case tok::kw_signed: + case tok::kw_unsigned: + case tok::kw_const: + case tok::kw_volatile: + case tok::kw_static: + case tok::kw_extern: + // Modules + case tok::kw_import: + case tok::kw_export: + // Class protection + case tok::kw_public: + case tok::kw_private: + case tok::kw_protected: + return true; + default: + return false; + } + }; + + // The indentation level where the declaration started hints at where a new + // sibling declaration might start. + unsigned OrigIndent = P.Tokens.tokens()[P.Begin].Indent; + + // Walk over tokens at the appropriate bracket nesting level. + for (const Token *T = &P.Tokens.tokens()[P.Begin].next(); T->Kind != tok::eof; + T = &T->next()) { + // Handle (paired) brackets. + if (const Token *B = T->pair()) { + // Closing bracket not paired within the recovery range. + if (B < T) { + assert(P.Tokens.index(*B) < P.Begin && "Why didn't we skip this one?"); + assert(P.Tokens.index(*T) >= P.Cursor && + "How did we parse an unmatched closing bracket?!"); + return P.Tokens.index(*T); + } + // Opening bracket: is this a semicolonless statement? + if (T->Kind == tok::l_brace && AcceptBraces && + P.Tokens.index(*B) >= P.Cursor) + return P.Tokens.index(*B); + // Skip over opening/closing bracket pair. + T = B; + continue; + } + + // Semicolons terminate statements. + if (T->Kind == tok::semi) + return P.Tokens.index(*T) + 1; + ConsiderForBraces(T->Kind); + + // Stop if it looks like a new declaration is starting on the next line. + if (T->Indent == OrigIndent && T->Line != T->prev().Line && + LikelyStartsDeclaration(T->Kind)) + return P.Tokens.index(*T); + } + // If we reached EOF, then assume the declaration runs right to EOF. + return P.Tokens.tokens().size(); +} + llvm::DenseMap buildRecoveryStrategies() { return { {(ExtensionID)Extension::Brackets, recoverBrackets}, + {(ExtensionID)Extension::NextDeclaration, recoverNextDeclaration}, }; } diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf @@ -309,8 +309,7 @@ declaration-statement := block-declaration # gram.dcl -declaration-seq := declaration -declaration-seq := declaration-seq declaration +declaration-seq := declaration [recover=NextDeclaration] declaration-seq_opt declaration := block-declaration declaration := nodeclspec-function-declaration declaration := function-definition @@ -535,13 +534,12 @@ module-partition := : module-name-qualifier_opt IDENTIFIER module-name-qualifier := IDENTIFIER . module-name-qualifier := module-name-qualifier IDENTIFIER . -export-declaration := EXPORT declaration -export-declaration := EXPORT ( declaration-seq_opt ) +export-declaration := EXPORT { declaration-seq_opt } export-declaration := export-keyword module-import-declaration -module-import-declaration := import-keyword module-name -module-import-declaration := import-keyword module-partition +module-import-declaration := import-keyword module-name ; +module-import-declaration := import-keyword module-partition ; # FIXME: we don't have header-name in the grammar. Handle these in PP? -# module-import-declaration := import-keyword header-name +# module-import-declaration := import-keyword header-name ; global-module-fragment := module-keyword ; declaration-seq_opt private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt @@ -554,7 +552,7 @@ class-key := CLASS class-key := STRUCT class-key := UNION -member-specification := member-declaration member-specification_opt +member-specification := member-declaration [recover=NextDeclaration] member-specification_opt member-specification := access-specifier : member-specification_opt member-declaration := decl-specifier-seq_opt member-declarator-list_opt ; member-declaration := function-definition diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp @@ -0,0 +1,22 @@ +// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s + +this is not a valid declaration; +// CHECK: declaration := + +int x[0]; +// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ; +// CHECK-NEXT: ├─decl-specifier-seq~INT := tok[7] +// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator [ constant-expression ] + +also_not_ok ??? @@@ (; +// CHECK: declaration := + +class Y{}; +// CHECK: simple-declaration := decl-specifier-seq ; +// CHECK-NEXT: ├─decl-specifier-seq~class-specifier := class-head { } + +// FIXME: support recovery of incomplete scopes +// namespace foo { + +class class class class function +// CHECK: declaration := diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -48,13 +48,13 @@ testing::UnorderedElementsAreArray(Parents)); } -Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) { - EXPECT_GT(Begin, 0u); - const Token &Left = Code.tokens()[Begin - 1]; +Token::Index recoverBraces(const RecoveryParams &P) { + EXPECT_GT(P.Begin, 0u); + const Token &Left = P.Tokens.tokens()[P.Begin - 1]; EXPECT_EQ(Left.Kind, tok::l_brace); if (const auto* Right = Left.pair()) { EXPECT_EQ(Right->Kind, tok::r_brace); - return Code.index(*Right); + return P.Tokens.index(*Right); } return Token::Invalid; }