diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h @@ -34,6 +34,14 @@ // Return true if the guard is satisfied. using RuleGuard = llvm::function_ref; +struct RecoveryParams { + // The position in the stream where the node being recovered will start. + Token::Index Begin; + // The position in the stream where the parse failed. + // Begin <= Cursor <= End (for a valid recovery). + Token::Index Cursor; + const TokenStream &Tokens; +}; // A recovery strategy determines a region of code to skip when parsing fails. // // For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`, @@ -43,7 +51,7 @@ // The provided index is the token where the skipped region begins. // Returns the (excluded) end of the range, or Token::Invalid for no recovery. using RecoveryStrategy = - llvm::function_ref; + llvm::function_ref; // Specify a language that can be parsed by the pseduoparser. struct Language { diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/Recovery.h b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/Recovery.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/Recovery.h @@ -0,0 +1,33 @@ +//===--- Recovery.h - C++ specific recovery heuristics ------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_CXX_RECOVERY_H +#define CLANG_PSEUDO_CXX_RECOVERY_H + +#include "clang-pseudo/Language.h" + +namespace clang::pseudo::cxx { + +// Heuristically finds the end of range of a declaration if parsing fails while +// attempting to parse a declaration. +// +// Returns an *excluded* end of token range. Given the broken code below, +// +// int a = ? garbage ?; +// double; +// +// We failed to parse the first declaration, the function returns the index of +// `double` token. We skip up to the first `;` token and assume tokens in +// [P.Begin, recoverNextDeclaration(P)) range was a declaration. +// +// Returns Token::Invalid if we can't find an end point. +Token::Index recoverNextDeclaration(const RecoveryParams &P); + +} // namespace clang::pseudo::cxx + +#endif // CLANG_PSEUDO_CXX_RECOVERY_H diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -28,11 +28,12 @@ namespace { Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin, + Token::Index Cursor, const TokenStream &Tokens, const Language &Lang) { assert(Strategy != 0); if (auto S = Lang.RecoveryStrategies.lookup(Strategy)) - return S(Begin, Tokens); + return S({Begin, Cursor, Tokens}); return Token::Invalid; } @@ -142,7 +143,7 @@ break; auto End = findRecoveryEndpoint(Option.Strategy, Option.Position, - Params.Code, Lang); + TokenIndex, Params.Code, Lang); // Recovery may not take the parse backwards. if (End == Token::Invalid || End < TokenIndex) continue; diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt @@ -4,6 +4,7 @@ add_clang_library(clangPseudoCXX CXX.cpp + Recovery.cpp DEPENDS cxx_gen diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp --- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp @@ -9,6 +9,7 @@ #include "clang-pseudo/cxx/CXX.h" #include "clang-pseudo/Forest.h" #include "clang-pseudo/Language.h" +#include "clang-pseudo/cxx/Recovery.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/CharInfo.h" @@ -410,14 +411,14 @@ #undef SYMBOL_GUARD } -Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { - assert(Begin > 0); - const Token &Left = Tokens.tokens()[Begin - 1]; +Token::Index recoverBrackets(const RecoveryParams &P) { + assert(P.Begin > 0); + const Token &Left = P.Tokens.tokens()[P.Begin - 1]; assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren || Left.Kind == tok::l_square); if (const Token *Right = Left.pair()) { - assert(Tokens.index(*Right) > Begin - 1); - return Tokens.index(*Right); + assert(P.Tokens.index(*Right) > P.Begin - 1); + return P.Tokens.index(*Right); } return Token::Invalid; } @@ -425,6 +426,7 @@ llvm::DenseMap buildRecoveryStrategies() { return { {Extension::Brackets, recoverBrackets}, + {Extension::NextDeclaration, recoverNextDeclaration}, }; } diff --git a/clang-tools-extra/pseudo/lib/cxx/Recovery.cpp b/clang-tools-extra/pseudo/lib/cxx/Recovery.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/cxx/Recovery.cpp @@ -0,0 +1,121 @@ +//===--- Recovery.cpp - ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/cxx/Recovery.h" +#include "clang-pseudo/Language.h" +#include "clang/Basic/TokenKinds.h" + +namespace clang::pseudo::cxx { + +Token::Index recoverNextDeclaration(const RecoveryParams &P) { + if (P.Begin == P.Tokens.tokens().size()) + return Token::Invalid; + // Declarations that don't end in semicolons: + // function-definition + // EXPORT {...} + // EXTERN string-literal {...} + // INLINE_opt NAMESPACE IDENTIFIER {...} + // INLINE_opt NAMESPACE {...} + // NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER {...} + // Functions are hard to do anything about: + // - if the declarator is good and the braces are present, parsing succeeds + // - if the declarator is bad, it's hard to know it's a function + // - if the braces are missing, we don't know where to recover + // So we only try to detect the others. + bool AcceptBraces = false; + auto ConsiderForBraces = [&](tok::TokenKind K) { + if (K == tok::kw_export || K == tok::kw_extern || K == tok::kw_namespace) + AcceptBraces = true; + }; + + // Some tokens are pretty good indicators of a declaration starting, when + // they appear at the beginning of a line. + auto LikelyStartsDeclaration = [&](tok::TokenKind K) -> bool { + switch (K) { + // Things you might be declaring. + case tok::kw_template: + case tok::kw_using: + case tok::kw_namespace: + case tok::kw_typedef: + case tok::kw_class: + case tok::kw_struct: + case tok::kw_union: + case tok::kw_friend: + // Types & other decl-specifiers + // FIXME: consider IDENTIFIER + case tok::kw_void: + case tok::kw_auto: + case tok::kw_int: + case tok::kw_char: + case tok::kw_float: + case tok::kw_double: + case tok::kw_long: + case tok::kw_signed: + case tok::kw_unsigned: + case tok::kw_const: + case tok::kw_volatile: + case tok::kw_static: + case tok::kw_extern: + case tok::kw_inline: + case tok::kw_explicit: + case tok::kw_virtual: + // Modules + case tok::kw_import: + case tok::kw_export: + // Class protection + case tok::kw_public: + case tok::kw_private: + case tok::kw_protected: + return true; + default: + return false; + } + }; + + // The indentation level where the declaration started hints at where a new + // sibling declaration might start. + unsigned OrigIndent = P.Tokens.tokens()[P.Begin].Indent; + + // Walk over tokens at the appropriate bracket nesting level. + for (const Token *T = &P.Tokens.tokens()[P.Begin]; T->Kind != tok::eof; + T = &T->next()) { + auto TIndex = P.Tokens.index(*T); + // Handle (paired) brackets. + if (const Token *PairedT = T->pair()) { + // Closing bracket not paired within the recovery range. + auto PairedTIndex = P.Tokens.index(*PairedT); + if (PairedT < T) { + assert(PairedTIndex < P.Begin && "Why didn't we skip this one?"); + assert(TIndex >= P.Cursor && + "How did we parse an unmatched closing bracket?!"); + return TIndex; + } + // Opening bracket: is this a semicolonless statement? + if (T->Kind == tok::l_brace && AcceptBraces && PairedTIndex >= P.Cursor) + return PairedTIndex + 1; + // Skip over opening/closing bracket pair. + T = PairedT; + continue; + } + ConsiderForBraces(T->Kind); + + // Semicolons terminate statements. + if (T->Kind == tok::semi && TIndex >= P.Cursor) + return TIndex + 1; + + // Stop if it looks like a new declaration is starting on the next line. + if (TIndex > P.Cursor // ensure cursor is moving forward after recovery + && (TIndex > 0 && T->Line != T->prev().Line) // start on the next line. + && T->Indent == OrigIndent && LikelyStartsDeclaration(T->Kind)) + return TIndex; + } + // If we reached EOF, then assume the declaration runs right to EOF. + return P.Tokens.tokens().size(); +} + +} // namespace clang::pseudo::cxx diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf @@ -304,8 +304,8 @@ declaration-statement := block-declaration # gram.dcl -declaration-seq := declaration -declaration-seq := declaration-seq declaration +declaration-seq := declaration [recover=NextDeclaration] +declaration-seq := declaration-seq declaration [recover=NextDeclaration] declaration := block-declaration declaration := nodeclspec-function-declaration declaration := function-definition @@ -554,7 +554,7 @@ class-key := CLASS class-key := STRUCT class-key := UNION -member-specification := member-declaration member-specification_opt +member-specification := member-declaration [recover=NextDeclaration] member-specification_opt member-specification := access-specifier : member-specification_opt member-declaration := decl-specifier-seq member-declarator-list_opt ; member-declaration := member-declarator-list ; diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp @@ -0,0 +1,30 @@ +// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s + +this is not a valid declaration; +// CHECK: declaration := + +int x[0]; +// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ; +// CHECK-NEXT: ├─decl-specifier-seq~INT := tok[7] +// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator [ constant-expression ] + +also_not_ok ??? @@@ (; +// CHECK: declaration := + +class Y{}; +// CHECK: simple-declaration := decl-specifier-seq ; +// CHECK-NEXT: ├─decl-specifier-seq~class-specifier := class-head { } + +// FIXME: support recovery of incomplete scopes +// namespace foo { + +class class class class function +// CHECK: declaration := + +export { +; +? +// CHECK: declaration-seq := declaration-seq declaration +// CHECK-NEXT: ├─declaration-seq~; := tok[34] +// CHECK-NEXT: └─declaration := +} diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -5,6 +5,7 @@ add_custom_target(ClangPseudoUnitTests) add_unittest(ClangPseudoUnitTests ClangPseudoTests BracketTest.cpp + CXXRecoveryTest.cpp CXXTest.cpp DirectiveTreeTest.cpp DisambiguateTest.cpp diff --git a/clang-tools-extra/pseudo/unittests/CXXRecoveryTest.cpp b/clang-tools-extra/pseudo/unittests/CXXRecoveryTest.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/CXXRecoveryTest.cpp @@ -0,0 +1,110 @@ +//===--- CXXRecoveryTest.cpp - ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Bracket.h" +#include "clang-pseudo/Language.h" +#include "clang-pseudo/Token.h" +#include "clang-pseudo/cxx/Recovery.h" + +#include "clang/Basic/LangOptions.h" +#include "llvm/Testing/Support/Annotations.h" +#include "gtest/gtest.h" + +namespace clang::pseudo { + +namespace { + +std::pair +extract(const llvm::Annotations &Test) { + assert(Test.points().size() == 2 && "Expect two ^s in the test annotations"); + LangOptions Opts = clang::pseudo::genericLangOpts(); + std::string Code = Test.code().str(); + TokenStream Cook = cook(lex(Code, Opts), Opts); + pairBrackets(Cook); + + RecoveryParams P{0, Token::Invalid, Cook}; + auto StartOffset = [&](const Token &T) -> size_t { + return T.Data - Code.data(); + }; + for (const auto &T : Cook.tokens()) { + if (StartOffset(T) == Test.points().front()) { + P.Cursor = Cook.index(T); + break; + } + } + assert(P.Cursor != Token::Invalid && "no cursor position provided"); + Token::Index Expect = Token::Invalid; + if (Test.points().back() > StartOffset(Cook.tokens().back())) + Expect = Cook.tokens().size(); // EOF + else { + for (const auto &T : Cook.tokens()) { + if (StartOffset(T) == Test.points().back()) { + Expect = Cook.index(T); + } + } + assert(Expect != Token::Invalid); + } + return {P, Expect}; +} + +TEST(CXXRecoveryTest, Declaration) { + // We have 3 different positions in the TokenStream: + // - Begin: where we start to parse the code (always 0 in the test) + // - Cursor: where the parse failed (specified by the first ^) + // - End: expected ends of the range (specified by the second ^) + const char *Tests[] = { + R"cpp( + namespace ^??? { } + ^Foo foo; + )cpp", + R"cpp( + auto lambda = [] ^xxx () xxx {} ; + ^Foo foo; + )cpp", + R"cpp( + LLVM_DISCARD int ^bar() {} + ^int; + )cpp", + // Recovery should at least consume 1 token. + R"cpp( + int + ^int + ^ + )cpp", + // FIXME: we skip too much, Foo is a better end point. + R"cpp( + LLVM_DISCARD int ^bar() {} + Foo foo(); + ^int; + )cpp", + R"cpp( + void Foo::AcquireLease() ^LOCKS_EXCLUDED(mutex_) {} + + Foo abc(); + ^ + )cpp", + // FIXME: skipp too much, we should end at private. + R"cpp( + class Foo { + public: + void abc() ^ERRROR + private: + int a; + }; + ^ + )cpp", + }; + for (const auto *T : Tests) { + llvm::Annotations Code(T); + const auto &[RP, ExpectedEnd] = extract(Code); + EXPECT_EQ(pseudo::cxx::recoverNextDeclaration(RP), ExpectedEnd) + << Code.code(); + } +} +} // namespace +} // namespace clang::pseudo diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -48,13 +48,13 @@ testing::UnorderedElementsAreArray(Parents)); } -Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) { - EXPECT_GT(Begin, 0u); - const Token &Left = Code.tokens()[Begin - 1]; +Token::Index recoverBraces(const RecoveryParams &P) { + EXPECT_GT(P.Begin, 0u); + const Token &Left = P.Tokens.tokens()[P.Begin - 1]; EXPECT_EQ(Left.Kind, tok::l_brace); if (const auto* Right = Left.pair()) { EXPECT_EQ(Right->Kind, tok::r_brace); - return Code.index(*Right); + return P.Tokens.index(*Right); } return Token::Invalid; } @@ -613,7 +613,7 @@ TestLang.Table = LRTable::buildSLR(TestLang.G); TestLang.RecoveryStrategies.try_emplace( extensionID("Skip"), - [](Token::Index Start, const TokenStream &) { return Start; }); + [](const RecoveryParams& P) { return P.Begin; }); clang::LangOptions LOptions; TokenStream Tokens = cook(lex("foo", LOptions), LOptions); @@ -641,7 +641,7 @@ TestLang.Table = LRTable::buildSLR(TestLang.G); TestLang.RecoveryStrategies.try_emplace( extensionID("AcceptAnyTokenInstead"), - [](Token::Index Start, const TokenStream &Stream) { return Start + 1; }); + [](const RecoveryParams& P) { return P.Begin + 1; }); const ForestNode &Parsed = glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang); @@ -660,9 +660,9 @@ )bnf"); TestLang.Table = LRTable::buildSLR(TestLang.G); bool fallback_recovered = false; - auto fallback = [&](Token::Index Start, const TokenStream & Code) { + auto fallback = [&](const RecoveryParams& P) { fallback_recovered = true; - return Code.tokens().size(); + return P.Tokens.tokens().size(); }; TestLang.RecoveryStrategies.try_emplace( extensionID("Fallback"), @@ -694,7 +694,7 @@ TestLang.Table = LRTable::buildSLR(TestLang.G); TestLang.RecoveryStrategies.try_emplace( extensionID("Skip"), - [](Token::Index Start, const TokenStream &) { return Start; }); + [](const RecoveryParams& P) { return P.Begin; }); clang::LangOptions LOptions; TokenStream Tokens = cook(lex("main", LOptions), LOptions);