diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h --- a/clang/include/clang/Tooling/Syntax/Nodes.h +++ b/clang/include/clang/Tooling/Syntax/Nodes.h @@ -37,7 +37,6 @@ enum class NodeKind : uint16_t { Leaf, TranslationUnit, - TopLevelDeclaration, // Expressions UnknownExpression, @@ -57,7 +56,11 @@ ReturnStatement, RangeBasedForStatement, ExpressionStatement, - CompoundStatement + CompoundStatement, + + // Declarations + UnknownDeclaration, + SimpleDeclaration, }; /// For debugging purposes. llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NodeKind K); @@ -102,20 +105,6 @@ } }; -/// FIXME: this node is temporary and will be replaced with nodes for various -/// 'declarations' and 'declarators' from the C/C++ grammar -/// -/// Represents any top-level declaration. Only there to give the syntax tree a -/// bit of structure until we implement syntax nodes for declarations and -/// declarators. -class TopLevelDeclaration final : public Tree { -public: - TopLevelDeclaration() : Tree(NodeKind::TopLevelDeclaration) {} - static bool classof(const Node *N) { - return N->kind() == NodeKind::TopLevelDeclaration; - } -}; - /// A base class for all expressions. Note that expressions are not statements, /// even though they are in clang. class Expression : public Tree { @@ -313,6 +302,38 @@ syntax::Leaf *rbrace(); }; +/// A declaration that can appear at the top-level. Note that this does *not* +/// correspond 1-to-1 to clang::Decl. Syntax trees distinguish between top-level +/// declarations (e.g. namespace definitions) and declarators (e.g. variables, +/// typedefs, etc.). Declarators are stored inside SimpleDeclaration. +class Declaration : public Tree { +public: + Declaration(NodeKind K) : Tree(K) {} + static bool classof(const Node *N) { + return NodeKind::UnknownDeclaration <= N->kind() && + N->kind() <= NodeKind::SimpleDeclaration; + } +}; + +/// Declaration of an unknown kind, e.g. not yet supported in syntax trees. +class UnknownDeclaration final : public Declaration { +public: + UnknownDeclaration() : Declaration(NodeKind::UnknownDeclaration) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::UnknownDeclaration; + } +}; + +/// Groups multiple declarators (e.g. variables, typedefs, etc.) together. All +/// grouped declarators share the same declaration specifiers (e.g. 'int' or +/// 'typedef'). +class SimpleDeclaration final : public Declaration { +public: + SimpleDeclaration() : Declaration(NodeKind::SimpleDeclaration) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::SimpleDeclaration; + } +}; } // namespace syntax } // namespace clang #endif diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp --- a/clang/lib/Tooling/Syntax/BuildTree.cpp +++ b/clang/lib/Tooling/Syntax/BuildTree.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// #include "clang/Tooling/Syntax/BuildTree.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Stmt.h" #include "clang/Basic/LLVM.h" @@ -56,6 +58,14 @@ /// Range. void foldNode(llvm::ArrayRef Range, syntax::Tree *New); + /// Must be called with the range of each `DeclaratorDecl`. Ensures the + /// corresponding declarator nodes are covered by `SimpleDeclaration`. + void noticeDeclaratorRange(llvm::ArrayRef Range); + + /// Notifies that we should not consume trailing semicolon when computing + /// token range of \p D. + void noticeDeclaratorWithoutSemicolon(Decl *D); + /// Mark the \p Child node with a corresponding \p Role. All marked children /// should be consumed by foldNode. /// (!) when called on expressions (clang::Expr is derived from clang::Stmt), @@ -94,7 +104,14 @@ return llvm::makeArrayRef(findToken(First), std::next(findToken(Last))); } llvm::ArrayRef getRange(const Decl *D) const { - return getRange(D->getBeginLoc(), D->getEndLoc()); + auto Tokens = getRange(D->getBeginLoc(), D->getEndLoc()); + if (llvm::isa(D)) + return Tokens; + if (DeclsWithoutSemicolons.count(D)) + return Tokens; + // FIXME: do not consume trailing semicolon on function definitions. + // Most declarations own a semicolon in syntax trees, but not int clang AST. + return withTrailingSemicolon(Tokens); } llvm::ArrayRef getExprRange(const Expr *E) const { return getRange(E->getBeginLoc(), E->getEndLoc()); @@ -108,14 +125,22 @@ // Some statements miss a trailing semicolon, e.g. 'return', 'continue' and // all statements that end with those. Consume this semicolon here. - // - // (!) statements never consume 'eof', so looking at the next token is ok. + if (Tokens.back().kind() == tok::semi) + return Tokens; + return withTrailingSemicolon(Tokens); + } + +private: + llvm::ArrayRef + withTrailingSemicolon(llvm::ArrayRef Tokens) const { + assert(!Tokens.empty()); + assert(Tokens.back().kind() != tok::eof); + // (!) we never consume 'eof', so looking at the next token is ok. if (Tokens.back().kind() != tok::semi && Tokens.end()->kind() == tok::semi) return llvm::makeArrayRef(Tokens.begin(), Tokens.end() + 1); return Tokens; } -private: /// Finds a token starting at \p L. The token must exist. const syntax::Token *findToken(SourceLocation L) const; @@ -136,6 +161,8 @@ {&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}}); } + ~Forest() { assert(DelayedFolds.empty()); } + void assignRole(llvm::ArrayRef Range, syntax::NodeRole Role) { assert(!Range.empty()); @@ -148,30 +175,46 @@ It->second.Role = Role; } - /// Add \p Node to the forest and fill its children nodes based on the \p - /// NodeRange. - void foldChildren(llvm::ArrayRef NodeTokens, + /// Add \p Node to the forest and attach child nodes based on \p Tokens. + void foldChildren(llvm::ArrayRef Tokens, syntax::Tree *Node) { - assert(!NodeTokens.empty()); - assert(Node->firstChild() == nullptr && "node already has children"); - - auto *FirstToken = NodeTokens.begin(); - auto BeginChildren = Trees.lower_bound(FirstToken); - assert(BeginChildren != Trees.end() && - BeginChildren->first == FirstToken && - "fold crosses boundaries of existing subtrees"); - auto EndChildren = Trees.lower_bound(NodeTokens.end()); - assert((EndChildren == Trees.end() || - EndChildren->first == NodeTokens.end()) && - "fold crosses boundaries of existing subtrees"); + // Execute delayed folds inside `Tokens`. + auto BeginExecuted = DelayedFolds.lower_bound(Tokens.begin()); + auto It = BeginExecuted; + for (; It != DelayedFolds.end() && It->second.End <= Tokens.end(); ++It) + foldChildrenEager(llvm::makeArrayRef(It->first, It->second.End), + It->second.Node); + DelayedFolds.erase(BeginExecuted, It); + + // Attach children to `Node`. + foldChildrenEager(Tokens, Node); + } - // (!) we need to go in reverse order, because we can only prepend. - for (auto It = EndChildren; It != BeginChildren; --It) - Node->prependChildLowLevel(std::prev(It)->second.Node, - std::prev(It)->second.Role); + /// Schedule a call to `foldChildren` that will only be executed when + /// containing node is folded. The range of delayed nodes can be extended by + /// calling `extendDelayedFold`. Only one delayed node for each starting + /// token is allowed. + void foldChildrenDelayed(llvm::ArrayRef Tokens, + syntax::Tree *Node) { + assert(!Tokens.empty()); + bool Inserted = + DelayedFolds.insert({Tokens.begin(), DelayedFold{Tokens.end(), Node}}) + .second; + (void)Inserted; + assert(Inserted && "Multiple delayed folds start at the same token"); + } - Trees.erase(BeginChildren, EndChildren); - Trees.insert({FirstToken, NodeAndRole(Node)}); + /// If there a delayed fold, starting at `ExtendedRange.begin()`, extends + /// its endpoint to `ExtendedRange.end()` and returns true. + /// Otherwise, returns false. + bool extendDelayedFold(llvm::ArrayRef ExtendedRange) { + assert(!ExtendedRange.empty()); + auto It = DelayedFolds.find(ExtendedRange.data()); + if (It == DelayedFolds.end()) + return false; + assert(It->second.End <= ExtendedRange.end()); + It->second.End = ExtendedRange.end(); + return true; } // EXPECTS: all tokens were consumed and are owned by a single root node. @@ -199,6 +242,30 @@ } private: + /// Implementation detail of `foldChildren`, does acutal folding ignoring + /// delayed folds. + void foldChildrenEager(llvm::ArrayRef Tokens, + syntax::Tree *Node) { + assert(Node->firstChild() == nullptr && "node already has children"); + + auto *FirstToken = Tokens.begin(); + auto BeginChildren = Trees.lower_bound(FirstToken); + assert((BeginChildren == Trees.end() || + BeginChildren->first == FirstToken) && + "fold crosses boundaries of existing subtrees"); + auto EndChildren = Trees.lower_bound(Tokens.end()); + assert( + (EndChildren == Trees.end() || EndChildren->first == Tokens.end()) && + "fold crosses boundaries of existing subtrees"); + + // (!) we need to go in reverse order, because we can only prepend. + for (auto It = EndChildren; It != BeginChildren; --It) + Node->prependChildLowLevel(std::prev(It)->second.Node, + std::prev(It)->second.Role); + + Trees.erase(BeginChildren, EndChildren); + Trees.insert({FirstToken, NodeAndRole(Node)}); + } /// A with a role that should be assigned to it when adding to a parent. struct NodeAndRole { explicit NodeAndRole(syntax::Node *Node) @@ -212,6 +279,13 @@ /// FIXME: storing the end tokens is redundant. /// FIXME: the key of a map is redundant, it is also stored in NodeForRange. std::map Trees; + + /// See documentation of `foldChildrenDelayed` for details. + struct DelayedFold { + const syntax::Token *End = nullptr; + syntax::Tree *Node = nullptr; + }; + std::map DelayedFolds; }; /// For debugging purposes. @@ -219,6 +293,7 @@ syntax::Arena &Arena; Forest Pending; + llvm::DenseSet DeclsWithoutSemicolons; }; namespace { @@ -229,20 +304,30 @@ bool shouldTraversePostOrder() const { return true; } - bool TraverseDecl(Decl *D) { - if (!D || isa(D)) - return RecursiveASTVisitor::TraverseDecl(D); - if (!llvm::isa(D->getDeclContext())) - return true; // Only build top-level decls for now, do not recurse. - return RecursiveASTVisitor::TraverseDecl(D); + bool WalkUpFromDeclaratorDecl(DeclaratorDecl *D) { + // Ensure declarators are covered by SimpleDeclaration. + Builder.noticeDeclaratorRange(Builder.getRange(D)); + // FIXME: build nodes for the declarator too. + return true; + } + bool WalkUpFromTypedefNameDecl(TypedefNameDecl *D) { + // Also a declarator. + Builder.noticeDeclaratorRange(Builder.getRange(D)); + // FIXME: build nodes for the declarator too. + return true; } bool VisitDecl(Decl *D) { - assert(llvm::isa(D->getDeclContext()) && - "expected a top-level decl"); assert(!D->isImplicit()); Builder.foldNode(Builder.getRange(D), - new (allocator()) syntax::TopLevelDeclaration()); + new (allocator()) syntax::UnknownDeclaration()); + return true; + } + + bool WalkUpFromTagDecl(TagDecl *C) { + // Avoid building UnknownDeclaration here, syntatically 'struct X {}' and + // similar are part of declaration specifiers and do not introduce a new + // top-level declaration. return true; } @@ -290,7 +375,11 @@ } bool TraverseStmt(Stmt *S) { - if (auto *E = llvm::dyn_cast_or_null(S)) { + if (auto *DS = llvm::dyn_cast_or_null(S)) { + // We want to consume the semicolon, make sure SimpleDeclaration does not. + for (auto *D : DS->decls()) + Builder.noticeDeclaratorWithoutSemicolon(D); + } else if (auto *E = llvm::dyn_cast_or_null(S)) { // (!) do not recurse into subexpressions. // we do not have syntax trees for expressions yet, so we only want to see // the first top-level expression. @@ -431,6 +520,19 @@ Pending.foldChildren(Range, New); } +void syntax::TreeBuilder::noticeDeclaratorRange( + llvm::ArrayRef Range) { + // FIXME: consume trailing semicolon in non-statement positions. + if (Pending.extendDelayedFold(Range)) + return; + Pending.foldChildrenDelayed(Range, + new (allocator()) syntax::SimpleDeclaration); +} + +void syntax::TreeBuilder::noticeDeclaratorWithoutSemicolon(Decl *D) { + DeclsWithoutSemicolons.insert(D); +} + void syntax::TreeBuilder::markChildToken(SourceLocation Loc, tok::TokenKind Kind, NodeRole Role) { if (Loc.isInvalid()) diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp --- a/clang/lib/Tooling/Syntax/Nodes.cpp +++ b/clang/lib/Tooling/Syntax/Nodes.cpp @@ -16,8 +16,6 @@ return OS << "Leaf"; case NodeKind::TranslationUnit: return OS << "TranslationUnit"; - case NodeKind::TopLevelDeclaration: - return OS << "TopLevelDeclaration"; case NodeKind::UnknownExpression: return OS << "UnknownExpression"; case NodeKind::UnknownStatement: @@ -50,6 +48,10 @@ return OS << "ExpressionStatement"; case NodeKind::CompoundStatement: return OS << "CompoundStatement"; + case NodeKind::UnknownDeclaration: + return OS << "UnknownDeclaration"; + case NodeKind::SimpleDeclaration: + return OS << "SimpleDeclaration"; } llvm_unreachable("unknown node kind"); } diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp --- a/clang/unittests/Tooling/Syntax/TreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp @@ -130,7 +130,7 @@ )cpp", R"txt( *: TranslationUnit -|-TopLevelDeclaration +|-SimpleDeclaration | |-int | |-main | |-( @@ -138,7 +138,7 @@ | `-CompoundStatement | |-{ | `-} -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-foo |-( @@ -157,7 +157,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-int |-main |-( @@ -202,7 +202,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -224,7 +224,7 @@ {"void test() { int a = 10; }", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -232,16 +232,18 @@ `-CompoundStatement |-{ |-DeclarationStatement - | |-int - | |-a - | |-= - | |-10 + | |-SimpleDeclaration + | | |-int + | | |-a + | | |-= + | | `-UnknownExpression + | | `-10 | `-; `-} )txt"}, {"void test() { ; }", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -263,7 +265,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -299,7 +301,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -329,7 +331,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-int |-test |-( @@ -352,7 +354,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -360,18 +362,21 @@ `-CompoundStatement |-{ |-DeclarationStatement - | |-int - | |-a - | |-[ - | |-3 - | |-] + | |-SimpleDeclaration + | | |-int + | | |-a + | | |-[ + | | |-UnknownExpression + | | | `-3 + | | `-] | `-; |-RangeBasedForStatement | |-for | |-( - | |-int - | |-x - | |-: + | |-SimpleDeclaration + | | |-int + | | |-x + | | `-: | |-UnknownExpression | | `-a | |-) @@ -384,7 +389,7 @@ // counterpart. {"void main() { foo: return 100; }", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-main |-( @@ -411,7 +416,7 @@ )cpp", R"txt( *: TranslationUnit -`-TopLevelDeclaration +`-SimpleDeclaration |-void |-test |-( @@ -444,7 +449,35 @@ | | `-) | `-; `-} -)txt"}}; +)txt"}, + // Multiple declarators group into a single SimpleDeclaration. + {R"cpp( + int *a, b; + )cpp", + R"txt( +*: TranslationUnit +`-SimpleDeclaration + |-int + |-* + |-a + |-, + |-b + `-; + )txt"}, + {R"cpp( + typedef int *a, b; + )cpp", + R"txt( +*: TranslationUnit +`-SimpleDeclaration + |-typedef + |-int + |-* + |-a + |-, + |-b + `-; + )txt"}}; for (const auto &T : Cases) { auto *Root = buildTree(T.first);