diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/syntax/SyntaxTree.h b/clang-tools-extra/pseudo/include/clang-pseudo/syntax/SyntaxTree.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/syntax/SyntaxTree.h @@ -0,0 +1,37 @@ +#ifndef CLANG_PSEUDO_SYNTAX_SYNTAX_TREE_H +#define CLANG_PSEUDO_SYNTAX_SYNTAX_TREE_H + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Token.h" +#include "clang/Tooling/Syntax/Nodes.h" + +namespace clang { +namespace syntax { + +class PLeaf : public clang::syntax::Leaf { +public: + PLeaf(const pseudo::Token *Tok) : Leaf(NodeKind::PLeaf), Tok(Tok) { + assert(Tok != nullptr); + } + static bool classof(const Node *N) { return N->getKind() == NodeKind::PLeaf; } + const pseudo::Token *getToken() const { return Tok; } + +private: + const pseudo::Token *Tok; +}; +} // namespace syntax + +namespace pseudo { + +std::string dumpSyntaxTree(const syntax::Node *T); + +/// Build a syntax tree for the main file. +/// This usually covers the whole TranslationUnitDecl, but can be restricted by +/// the ASTContext's traversal scope. +syntax::Node *buildSyntaxTree(llvm::BumpPtrAllocator &Arena, + const ForestNode &Node, + const TokenStream &Tokens); +} // namespace pseudo +} // namespace clang + +#endif \ No newline at end of file diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(cli) add_subdirectory(cxx) add_subdirectory(grammar) +add_subdirectory(syntax) set(LLVM_LINK_COMPONENTS Support) diff --git a/clang-tools-extra/pseudo/lib/syntax/Build.cpp b/clang-tools-extra/pseudo/lib/syntax/Build.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/syntax/Build.cpp @@ -0,0 +1,188 @@ + + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/cxx/CXX.h" +#include "clang-pseudo/grammar/Grammar.h" +#include "clang-pseudo/syntax/SyntaxTree.h" +#include "clang/Tooling/Syntax/Nodes.h" +#include "clang/Tooling/Syntax/Tree.h" +#include "llvm/ADT/BitVector.h" + +namespace clang { +namespace syntax { +class TreeBuilder { +public: + TreeBuilder(llvm::BumpPtrAllocator &Arena) : Arena(Arena) {} + + syntax::Node *build(const pseudo::ForestNode &Node, + const pseudo::TokenStream &Tokens) { + for (const auto &T : Tokens.tokens()) { + Leaves.push_back(new (Arena) syntax::PLeaf(&T)); + Leaves.back()->setRole(NodeRole::Unknown); + Leaves.back()->Original = Leaves.back()->CanModify = true; + } + + return build(&Node, Tokens.tokens().size()).front(); + } + + void buildForRHS(Tree *Parent, llvm::ArrayRef RHS, + pseudo::Token::Index End) { + for (size_t I = 0; I < RHS.size(); ++I) { + for (auto *Child : + build(RHS[I], + I + 1 == RHS.size() ? End : RHS[I + 1]->startTokenIndex())) { + // FIXME: setup roles properly. + Child->setRole(NodeRole::Unknown); + Child->Original = Child->CanModify = true; + Parent->appendChildLowLevel(Child); + } + } + } + + std::vector build(const pseudo::ForestNode *Node, + pseudo::Token::Index End) { + using cxx = pseudo::cxx::Symbol; + pseudo::cxx::Symbol CXXSymbol = (pseudo::cxx::Symbol)Node->symbol(); + + if (Node->kind() == pseudo::ForestNode::Terminal) { + assert(pseudo::isToken(Node->symbol())); + return {Leaves[Node->startTokenIndex()]}; + } + if (Node->kind() == pseudo::ForestNode::Ambiguous) { + return build(Node->alternatives()[0], End); // select a random one. + } + // FIXME: handle opaque nodes! + const auto &Sequence = Node->elements(); + + if (CXXSymbol == cxx::translation_unit) { + syntax::TranslationUnit *R = new (Arena) syntax::TranslationUnit(); + buildForRHS(R, Sequence, End); + return {R}; + } + if (CXXSymbol == cxx::simple_declaration) { + auto *SD = new (Arena) syntax::SimpleDeclaration(); + buildForRHS(SD, Sequence, End); + return {SD}; + } + if (CXXSymbol == cxx::compound_statement) { + auto *CS = new (Arena) syntax::CompoundStatement(); + buildForRHS(CS, Sequence, End); + return {CS}; + } + + if (Sequence.size() > 1) { + switch (CXXSymbol) { + case pseudo::cxx::Symbol::additive_expression: + case pseudo::cxx::Symbol::and_expression: + case pseudo::cxx::Symbol::assignment_expression: + case pseudo::cxx::Symbol::compare_expression: + case pseudo::cxx::Symbol::constraint_logical_and_expression: + case pseudo::cxx::Symbol::constraint_logical_or_expression: + case pseudo::cxx::Symbol::equality_expression: + case pseudo::cxx::Symbol::exclusive_or_expression: + case pseudo::cxx::Symbol::inclusive_or_expression: + case pseudo::cxx::Symbol::logical_and_expression: + case pseudo::cxx::Symbol::logical_or_expression: + case pseudo::cxx::Symbol::multiplicative_expression: + case pseudo::cxx::Symbol::pm_expression: + case pseudo::cxx::Symbol::relational_expression: + case pseudo::cxx::Symbol::shift_expression: { + auto *BOE = new (Arena) syntax::BinaryOperatorExpression(); + buildForRHS(BOE, Sequence, End); + return {BOE}; + } + default: + break; + } + } + + // For sequence, we want to build a flat list of them. + // And fallback mechanism for unsupported syntax nodes. + std::vector Results; + for (size_t I = 0; I < Sequence.size(); ++I) { + for (auto *E : + build(Sequence[I], I + 1 == Sequence.size() + ? End + : Sequence[I + 1]->startTokenIndex())) { + Results.push_back(E); + } + } + return Results; + } + + llvm::BumpPtrAllocator &Arena; + std::vector Leaves; +}; +} // namespace syntax +namespace pseudo { + +static void dumpLeaf(raw_ostream &OS, const syntax::PLeaf *L) { + assert(L); + const auto *Token = L->getToken(); + assert(Token); + // Handle 'eof' separately, calling text() on it produces an empty string. + if (L->getToken()->Kind == tok::eof) + OS << ""; + else + OS << L->getToken()->text(); +} + +static void dumpNode(raw_ostream &OS, const syntax::Node *N, + llvm::BitVector IndentMask) { + auto DumpExtraInfo = [&OS](const syntax::Node *N) { + if (N->getRole() != syntax::NodeRole::Unknown) + OS << " " << N->getRole(); + if (!N->isOriginal()) + OS << " synthesized"; + if (!N->canModify()) + OS << " unmodifiable"; + }; + + assert(N); + if (const auto *L = dyn_cast(N)) { + OS << "'"; + dumpLeaf(OS, L); + OS << "'"; + DumpExtraInfo(N); + OS << "\n"; + return; + } + + const auto *T = cast(N); + OS << T->getKind(); + DumpExtraInfo(N); + OS << "\n"; + + for (const syntax::Node &It : T->getChildren()) { + for (unsigned Idx = 0; Idx < IndentMask.size(); ++Idx) { + if (IndentMask[Idx]) + OS << "| "; + else + OS << " "; + } + if (!It.getNextSibling()) { + OS << "`-"; + IndentMask.push_back(false); + } else { + OS << "|-"; + IndentMask.push_back(true); + } + dumpNode(OS, &It, IndentMask); + IndentMask.pop_back(); + } +} + +std::string dumpSyntaxTree(const syntax::Node *T) { + std::string Str; + llvm::raw_string_ostream OS(Str); + dumpNode(OS, T, /*IndentMask=*/{}); + return std::move(OS.str()); +} +syntax::Node *buildSyntaxTree(llvm::BumpPtrAllocator &Arena, + const ForestNode &Node, + const TokenStream &Tokens) { + return syntax::TreeBuilder(Arena).build(Node, Tokens); + return nullptr; +} +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/syntax/CMakeLists.txt b/clang-tools-extra/pseudo/lib/syntax/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/syntax/CMakeLists.txt @@ -0,0 +1,11 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_library(clangPseudoSyntax + Build.cpp + + LINK_LIBS + clangSyntaxTree + clangPseudo + clangPseudoGrammar + ) + diff --git a/clang-tools-extra/pseudo/tool/CMakeLists.txt b/clang-tools-extra/pseudo/tool/CMakeLists.txt --- a/clang-tools-extra/pseudo/tool/CMakeLists.txt +++ b/clang-tools-extra/pseudo/tool/CMakeLists.txt @@ -13,6 +13,7 @@ PRIVATE clangPseudo clangPseudoGrammar + clangPseudoSyntax clangPseudoCLI ) diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -14,6 +14,7 @@ #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRGraph.h" #include "clang-pseudo/grammar/LRTable.h" +#include "clang-pseudo/syntax/SyntaxTree.h" #include "clang/Basic/LangOptions.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" @@ -117,8 +118,12 @@ auto &Root = glrParse(*ParseableStream, clang::pseudo::ParseParams{Lang, Arena, GSS}, *StartSymID); - if (PrintForest) + if (PrintForest) { llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/true); + llvm::BumpPtrAllocator A; + llvm::outs() << clang::pseudo::dumpSyntaxTree( + clang::pseudo::buildSyntaxTree(A, Root, *ParseableStream)); + } if (PrintStatistics) { llvm::outs() << "Forest bytes: " << Arena.bytes() diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h --- a/clang/include/clang/Tooling/Syntax/Nodes.h +++ b/clang/include/clang/Tooling/Syntax/Nodes.h @@ -31,6 +31,7 @@ /// blocks of enumerator constants must correspond to the inheritance hierarchy /// of syntax::Node. enum class NodeKind : uint16_t { + PLeaf, #define CONCRETE_NODE(Kind, Base) Kind, #include "clang/Tooling/Syntax/Nodes.inc" }; diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp --- a/clang/lib/Tooling/Syntax/Nodes.cpp +++ b/clang/lib/Tooling/Syntax/Nodes.cpp @@ -12,6 +12,9 @@ raw_ostream &syntax::operator<<(raw_ostream &OS, NodeKind K) { switch (K) { + case NodeKind::PLeaf: + OS << "PLeaf"; + break; #define CONCRETE_NODE(Kind, Parent) \ case NodeKind::Kind: \ return OS << #Kind; diff --git a/clang/lib/Tooling/Syntax/Synthesis.cpp b/clang/lib/Tooling/Syntax/Synthesis.cpp --- a/clang/lib/Tooling/Syntax/Synthesis.cpp +++ b/clang/lib/Tooling/Syntax/Synthesis.cpp @@ -62,6 +62,7 @@ // Allocates the concrete syntax `Tree` according to its `NodeKind`. syntax::Tree *allocateTree(syntax::Arena &A, syntax::NodeKind Kind) { switch (Kind) { + case syntax::NodeKind::PLeaf: case syntax::NodeKind::OLeaf: case syntax::NodeKind::Leaf: assert(false);