diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h @@ -0,0 +1,178 @@ +//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A parse forest represents a set of possible parse trees efficiently, it is +// produced by the GLR parser. +// +// Despite the name, its data structure is a tree-like DAG with a single root. +// Multiple ways to parse the same tokens are presented as an ambiguous node +// with all possible interpretations as children. +// Common sub-parses are shared: if two interpretations both parse "1 + 1" as +// "expr := expr + expr", they will share a Sequence node representing the expr. +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Allocator.h" +#include + +namespace clang { +namespace pseudo { + +// A node represents ways to parse a sequence of tokens, it interprets a fixed +// range of tokens as a fixed grammar symbol. +// +// There are different kinds of nodes, some nodes have "children" (stored in a +// trailing array) and have pointers to them. "Children" has different semantics +// depending on the node kinds. For an Ambiguous node, it means all +// possible interpretations; for a Sequence node, it means each symbol on the +// right hand side of the production rule. +// +// Since this is a node in a DAG, a node may have multiple parents. And a node +// doesn't have parent pointers. +class alignas(class ForestNode *) ForestNode { +public: + enum Kind : uint8_t { + // A Terminal node is a single terminal symbol bound to a token. + Terminal, + // A Sequence node is a nonterminal symbol parsed from a grammar rule, + // elements() are the parses of each symbol on the RHS of the rule. + // If the rule is A := X Y Z, the node is for nonterminal A, and elements() + // are [X, Y, Z]. + Sequence, + // An Ambiguous node exposes multiple ways to interpret the code as the + // same symbol, alternatives() are all possible parses. + Ambiguous, + // An Opaque node is a placeholder. It asserts that tokens match a symbol, + // without saying how. + // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid + // code). + Opaque, + }; + Kind kind() const { return K; } + + SymbolID symbol() const { return Symbol; } + + // The start of the token range, it is a poistion within a token stream. + Token::Index startTokenIndex() const { return StartIndex; } + + // Returns the corresponding grammar rule. + // REQUIRES: this is a Sequence node. + RuleID rule() const { + assert(kind() == Sequence); + return Data & ((1 << RuleBits) - 1); + } + // Returns the parses of each element on the RHS of the rule. + // REQUIRES: this is a Sequence node; + llvm::ArrayRef elements() const { + assert(kind() == Sequence); + return children(Data >> RuleBits); + }; + + // Returns all possible interpretations of the code. + // REQUIRES: this is an Ambiguous node. + llvm::ArrayRef alternatives() const { + assert(kind() == Ambiguous); + return children(Data); + } + + std::string dump(const Grammar &) const; + std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const; + +private: + friend class ForestArena; + + ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data) + : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {} + + ForestNode(const ForestNode &) = delete; + ForestNode &operator=(const ForestNode &) = delete; + ForestNode(ForestNode &&) = delete; + ForestNode &operator=(ForestNode &&) = delete; + + static uint16_t sequenceData(RuleID Rule, + llvm::ArrayRef Elements) { + assert(Rule < (1 << RuleBits)); + assert(Elements.size() < (1 << (16 - RuleBits))); + return Rule | Elements.size() << RuleBits; + } + static uint16_t + ambiguousData(llvm::ArrayRef Alternatives) { + return Alternatives.size(); + } + + // Retrieves the trailing array. + llvm::ArrayRef children(uint16_t Num) const { + return llvm::makeArrayRef(reinterpret_cast(this + 1), + Num); + } + + Token::Index StartIndex; + Kind K : 4; + SymbolID Symbol : SymbolBits; + // Sequence - child count : 4 | RuleID : RuleBits (12) + // Ambiguous - child count : 16 + // Terminal, Opaque - unused + uint16_t Data; + // An array of ForestNode* following the object. +}; +// ForestNode may not be destroyed (for BumpPtrAllocator). +static_assert(std::is_trivially_destructible(), ""); + +// A memory arena for the parse forest. +class ForestArena { +public: + llvm::ArrayRef createTerminals(const TokenStream &Code); + ForestNode &createSequence(SymbolID SID, RuleID RID, + llvm::ArrayRef Elements) { + assert(!Elements.empty()); + return create(ForestNode::Sequence, SID, + Elements.front()->startTokenIndex(), + ForestNode::sequenceData(RID, Elements), Elements); + } + ForestNode &createAmbiguous(SymbolID SID, + llvm::ArrayRef Alternatives) { + assert(!Alternatives.empty()); + assert(llvm::all_of(Alternatives, + [SID](const ForestNode *Alternative) { + return SID == Alternative->symbol(); + }) && + "Ambiguous alternatives must represent the same symbol!"); + return create(ForestNode::Ambiguous, SID, + Alternatives.front()->startTokenIndex(), + ForestNode::ambiguousData(Alternatives), Alternatives); + } + ForestNode &createOpaque(SymbolID SID, Token::Index Start) { + return create(ForestNode::Opaque, SID, Start, 0, {}); + } + + size_t nodeCount() const { return NodeCount; } + size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); } + +private: + ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start, + uint16_t Data, + llvm::ArrayRef Elements) { + ++NodeCount; + ForestNode *New = new (Arena.Allocate( + sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *), + alignof(ForestNode))) ForestNode(K, SID, Start, Data); + if (!Elements.empty()) + llvm::copy(Elements, reinterpret_cast(New + 1)); + return *New; + } + + llvm::BumpPtrAllocator Arena; + uint32_t NodeCount = 0; +}; + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -2,6 +2,7 @@ add_clang_library(clangPseudo DirectiveMap.cpp + Forest.cpp Grammar.cpp GrammarBNF.cpp Lex.cpp diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/Forest.cpp @@ -0,0 +1,126 @@ +//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/FormatVariadic.h" + +namespace clang { +namespace pseudo { + +std::string ForestNode::dump(const Grammar &G) const { + switch (kind()) { + case Ambiguous: + return llvm::formatv("{0} := ", G.symbolName(symbol())); + case Terminal: + return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()), + startTokenIndex()); + case Sequence: + return G.dumpRule(rule()); + case Opaque: + return llvm::formatv("{0} := ", G.symbolName(symbol())); + } + llvm_unreachable("Unhandled node kind!"); +} + +std::string ForestNode::dumpRecursive(const Grammar &G, + bool Abbreviated) const { + // Count visits of nodes so we can mark those seen multiple times. + llvm::DenseMap VisitCounts; + std::function CountVisits = + [&](const ForestNode *P) { + if (VisitCounts[P]++ > 0) + return; // Don't count children as multiply visited. + if (P->kind() == Ambiguous) + llvm::for_each(P->alternatives(), CountVisits); + else if (P->kind() == Sequence) + llvm::for_each(P->elements(), CountVisits); + }; + CountVisits(this); + + // We print a "#" for nonterminal forest nodes that are being dumped + // multiple times. + llvm::DenseMap ReferenceIds; + std::string Result; + constexpr Token::Index KEnd = std::numeric_limits::max(); + std::function)> + Dump = [&](const ForestNode *P, unsigned Level, Token::Index End, + llvm::Optional ElidedParent) { + llvm::ArrayRef Children; + auto EndOfElement = [&](size_t ChildIndex) { + return ChildIndex + 1 == Children.size() + ? End + : Children[ChildIndex + 1]->startTokenIndex(); + }; + if (P->kind() == Ambiguous) { + Children = P->alternatives(); + } else if (P->kind() == Sequence) { + Children = P->elements(); + if (Abbreviated) { + if (P->startTokenIndex() == End) + return; + for (size_t I = 0; I < Children.size(); ++I) + if (Children[I]->startTokenIndex() == P->startTokenIndex() && + EndOfElement(I) == End) { + return Dump( + Children[I], Level, End, + /*ElidedParent=*/ElidedParent.getValueOr(P->symbol())); + } + } + } + + // FIXME: pretty ascii trees + if (End == KEnd) + Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex()); + else + Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End); + Result.append(2 * Level, ' '); + if (ElidedParent.hasValue()) { + Result += G.symbolName(*ElidedParent); + Result += "~"; + } + Result.append(P->dump(G)); + + if (VisitCounts.find(P)->getSecond() > 1 && + P->kind() != ForestNode::Terminal) { + // The first time, print as #1. Later, =#1. + auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1); + Result += + llvm::formatv(" {0}#{1}", It.second ? "" : "=", It.first->second); + } + Result.push_back('\n'); + + ++Level; + for (size_t I = 0; I < Children.size(); ++I) + Dump(Children[I], Level, + P->kind() == Sequence ? EndOfElement(I) : End, llvm::None); + }; + Dump(this, 0, KEnd, llvm::None); + return Result; +} + +llvm::ArrayRef +ForestArena::createTerminals(const TokenStream &Code) { + ForestNode *Terminals = Arena.Allocate(Code.tokens().size()); + size_t Index = 0; + for (const auto &T : Code.tokens()) { + new (&Terminals[Index]) + ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind), + /*Start=*/Index, /*TerminalData*/ 0); + ++Index; + } + NodeCount = Index; + return llvm::makeArrayRef(Terminals, Index); +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -5,6 +5,7 @@ add_custom_target(ClangPseudoUnitTests) add_unittest(ClangPseudoUnitTests ClangPseudoTests DirectiveMapTest.cpp + ForestTest.cpp GrammarTest.cpp LRTableTest.cpp TokenTest.cpp diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/ForestTest.cpp @@ -0,0 +1,130 @@ +//===--- ForestTest.cpp - Test Forest dump ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Token.h" +#include "clang/Basic/LangOptions.h" +#include "llvm/ADT/StringRef.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace pseudo { +namespace { + +// FIXME: extract to a TestGrammar class to allow code sharing among tests. +class ForestTest : public ::testing::Test { +public: + void build(llvm::StringRef BNF) { + Diags.clear(); + G = Grammar::parseBNF(BNF, Diags); + } + + SymbolID symbol(llvm::StringRef Name) const { + for (unsigned I = 0; I < NumTerminals; ++I) + if (G->table().Terminals[I] == Name) + return tokenSymbol(static_cast(I)); + for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) + if (G->table().Nonterminals[ID].Name == Name) + return ID; + ADD_FAILURE() << "No such symbol found: " << Name; + return 0; + } + + RuleID ruleFor(llvm::StringRef NonterminalName) const { + auto RuleRange = G->table().Nonterminals[symbol(NonterminalName)].RuleRange; + if (RuleRange.End - RuleRange.Start == 1) + return G->table().Nonterminals[symbol(NonterminalName)].RuleRange.Start; + ADD_FAILURE() << "Expected a single rule for " << NonterminalName + << ", but it has " << RuleRange.End - RuleRange.Start + << " rule!\n"; + return 0; + } + +protected: + std::unique_ptr G; + std::vector Diags; +}; + +TEST_F(ForestTest, DumpBasic) { + build(R"cpp( + _ := add-expression + add-expression := id-expression + id-expression + id-expression := IDENTIFIER + )cpp"); + ASSERT_TRUE(Diags.empty()); + ForestArena Arena; + const auto &TS = + cook(lex("a + b", clang::LangOptions()), clang::LangOptions()); + + auto T = Arena.createTerminals(TS); + ASSERT_EQ(T.size(), 3u); + const auto *Left = &Arena.createSequence( + symbol("id-expression"), ruleFor("id-expression"), {&T.front()}); + const auto *Right = &Arena.createSequence(symbol("id-expression"), + ruleFor("id-expression"), {&T[2]}); + + const auto *Add = + &Arena.createSequence(symbol("add-expression"), 1, {Left, &T[1], Right}); + EXPECT_EQ(Add->dumpRecursive(*G, true), + "[ 0, end) add-expression := id-expression + id-expression\n" + "[ 0, 1) id-expression~IDENTIFIER := tok[0]\n" + "[ 1, 2) + := tok[1]\n" + "[ 2, end) id-expression~IDENTIFIER := tok[2]\n"); + EXPECT_EQ(Add->dumpRecursive(*G, false), + "[ 0, end) add-expression := id-expression + id-expression\n" + "[ 0, 1) id-expression := IDENTIFIER\n" + "[ 0, 1) IDENTIFIER := tok[0]\n" + "[ 1, 2) + := tok[1]\n" + "[ 2, end) id-expression := IDENTIFIER\n" + "[ 2, end) IDENTIFIER := tok[2]\n"); +} + +TEST_F(ForestTest, DumpAmbiguousAndRefs) { + build(R"cpp( + _ := type # rule 0 + type := class-type # rule 1 + type := enum-type # rule 2 + class-type := shared-type + enum-type := shared-type + shared-type := IDENTIFIER)cpp"); + ASSERT_TRUE(Diags.empty()); + ForestArena Arena; + const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions()); + + auto Terminals = Arena.createTerminals(TS); + ASSERT_EQ(Terminals.size(), 1u); + + const auto *SharedType = &Arena.createSequence( + symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()}); + const auto *ClassType = &Arena.createSequence( + symbol("class-type"), ruleFor("class-type"), {SharedType}); + const auto *Enumtype = &Arena.createSequence( + symbol("enum-type"), ruleFor("enum-type"), {SharedType}); + const auto *Alternative1 = + &Arena.createSequence(symbol("type"), /*RuleID=*/1, {ClassType}); + const auto *Alternative2 = + &Arena.createSequence(symbol("type"), /*RuleID=*/2, {Enumtype}); + const auto *Type = + &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2}); + EXPECT_EQ(Type->dumpRecursive(*G), + "[ 0, end) type := \n" + "[ 0, end) class-type := shared-type\n" + "[ 0, end) class-type := shared-type\n" + "[ 0, end) shared-type := IDENTIFIER #1\n" + "[ 0, end) IDENTIFIER := tok[0]\n" + "[ 0, end) enum-type := shared-type\n" + "[ 0, end) enum-type := shared-type\n" + "[ 0, end) shared-type := IDENTIFIER =#1\n" + "[ 0, end) IDENTIFIER := tok[0]\n"); +} + +} // namespace +} // namespace pseudo +} // namespace clang