diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h b/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h @@ -0,0 +1,135 @@ +//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Parse forest is the output of the GLR parser. +// +// For an ambiguous grammar, there might be multiple parse trees generated from +// for the given input. Forest is a DAG which represent numberous possible in a +// space-efficient manner. Common subtrees are shared -- if two or more trees +// treat the token range [1, 3) as an Expression, then there is a single shared +// Expression node representing the subparse in the forest. +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Allocator.h" +#include <cstdint> + +namespace clang { +namespace syntax { +namespace pseudo { + +// A node in a forest. +class ForestNode { +public: + enum Kind : uint8_t { + // A Terminal node is a single terminal symbol bound to a token. + Terminal, + // A Sequence node is a nonterminal symbol parsed with a grammar rule. + // elements() are the parses of each symbol on the RHS of the rule. + Sequence, + // An Ambiguous node exposes multiple ways to match the code to the symbol. + // alternatives() are the possible parses, we should choose one. + Ambiguous, + }; + Kind kind() const { return K; } + + SymbolID symbol() const { return Symbol; } + + // The parses for each element in the RHS of the rule. + // REQUIRES: this is a Sequence node. + RuleID rule() const { + assert(kind() == Sequence); + return Data_ & ((1 << RuleBits) - 1); + } + // REQUIRES: this is a Sequence node; + llvm::ArrayRef<const ForestNode *> elements() const { + assert(kind() == Sequence); + return children(Data_ >> RuleBits); + }; + uint32_t startLoc() const { return StartLoc; } + + // The possible interpretations of the code. + // REQUIRES: this is an Ambiguous node. + llvm::ArrayRef<const ForestNode *> alternatives() const { + assert(kind() == Ambiguous); + return children(Data_); + } + + std::string Dump(const Grammar &) const; + std::string DumpRecursive(const Grammar &, bool abbreviated = false) const; + +private: + friend class ForestArena; + ForestNode(Kind K, SymbolID Symbol, Token::Index StartLoc, uint16_t Data) + : StartLoc(StartLoc), K(K), Symbol(Symbol), Data_(Data) {} + + llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const { + return llvm::makeArrayRef( + reinterpret_cast<const ForestNode *const *>(this + 1), Num); + } + static uint16_t SequenceData(RuleID rule, + llvm::ArrayRef<const ForestNode *> elements) { + assert(rule < (1 << RuleBits)); + assert(elements.size() < (1 << (16 - RuleBits))); + return rule | elements.size() << RuleBits; + } + Token::Index StartLoc; + Kind K : 4; + SymbolID Symbol : SymbolBits; + // Sequences - child count : 4 | RuleID : 12 + // Ambiguous - child count : 16 + // Terminal - unused + uint16_t Data_; + // A trailing array of Node* . +}; + +// Node may not be destroyed (for BumpPtrAllocator). +static_assert(std::is_trivially_destructible<ForestNode>(), ""); + +// A memory arena for the parse forest. +class ForestArena { +public: + size_t nodeNum() const { return NodeNum; } + size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); } + + ForestNode &createSequence(SymbolID SID, RuleID RID, Token::Index Start, + llvm::ArrayRef<const ForestNode *> Elements) { + return create(ForestNode::Sequence, SID, Start, + ForestNode::SequenceData(RID, Elements), Elements); + } + + void init(const TokenStream &Code); + const ForestNode &terminal(Token::Index Index) const { + assert(Terminals && "Terminals are not intialized!"); + return Terminals[Index]; + } + +private: + ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start, + uint16_t Data, + llvm::ArrayRef<const ForestNode *> Elements) { + ++NodeNum; + ForestNode *New = new (Arena.Allocate( + sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *), + alignof(ForestNode))) ForestNode(K, SID, Start, Data); + if (!Elements.empty()) + llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1)); + return *New; + } + + llvm::BumpPtrAllocator Arena; + ForestNode *Terminals = nullptr; + uint32_t NodeNum = 0; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h b/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h @@ -0,0 +1,148 @@ +//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements a standard Generalized LR (GLR) parsing algorithm. +// +// The GLR parser behaves as a normal LR parser until it encounters a conflict. +// To handle a conflict (where there are multiple actions could perform), the +// parser will simulate nondeterminism by doing a breadth-first search +// over all the possibilities. +// +// Basic mechanisims of the GLR parser: +// - A number of processes are operated in parallel. +// - Each process has its own parsing stack and behaves as a standard +// determinism LR parser. +// - When a process encounters a conflict, it will be fork (one for each +// avaiable action). +// - When a process encounters an error, it is abandoned. +// - All process are synchronized by the lookahead token: they perfrom shift +// action at the same time, which means some processes need wait until other +// processes have performed all reduce actions. +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Forest.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/Support/Allocator.h" +#include <vector> + +namespace clang { +namespace syntax { +namespace pseudo { + +// An implementation of a directed acyclic graph (DAG), used as a +// graph-structured stack (GSS) in the GLR parser. +// +// GSS is an efficient data structure to represent multiple active stacks, it +// employs a stack-combination optimization to avoid potentially exponential +// growth of the stack: +// - combing equal stack prefixes -- A new stack doesn't need to have a full +// copy of its parent’s stack. They share a common prefix. +// - combing euqal stack suffices -- as there are a finite number of DFA's +// state the parser can be in. A set of heads can be in the same state +// though they may have different parses, these heads can be merged, +// resulting a single head. +// +// E.g. we have two active stacks: +// 0 -> 1 -> 2 +// | ^ head1, representing a stack [2, 1, 0] +// ` -> 3 +// ^ head2, representing a stack [3, 1, 0] +struct Graph { + // Represents a node in the graph. + struct Node { + // The parsing state presented by the graph node. + LRTable::StateID State : LRTable::StateBits; + static constexpr unsigned PredecessorBits = 3; + // Number of the predecessors of the node. + // u is the predecessor of v, if u -> v. + unsigned PredecessorCount : PredecessorBits; + // The forest node for a termina/nonterminal symbol. + // The symbol correponds to the label of edges which leads to current node + // from the predecessor nodes. + const ForestNode *Parsed = nullptr; + + llvm::ArrayRef<const Node *> predecessors() const { + return llvm::makeArrayRef(reinterpret_cast<const Node *const *>(this + 1), + PredecessorCount); + }; + + bool operator==(const Node &L) const { + return State == L.State && predecessors() == L.predecessors(); + } + // A trailing array of Node*. + }; + + // Creates a new node in the graph. + const Node *addNode(LRTable::StateID State, + const ::clang::syntax::pseudo::ForestNode *Symbol, + llvm::ArrayRef<const Node *> Predecessors) { + assert(Predecessors.size() < (1 << Node::PredecessorBits) && + "Too many predecessors to fit in PredecessorBits!"); + ++NodeCount; + Node *Result = new (Arena.Allocate( + sizeof(Node) + Predecessors.size() * sizeof(Node *), alignof(Node))) + Node({State, static_cast<unsigned>(Predecessors.size())}); + Result->Parsed = Symbol; + if (!Predecessors.empty()) + llvm::copy(Predecessors, reinterpret_cast<const Node **>(Result + 1)); + return Result; + } + + size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } + size_t nodeCount() const { return NodeCount; } + +private: + llvm::BumpPtrAllocator Arena; + unsigned NodeCount = 0; +}; + +class GLRParser { +public: + GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena) + : ParsingTable(T), G(G), ParsedForest(Arena) {} + + const ForestNode *parse(const TokenStream &Code); + + const Graph &getGSS() const { return GSS; } + +private: + // Return a list of active stack heads. + std::vector<const Graph::Node *> performShift(Token::Index Lookahead); + void performReduction(const Token &Lookahead); + + void addActions(const Graph::Node *Head, const Token &Lookahead); + + const LRTable &ParsingTable; + const Grammar &G; + + // An active stack head can have multiple avaialble actions (reduce/reduce + // actions, reduce/shift actions) + // Frontier is to track all avaiable actions from all active stack heads. + struct Frontier { + // A corresponding stack head. + const Graph::Node *Head = nullptr; + // An action associated with the Head. + const LRTable::Action *PerformAction = nullptr; + }; + // A list of active shift actions. + std::vector<Frontier> ShiftList; + // A list of active reduce actions. + std::vector<Frontier> ReduceList; + // A list of active accept action. + std::vector<Frontier> AcceptLists; + + Graph GSS; + ForestArena &ParsedForest; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -8,6 +8,8 @@ LRGraph.cpp LRTable.cpp LRTableBuild.cpp + Forest.cpp + GLRParser.cpp Token.cpp LINK_LIBS diff --git a/clang/lib/Tooling/Syntax/Pseudo/Forest.cpp b/clang/lib/Tooling/Syntax/Pseudo/Forest.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Forest.cpp @@ -0,0 +1,125 @@ +//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Forest.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +std::string ForestNode::Dump(const Grammar &G) const { + switch (kind()) { + case Ambiguous: + return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol())); + case Terminal: + return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()), startLoc()); + case Sequence: + return G.dumpRule(rule()); + } + llvm_unreachable("unhandle node kind!"); +} + +std::string ForestNode::DumpRecursive(const Grammar &G, + bool Abbreviated) const { + // Count visits of nodes so we can mark those seen multiple times. + llvm::DenseMap<const ForestNode *, unsigned> Visits; + std::function<void(const ForestNode *)> CountVisits = + [&](const ForestNode *P) { + if (Visits[P]++ > 0) + return; // Don't count children as multiply visited. + if (P->kind() == Ambiguous) + llvm::for_each(P->alternatives(), CountVisits); + else if (P->kind() == Sequence) + llvm::for_each(P->elements(), CountVisits); + }; + CountVisits(this); + + llvm::DenseMap<const ForestNode *, unsigned> Ids; + std::string Result; + constexpr unsigned kEnd = std::numeric_limits<unsigned>::max(); + std::function<void(const ForestNode *, unsigned, unsigned, + llvm::Optional<SymbolID>)> + Dump = [&](const ForestNode *P, unsigned Level, unsigned End, + llvm::Optional<SymbolID> ElidedParent) { + // absl::Span<const Node* const> children; + llvm::ArrayRef<const ForestNode *> children; + auto end_of_element = [&](unsigned child_index) { + return child_index + 1 == children.size() + ? End + : children[child_index + 1]->startLoc(); + }; + if (P->kind() == Ambiguous) { + children = P->alternatives(); + } else if (P->kind() == Sequence) { + children = P->elements(); + if (Abbreviated) { + if (P->startLoc() == End) + return; + for (unsigned i = 0; i < children.size(); ++i) + if (children[i]->startLoc() == P->startLoc() && + end_of_element(i) == End) { + return Dump( + children[i], Level, End, + /*elided_parent=*/ElidedParent.getValueOr(P->symbol())); + } + } + } + + // FIXME: pretty ascii trees + if (End == kEnd) + Result += llvm::formatv("[{0},end) ", P->startLoc()); + else + Result += llvm::formatv("[{0},{1}) ", P->startLoc(), End); + Result.append(2 * Level, ' '); + if (ElidedParent.hasValue()) { + Result += G.symbolName(*ElidedParent); + Result += "~"; + } + Result.append(P->Dump(G)); + if (Visits.find(P)->getSecond() > 1 && + P->kind() != ForestNode::Terminal) { + // The first time, print as #1. Later, =#1. + auto id = Ids.try_emplace(P, Ids.size() + 1); + Result += + llvm::formatv(" {0}#{1}", id.second ? "" : "=", id.first->second); + } + Result.push_back('\n'); + + ++Level; + for (unsigned i = 0; i < children.size(); ++i) + Dump(children[i], Level, + P->kind() == Sequence ? end_of_element(i) : End, llvm::None); + }; + Dump(this, 0, kEnd, llvm::None); + return Result; +} + +void ForestArena::init(const TokenStream &Tokens) { + Arena.Reset(); // clean the arena. + NodeNum = 0; + // List of leaves is prepopulated, it's convenient and we need them anyway. + Terminals = Arena.Allocate<ForestNode>(Tokens.tokens().size()); + size_t Index = 0; + for (const auto &T : Tokens.tokens()) { + new (&Terminals[Index]) + ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind), + /*begin=*/Index, /*TerminalData*/ 0); + ++Index; + } + NodeNum = Tokens.tokens().size(); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp b/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp @@ -0,0 +1,330 @@ +//===--- GLRParser.cpp -----------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/GLRParser.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include <memory> +#include <tuple> + +#define DEBUG_TYPE "GLRParser.cpp" + +namespace clang { +namespace syntax { +namespace pseudo { + +using StateID = LRTable::StateID; + +const ForestNode *GLRParser::parse(const TokenStream &Code) { + ParsedForest.init(Code); + const Token *Lookahead = &Code.tokens().front(); + addActions(GSS.addNode(/*StartState*/ 0, nullptr, {}), *Lookahead); + + while (!ShiftList.empty() || !ReduceList.empty()) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + "Lookahead token {0} (id: {1} text: '{2}')\n", + G.symbolName(tokenSymbol(Lookahead->Kind)), + tokenSymbol(Lookahead->Kind), Lookahead->text())); + + performReduction(*Lookahead); + auto NewHeads = performShift(Code.index(*Lookahead)); + + if (Lookahead->Kind != tok::eof) + ++Lookahead; + for (const auto &AS : NewHeads) + addActions(AS, *Lookahead); + } + + if (!AcceptLists.empty()) { + // FIXME: supporting multiple accepted symbols. It should be fine now, as we + // only have one production for the start symbol `_`. This would become a + // problem when we support parsing any code snippet rather than the + // translation unit. + assert(AcceptLists.size() == 1); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n", + AcceptLists.size())); + for (const auto &A : AcceptLists) + LLVM_DEBUG(llvm::dbgs() + << " - " << G.symbolName(A.Head->Parsed->symbol()) << "\n"); + return AcceptLists.front().Head->Parsed; + } + return nullptr; +} + +static std::vector<std::string> +ToStateString2(llvm::ArrayRef<const Graph::Node *> A) { + std::vector<std::string> States; + for (const auto &N : A) + States.push_back(llvm::formatv("state {0}", N->State)); + return States; +} + +std::vector<const Graph::Node *> +GLRParser::performShift(Token::Index Lookahead) { + assert(ReduceList.empty() && + "Reduce actions must be performed before shift actions"); + if (ShiftList.empty()) + return {}; + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " Perform Shift ({0} active heads):\n", ShiftList.size())); + + const pseudo::ForestNode *Leaf = &ParsedForest.terminal(Lookahead); + // New heads after performing all the shifts. + std::vector<const Graph::Node *> NewHeads; + + // Merge the stack -- if multiple stack heads are going to shift a same + // state, we perform the shift only once by combining these heads. + // + // E.g. we have two heads (2, 3) in the GSS, and state 4 is to be shifted from + // state 2 and state 3: + // 0 -> 1 -> 2 + // ` -> 3 + // After the shift action, the GSS looks like below, state 4 becomes the new + // head: + // 0 -> 1 -> 2 -> 4 + // ` -> 3 ---^ + // + // Shifts are partitioned by the shift state, so each partition (per loop + // iteration) corresponds to a "perform" shift. + llvm::sort(ShiftList, [](const Frontier &L, const Frontier &R) { + assert(L.PerformAction->kind() == LRTable::Action::Shift && + R.PerformAction->kind() == LRTable::Action::Shift); + return std::forward_as_tuple(L.PerformAction->getShiftState(), L.Head) < + std::forward_as_tuple(R.PerformAction->getShiftState(), R.Head); + }); + auto Partition = llvm::makeArrayRef(ShiftList); + while (!Partition.empty()) { + StateID NextState = Partition.front().PerformAction->getShiftState(); + auto Batch = Partition.take_while([&NextState](const Frontier &A) { + return A.PerformAction->getShiftState() == NextState; + }); + assert(!Batch.empty()); + // Predecessors of the new head in GSS. + std::vector<const Graph::Node *> Predecessors; + llvm::for_each(Batch, [&Predecessors](const Frontier &F) { + assert(llvm::find(Predecessors, F.Head) == Predecessors.end() && + "Unexpected duplicated stack heads during shift!"); + Predecessors.push_back(F.Head); + }); + const auto *Head = GSS.addNode(NextState, Leaf, Predecessors); + LLVM_DEBUG(llvm::dbgs() + << llvm::formatv(" - state {0} -> state {1}\n", + Partition.front().Head->State, NextState)); + + NewHeads.push_back(Head); + // Next iteration for next partition. + Partition = Partition.drop_front(Batch.size()); + } + ShiftList.clear(); + return NewHeads; +} + +// Enumerate all reduce paths on the stack by traversing from the given Head in +// the GSS. +static void enumerateReducePath(const Graph::Node *Head, unsigned PathLength, + std::vector<const Graph::Node *> &PathStorage, + std::function<void()> CB) { + assert(PathStorage.empty() && "PathStorage must be empty!"); + std::function<void(const Graph::Node *, unsigned)> enumPath = + [&CB, &PathStorage, &enumPath](const Graph::Node *Current, + unsigned Length) -> void { + assert(Length > 0); + PathStorage.push_back(Current); + if (--Length == 0) + return CB(); + + for (const auto *Next : Current->predecessors()) + enumPath(Next, Length); + PathStorage.pop_back(); + }; + enumPath(Head, PathLength); +} + +// Perform reduction recursively until we don't have reduce actions with +// heads. +void GLRParser::performReduction(const Token &Lookahead) { + if (!ReduceList.empty()) + LLVM_DEBUG(llvm::dbgs() << " Performing **Reduce**\n"); + + // Reduce can manipulate the GSS in following way: + // + // 1) Split -- + // 1.1 when a stack head has mutiple reduce actions, the head is + // made to split to accommodate the various possiblities. + // E.g. + // 0 -> 1 (ID) + // After performing reduce of production rules (class-name := ID, + // enum-name := ID), the GSS now has two new heads: + // 0 -> 2 (class-name) + // `-> 3 (enum-name) + // + // 1.2 when a stack head has a reduce action with multiple reduce + // paths, the head is to split. + // E.g. + // ... -> 1(...) -> 3 (INT) + // ^ + // ... -> 2(...) ---| + // + // After the reduce action (simple-type-specifier := INT), the GSS looks + // like: + // ... -> 1(...) -> 4 (simple-type-specifier) + // ... -> 2(...) -> 5 (simple-type-specifier) + // + // 2) Merge -- if multiple heads turn out to be identical after + // reduction (new heads have the same state, and point to the same + // predecessors), these heads are merged and treated as a single head. + // This is usually where ambiguity happens. + // + // E.g. + // 0 -> 2 (class-name) + // ` -> 3 (enum-name) + // After reduction of rules (type-name := class-name | enum-name), the GSS + // has the following form: + // 0 -> 4 (type-name) + // The type-name forest node in the new head 4 is ambiguous, which has two + // parses (type-name -> class-name -> id, type-name -> enum-name -> id). + + // Store all newly-created stack heads for tracking ambiguities. + std::vector<const Graph::Node *> CreatedHeads; + while (!ReduceList.empty()) { + auto RA = std::move(ReduceList.back()); + ReduceList.pop_back(); + + RuleID ReduceRuleID = RA.PerformAction->getReduceRule(); + const Rule &ReduceRule = G.lookupRule(ReduceRuleID); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " !reduce rule {0}: {1} head: {2}\n", ReduceRuleID, + G.dumpRule(ReduceRuleID), RA.Head->State)); + + std::vector<const Graph::Node *> ReducePath; + enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() { + LLVM_DEBUG( + llvm::dbgs() << llvm::formatv( + " stack path: {0}, bases: {1}\n", + llvm::join(ToStateString2(ReducePath), " -> "), + llvm::join(ToStateString2(ReducePath.back()->predecessors()), + ", "))); + + // A reduce is a back-and-forth operation in the stack. + // For example, we reduce a rule "declaration := decl-specifier-seq ;" on + // the linear stack: + // + // 0 -> 1(decl-specifier-seq) -> 3(;) + // ^ Base ^ Head + // <--- ReducePath: [3,1] ----> + // + // 1. back -- pop |ReduceRuleLength| nodes (ReducePath) in the stack; + // 2. forth -- push a new node in the stack and mark it as a head; + // 0 -> 4(declaration) + // ^ Head + // + // It becomes tricky if a reduce path has multiple bases, we want to merge + // them if their next state is the same. Similiar to above performShift, + // we partition the bases by their next state, and process each partition + // per loop iteration. + struct BaseInfo { + // An intermediate head after the stack has poped |ReducePath| nodes. + const Graph::Node *Base = nullptr; + // The final state after reduce. + // It is getGoToState(Base->State, ReduceSymbol). + StateID NextState; + }; + std::vector<BaseInfo> Bases; + for (const Graph::Node *Base : ReducePath.back()->predecessors()) + Bases.push_back( + {Base, ParsingTable.getGoToState(Base->State, ReduceRule.Target)}); + llvm::sort(Bases, [](const BaseInfo &L, const BaseInfo &R) { + return std::forward_as_tuple(L.NextState, L.Base) < + std::forward_as_tuple(R.NextState, R.Base); + }); + + llvm::ArrayRef<BaseInfo> Partition = llvm::makeArrayRef(Bases); + while (!Partition.empty()) { + StateID NextState = Partition.front().NextState; + // Predecessors of the new stack head. + std::vector<const Graph::Node *> Predecessors; + auto Batch = Partition.take_while([&](const BaseInfo &TB) { + if (NextState != TB.NextState) + return false; + Predecessors.push_back(TB.Base); + return true; + }); + assert(!Batch.empty()); + Partition = Partition.drop_front(Batch.size()); + + // Check ambiguities. + auto It = llvm::find_if(CreatedHeads, [&](const Graph::Node *Head) { + return Head->Parsed->symbol() == ReduceRule.Target && + Head->predecessors() == llvm::makeArrayRef(Predecessors); + }); + if (It != CreatedHeads.end()) { + // This should be guaranteed by checking the equalivent of + // predecessors and reduce nonterminal symbol! + assert(NextState == (*It)->State); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " found ambiguity, merged in state {0} (forest " + "'{1}')\n", + (*It)->State, G.symbolName((*It)->Parsed->symbol()))); + // FIXME: create ambiguous foreset node! + continue; + } + + // Create a corresponding sequence forest node for the reduce rule. + std::vector<const ForestNode *> ForestChildren; + for (const Graph::Node *PN : llvm::reverse(ReducePath)) + ForestChildren.push_back(PN->Parsed); + const ForestNode &ForestNode = ParsedForest.createSequence( + ReduceRule.Target, RA.PerformAction->getReduceRule(), + ForestChildren.front()->startLoc(), ForestChildren); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " after reduce: {0} -> state {1} ({2})\n", + llvm::join(ToStateString2(Predecessors), ", "), + NextState, G.symbolName(ReduceRule.Target))); + + // Create a new stack head. + const Graph::Node *Head = + GSS.addNode(NextState, &ForestNode, Predecessors); + CreatedHeads.push_back(Head); + + // Actions that are enabled by this reduce. + addActions(Head, Lookahead); + } + }); + } +} + +void GLRParser::addActions(const Graph::Node *Head, const Token &Lookahead) { + for (const auto &Action : + ParsingTable.getActions(Head->State, tokenSymbol(Lookahead.Kind))) { + switch (Action.kind()) { + case LRTable::Action::Shift: + ShiftList.push_back({Head, &Action}); + break; + case LRTable::Action::Reduce: + ReduceList.push_back({Head, &Action}); + break; + case LRTable::Action::Accept: + AcceptLists.push_back({Head, &Action}); + break; + default: + llvm_unreachable("unexpected action kind!"); + } + } +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp --- a/clang/tools/clang-pseudo/ClangPseudo.cpp +++ b/clang/tools/clang-pseudo/ClangPseudo.cpp @@ -8,6 +8,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h" +#include "clang/Tooling/Syntax/Pseudo/GLRParser.h" #include "clang/Tooling/Syntax/Pseudo/Grammar.h" #include "clang/Tooling/Syntax/Pseudo/LRGraph.h" #include "clang/Tooling/Syntax/Pseudo/LRTable.h" @@ -29,6 +30,8 @@ desc("Print the LR graph for the grammar")); static opt<bool> PrintTable("print-table", desc("Print the LR table for the grammar")); +static opt<std::string> ParseFile("parse", desc("Parse a C++ source file"), + init("")); static opt<std::string> Source("source", desc("Source file")); static opt<bool> PrintSource("print-source", desc("Print token stream")); static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info")); @@ -69,6 +72,26 @@ if (PrintTable) llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests( *G); + if (ParseFile.getNumOccurrences()) { + std::string Code = readOrDie(ParseFile); + const auto &T = clang::syntax::pseudo::LRTable::buildSLR(*G); + clang::LangOptions Opts; + Opts.CPlusPlus = 1; + + auto RawTokens = clang::syntax::pseudo::lex(Code, Opts); + auto Tokens = clang::syntax::pseudo::stripComments(cook(RawTokens, Opts)); + clang::syntax::pseudo::ForestArena Arena; + clang::syntax::pseudo::GLRParser Parser(T, *G, Arena); + const auto *Root = Parser.parse(Tokens); + if (Root) { + llvm::outs() << "parsed successfully!\n"; + llvm::outs() << "Forest bytes: " << Arena.bytes() + << " nodes: " << Arena.nodeNum() << "\n"; + llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes() + << " nodes: " << Parser.getGSS().nodeCount() << "\n"; + // llvm::outs() << Root->DumpRecursive(*G, true); + } + } return 0; }