diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h @@ -17,6 +17,9 @@ // //===----------------------------------------------------------------------===// +#ifndef CLANG_PSEUDO_FOREST_H +#define CLANG_PSEUDO_FOREST_H + #include "clang-pseudo/Grammar.h" #include "clang-pseudo/Token.h" #include "llvm/ADT/ArrayRef.h" @@ -176,3 +179,5 @@ } // namespace pseudo } // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h @@ -0,0 +1,151 @@ +//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements a standard Generalized LR (GLR) parsing algorithm. +// +// The GLR parser behaves as a normal LR parser until it encounters a conflict. +// To handle a conflict (where there are multiple actions could perform), the +// parser will simulate nondeterminism by doing a breadth-first search +// over all the possibilities. +// +// Basic mechanisims of the GLR parser: +// - A number of processes are operated in parallel. +// - Each process has its own parsing stack and behaves as a standard +// determinism LR parser. +// - When a process encounters a conflict, it will be fork (one for each +// avaiable action). +// - When a process encounters an error, it is abandoned. +// - All process are synchronized by the lookahead token: they perfrom shift +// action at the same time, which means some processes need wait until other +// processes have performed all reduce actions. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_GLRPARSER_H +#define CLANG_PSEUDO_GLRPARSER_H + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "clang-pseudo/Token.h" +#include "llvm/Support/Allocator.h" +#include + +namespace clang { +namespace pseudo { + +// An implementation of a directed acyclic graph (DAG), used as a +// graph-structured stack (GSS) in the GLR parser. +// +// GSS is an efficient data structure to represent multiple active stacks, it +// employs a stack-combination optimization to avoid potentially exponential +// growth of the stack: +// - combing equal stack prefixes -- A new stack doesn't need to have a full +// copy of its parent’s stack. They share a common prefix. +// - combing euqal stack suffices -- as there are a finite number of DFA's +// state the parser can be in. A set of heads can be in the same state +// though they may have different parses, these heads can be merged, +// resulting a single head. +// +// E.g. we have two active stacks: +// 0 -> 1 -> 2 +// | ^ head1, representing a stack [2, 1, 0] +// ` -> 3 +// ^ head2, representing a stack [3, 1, 0] +struct Graph { + // Represents a node in the graph. + struct Node { + // The parsing state presented by the graph node. + LRTable::StateID State : LRTable::StateBits; + static constexpr unsigned PredecessorBits = 3; + // Number of the predecessors of the node. + // u is the predecessor of v, if u -> v. + unsigned PredecessorCount : PredecessorBits; + // The forest node for a termina/nonterminal symbol. + // The symbol correponds to the label of edges which leads to current node + // from the predecessor nodes. + const ForestNode *Parsed = nullptr; + + llvm::ArrayRef predecessors() const { + return llvm::makeArrayRef(reinterpret_cast(this + 1), + PredecessorCount); + }; + + bool operator==(const Node &L) const { + return State == L.State && predecessors() == L.predecessors(); + } + // A trailing array of Node*. + }; + + // Creates a new node in the graph. + const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Predecessors) { + assert(Predecessors.size() < (1 << Node::PredecessorBits) && + "Too many predecessors to fit in PredecessorBits!"); + ++NodeCount; + Node *Result = new (Arena.Allocate( + sizeof(Node) + Predecessors.size() * sizeof(Node *), alignof(Node))) + Node({State, static_cast(Predecessors.size())}); + Result->Parsed = Symbol; + if (!Predecessors.empty()) + llvm::copy(Predecessors, reinterpret_cast(Result + 1)); + return Result; + } + + size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } + size_t nodeCount() const { return NodeCount; } + +private: + llvm::BumpPtrAllocator Arena; + unsigned NodeCount = 0; +}; + +class GLRParser { +public: + GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena) + : ParsingTable(T), G(G), ParsedForest(Arena) {} + + const ForestNode *parse(const TokenStream &Code); + + const Graph &getGSS() const { return GSS; } + +private: + // Return a list of active stack heads. + std::vector performShift(Token::Index Lookahead); + void performReduction(const Token &Lookahead); + + void addActions(const Graph::Node *Head, const Token &Lookahead); + + const LRTable &ParsingTable; + const Grammar &G; + + // An active stack head can have multiple avaialble actions (reduce/reduce + // actions, reduce/shift actions) + // Frontier is to track all avaiable actions from all active stack heads. + struct Frontier { + // A corresponding stack head. + const Graph::Node *Head = nullptr; + // An action associated with the Head. + const LRTable::Action *PerformAction = nullptr; + }; + // A list of active shift actions. + std::vector ShiftList; + // A list of active reduce actions. + std::vector ReduceList; + // A list of active accept action. + std::vector AcceptLists; + + Graph GSS; + ForestArena &ParsedForest; + llvm::ArrayRef Terminals; +}; + +} // namespace pseudo +} // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -3,6 +3,7 @@ add_clang_library(clangPseudo DirectiveMap.cpp Forest.cpp + GLRParser.cpp Grammar.cpp GrammarBNF.cpp Lex.cpp diff --git a/clang-tools-extra/pseudo/lib/GLRParser.cpp b/clang-tools-extra/pseudo/lib/GLRParser.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/GLRParser.cpp @@ -0,0 +1,332 @@ +//===--- GLRParser.cpp -----------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/GLRParser.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "clang-pseudo/Token.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include + +#define DEBUG_TYPE "GLRParser.cpp" + +namespace clang { +namespace pseudo { + +using StateID = LRTable::StateID; + +const ForestNode *GLRParser::parse(const TokenStream &Code) { + Terminals = ParsedForest.createTerminals(Code); + const Token *Lookahead = &Code.tokens().front(); + addActions(GSS.addNode(/*StartState*/ 0, nullptr, {}), *Lookahead); + + while (!ShiftList.empty() || !ReduceList.empty()) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + "Lookahead token {0} (id: {1} text: '{2}')\n", + G.symbolName(tokenSymbol(Lookahead->Kind)), + tokenSymbol(Lookahead->Kind), Lookahead->text())); + + performReduction(*Lookahead); + auto NewHeads = performShift(Code.index(*Lookahead)); + + if (Lookahead->Kind != tok::eof) + ++Lookahead; + for (const auto &AS : NewHeads) + addActions(AS, *Lookahead); + } + + if (!AcceptLists.empty()) { + // FIXME: supporting multiple accepted symbols. It should be fine now, as we + // only have one production for the start symbol `_`. This would become a + // problem when we support parsing any code snippet rather than the + // translation unit. + assert(AcceptLists.size() == 1); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n", + AcceptLists.size())); + for (const auto &A : AcceptLists) + LLVM_DEBUG(llvm::dbgs() + << " - " << G.symbolName(A.Head->Parsed->symbol()) << "\n"); + return AcceptLists.front().Head->Parsed; + } + return nullptr; +} + +std::vector +GLRParser::performShift(Token::Index Lookahead) { + assert(ReduceList.empty() && + "Reduce actions must be performed before shift actions"); + if (ShiftList.empty()) + return {}; + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " Perform Shift ({0} active heads):\n", ShiftList.size())); + + const pseudo::ForestNode *Leaf = &Terminals[Lookahead]; + // New heads after performing all the shifts. + std::vector NewHeads; + + // Merge the stack -- if multiple stack heads are going to shift a same + // state, we perform the shift only once by combining these heads. + // + // E.g. we have two heads (2, 3) in the GSS, and state 4 is to be shifted from + // state 2 and state 3: + // 0 -> 1 -> 2 + // ` -> 3 + // After the shift action, the GSS looks like below, state 4 becomes the new + // head: + // 0 -> 1 -> 2 -> 4 + // ` -> 3 ---^ + // + // Shifts are partitioned by the shift state, so each partition (per loop + // iteration) corresponds to a "perform" shift. + llvm::sort(ShiftList, [](const Frontier &L, const Frontier &R) { + assert(L.PerformAction->kind() == LRTable::Action::Shift && + R.PerformAction->kind() == LRTable::Action::Shift); + return std::forward_as_tuple(L.PerformAction->getShiftState(), L.Head) < + std::forward_as_tuple(R.PerformAction->getShiftState(), R.Head); + }); + auto Partition = llvm::makeArrayRef(ShiftList); + while (!Partition.empty()) { + StateID NextState = Partition.front().PerformAction->getShiftState(); + auto Batch = Partition.take_while([&NextState](const Frontier &A) { + return A.PerformAction->getShiftState() == NextState; + }); + assert(!Batch.empty()); + // Predecessors of the new head in GSS. + std::vector Predecessors; + llvm::for_each(Batch, [&Predecessors](const Frontier &F) { + assert(llvm::find(Predecessors, F.Head) == Predecessors.end() && + "Unexpected duplicated stack heads during shift!"); + Predecessors.push_back(F.Head); + }); + const auto *Head = GSS.addNode(NextState, Leaf, Predecessors); + LLVM_DEBUG(llvm::dbgs() + << llvm::formatv(" - state {0} -> state {1}\n", + Partition.front().Head->State, NextState)); + + NewHeads.push_back(Head); + // Next iteration for next partition. + Partition = Partition.drop_front(Batch.size()); + } + ShiftList.clear(); + return NewHeads; +} + +static std::vector +getStateString(llvm::ArrayRef A) { + std::vector States; + for (const auto &N : A) + States.push_back(llvm::formatv("state {0}", N->State)); + return States; +} + +// Enumerate all reduce paths on the stack by traversing from the given Head in +// the GSS. +static void enumerateReducePath(const Graph::Node *Head, unsigned PathLength, + std::vector &PathStorage, + std::function CB) { + assert(PathStorage.empty() && "PathStorage must be empty!"); + std::function EnumPath = + [&CB, &PathStorage, &EnumPath](const Graph::Node *Current, + unsigned RemainingLength) -> void { + assert(RemainingLength > 0); + + --RemainingLength; + PathStorage.push_back(Current); + if (RemainingLength == 0) { + CB(); + } else { + for (const auto *Next : Current->predecessors()) + EnumPath(Next, RemainingLength); + } + PathStorage.pop_back(); + }; + EnumPath(Head, PathLength); +} + +// Perform reduction recursively until we don't have reduce actions with +// heads. +void GLRParser::performReduction(const Token &Lookahead) { + if (!ReduceList.empty()) + LLVM_DEBUG(llvm::dbgs() << " Performing **Reduce**\n"); + + // Reduce can manipulate the GSS in following way: + // + // 1) Split -- + // 1.1 when a stack head has mutiple reduce actions, the head is + // made to split to accommodate the various possiblities. + // E.g. + // 0 -> 1 (ID) + // After performing reduce of production rules (class-name := ID, + // enum-name := ID), the GSS now has two new heads: + // 0 -> 2 (class-name) + // `-> 3 (enum-name) + // + // 1.2 when a stack head has a reduce action with multiple reduce + // paths, the head is to split. + // E.g. + // ... -> 1(...) -> 3 (INT) + // ^ + // ... -> 2(...) ---| + // + // After the reduce action (simple-type-specifier := INT), the GSS looks + // like: + // ... -> 1(...) -> 4 (simple-type-specifier) + // ... -> 2(...) -> 5 (simple-type-specifier) + // + // 2) Merge -- if multiple heads turn out to be identical after + // reduction (new heads have the same state, and point to the same + // predecessors), these heads are merged and treated as a single head. + // This is usually where ambiguity happens. + // + // E.g. + // 0 -> 2 (class-name) + // ` -> 3 (enum-name) + // After reduction of rules (type-name := class-name | enum-name), the GSS + // has the following form: + // 0 -> 4 (type-name) + // The type-name forest node in the new head 4 is ambiguous, which has two + // parses (type-name -> class-name -> id, type-name -> enum-name -> id). + + // Store all newly-created stack heads for tracking ambiguities. + std::vector CreatedHeads; + while (!ReduceList.empty()) { + auto RA = std::move(ReduceList.back()); + ReduceList.pop_back(); + + RuleID ReduceRuleID = RA.PerformAction->getReduceRule(); + const Rule &ReduceRule = G.lookupRule(ReduceRuleID); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " !reduce rule {0}: {1} head: {2}\n", ReduceRuleID, + G.dumpRule(ReduceRuleID), RA.Head->State)); + + std::vector ReducePath; + enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() { + LLVM_DEBUG( + llvm::dbgs() << llvm::formatv( + " stack path: {0}, bases: {1}\n", + llvm::join(getStateString(ReducePath), " -> "), + llvm::join(getStateString(ReducePath.back()->predecessors()), + ", "))); + assert(ReducePath.size() == ReduceRule.Size && + "Reduce path's length must equal to the reduce rule size"); + // A reduce is a back-and-forth operation in the stack. + // For example, we reduce a rule "declaration := decl-specifier-seq ;" on + // the linear stack: + // + // 0 -> 1(decl-specifier-seq) -> 3(;) + // ^ Base ^ Head + // <--- ReducePath: [3,1] ----> + // + // 1. back -- pop |ReduceRuleLength| nodes (ReducePath) in the stack; + // 2. forth -- push a new node in the stack and mark it as a head; + // 0 -> 4(declaration) + // ^ Head + // + // It becomes tricky if a reduce path has multiple bases, we want to merge + // them if their next state is the same. Similiar to above performShift, + // we partition the bases by their next state, and process each partition + // per loop iteration. + struct BaseInfo { + // An intermediate head after the stack has poped |ReducePath| nodes. + const Graph::Node *Base = nullptr; + // The final state after reduce. + // It is getGoToState(Base->State, ReduceSymbol). + StateID NextState; + }; + std::vector Bases; + for (const Graph::Node *Base : ReducePath.back()->predecessors()) + Bases.push_back( + {Base, ParsingTable.getGoToState(Base->State, ReduceRule.Target)}); + llvm::sort(Bases, [](const BaseInfo &L, const BaseInfo &R) { + return std::forward_as_tuple(L.NextState, L.Base) < + std::forward_as_tuple(R.NextState, R.Base); + }); + + llvm::ArrayRef Partition = llvm::makeArrayRef(Bases); + while (!Partition.empty()) { + StateID NextState = Partition.front().NextState; + // Predecessors of the new stack head. + std::vector Predecessors; + auto Batch = Partition.take_while([&](const BaseInfo &TB) { + if (NextState != TB.NextState) + return false; + Predecessors.push_back(TB.Base); + return true; + }); + assert(!Batch.empty()); + Partition = Partition.drop_front(Batch.size()); + + // Check ambiguities. + auto It = llvm::find_if(CreatedHeads, [&](const Graph::Node *Head) { + return Head->Parsed->symbol() == ReduceRule.Target && + Head->predecessors() == llvm::makeArrayRef(Predecessors); + }); + if (It != CreatedHeads.end()) { + // This should be guaranteed by checking the equalivent of + // predecessors and reduce nonterminal symbol! + assert(NextState == (*It)->State); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " found ambiguity, merged in state {0} (forest " + "'{1}')\n", + (*It)->State, G.symbolName((*It)->Parsed->symbol()))); + // FIXME: create ambiguous foreset node! + continue; + } + + // Create a corresponding sequence forest node for the reduce rule. + std::vector ForestChildren; + for (const Graph::Node *PN : llvm::reverse(ReducePath)) + ForestChildren.push_back(PN->Parsed); + const ForestNode &ForestNode = ParsedForest.createSequence( + ReduceRule.Target, RA.PerformAction->getReduceRule(), + ForestChildren); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " after reduce: {0} -> state {1} ({2})\n", + llvm::join(getStateString(Predecessors), ", "), + NextState, G.symbolName(ReduceRule.Target))); + + // Create a new stack head. + const Graph::Node *Head = + GSS.addNode(NextState, &ForestNode, Predecessors); + CreatedHeads.push_back(Head); + + // Actions that are enabled by this reduce. + addActions(Head, Lookahead); + } + }); + } +} + +void GLRParser::addActions(const Graph::Node *Head, const Token &Lookahead) { + for (const auto &Action : + ParsingTable.getActions(Head->State, tokenSymbol(Lookahead.Kind))) { + switch (Action.kind()) { + case LRTable::Action::Shift: + ShiftList.push_back({Head, &Action}); + break; + case LRTable::Action::Reduce: + ReduceList.push_back({Head, &Action}); + break; + case LRTable::Action::Accept: + AcceptLists.push_back({Head, &Action}); + break; + default: + llvm_unreachable("unexpected action kind!"); + } + } +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/DirectiveMap.h" +#include "clang-pseudo/GLRParser.h" #include "clang-pseudo/Grammar.h" #include "clang-pseudo/LRGraph.h" #include "clang-pseudo/LRTable.h" @@ -29,6 +30,8 @@ desc("Print the LR graph for the grammar")); static opt PrintTable("print-table", desc("Print the LR table for the grammar")); +static opt ParseFile("parse", desc("Parse a C++ source file"), + init("")); static opt Source("source", desc("Source file")); static opt PrintSource("print-source", desc("Print token stream")); static opt PrintTokens("print-tokens", desc("Print detailed token info")); @@ -67,6 +70,28 @@ llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G); if (PrintTable) llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G); + + if (ParseFile.getNumOccurrences()) { + std::string Code = readOrDie(ParseFile); + const auto &T = clang::pseudo::LRTable::buildSLR(*G); + clang::LangOptions Opts; + Opts.CPlusPlus = 1; + + auto RawTokens = clang::pseudo::lex(Code, Opts); + auto Tokens = clang::pseudo::stripComments(cook(RawTokens, Opts)); + clang::pseudo::ForestArena Arena; + clang::pseudo::GLRParser Parser(T, *G, Arena); + const auto *Root = Parser.parse(Tokens); + if (Root) { + llvm::outs() << "parsed successfully!\n"; + llvm::outs() << "Forest bytes: " << Arena.bytes() + << " nodes: " << Arena.nodeCount() << "\n"; + llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes() + << " nodes: " << Parser.getGSS().nodeCount() << "\n"; + llvm::outs() << Root->dumpRecursive(*G, true); + } + } + return 0; }