diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h @@ -17,6 +17,9 @@ // //===----------------------------------------------------------------------===// +#ifndef CLANG_PSEUDO_FOREST_H +#define CLANG_PSEUDO_FOREST_H + #include "clang-pseudo/Grammar.h" #include "clang-pseudo/Token.h" #include "llvm/ADT/ArrayRef.h" @@ -176,3 +179,5 @@ } // namespace pseudo } // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h @@ -0,0 +1,160 @@ +//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements a standard Generalized LR (GLR) parsing algorithm. +// +// The GLR parser behaves as a normal LR parser until it encounters a conflict. +// To handle a conflict (where there are multiple actions could perform), the +// parser will simulate nondeterminism by doing a breadth-first search +// over all the possibilities. +// +// Basic mechanisims of the GLR parser: +// - A number of processes are operated in parallel. +// - Each process has its own parsing stack and behaves as a standard +// determinism LR parser. +// - When a process encounters a conflict, it will be fork (one for each +// avaiable action). +// - When a process encounters an error, it is abandoned. +// - All process are synchronized by the lookahead token: they perfrom shift +// action at the same time, which means some processes need wait until other +// processes have performed all reduce actions. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_GLRPARSER_H +#define CLANG_PSEUDO_GLRPARSER_H + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "llvm/Support/Allocator.h" +#include + +namespace clang { +namespace pseudo { + +// A Graph-Structured Stack represents all parse stacks of a GLR parser. +// +// Each node stores a parse state, the last parsed ForestNode, and the parent +// node. There may be several heads (top of stack), and the parser operates by: +// - shift: pushing terminal symbols on top of the stack +// - reduce: replace N symbols on top of the stack with one nonterminal +// +// The structure is a DAG rather than a stack: +// - GLR allows multiple actions (conflicts) on the same head, producing forks +// where several nodes have the same parent +// - The parser merges nodes with the same (state, ForestNode), producing joins +// where one node has multiple parents +// +// The parser is responsible for creating nodes and keeping track of the set of +// heads. The GSS class is mostly an arena for them. +struct GSS { + // A node represents a partial parse of the input up to some point. + // + // It is the equivalent of a frame in an LR parse stack. + // Like such a frame, it has an LR parse state and an AST node representing + // the last parsed symbol (a ForestNode in our case). + // Unlike a regular LR stack frame, it may have multiple parents. + // + // Nodes are not exactly pushed and popped on the stack: pushing is just + // allocating a new head node with a parent pointer to the old head. Popping + // is just forgetting about a node and remembering its parent instead. + struct alignas(struct Node *) Node { + // LR state describing how parsing should continue from this head. + LRTable::StateID State; + // Number of the parents of this node, which are stored as trailing objects. + // The parents hold previous parsed symbols, and may resume control after + // this node is reduced. + unsigned ParentCount; + // The parse node for the last parsed symbol. + // This symbol appears on the left of the dot in the parse state's items. + // (In the literature, the node is attached to the *edge* to the parent). + const ForestNode *Payload = nullptr; + + // FIXME: Most nodes live a fairly short time, and are simply discarded. + // Is it worth refcounting them (we have empty padding) and returning to a + // freelist, to keep the working set small? + + llvm::ArrayRef parents() const { + return llvm::makeArrayRef(reinterpret_cast(this + 1), + ParentCount); + }; + // Parents are stored as a trailing array of Node*. + }; + + // Allocates a new node in the graph. + const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Parents) { + ++NodeCount; + Node *Result = new (Arena.Allocate( + sizeof(Node) + Parents.size() * sizeof(Node *), alignof(Node))) + Node({State, static_cast(Parents.size())}); + Result->Payload = Symbol; + if (!Parents.empty()) + llvm::copy(Parents, reinterpret_cast(Result + 1)); + return Result; + } + + size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } + size_t nodeCount() const { return NodeCount; } + +private: + llvm::BumpPtrAllocator Arena; + unsigned NodeCount = 0; +}; + +// FIXME: what's the useful public interface here? +class GLRParser { +public: + GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena) + : ParsingTable(T), G(G), Forest(Arena) {} + + // FIXME: provide start symbol too? + const ForestNode *parse(llvm::ArrayRef Terminals); + + const GSS &getGSS() const { return GSS; } + +private: + // Apply PendingShift actions, producing NewHeads. + void performShift(const ForestNode *Tok, + std::vector *NewHeads); + void performReduction(SymbolID Next); + + void addActions(const GSS::Node *Head, SymbolID Next); + + const LRTable &ParsingTable; + const Grammar &G; + + // An active stack head can have multiple available actions (reduce/reduce + // actions, reduce/shift actions) + // A step is any one action applied to any one stack head. + struct Step { + // A corresponding stack head. + const GSS::Node *Head = nullptr; + // An action associated with the Head. + LRTable::Action Action = LRTable::Action::sentinel(); + }; + // A list of active shift actions. + std::vector PendingShift; + // A list of active reduce actions. + std::vector PendingReduce; + // A list of active accept action. + std::vector PendingAccept; + + // Temporary storage we reuse instead of reallocating each time. + std::vector TempParents; + + GSS GSS; + ForestArena &Forest; + llvm::ArrayRef Terminals; +}; + +} // namespace pseudo +} // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -3,6 +3,7 @@ add_clang_library(clangPseudo DirectiveMap.cpp Forest.cpp + GLRParser.cpp Grammar.cpp GrammarBNF.cpp Lex.cpp diff --git a/clang-tools-extra/pseudo/lib/GLRParser.cpp b/clang-tools-extra/pseudo/lib/GLRParser.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/GLRParser.cpp @@ -0,0 +1,303 @@ +//===--- GLRParser.cpp -----------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/GLRParser.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include + +#define DEBUG_TYPE "GLRParser.cpp" + +namespace clang { +namespace pseudo { + +using StateID = LRTable::StateID; + +const ForestNode *GLRParser::parse(llvm::ArrayRef Terminals) { + std::vector NewHeads = { + GSS.addNode(/*State=*/0, nullptr, {})}; + for (const ForestNode &Terminal : Terminals) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", + G.symbolName(Terminal.symbol()), + Terminal.symbol())); + for (const auto &AS : NewHeads) + addActions(AS, Terminal.symbol()); + NewHeads.clear(); + performReduction(Terminal.symbol()); + performShift(&Terminal, &NewHeads); + } + for (const auto &AS : NewHeads) + addActions(AS, tokenSymbol(tok::eof)); + performReduction(tokenSymbol(tok::eof)); + + // FIXME: recover if we don't accept, don't return null! + if (!PendingAccept.empty()) { + assert(PendingAccept.size() == 1); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n", + PendingAccept.size())); + for (const auto &A : PendingAccept) + LLVM_DEBUG(llvm::dbgs() + << " - " << G.symbolName(A.Head->Payload->symbol()) << "\n"); + return PendingAccept.front().Head->Payload; + } + return nullptr; +} + +void GLRParser::performShift(const ForestNode *Tok, + std::vector *NewHeads) { + assert(Tok->kind() == ForestNode::Terminal); + assert(PendingReduce.empty() && + "Reduce actions must be performed before shift actions"); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({0} active heads):\n", + G.symbolName(Tok->kind()), + PendingShift.size())); + + // Merge the stack -- if multiple stack heads will reach the same state after + // shifting a token, we shift only once by combining these heads. + // + // E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4: + // 0---1---2 + // └---3 + // After the shift action, the GSS is: + // 0---1---2---4 + // └---3---┘ + // + // We group pending shifts by their target state so we can merge them. + llvm::stable_sort(PendingShift, [](const Step &L, const Step &R) { + return L.Action.getShiftState() < R.Action.getShiftState(); + }); + auto Rest = llvm::makeArrayRef(PendingShift); + while (!Rest.empty()) { + // Collect the batch of PendingShift that have compatible shift states. + // Their heads become TempParents, the parents of the new GSS node. + StateID NextState = Rest.front().Action.getShiftState(); + for (TempParents.clear(); + !Rest.empty() && Rest.front().Action.getShiftState() == NextState; + Rest = Rest.drop_front()) { + assert(!llvm::is_contained(TempParents, Rest.front().Head) && + "shift: duplicated stack heads!"); + TempParents.push_back(Rest.front().Head); + } + LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)", + NextState, TempParents.size())); + NewHeads->push_back(GSS.addNode(NextState, Tok, TempParents)); + } + PendingShift.clear(); +} + +static std::vector +describeStates(llvm::ArrayRef A) { + std::vector States; + for (const auto &N : A) + States.push_back(llvm::formatv("S{0}", N->State)); + return States; +} + +// Enumerate all reduce paths on the stack by traversing from the given Head in +// the GSS. +static void enumerateReducePath(const GSS::Node *Head, unsigned PathLength, + std::vector &PathStorage, + std::function CB) { + assert(PathStorage.empty() && "PathStorage must be empty!"); + std::function EnumPath = + [&CB, &PathStorage, &EnumPath](const GSS::Node *Current, + unsigned RemainingLength) -> void { + assert(RemainingLength > 0); + + --RemainingLength; + PathStorage.push_back(Current); + if (RemainingLength == 0) { + CB(); + } else { + for (const auto *Next : Current->parents()) + EnumPath(Next, RemainingLength); + } + PathStorage.pop_back(); + }; + EnumPath(Head, PathLength); +} + +// Perform reduction recursively until we don't have reduce actions with +// heads. +void GLRParser::performReduction(SymbolID NextTok) { + if (!PendingReduce.empty()) + LLVM_DEBUG(llvm::dbgs() << " Performing **Reduce**\n"); + + // Reduce can manipulate the GSS in following way: + // + // 1) Split -- + // 1.1 when a stack head has mutiple reduce actions, the head is + // made to split to accommodate the various possiblities. + // E.g. + // 0 -> 1 (ID) + // After performing reduce of production rules (class-name := ID, + // enum-name := ID), the GSS now has two new heads: + // 0 -> 2 (class-name) + // `-> 3 (enum-name) + // + // 1.2 when a stack head has a reduce action with multiple reduce + // paths, the head is to split. + // E.g. + // ... -> 1(...) -> 3 (INT) + // ^ + // ... -> 2(...) ---| + // + // After the reduce action (simple-type-specifier := INT), the GSS looks + // like: + // ... -> 1(...) -> 4 (simple-type-specifier) + // ... -> 2(...) -> 5 (simple-type-specifier) + // + // 2) Merge -- if multiple heads turn out to be identical after + // reduction (new heads have the same state, and point to the same + // parents), these heads are merged and treated as a single head. + // This is usually where ambiguity happens. + // + // E.g. + // 0 -> 2 (class-name) + // ` -> 3 (enum-name) + // After reduction of rules (type-name := class-name | enum-name), the GSS + // has the following form: + // 0 -> 4 (type-name) + // The type-name forest node in the new head 4 is ambiguous, which has two + // parses (type-name -> class-name -> id, type-name -> enum-name -> id). + + // Store all newly-created stack heads for tracking ambiguities. + std::vector CreatedHeads; + while (!PendingReduce.empty()) { + auto RA = std::move(PendingReduce.back()); + PendingReduce.pop_back(); + + RuleID ReduceRuleID = RA.Action.getReduceRule(); + const Rule &ReduceRule = G.lookupRule(ReduceRuleID); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " !reduce rule {0}: {1} head: {2}\n", ReduceRuleID, + G.dumpRule(ReduceRuleID), RA.Head->State)); + + std::vector ReducePath; + enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() { + LLVM_DEBUG( + llvm::dbgs() << llvm::formatv( + " stack path: {0}, bases: {1}\n", + llvm::join(describeStates(ReducePath), " -> "), + llvm::join(describeStates(ReducePath.back()->parents()), ", "))); + assert(ReducePath.size() == ReduceRule.Size && + "Reduce path's length must equal to the reduce rule size"); + // A reduce is a back-and-forth operation in the stack. + // For example, we reduce a rule "declaration := decl-specifier-seq ;" on + // the linear stack: + // + // 0 -> 1(decl-specifier-seq) -> 3(;) + // ^ Base ^ Head + // <--- ReducePath: [3,1] ----> + // + // 1. back -- pop |ReduceRuleLength| nodes (ReducePath) in the stack; + // 2. forth -- push a new node in the stack and mark it as a head; + // 0 -> 4(declaration) + // ^ Head + // + // It becomes tricky if a reduce path has multiple bases, we want to merge + // them if their next state is the same. Similiar to above performShift, + // we partition the bases by their next state, and process each partition + // per loop iteration. + struct BaseInfo { + // An intermediate head after the stack has poped |ReducePath| nodes. + const GSS::Node *Base = nullptr; + // The final state after reduce. + // It is getGoToState(Base->State, ReduceSymbol). + StateID NextState; + }; + std::vector Bases; + for (const GSS::Node *Base : ReducePath.back()->parents()) + Bases.push_back( + {Base, ParsingTable.getGoToState(Base->State, ReduceRule.Target)}); + llvm::stable_sort(Bases, [](const BaseInfo &L, const BaseInfo &R) { + return std::forward_as_tuple(L.NextState, L.Base) < + std::forward_as_tuple(R.NextState, R.Base); + }); + + llvm::ArrayRef Partition = llvm::makeArrayRef(Bases); + while (!Partition.empty()) { + StateID NextState = Partition.front().NextState; + // Parents of the new stack head. + std::vector Parents; + auto Batch = Partition.take_while([&](const BaseInfo &TB) { + if (NextState != TB.NextState) + return false; + Parents.push_back(TB.Base); + return true; + }); + assert(!Batch.empty()); + Partition = Partition.drop_front(Batch.size()); + + // Check ambiguities. + auto It = llvm::find_if(CreatedHeads, [&](const GSS::Node *Head) { + return Head->Payload->symbol() == ReduceRule.Target && + Head->parents() == llvm::makeArrayRef(Parents); + }); + if (It != CreatedHeads.end()) { + // This should be guaranteed by checking the equalivent of + // parents and reduce nonterminal symbol! + assert(NextState == (*It)->State); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " found ambiguity, merged in state {0} (forest " + "'{1}')\n", + (*It)->State, G.symbolName((*It)->Payload->symbol()))); + // FIXME: create ambiguous foreset node! + continue; + } + + // Create a corresponding sequence forest node for the reduce rule. + std::vector ForestChildren; + for (const GSS::Node *PN : llvm::reverse(ReducePath)) + ForestChildren.push_back(PN->Payload); + const ForestNode &ForestNode = Forest.createSequence( + ReduceRule.Target, RA.Action.getReduceRule(), ForestChildren); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " after reduce: {0} -> state {1} ({2})\n", + llvm::join(describeStates(Parents), ", "), NextState, + G.symbolName(ReduceRule.Target))); + + // Create a new stack head. + const GSS::Node *Head = GSS.addNode(NextState, &ForestNode, Parents); + CreatedHeads.push_back(Head); + + // Actions that are enabled by this reduce. + addActions(Head, NextTok); + } + }); + } +} + +void GLRParser::addActions(const GSS::Node *Head, SymbolID NextTok) { + for (const auto &Action : ParsingTable.getActions(Head->State, NextTok)) { + switch (Action.kind()) { + case LRTable::Action::Shift: + PendingShift.push_back({Head, Action}); + break; + case LRTable::Action::Reduce: + PendingReduce.push_back({Head, Action}); + break; + case LRTable::Action::Accept: + PendingAccept.push_back({Head, Action}); + break; + default: + llvm_unreachable("unexpected action kind!"); + } + } +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/DirectiveMap.h" +#include "clang-pseudo/GLRParser.h" #include "clang-pseudo/Grammar.h" #include "clang-pseudo/LRGraph.h" #include "clang-pseudo/LRTable.h" @@ -35,6 +36,8 @@ static opt PrintDirectiveMap("print-directive-map", desc("Print directive structure of source code")); +static opt PrintStats("print-stats", desc("Print processing statistics")); +static opt PrintForest("print-forest", desc("Print parse forest")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr> Text = @@ -50,6 +53,29 @@ int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); + clang::LangOptions LangOpts; // FIXME: use real options. + LangOpts.CPlusPlus = 1; + llvm::Optional RawStream; + llvm::Optional DirectiveStructure; + llvm::Optional ParseableStream; + if (Source.getNumOccurrences()) { + std::string Text = readOrDie(Source); + RawStream = clang::pseudo::lex(Text, LangOpts); + + if (PrintSource) + RawStream->print(llvm::outs()); + if (PrintTokens) + llvm::outs() << *RawStream; + + DirectiveStructure = clang::pseudo::DirectiveMap::parse(*RawStream); + if (PrintDirectiveMap) + llvm::outs() << *DirectiveStructure; + + ParseableStream = clang::pseudo::stripComments(cook(*RawStream, LangOpts)); + if (PrintTokens) + llvm::outs() << "Cooked:\n" << *ParseableStream; + } + if (Grammar.getNumOccurrences()) { std::string Text = readOrDie(Grammar); std::vector Diags; @@ -65,23 +91,30 @@ llvm::outs() << G->dump(); if (PrintGraph) llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G); + auto LRTable = clang::pseudo::LRTable::buildSLR(*G); if (PrintTable) - llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G); - return 0; - } + llvm::outs() << LRTable.dumpForTests(*G); - if (Source.getNumOccurrences()) { - std::string Text = readOrDie(Source); - clang::LangOptions LangOpts; // FIXME: use real options. - auto Stream = clang::pseudo::lex(Text, LangOpts); - auto Structure = clang::pseudo::DirectiveMap::parse(Stream); + if (ParseableStream) { + clang::pseudo::ForestArena Arena; + clang::pseudo::GLRParser Parser(LRTable, *G, Arena); + const auto *Root = Parser.parse(Arena.createTerminals(*ParseableStream)); + if (Root) { + llvm::outs() << "parsed successfully!\n"; + if (PrintForest) + llvm::outs() << Root->dumpRecursive(*G, true); + } else { + llvm::outs() << "parse failed!\n"; + } + if (PrintStats) { + llvm::outs() << "Forest bytes: " << Arena.bytes() + << " nodes: " << Arena.nodeCount() << "\n"; + llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes() + << " nodes: " << Parser.getGSS().nodeCount() << "\n"; + } + } - if (PrintDirectiveMap) - llvm::outs() << Structure; - if (PrintSource) - Stream.print(llvm::outs()); - if (PrintTokens) - llvm::outs() << Stream; + return 0; } return 0;