diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h @@ -17,6 +17,9 @@ // //===----------------------------------------------------------------------===// +#ifndef CLANG_PSEUDO_FOREST_H +#define CLANG_PSEUDO_FOREST_H + #include "clang-pseudo/Grammar.h" #include "clang-pseudo/Token.h" #include "llvm/ADT/ArrayRef.h" @@ -176,3 +179,5 @@ } // namespace pseudo } // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h @@ -0,0 +1,161 @@ +//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements a standard Generalized LR (GLR) parsing algorithm. +// +// The GLR parser behaves as a normal LR parser until it encounters a conflict. +// To handle a conflict (where there are multiple actions could perform), the +// parser will simulate nondeterminism by doing a breadth-first search +// over all the possibilities. +// +// Basic mechanisims of the GLR parser: +// - A number of processes are operated in parallel. +// - Each process has its own parsing stack and behaves as a standard +// determinism LR parser. +// - When a process encounters a conflict, it will be fork (one for each +// avaiable action). +// - When a process encounters an error, it is abandoned. +// - All process are synchronized by the lookahead token: they perfrom shift +// action at the same time, which means some processes need wait until other +// processes have performed all reduce actions. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_GLRPARSER_H +#define CLANG_PSEUDO_GLRPARSER_H + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "llvm/Support/Allocator.h" +#include + +namespace clang { +namespace pseudo { + +// A Graph-Structured Stack represents all parse stacks of a GLR parser. +// +// Each node stores a parse state, the last parsed ForestNode, and the parent +// node. There may be several heads (top of stack), and the parser operates by: +// - shift: pushing terminal symbols on top of the stack +// - reduce: replace N symbols on top of the stack with one nonterminal +// +// The structure is a DAG rather than a stack: +// - GLR allows multiple actions (conflicts) on the same head, producing forks +// where several nodes have the same parent +// - The parser merges nodes with the same (state, ForestNode), producing joins +// where one node has multiple parents +// +// The parser is responsible for creating nodes and keeping track of the set of +// heads. The GSS class is mostly an arena for them. +struct GSS { + // A node represents a partial parse of the input up to some point. + // + // It is the equivalent of a frame in an LR parse stack. + // Like such a frame, it has an LR parse state and an AST node representing + // the last parsed symbol (a ForestNode in our case). + // Unlike a regular LR stack frame, it may have multiple parents. + // + // Nodes are not exactly pushed and popped on the stack: pushing is just + // allocating a new head node with a parent pointer to the old head. Popping + // is just forgetting about a node and remembering its parent instead. + struct alignas(struct Node *) Node { + // LR state describing how parsing should continue from this head. + LRTable::StateID State; + // Number of the parents of this node, which are stored as trailing objects. + // The parents hold previous parsed symbols, and may resume control after + // this node is reduced. + unsigned ParentCount; + // The parse node for the last parsed symbol. + // This symbol appears on the left of the dot in the parse state's items. + // (In the literature, the node is attached to the *edge* to the parent). + const ForestNode *Payload = nullptr; + + // FIXME: Most nodes live a fairly short time, and are simply discarded. + // Is it worth refcounting them (we have empty padding) and returning to a + // freelist, to keep the working set small? + + llvm::ArrayRef parents() const { + return llvm::makeArrayRef(reinterpret_cast(this + 1), + ParentCount); + }; + // Parents are stored as a trailing array of Node*. + }; + + // Allocates a new node in the graph. + const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Parents) { + ++NodeCount; + Node *Result = new (Arena.Allocate( + sizeof(Node) + Parents.size() * sizeof(Node *), alignof(Node))) + Node({State, static_cast(Parents.size())}); + Result->Payload = Symbol; + if (!Parents.empty()) + llvm::copy(Parents, reinterpret_cast(Result + 1)); + return Result; + } + + size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } + size_t nodeCount() const { return NodeCount; } + +private: + llvm::BumpPtrAllocator Arena; + unsigned NodeCount = 0; +}; + +// FIXME: what's the useful public interface here? +class GLRParser { +public: + GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena) + : ParsingTable(T), G(G), Forest(Arena) {} + + // FIXME: provide start symbol too? + const ForestNode *parse(llvm::ArrayRef Terminals); + + const GSS &getGSS() const { return GSS; } + +private: + // Apply PendingShift actions, producing NewHeads. + void performShift(const ForestNode *Tok, + std::vector *NewHeads); + void performReduction(SymbolID Next); + + void addActions(const GSS::Node *Head, SymbolID Next); + + const LRTable &ParsingTable; + const Grammar &G; + + // An active stack head can have multiple available actions (reduce/reduce + // actions, reduce/shift actions) + // A step is any one action applied to any one stack head. + struct Step { + // A corresponding stack head. + const GSS::Node *Head = nullptr; + // An action associated with the Head. + LRTable::Action Action = LRTable::Action::sentinel(); + }; + // A list of active shift actions. + std::vector PendingShift; + // A list of active reduce actions. + std::vector PendingReduce; + // A list of active accept action. + std::vector PendingAccept; + + // Temporary storage we reuse instead of reallocating each time. + std::vector TempGSSNodes; + std::vector TempForestNodes; + + GSS GSS; + ForestArena &Forest; + llvm::ArrayRef Terminals; +}; + +} // namespace pseudo +} // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -3,6 +3,7 @@ add_clang_library(clangPseudo DirectiveMap.cpp Forest.cpp + GLRParser.cpp Grammar.cpp GrammarBNF.cpp Lex.cpp diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp --- a/clang-tools-extra/pseudo/lib/Forest.cpp +++ b/clang-tools-extra/pseudo/lib/Forest.cpp @@ -46,15 +46,23 @@ }; CountVisits(this); + // The box-drawing characters that should be added as a child is rendered. + struct LineDecoration { + std::string Prefix; // Prepended to every line. + llvm::StringRef First; // added to the child's line. + llvm::StringRef Subsequent; // added to descendants' lines. + }; + // We print a "#" for nonterminal forest nodes that are being dumped // multiple times. llvm::DenseMap ReferenceIds; std::string Result; constexpr Token::Index KEnd = std::numeric_limits::max(); - std::function)> - Dump = [&](const ForestNode *P, unsigned Level, Token::Index End, - llvm::Optional ElidedParent) { + std::function, + LineDecoration &LineDec)> + Dump = [&](const ForestNode *P, Token::Index End, + llvm::Optional ElidedParent, + LineDecoration LineDec) { llvm::ArrayRef Children; auto EndOfElement = [&](size_t ChildIndex) { return ChildIndex + 1 == Children.size() @@ -72,18 +80,19 @@ if (Children[I]->startTokenIndex() == P->startTokenIndex() && EndOfElement(I) == End) { return Dump( - Children[I], Level, End, - /*ElidedParent=*/ElidedParent.getValueOr(P->symbol())); + Children[I], End, + /*ElidedParent=*/ElidedParent.getValueOr(P->symbol()), + LineDec); } } } - // FIXME: pretty ascii trees if (End == KEnd) Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex()); else Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End); - Result.append(2 * Level, ' '); + Result += LineDec.Prefix; + Result += LineDec.First; if (ElidedParent.hasValue()) { Result += G.symbolName(*ElidedParent); Result += "~"; @@ -99,12 +108,23 @@ } Result.push_back('\n'); - ++Level; - for (size_t I = 0; I < Children.size(); ++I) - Dump(Children[I], Level, - P->kind() == Sequence ? EndOfElement(I) : End, llvm::None); + auto OldPrefixSize = LineDec.Prefix.size(); + LineDec.Prefix += LineDec.Subsequent; + for (size_t I = 0; I < Children.size(); ++I) { + if (I == Children.size() - 1) { + LineDec.First = "└─"; + LineDec.Subsequent = " "; + } else { + LineDec.First = "├─"; + LineDec.Subsequent = "│ "; + } + Dump(Children[I], P->kind() == Sequence ? EndOfElement(I) : End, + llvm::None, LineDec); + } + LineDec.Prefix.resize(OldPrefixSize); }; - Dump(this, 0, KEnd, llvm::None); + LineDecoration LineDec; + Dump(this, KEnd, llvm::None, LineDec); return Result; } diff --git a/clang-tools-extra/pseudo/lib/GLRParser.cpp b/clang-tools-extra/pseudo/lib/GLRParser.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/GLRParser.cpp @@ -0,0 +1,332 @@ +//===--- GLRParser.cpp -----------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/GLRParser.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include +#include + +#define DEBUG_TYPE "GLRParser.cpp" + +namespace clang { +namespace pseudo { + +using StateID = LRTable::StateID; + +const ForestNode *GLRParser::parse(llvm::ArrayRef Terminals) { + std::vector NewHeads = { + GSS.addNode(/*State=*/0, nullptr, {})}; + for (const ForestNode &Terminal : Terminals) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", + G.symbolName(Terminal.symbol()), + Terminal.symbol())); + for (const auto &AS : NewHeads) + addActions(AS, Terminal.symbol()); + NewHeads.clear(); + performReduction(Terminal.symbol()); + performShift(&Terminal, &NewHeads); + } + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n")); + for (const auto &AS : NewHeads) + addActions(AS, tokenSymbol(tok::eof)); + performReduction(tokenSymbol(tok::eof)); + + // FIXME: recover if we don't accept, don't return null! + if (!PendingAccept.empty()) { + assert(PendingAccept.size() == 1); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n", + PendingAccept.size())); + for (const auto &A : PendingAccept) + LLVM_DEBUG(llvm::dbgs() + << " - " << G.symbolName(A.Head->Payload->symbol()) << "\n"); + return PendingAccept.front().Head->Payload; + } + return nullptr; +} + +void GLRParser::performShift(const ForestNode *Tok, + std::vector *NewHeads) { + assert(Tok->kind() == ForestNode::Terminal); + assert(PendingReduce.empty() && + "Reduce actions must be performed before shift actions"); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n", + G.symbolName(Tok->symbol()), + PendingShift.size())); + + // Merge the stack -- if multiple stack heads will reach the same state after + // shifting a token, we shift only once by combining these heads. + // + // E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4: + // 0---1---2 + // └---3 + // After the shift action, the GSS is: + // 0---1---2---4 + // └---3---┘ + // + // We group pending shifts by their target state so we can merge them. + llvm::stable_sort(PendingShift, [](const Step &L, const Step &R) { + return L.Action.getShiftState() < R.Action.getShiftState(); + }); + auto Rest = llvm::makeArrayRef(PendingShift); + while (!Rest.empty()) { + // Collect the batch of PendingShift that have compatible shift states. + // Their heads become TempParents, the parents of the new GSS node. + StateID NextState = Rest.front().Action.getShiftState(); + auto &Parents = TempGSSNodes; + for (Parents.clear(); + !Rest.empty() && Rest.front().Action.getShiftState() == NextState; + Rest = Rest.drop_front()) { + assert(!llvm::is_contained(Parents, Rest.front().Head) && + "shift: duplicated stack heads!"); + Parents.push_back(Rest.front().Head); + } + LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n", + NextState, Parents.size())); + NewHeads->push_back(GSS.addNode(NextState, Tok, Parents)); + } + PendingShift.clear(); +} + +namespace { +// A KeyedQueue yields pairs of keys and values in order of the keys. +template +using KeyedQueue = + std::priority_queue, + std::vector>, llvm::less_first>; + +template void sortAndUnique(std::vector &Vec) { + llvm::sort(Vec); + Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end()); +} + +} // namespace + +// Perform reduces until no more are possible. +// +// Generally this means walking up from the heads gathering ForestNodes that +// will match the RHS of the rule we're reducing into a sequence ForestNode, +// and ending up at a base node. +// Then we push a new GSS node onto that base, taking care to: +// - pack alternative sequence ForestNodes into an ambiguous ForestNode. +// - use the same GSS node for multiple heads if the parse state matches. +// +// Examples of reduction: +// Before (simple): +// 0--1(expr)--2(semi) +// After reducing 2 by `stmt := expr semi`: +// 0--3(stmt) // 3 is goto(0, stmt) +// +// Before (splitting due to R/R conflict): +// 0--1(IDENTIFIER) +// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`: +// 0--2(class-name) // 2 is goto(0, class-name) +// └--3(enum-name) // 3 is goto(0, enum-name) +// +// Before (splitting due to multiple bases): +// 0--2(class-name)--4(STAR) +// └--3(enum-name)---┘ +// After reducing 4 by ptr-operator := STAR: +// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) +// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) +// +// Before (joining due to same goto state): +// 0--1(A)--3(C) +// └--2(B)--4(D) +// After reducing 3 by E := C and 4 by E := D +// 0--1(A)--5(E) // 5 is goto(1, E) and goto(2, E) +// └--2(B)--┘ +// +// Before (joining due to same base): +// 0--1(A)--3(STAR) +// └--2(B)--4(STAR) +// After reducing 3 by C := A STAR and 2 by C := B STAR: +// 0--5(C) // 5 is goto(0, C) +void GLRParser::performReduction(SymbolID NextTok) { + // There are two interacting complications: + // 1. Performing one reduce can unlock new reduces on the newly-created head. + // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). + // This means we must have unlocked all the reduces that contribute to it. + // 2b. Similarly, the new GSS nodes must be complete (have all parents). + // + // We define a "family" of reduces as those that produce the same symbol and + // cover the same range of tokens. These are exactly the set of reductions + // whose sequence nodes would be covered by the same ambiguous node. + // We wish to process a whole family at a time (to satisfy complication 2), + // and can address complication 1 by carefully ordering the families: + // - Process families covering fewer tokens first. + // A reduce can't depend on a longer reduce! + // - For equal token ranges: if S := T, process T families before S families. + // Parsing T can't depend on an equal-length S, as the grammar is acyclic. + // + // This isn't quite enough: we don't know the token length of the reduction + // until we walk up the stack to perform the pop. + // So we perform the pop part upfront, and place the push specification on + // priority queues such that we can retrieve a family at a time. + + // A reduction family is characterized by its token range and symbol produced. + // It is used as a key in the priority queues to group pushes by family. + struct Family { + Token::Index Start; + SymbolID Symbol; + // Rule must produce Symbol and can otherwise be arbitrary. + // RuleIDs have the topological order based on the acyclic grammar. + // FIXME: should SymbolIDs be so ordered instead? + RuleID Rule; + + bool operator==(const Family &Other) const { + return Start == Other.Start && Symbol == Other.Symbol; + } + // The larger Family is the one that should be processed first. + bool operator<(const Family &Other) const { + if (Start != Other.Start) + return Start < Other.Start; + if (Symbol != Other.Symbol) + return Rule > Other.Rule; + assert(*this == Other); + return false; + } + }; + + // The base nodes are the heads after popping the GSS nodes we are reducing. + // We don't care which rule yielded each base. If Family.Symbol is S, the + // base includes an item X := ... • S ... and since the grammar is + // context-free, *all* parses of S are valid here. + // FIXME: reuse the queues across calls instead of reallocating. + KeyedQueue Bases; + + // A sequence is the ForestNode payloads of the GSS nodes we are reducing. + // These are the RHS of the rule, the RuleID is stored in the Family. + // They specify a sequence ForestNode we may build (but we dedup first). + using Sequence = llvm::SmallVector; + KeyedQueue Sequences; + + Sequence TempSequence; + // Pop walks up the parent chain(s) for a reduction from Head by to Rule. + // Once we reach the end, record the bases and sequences. + auto Pop = [&](const GSS::Node *Head, RuleID RID) { + LLVM_DEBUG(llvm::dbgs() << " Pop " << G.dumpRule(RID) << "\n"); + const auto &Rule = G.lookupRule(RID); + Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID}; + TempSequence.resize_for_overwrite(Rule.Size); + auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) { + if (I == Rule.Size) { + F.Start = TempSequence.front()->startTokenIndex(); + Bases.emplace(F, N); + LLVM_DEBUG(llvm::dbgs() << " --> base at S" << N->State << "\n"); + Sequences.emplace(F, TempSequence); + return; + } + TempSequence[Rule.Size - 1 - I] = N->Payload; + for (const GSS::Node *Parent : N->parents()) + DFS(Parent, I + 1, DFS); + }; + DFS(Head, 0, DFS); + }; + auto PopPending = [&] { + for (const Step &Pending : PendingReduce) + Pop(Pending.Head, Pending.Action.getReduceRule()); + PendingReduce.clear(); + }; + + std::vector> FamilyBases; + std::vector> FamilySequences; + // Main reduction loop: + // - pop as much as we can + // - push one family + // - this produces new heads which may enable more pops + PopPending(); + while (!Bases.empty()) { + // We should always have bases and sequences for the same families. + Family F = Bases.top().first; + assert(!Sequences.empty()); + assert(Sequences.top().first == F); + + LLVM_DEBUG(llvm::dbgs() << " Push " << G.symbolName(F.Symbol) + << " from token " << F.Start << "\n"); + + // Grab the sequences for this family. + FamilySequences.clear(); + do { + FamilySequences.emplace_back(Sequences.top().first.Rule, + Sequences.top().second); + Sequences.pop(); + } while (!Sequences.empty() && Sequences.top().first == F); + // Build a forest node for each unique sequence. + sortAndUnique(FamilySequences); + auto &SequenceNodes = TempForestNodes; + SequenceNodes.clear(); + for (const auto &SequenceSpec : FamilySequences) + SequenceNodes.push_back(&Forest.createSequence( + F.Symbol, SequenceSpec.first, SequenceSpec.second)); + // Wrap in an ambiguous node if needed. + const ForestNode *Parsed = + SequenceNodes.size() == 1 + ? SequenceNodes.front() + : &Forest.createAmbiguous(F.Symbol, SequenceNodes); + LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(G) << "\n"); + + // Grab the bases for this family. + // As well as deduplicating them, we'll group by the goto state. + FamilyBases.clear(); + do { + FamilyBases.emplace_back( + ParsingTable.getGoToState(Bases.top().second->State, F.Symbol), + Bases.top().second); + Bases.pop(); + } while (!Bases.empty() && Bases.top().first == F); + sortAndUnique(FamilyBases); + // Create a GSS node for each unique goto state. + llvm::ArrayRef BasesLeft = FamilyBases; + while (!BasesLeft.empty()) { + StateID NextState = BasesLeft.front().first; + auto &Parents = TempGSSNodes; + Parents.clear(); + for (const auto &Base : BasesLeft) { + if (Base.first != NextState) + break; + Parents.push_back(Base.second); + } + BasesLeft = BasesLeft.drop_front(Parents.size()); + + const GSS::Node *NewHead = GSS.addNode(NextState, Parsed, Parents); + addActions(NewHead, NextTok); + } + PopPending(); + } + assert(Sequences.empty()); +} + +void GLRParser::addActions(const GSS::Node *Head, SymbolID NextTok) { + for (const auto &Action : ParsingTable.getActions(Head->State, NextTok)) { + switch (Action.kind()) { + case LRTable::Action::Shift: + PendingShift.push_back({Head, Action}); + break; + case LRTable::Action::Reduce: + PendingReduce.push_back({Head, Action}); + break; + case LRTable::Action::Accept: + PendingAccept.push_back({Head, Action}); + break; + default: + llvm_unreachable("unexpected action kind!"); + } + } +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/DirectiveMap.h" +#include "clang-pseudo/GLRParser.h" #include "clang-pseudo/Grammar.h" #include "clang-pseudo/LRGraph.h" #include "clang-pseudo/LRTable.h" @@ -35,6 +36,8 @@ static opt PrintDirectiveMap("print-directive-map", desc("Print directive structure of source code")); +static opt PrintStats("print-stats", desc("Print processing statistics")); +static opt PrintForest("print-forest", desc("Print parse forest")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr> Text = @@ -50,6 +53,29 @@ int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); + clang::LangOptions LangOpts; // FIXME: use real options. + LangOpts.CPlusPlus = 1; + llvm::Optional RawStream; + llvm::Optional DirectiveStructure; + llvm::Optional ParseableStream; + if (Source.getNumOccurrences()) { + std::string Text = readOrDie(Source); + RawStream = clang::pseudo::lex(Text, LangOpts); + + if (PrintSource) + RawStream->print(llvm::outs()); + if (PrintTokens) + llvm::outs() << *RawStream; + + DirectiveStructure = clang::pseudo::DirectiveMap::parse(*RawStream); + if (PrintDirectiveMap) + llvm::outs() << *DirectiveStructure; + + ParseableStream = clang::pseudo::stripComments(cook(*RawStream, LangOpts)); + if (PrintTokens) + llvm::outs() << "Cooked:\n" << *ParseableStream; + } + if (Grammar.getNumOccurrences()) { std::string Text = readOrDie(Grammar); std::vector Diags; @@ -65,23 +91,30 @@ llvm::outs() << G->dump(); if (PrintGraph) llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G); + auto LRTable = clang::pseudo::LRTable::buildSLR(*G); if (PrintTable) - llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G); - return 0; - } + llvm::outs() << LRTable.dumpForTests(*G); - if (Source.getNumOccurrences()) { - std::string Text = readOrDie(Source); - clang::LangOptions LangOpts; // FIXME: use real options. - auto Stream = clang::pseudo::lex(Text, LangOpts); - auto Structure = clang::pseudo::DirectiveMap::parse(Stream); + if (ParseableStream) { + clang::pseudo::ForestArena Arena; + clang::pseudo::GLRParser Parser(LRTable, *G, Arena); + const auto *Root = Parser.parse(Arena.createTerminals(*ParseableStream)); + if (Root) { + llvm::outs() << "parsed successfully!\n"; + if (PrintForest) + llvm::outs() << Root->dumpRecursive(*G, /*Abbreviated=*/true); + } else { + llvm::outs() << "parse failed!\n"; + } + if (PrintStats) { + llvm::outs() << "Forest bytes: " << Arena.bytes() + << " nodes: " << Arena.nodeCount() << "\n"; + llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes() + << " nodes: " << Parser.getGSS().nodeCount() << "\n"; + } + } - if (PrintDirectiveMap) - llvm::outs() << Structure; - if (PrintSource) - Stream.print(llvm::outs()); - if (PrintTokens) - llvm::outs() << Stream; + return 0; } return 0;