diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h @@ -157,6 +157,10 @@ return create(ForestNode::Opaque, SID, Start, 0, {}); } + ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) { + return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {}); + } + size_t nodeCount() const { return NodeCount; } size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); } diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -0,0 +1,164 @@ +//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements a standard Generalized LR (GLR) parsing algorithm. +// +// The GLR parser behaves as a normal LR parser until it encounters a conflict. +// To handle a conflict (where there are multiple actions could perform), the +// parser will simulate nondeterminism by doing a breadth-first search +// over all the possibilities. +// +// Basic mechanisims of the GLR parser: +// - A number of processes are operated in parallel. +// - Each process has its own parsing stack and behaves as a standard +// determinism LR parser. +// - When a process encounters a conflict, it will be fork (one for each +// avaiable action). +// - When a process encounters an error, it is abandoned. +// - All process are synchronized by the lookahead token: they perfrom shift +// action at the same time, which means some processes need wait until other +// processes have performed all reduce actions. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_GLR_H +#define CLANG_PSEUDO_GLR_H + +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "llvm/Support/Allocator.h" +#include + +namespace clang { +namespace pseudo { + +// A Graph-Structured Stack efficiently represents all parse stacks of a GLR +// parser. +// +// Each node stores a parse state, the last parsed ForestNode, and the parent +// node. There may be several heads (top of stack), and the parser operates by: +// - shift: pushing terminal symbols on top of the stack +// - reduce: replace N symbols on top of the stack with one nonterminal +// +// The structure is a DAG rather than a linear stack: +// - GLR allows multiple actions (conflicts) on the same head, producing forks +// where several nodes have the same parent +// - The parser merges nodes with the same (state, ForestNode), producing joins +// where one node has multiple parents +// +// The parser is responsible for creating nodes and keeping track of the set of +// heads. The GSS class is mostly an arena for them. +struct GSS { + // A node represents a partial parse of the input up to some point. + // + // It is the equivalent of a frame in an LR parse stack. + // Like such a frame, it has an LR parse state and a syntax-tree node + // representing the last parsed symbol (a ForestNode in our case). + // Unlike a regular LR stack frame, it may have multiple parents. + // + // Nodes are not exactly pushed and popped on the stack: pushing is just + // allocating a new head node with a parent pointer to the old head. Popping + // is just forgetting about a node and remembering its parent instead. + struct alignas(struct Node *) Node { + // LR state describing how parsing should continue from this head. + LRTable::StateID State; + // Number of the parents of this node. + // The parents hold previous parsed symbols, and may resume control after + // this node is reduced. + unsigned ParentCount; + // The parse node for the last parsed symbol. + // This symbol appears on the left of the dot in the parse state's items. + // (In the literature, the node is attached to the *edge* to the parent). + const ForestNode *Payload = nullptr; + + // FIXME: Most nodes live a fairly short time, and are simply discarded. + // Is it worth refcounting them (we have empty padding) and returning to a + // freelist, to keep the working set small? + + llvm::ArrayRef parents() const { + return llvm::makeArrayRef(reinterpret_cast(this + 1), + ParentCount); + }; + // Parents are stored as a trailing array of Node*. + }; + + // Allocates a new node in the graph. + const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Parents) { + ++NodeCount; + Node *Result = new (Arena.Allocate( + sizeof(Node) + Parents.size() * sizeof(Node *), alignof(Node))) + Node({State, static_cast(Parents.size())}); + Result->Payload = Symbol; + if (!Parents.empty()) + llvm::copy(Parents, reinterpret_cast(Result + 1)); + return Result; + } + + size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } + size_t nodeCount() const { return NodeCount; } + +private: + llvm::BumpPtrAllocator Arena; + unsigned NodeCount = 0; +}; + +// Parameters for the GLR parsing. +struct ParseParams { + // The grammar of the language we're going to parse. + const Grammar &G; + // The LR table which GLR uses to parse the input, should correspond to the + // Grammar G. + const LRTable &Table; + + // Arena for data structure used by the GLR algorithm. + ForestArena &Forest; // Storage for the output forest. + GSS &GSS; // Storage for parsing stacks. +}; +// Parse the given token stream with the GLR algorithm, and return a forest node +// of the start symbol. +// +// If the parsing fails, we model it as an opaque node in the forest. +// +// FIXME: add support for variant start symbols. +const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params); + +// An active stack head can have multiple available actions (reduce/reduce +// actions, reduce/shift actions). +// A step is any one action applied to any one stack head. +struct ParseStep { + // A specific stack head. + const GSS::Node *Head = nullptr; + // An action associated with the head. + LRTable::Action Action = LRTable::Action::sentinel(); +}; +// A callback is invoked whenever a new GSS head is created during the GLR +// parsing process (glrShift, or glrReduce). +using NewHeadCallback = std::function; +// Apply all PendingShift actions on a given GSS state, newly-created heads are +// passed to the callback. +// +// When this function returns, PendingShift is empty. +// +// Exposed for testing only. +void glrShift(std::vector &PendingShift, const ForestNode &NextTok, + const ParseParams &Params, NewHeadCallback NewHeadCB); +// Applies PendingReduce actions, until no more reduce actions are available. +// +// When this function returns, PendingReduce is empty. Calls to NewHeadCB may +// add elements to PendingReduce +// +// Exposed for testing only. +void glrReduce(std::vector &PendingReduce, const ParseParams &Params, + NewHeadCallback NewHeadCB); + +} // namespace pseudo +} // namespace clang + +#endif // CLANG_PSEUDO_GLR_H diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -3,6 +3,7 @@ add_clang_library(clangPseudo DirectiveTree.cpp Forest.cpp + GLR.cpp Grammar.cpp GrammarBNF.cpp Lex.cpp diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -0,0 +1,369 @@ +//===--- GLR.cpp -----------------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/GLR.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/LRTable.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include + +#define DEBUG_TYPE "GLR.cpp" + +namespace clang { +namespace pseudo { + +using StateID = LRTable::StateID; + +const ForestNode &glrParse(const TokenStream &Tokens, + const ParseParams &Params) { + llvm::ArrayRef Terminals = Params.Forest.createTerminals(Tokens); + auto &G = Params.G; + auto &GSS = Params.GSS; + + // Lists of active shift, reduce, accept actions. + std::vector PendingShift, PendingReduce, PendingAccept; + auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) { + for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) { + switch (Action.kind()) { + case LRTable::Action::Shift: + PendingShift.push_back({Head, Action}); + break; + case LRTable::Action::Reduce: + PendingReduce.push_back({Head, Action}); + break; + case LRTable::Action::Accept: + PendingAccept.push_back({Head, Action}); + break; + default: + llvm_unreachable("unexpected action kind!"); + } + } + }; + + std::vector NewHeads = { + GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})}; + for (const ForestNode &Terminal : Terminals) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", + G.symbolName(Terminal.symbol()), + Terminal.symbol())); + for (const auto *Head : NewHeads) + AddSteps(Head, Terminal.symbol()); + NewHeads.clear(); + glrReduce(PendingReduce, Params, + [&](const GSS::Node * NewHead) { + // A reduce will enable more steps. + AddSteps(NewHead, Terminal.symbol()); + }); + + glrShift(PendingShift, Terminal, Params, + [&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); }); + } + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n")); + for (const auto *Heads : NewHeads) + AddSteps(Heads, tokenSymbol(tok::eof)); + glrReduce(PendingReduce, Params, + [&](const GSS::Node * NewHead) { + // A reduce will enable more steps. + AddSteps(NewHead, tokenSymbol(tok::eof)); + }); + + if (!PendingAccept.empty()) { + LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted result:\n", + PendingAccept.size())); + for (const auto &Accept : PendingAccept) + LLVM_DEBUG(llvm::dbgs() + << " - " << G.symbolName(Accept.Head->Payload->symbol()) + << "\n"); + assert(PendingAccept.size() == 1); + return *PendingAccept.front().Head->Payload; + } + // We failed to parse the input, returning an opaque forest node for recovery. + auto RulesForStart = G.rulesFor(G.startSymbol()); + // FIXME: support multiple start symbols. + assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 && + "start symbol _ must have exactly one rule"); + return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0); +} + +// Apply all pending shift actions. +// In theory, LR parsing doesn't have shift/shift conflicts on a single head. +// But we may have multiple active heads, and each head has a shift action. +// +// We merge the stack -- if multiple heads will reach the same state after +// shifting a token, we shift only once by combining these heads. +// +// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4: +// 0---1---2 +// └---3 +// After the shift action, the GSS is: +// 0---1---2---4 +// └---3---┘ +void glrShift(std::vector &PendingShift, const ForestNode &NewTok, + const ParseParams &Params, NewHeadCallback NewHeadCB) { + assert(NewTok.kind() == ForestNode::Terminal); + assert(llvm::all_of(PendingShift, + [](const ParseStep &Step) { + return Step.Action.kind() == LRTable::Action::Shift; + }) && + "Pending shift actions must be shift actions"); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n", + Params.G.symbolName(NewTok.symbol()), + PendingShift.size())); + + // We group pending shifts by their target state so we can merge them. + llvm::stable_sort(PendingShift, [](const ParseStep &L, const ParseStep &R) { + return L.Action.getShiftState() < R.Action.getShiftState(); + }); + auto Rest = llvm::makeArrayRef(PendingShift); + llvm::SmallVector Parents; + while (!Rest.empty()) { + // Collect the batch of PendingShift that have compatible shift states. + // Their heads become TempParents, the parents of the new GSS node. + StateID NextState = Rest.front().Action.getShiftState(); + + Parents.clear(); + for (const auto &Base : Rest) { + if (Base.Action.getShiftState() != NextState) + break; + Parents.push_back(Base.Head); + } + Rest = Rest.drop_front(Parents.size()); + + LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n", + NextState, Parents.size())); + NewHeadCB(Params.GSS.addNode(NextState, &NewTok, Parents)); + } + PendingShift.clear(); +} + +namespace { +// A KeyedQueue yields pairs of keys and values in order of the keys. +template +using KeyedQueue = + std::priority_queue, + std::vector>, llvm::less_first>; + +template void sortAndUnique(std::vector &Vec) { + llvm::sort(Vec); + Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end()); +} +} // namespace + +// Perform reduces until no more are possible. +// +// Generally this means walking up from the heads gathering ForestNodes that +// will match the RHS of the rule we're reducing into a sequence ForestNode, +// and ending up at a base node. +// Then we push a new GSS node onto that base, taking care to: +// - pack alternative sequence ForestNodes into an ambiguous ForestNode. +// - use the same GSS node for multiple heads if the parse state matches. +// +// Examples of reduction: +// Before (simple): +// 0--1(expr)--2(semi) +// After reducing 2 by `stmt := expr semi`: +// 0--3(stmt) // 3 is goto(0, stmt) +// +// Before (splitting due to R/R conflict): +// 0--1(IDENTIFIER) +// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`: +// 0--2(class-name) // 2 is goto(0, class-name) +// └--3(enum-name) // 3 is goto(0, enum-name) +// +// Before (splitting due to multiple bases): +// 0--2(class-name)--4(STAR) +// └--3(enum-name)---┘ +// After reducing 4 by `ptr-operator := STAR`: +// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) +// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) +// +// Before (joining due to same goto state, multiple bases): +// 0--1(cv-qualifier)--3(class-name) +// └--2(cv-qualifier)--4(enum-name) +// After reducing 3 by `type-name := class-name` and +// 4 by `type-name := enum-name`: +// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and +// └--2(cv-qualifier)--┘ // goto(2, type-name) +// +// Before (joining due to same goto state, the same base): +// 0--1(class-name)--3(STAR) +// └--2(enum-name)--4(STAR) +// After reducing 3 by `pointer := class-name STAR` and +// 2 by`enum-name := class-name STAR`: +// 0--5(pointer) // 5 is goto(0, pointer) +void glrReduce(std::vector &PendingReduce, const ParseParams &Params, + NewHeadCallback NewHeadCB) { + // There are two interacting complications: + // 1. Performing one reduce can unlock new reduces on the newly-created head. + // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). + // This means we must have unlocked all the reduces that contribute to it. + // 2b. Similarly, the new GSS nodes must be complete (have all parents). + // + // We define a "family" of reduces as those that produce the same symbol and + // cover the same range of tokens. These are exactly the set of reductions + // whose sequence nodes would be covered by the same ambiguous node. + // We wish to process a whole family at a time (to satisfy complication 2), + // and can address complication 1 by carefully ordering the families: + // - Process families covering fewer tokens first. + // A reduce can't depend on a longer reduce! + // - For equal token ranges: if S := T, process T families before S families. + // Parsing T can't depend on an equal-length S, as the grammar is acyclic. + // + // This isn't quite enough: we don't know the token length of the reduction + // until we walk up the stack to perform the pop. + // So we perform the pop part upfront, and place the push specification on + // priority queues such that we can retrieve a family at a time. + + // A reduction family is characterized by its token range and symbol produced. + // It is used as a key in the priority queues to group pushes by family. + struct Family { + // The start of the token range of the reduce. + Token::Index Start; + SymbolID Symbol; + // Rule must produce Symbol and can otherwise be arbitrary. + // RuleIDs have the topological order based on the acyclic grammar. + // FIXME: should SymbolIDs be so ordered instead? + RuleID Rule; + + bool operator==(const Family &Other) const { + return Start == Other.Start && Symbol == Other.Symbol; + } + // The larger Family is the one that should be processed first. + bool operator<(const Family &Other) const { + if (Start != Other.Start) + return Start < Other.Start; + if (Symbol != Other.Symbol) + return Rule > Other.Rule; + assert(*this == Other); + return false; + } + }; + + // The base nodes are the heads after popping the GSS nodes we are reducing. + // We don't care which rule yielded each base. If Family.Symbol is S, the + // base includes an item X := ... • S ... and since the grammar is + // context-free, *all* parses of S are valid here. + // FIXME: reuse the queues across calls instead of reallocating. + KeyedQueue Bases; + + // A sequence is the ForestNode payloads of the GSS nodes we are reducing. + // These are the RHS of the rule, the RuleID is stored in the Family. + // They specify a sequence ForestNode we may build (but we dedup first). + using Sequence = llvm::SmallVector; + KeyedQueue Sequences; + + Sequence TempSequence; + // Pop walks up the parent chain(s) for a reduction from Head by to Rule. + // Once we reach the end, record the bases and sequences. + auto Pop = [&](const GSS::Node *Head, RuleID RID) { + LLVM_DEBUG(llvm::dbgs() << " Pop " << Params.G.dumpRule(RID) << "\n"); + const auto &Rule = Params.G.lookupRule(RID); + Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID}; + TempSequence.resize_for_overwrite(Rule.Size); + auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) { + if (I == Rule.Size) { + F.Start = TempSequence.front()->startTokenIndex(); + Bases.emplace(F, N); + LLVM_DEBUG(llvm::dbgs() << " --> base at S" << N->State << "\n"); + Sequences.emplace(F, TempSequence); + return; + } + TempSequence[Rule.Size - 1 - I] = N->Payload; + for (const GSS::Node *Parent : N->parents()) + DFS(Parent, I + 1, DFS); + }; + DFS(Head, 0, DFS); + }; + auto PopPending = [&] { + for (const ParseStep &Pending : PendingReduce) + Pop(Pending.Head, Pending.Action.getReduceRule()); + PendingReduce.clear(); + }; + + std::vector> FamilyBases; + std::vector> FamilySequences; + + std::vector TempGSSNodes; + std::vector TempForestNodes; + + // Main reduction loop: + // - pop as much as we can + // - process one family at a time, forming a forest node + // - produces new GSS heads which may enable more pops + PopPending(); + while (!Bases.empty()) { + // We should always have bases and sequences for the same families. + Family F = Bases.top().first; + assert(!Sequences.empty()); + assert(Sequences.top().first == F); + + LLVM_DEBUG(llvm::dbgs() << " Push " << Params.G.symbolName(F.Symbol) + << " from token " << F.Start << "\n"); + + // Grab the sequences for this family. + FamilySequences.clear(); + do { + FamilySequences.emplace_back(Sequences.top().first.Rule, + Sequences.top().second); + Sequences.pop(); + } while (!Sequences.empty() && Sequences.top().first == F); + // Build a forest node for each unique sequence. + sortAndUnique(FamilySequences); + auto &SequenceNodes = TempForestNodes; + SequenceNodes.clear(); + for (const auto &SequenceSpec : FamilySequences) + SequenceNodes.push_back(&Params.Forest.createSequence( + F.Symbol, SequenceSpec.first, SequenceSpec.second)); + // Wrap in an ambiguous node if needed. + const ForestNode *Parsed = + SequenceNodes.size() == 1 + ? SequenceNodes.front() + : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes); + LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Params.G) << "\n"); + + // Grab the bases for this family. + // As well as deduplicating them, we'll group by the goto state. + FamilyBases.clear(); + do { + FamilyBases.emplace_back( + Params.Table.getGoToState(Bases.top().second->State, F.Symbol), + Bases.top().second); + Bases.pop(); + } while (!Bases.empty() && Bases.top().first == F); + sortAndUnique(FamilyBases); + // Create a GSS node for each unique goto state. + llvm::ArrayRef BasesLeft = FamilyBases; + while (!BasesLeft.empty()) { + StateID NextState = BasesLeft.front().first; + auto &Parents = TempGSSNodes; + Parents.clear(); + for (const auto &Base : BasesLeft) { + if (Base.first != NextState) + break; + Parents.push_back(Base.second); + } + BasesLeft = BasesLeft.drop_front(Parents.size()); + + // Invoking the callback for new heads, a real GLR parser may add new + // reduces to the PendingReduce queue! + NewHeadCB(Params.GSS.addNode(NextState, Parsed, Parents)); + } + PopPending(); + } + assert(Sequences.empty()); +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/test/glr.cpp b/clang-tools-extra/pseudo/test/glr.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/glr.cpp @@ -0,0 +1,23 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s + +void foo() { + T* a; // a multiply expression or a pointer declaration? +// CHECK: statement-seq~statement := +// CHECK-NEXT: ├─statement~expression-statement := expression ; +// CHECK-NEXT: │ ├─expression~multiplicative-expression := multiplicative-expression * pm-expression +// CHECK-NEXT: │ │ ├─multiplicative-expression~IDENTIFIER := tok[5] +// CHECK-NEXT: │ │ ├─* := tok[6] +// CHECK-NEXT: │ │ └─pm-expression~IDENTIFIER := tok[7] +// CHECK-NEXT: │ └─; := tok[8] +// CHECK-NEXT: └─statement~simple-declaration := decl-specifier-seq init-declarator-list ; +// CHECK-NEXT: ├─decl-specifier-seq~simple-type-specifier := +// CHECK-NEXT: │ ├─simple-type-specifier~type-name := +// CHECK-NEXT: │ │ ├─type-name~IDENTIFIER := tok[5] +// CHECK-NEXT: │ │ ├─type-name~IDENTIFIER := tok[5] +// CHECK-NEXT: │ │ └─type-name~IDENTIFIER := tok[5] +// CHECK-NEXT: │ └─simple-type-specifier~IDENTIFIER := tok[5] +// CHECK-NEXT: ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator +// CHECK-NEXT: │ ├─ptr-operator~* := tok[6] +// CHECK-NEXT: │ └─ptr-declarator~IDENTIFIER := tok[7] +// CHECK-NEXT: └─; := tok[8] +} diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/DirectiveTree.h" +#include "clang-pseudo/GLR.h" #include "clang-pseudo/Grammar.h" #include "clang-pseudo/LRGraph.h" #include "clang-pseudo/LRTable.h" @@ -35,6 +36,8 @@ static opt PrintDirectiveTree("print-directive-tree", desc("Print directive structure of source code")); +static opt PrintStatistics("print-statistics", desc("Print GLR parser statistics")); +static opt PrintForest("print-forest", desc("Print parse forest")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr> Text = @@ -50,6 +53,28 @@ int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); + clang::LangOptions LangOpts; // FIXME: use real options. + LangOpts.CPlusPlus = 1; + std::string SourceText; + llvm::Optional RawStream; + llvm::Optional DirectiveStructure; + llvm::Optional ParseableStream; + if (Source.getNumOccurrences()) { + SourceText = readOrDie(Source); + RawStream = clang::pseudo::lex(SourceText, LangOpts); + DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream); + clang::pseudo::chooseConditionalBranches(*DirectiveStructure, *RawStream); + + if (PrintDirectiveTree) + llvm::outs() << DirectiveStructure; + if (PrintSource) + RawStream->print(llvm::outs()); + if (PrintTokens) + llvm::outs() << RawStream; + + ParseableStream = clang::pseudo::stripComments(cook(*RawStream, LangOpts)); + } + if (Grammar.getNumOccurrences()) { std::string Text = readOrDie(Grammar); std::vector Diags; @@ -65,24 +90,26 @@ llvm::outs() << G->dump(); if (PrintGraph) llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G); + auto LRTable = clang::pseudo::LRTable::buildSLR(*G); if (PrintTable) - llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G); - return 0; - } + llvm::outs() << LRTable.dumpForTests(*G); - if (Source.getNumOccurrences()) { - std::string Text = readOrDie(Source); - clang::LangOptions LangOpts; // FIXME: use real options. - auto Stream = clang::pseudo::lex(Text, LangOpts); - auto Structure = clang::pseudo::DirectiveTree::parse(Stream); - clang::pseudo::chooseConditionalBranches(Structure, Stream); + if (ParseableStream) { + clang::pseudo::ForestArena Arena; + clang::pseudo::GSS GSS; + auto &Root = + glrParse(*ParseableStream, + clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}); + if (PrintForest) + llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true); - if (PrintDirectiveTree) - llvm::outs() << Structure; - if (PrintSource) - Stream.print(llvm::outs()); - if (PrintTokens) - llvm::outs() << Stream; + if (PrintStatistics) { + llvm::outs() << "Forest bytes: " << Arena.bytes() + << " nodes: " << Arena.nodeCount() << "\n"; + llvm::outs() << "GSS bytes: " << GSS.bytes() + << " nodes: " << GSS.nodeCount() << "\n"; + } + } } return 0; diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -6,6 +6,7 @@ add_unittest(ClangPseudoUnitTests ClangPseudoTests DirectiveTreeTest.cpp ForestTest.cpp + GLRTest.cpp GrammarTest.cpp LRTableTest.cpp TokenTest.cpp diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -0,0 +1,393 @@ +//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/GLR.h" +#include "clang-pseudo/Grammar.h" +#include "clang-pseudo/Token.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/FormatVariadic.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace pseudo { +namespace { + +using Action = LRTable::Action; +using std::placeholders::_1; +using std::placeholders::_2; +using std::placeholders::_3; +using testing::AllOf; + +MATCHER_P(state, StateID, "") { return arg->State == StateID; } +MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; } +MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; } + + +// llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const NewHeadResult &R) { +// std::vector ParentStates; +// for (const auto &P : R.Parents) +// ParentStates.push_back(llvm::formatv("{0}", P->State)); +// OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", R.State, +// R.Parsed->symbol(), llvm::join(ParentStates, " ")); +// return OS; +// } + +testing::Matcher +parents(llvm::ArrayRef Parents) { + return testing::Property(&GSS::Node::parents, + testing::UnorderedElementsAreArray(Parents)); +} + +class GLRTest : public ::testing::Test { +public: + void build(llvm::StringRef GrammarBNF) { + std::vector Diags; + G = Grammar::parseBNF(GrammarBNF, Diags); + } + + void buildGrammar(std::vector Nonterminals, + std::vector Rules) { + Nonterminals.push_back("_"); + llvm::sort(Nonterminals); + Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()), + Nonterminals.end()); + std::string FakeTestBNF; + for (const auto &NT : Nonterminals) + FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT); + FakeTestBNF += llvm::join(Rules, "\n"); + build(FakeTestBNF); + } + + SymbolID id(llvm::StringRef Name) const { + for (unsigned I = 0; I < NumTerminals; ++I) + if (G->table().Terminals[I] == Name) + return tokenSymbol(static_cast(I)); + for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) + if (G->table().Nonterminals[ID].Name == Name) + return ID; + ADD_FAILURE() << "No such symbol found: " << Name; + return 0; + } + + RuleID ruleFor(llvm::StringRef NonterminalName) const { + auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange; + if (RuleRange.End - RuleRange.Start == 1) + return G->table().Nonterminals[id(NonterminalName)].RuleRange.Start; + ADD_FAILURE() << "Expected a single rule for " << NonterminalName + << ", but it has " << RuleRange.End - RuleRange.Start + << " rule!\n"; + return 0; + } + + NewHeadCallback captureNewHeads() { + return [this](const GSS::Node *NewHead) { + NewHeadResults.push_back(NewHead); + }; + }; + +protected: + std::unique_ptr G; + ForestArena Arena; + GSS GSS; + std::vector NewHeadResults; +}; + +TEST_F(GLRTest, ShiftMergingHeads) { + // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1, + // 2 have shift actions to reach state 4, and the head 3 has a shift action to + // reach state 5: + // 0--1 + // └--2 + // └--3 + // After the shift action, the GSS (with new heads 4, 5) is: + // 0---1---4 + // └---2---┘ + // └---3---5 + auto *GSSNode0 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + auto *GSSNode1 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); + auto *GSSNode2 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); + auto *GSSNode3 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0}); + + buildGrammar({}, {}); // Create a fake empty grammar. + LRTable T = LRTable::buildForTests(G->table(), /*Entries=*/{}); + + ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0); + std::vector PendingShift = { + {GSSNode1, Action::shift(4)}, + {GSSNode3, Action::shift(5)}, + {GSSNode2, Action::shift(4)}, + }; + glrShift(PendingShift, SemiTerminal, {*G, T, Arena, GSS}, + captureNewHeads()); + + EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( + AllOf(state(4), parsedSymbol(&SemiTerminal), + parents({GSSNode1, GSSNode2})), + AllOf(state(5), parsedSymbol(&SemiTerminal), + parents({GSSNode3})))); +} + +TEST_F(GLRTest, ReduceConflictsSplitting) { + // Before (splitting due to R/R conflict): + // 0--1(IDENTIFIER) + // After reducing 1 by `class-name := IDENTIFIER` and + // `enum-name := IDENTIFIER`: + // 0--2(class-name) // 2 is goto(0, class-name) + // └--3(enum-name) // 3 is goto(0, enum-name) + buildGrammar({"class-name", "enum-name"}, + {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"}); + + LRTable Table = LRTable::buildForTests( + G->table(), {{/*State=*/0, id("class-name"), Action::goTo(2)}, + {/*State=*/0, id("enum-name"), Action::goTo(3)}}); + + const auto *GSSNode0 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + const auto *GSSNode1 = + GSS.addNode(3, &Arena.createTerminal(tok::identifier, 0), {GSSNode0}); + + std::vector PendingReduce = { + {GSSNode1, Action::reduce(ruleFor("class-name"))}, + {GSSNode1, Action::reduce(ruleFor("enum-name"))}}; + glrReduce(PendingReduce, {*G, Table, Arena, GSS}, + captureNewHeads()); + // Verify + EXPECT_THAT(NewHeadResults, + testing::UnorderedElementsAre( + AllOf(state(2), parsedSymbolID(id("class-name")), + parents({GSSNode0})), + AllOf(state(3), parsedSymbolID(id("enum-name")), + parents({GSSNode0})))); +} + +TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) { + // Before (splitting due to multiple bases): + // 2(class-name)--4(*) + // 3(enum-name)---┘ + // After reducing 4 by `ptr-operator := *`: + // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) + // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) + buildGrammar({"ptr-operator", "class-name", "enum-name"}, + {"ptr-operator := *"}); + + auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0); + auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0); + + const auto *GSSNode2 = + GSS.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{}); + const auto *GSSNode3 = + GSS.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{}); + const auto *GSSNode4 = GSS.addNode( + /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1), + /*Parents=*/{GSSNode2, GSSNode3}); + + LRTable Table = LRTable::buildForTests( + G->table(), + {{/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)}, + {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}}); + std::vector PendingReduce = { + {GSSNode4, Action::reduce(ruleFor("ptr-operator"))}}; + glrReduce(PendingReduce, {*G, Table, Arena, GSS}, + captureNewHeads()); + + EXPECT_THAT(NewHeadResults, + testing::UnorderedElementsAre( + AllOf(state(5), parsedSymbolID(id("ptr-operator")), + parents({GSSNode2})), + AllOf(state(6), parsedSymbolID(id("ptr-operator")), + parents({GSSNode3})))); + // Verify that the payload of the two new heads is shared, only a single + // ptr-operator node is created in the forest. + EXPECT_EQ(NewHeadResults[0]->Payload, NewHeadResults[1]->Payload); +} + +TEST_F(GLRTest, ReduceJoiningWithMultipleBases) { + // Before (joining due to same goto state, multiple bases): + // 0--1(cv-qualifier)--3(class-name) + // └--2(cv-qualifier)--4(enum-name) + // After reducing 3 by `type-name := class-name` and + // 4 by `type-name := enum-name`: + // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and + // └--2(cv-qualifier)--┘ // goto(2, type-name) + buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"}, + {"type-name := class-name", "type-name := enum-name"}); + + auto *CVQualifierNode = + &Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0); + auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1); + auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1); + + const auto *GSSNode0 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + const auto *GSSNode1 = GSS.addNode( + /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); + const auto *GSSNode2 = GSS.addNode( + /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); + const auto *GSSNode3 = GSS.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode, + /*Parents=*/{GSSNode1}); + const auto *GSSNode4 = GSS.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, + /*Parents=*/{GSSNode2}); + + LRTable Table = LRTable::buildForTests( + G->table(), + {{/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)}, + {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}}); + // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! + std::vector PendingReduce = { + { + GSSNode3, Action::reduce(/*RuleID=*/0) // type-name := class-name + }, + { + GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name + }}; + glrReduce(PendingReduce, {*G, Table, Arena, GSS}, + captureNewHeads()); + + // Verify that the stack heads are joint at state 5 after reduces. + EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(AllOf( + state(5), parsedSymbolID(id("type-name")), + parents({GSSNode1, GSSNode2})))); + // Verify that we create an ambiguous ForestNode of two parses of `type-name`. + EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G), + "[ 1, end) type-name := \n" + "[ 1, end) ├─type-name := class-name\n" + "[ 1, end) │ └─class-name := \n" + "[ 1, end) └─type-name := enum-name\n" + "[ 1, end) └─enum-name := \n"); +} + +TEST_F(GLRTest, ReduceJoiningWithSameBase) { + // Before (joining due to same goto state, the same base): + // 0--1(class-name)--3(*) + // └--2(enum-name)--4(*) + // After reducing 3 by `pointer := class-name *` and + // 2 by `pointer := enum-name *`: + // 0--5(pointer) // 5 is goto(0, pointer) + buildGrammar({"pointer", "class-name", "enum-name"}, + {"pointer := class-name *", "pointer := enum-name *"}); + + auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0); + auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0); + auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1); + + const auto *GSSNode0 = + GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); + const auto *GSSNode1 = GSS.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode, + /*Parents=*/{GSSNode0}); + const auto *GSSNode2 = GSS.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode, + /*Parents=*/{GSSNode0}); + const auto *GSSNode3 = GSS.addNode(/*State=*/3, /*ForestNode=*/StartTerminal, + /*Parents=*/{GSSNode1}); + const auto *GSSNode4 = GSS.addNode(/*State=*/4, /*ForestNode=*/StartTerminal, + /*Parents=*/{GSSNode2}); + + LRTable Table = LRTable::buildForTests( + G->table(), {{/*State=*/0, id("pointer"), Action::goTo(5)}}); + // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! + std::vector PendingReduce = { + { + GSSNode3, Action::reduce(/*RuleID=*/0) // pointer := class-name * + }, + { + GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name * + }}; + glrReduce(PendingReduce, {*G, Table, Arena, GSS}, + captureNewHeads()); + + EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( + AllOf(state(5), parsedSymbolID(id("pointer")), + parents({GSSNode0})))); + EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G), + "[ 0, end) pointer := \n" + "[ 0, end) ├─pointer := class-name *\n" + "[ 0, 1) │ ├─class-name := \n" + "[ 1, end) │ └─* := tok[1]\n" + "[ 0, end) └─pointer := enum-name *\n" + "[ 0, 1) ├─enum-name := \n" + "[ 1, end) └─* := tok[1]\n"); +} + +TEST_F(GLRTest, PerfectForestNodeSharing) { + // Run the GLR on a simple grammar and test that we build exactly one forest + // node per (SymbolID, token range). + + // This is a grmammar where the original parsing-stack-based forest node + // sharing approach will fail. In its LR0 graph, it has two states containing + // item `expr := • IDENTIFIER`, and both have different goto states on the + // nonterminal `expr`. + build(R"bnf( + _ := test + + test := { expr + test := { IDENTIFIER + test := left-paren expr + left-paren := { + expr := IDENTIFIER + )bnf"); + clang::LangOptions LOptions; + const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions); + auto LRTable = LRTable::buildSLR(*G); + + const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSS}); + // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` + // in the forest, see the `#1` and `=#1` in the dump string. + EXPECT_EQ(Parsed.dumpRecursive(*G), + "[ 0, end) test := \n" + "[ 0, end) ├─test := { expr\n" + "[ 0, 1) │ ├─{ := tok[0]\n" + "[ 1, end) │ └─expr := IDENTIFIER #1\n" + "[ 1, end) │ └─IDENTIFIER := tok[1]\n" + "[ 0, end) ├─test := { IDENTIFIER\n" + "[ 0, 1) │ ├─{ := tok[0]\n" + "[ 1, end) │ └─IDENTIFIER := tok[1]\n" + "[ 0, end) └─test := left-paren expr\n" + "[ 0, 1) ├─left-paren := {\n" + "[ 0, 1) │ └─{ := tok[0]\n" + "[ 1, end) └─expr := IDENTIFIER =#1\n" + "[ 1, end) └─IDENTIFIER := tok[1]\n"); +} + +TEST_F(GLRTest, GLRReduceOrder) { + // Given the following grammar, and the input `IDENTIFIER`, reductions should + // be performed in the following order: + // 1. foo := IDENTIFIER + // 2. { test := IDENTIFIER, test := foo } + // foo should be reduced first, so that in step 2 we have completed reduces + // for test, and form an ambiguous forest node. + build(R"bnf( + _ := test + + test := IDENTIFIER + test := foo + foo := IDENTIFIER + )bnf"); + clang::LangOptions LOptions; + const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions); + auto LRTable = LRTable::buildSLR(*G); + + const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSS}); + EXPECT_EQ(Parsed.dumpRecursive(*G), + "[ 0, end) test := \n" + "[ 0, end) ├─test := IDENTIFIER\n" + "[ 0, end) │ └─IDENTIFIER := tok[0]\n" + "[ 0, end) └─test := foo\n" + "[ 0, end) └─foo := IDENTIFIER\n" + "[ 0, end) └─IDENTIFIER := tok[0]\n"); +} + +} // namespace +} // namespace pseudo +} // namespace clang