diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h b/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Forest.h @@ -81,6 +81,11 @@ assert(elements.size() < (1 << (16 - RuleBits))); return rule | elements.size() << RuleBits; } + static uint16_t + AmbiguousData(llvm::ArrayRef alternatives) { + return alternatives.size(); + } + Token::Index StartLoc; Kind K : 4; SymbolID Symbol : SymbolBits; @@ -106,6 +111,14 @@ ForestNode::SequenceData(RID, Elements), Elements); } + ForestNode &createAmbiguous(SymbolID symbol, + llvm::ArrayRef alternatives) { + assert(!alternatives.empty()); + return create(ForestNode::Ambiguous, symbol, + alternatives.front()->startLoc(), + ForestNode::AmbiguousData(alternatives), alternatives); + } + void init(const TokenStream &Code); const ForestNode &terminal(Token::Index Index) const { assert(Terminals && "Terminals are not intialized!"); diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h b/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h @@ -81,9 +81,9 @@ }; // Creates a new node in the graph. - const Node *addNode(LRTable::StateID State, - const ::clang::syntax::pseudo::ForestNode *Symbol, - llvm::ArrayRef Predecessors) { + Node *addNode(LRTable::StateID State, + const ::clang::syntax::pseudo::ForestNode *Symbol, + llvm::ArrayRef Predecessors) { assert(Predecessors.size() < (1 << Node::PredecessorBits) && "Too many predecessors to fit in PredecessorBits!"); ++NodeCount; diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -158,6 +158,10 @@ struct Nonterminal { std::string Name; + // Value of topological order of the nonterminal. + // For nonterminals A and B, if A := B (or transitively), then + // A.TopologicalOrder > B.TopologicalOrder. + unsigned TopologicalOrder = 0; // Corresponding rules that construct the non-terminal, it is a [start, end) // index range of the Rules table. struct { diff --git a/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp b/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp @@ -18,6 +18,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include +#include #include #define DEBUG_TYPE "GLRParser.cpp" @@ -156,72 +157,135 @@ enumPath(Head, PathLength); } -// Perform reduction recursively until we don't have reduce actions with -// heads. +// Perform reductions recursively until there is no available reductions. +// +// Reductions can manipulate the GSS in following way: +// +// 1) Split -- +// 1.1 when a stack head has mutiple reduce actions, the head is +// made to split to accommodate the various possiblities. +// E.g. +// 0 -> 1 (ID) +// After performing reduce of production rules (class-name := ID, +// enum-name := ID), the GSS now has two new heads: +// 0 -> 2 (class-name) +// `-> 3 (enum-name) +// +// 1.2 when a stack head has a reduce action with multiple bases, the head +// will be split if each base leads to different states. +// E.g. +// ... -> 1(...) -> 3 (INT) +// ^ +// ... -> 2(...) ---| +// +// After the reduce action (simple-type-specifier := INT), the GSS looks +// like: +// ... -> 1(...) -> 4 (simple-type-specifier) +// ... -> 2(...) -> 5 (simple-type-specifier) +// +// 2) Merge -- if multiple heads turn out to be identical after +// reduction (new heads have the same state, and point to the same +// predecessors), these heads are merged and treated as a single head. +// This is usually where ambiguity happens. +// +// E.g. +// 0 -> 2 (class-name) +// ` -> 3 (enum-name) +// After reduction of rules (type-name := class-name | enum-name), the GSS +// has the following form: +// 0 -> 4 (type-name) +// The type-name forest node in the new head 4 is ambiguous, which has two +// parses (type-name -> class-name -> id, type-name -> enum-name -> id). void GLRParser::performReduction(const Token &Lookahead) { if (!ReduceList.empty()) LLVM_DEBUG(llvm::dbgs() << " Performing **Reduce**\n"); - // Reduce can manipulate the GSS in following way: - // - // 1) Split -- - // 1.1 when a stack head has mutiple reduce actions, the head is - // made to split to accommodate the various possiblities. - // E.g. - // 0 -> 1 (ID) - // After performing reduce of production rules (class-name := ID, - // enum-name := ID), the GSS now has two new heads: - // 0 -> 2 (class-name) - // `-> 3 (enum-name) + // Reductions per reduce path. + struct ReduceAction { + std::vector ReducePath; + RuleID ReduceRuleID; + }; + auto OrderCmp = [this](const ReduceAction &L, const ReduceAction &R) { + auto LBegin = L.ReducePath.back()->Parsed->startLoc(); + auto RBegin = R.ReducePath.back()->Parsed->startLoc(); + if (LBegin == RBegin) + return G.table() + .Nonterminals[G.lookupRule(L.ReduceRuleID).Target] + .TopologicalOrder > + G.table() + .Nonterminals[G.lookupRule(R.ReduceRuleID).Target] + .TopologicalOrder; + return LBegin < RBegin; + }; + auto Equal = [this](const ReduceAction &L, const ReduceAction &R) { + return L.ReducePath.back()->Parsed->startLoc() == + R.ReducePath.back()->Parsed->startLoc() && + G.lookupRule(L.ReduceRuleID).Target == + G.lookupRule(R.ReduceRuleID).Target; + }; + + // Forest node is unmutable. To create an amgbiguous forest node, we need to + // know all alternatives in advance. // - // 1.2 when a stack head has a reduce action with multiple reduce - // paths, the head is to split. - // E.g. - // ... -> 1(...) -> 3 (INT) - // ^ - // ... -> 2(...) ---| + // All Reductions must be performed in a careful order, so that we can gather + // all ambiguous alternatives as a batch, and process them as a single pass. // - // After the reduce action (simple-type-specifier := INT), the GSS looks - // like: - // ... -> 1(...) -> 4 (simple-type-specifier) - // ... -> 2(...) -> 5 (simple-type-specifier) + // Reductions is stored in a priority queue with a sorted order according to: + // Rule 1: Reductions which span fewer tokens are processed first; + // Rule 2: If two reductions A := X and B := Y span the same tokens, + // A := X is processed first if topological order of nonterminal + // A is less than nonterminal B (That is to say: if there is a + // production rule B := A in the grammar, the reduction A := X + // should come first because it will enable a new reduction B := A); // - // 2) Merge -- if multiple heads turn out to be identical after - // reduction (new heads have the same state, and point to the same - // predecessors), these heads are merged and treated as a single head. - // This is usually where ambiguity happens. + // Each iteration, we construct a batch from the priority queue. Reductions in + // the batch span the same tokens and reduce to the same nonterminal. // - // E.g. - // 0 -> 2 (class-name) - // ` -> 3 (enum-name) - // After reduction of rules (type-name := class-name | enum-name), the GSS - // has the following form: - // 0 -> 4 (type-name) - // The type-name forest node in the new head 4 is ambiguous, which has two - // parses (type-name -> class-name -> id, type-name -> enum-name -> id). + // Local ambiguity packing (if present) is guaranteed in each batch. + std::priority_queue, + decltype(OrderCmp)> + OrderedReduceList(OrderCmp); + auto addToOrderedReduceList = [&OrderedReduceList, + this](decltype(ReduceList) &ReduceList) { + std::vector ReducePath; + for (const auto &RA : ReduceList) { + RuleID ReduceRuleID = RA.PerformAction->getReduceRule(); + const Rule &ReduceRule = G.lookupRule(ReduceRuleID); + enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() { + OrderedReduceList.push({ReducePath, ReduceRuleID}); + }); + } + ReduceList.clear(); + }; + addToOrderedReduceList(ReduceList); - // Store all newly-created stack heads for tracking ambiguities. - std::vector CreatedHeads; - while (!ReduceList.empty()) { - auto RA = std::move(ReduceList.back()); - ReduceList.pop_back(); + while (!OrderedReduceList.empty()) { + std::vector Batch; + do { + Batch.push_back(OrderedReduceList.top()); + OrderedReduceList.pop(); + } while (!OrderedReduceList.empty() && + Equal(OrderedReduceList.top(), Batch.front())); - RuleID ReduceRuleID = RA.PerformAction->getReduceRule(); - const Rule &ReduceRule = G.lookupRule(ReduceRuleID); - LLVM_DEBUG(llvm::dbgs() << llvm::formatv( - " !reduce rule {0}: {1} head: {2}\n", ReduceRuleID, - G.dumpRule(ReduceRuleID), RA.Head->State)); + // newly-created GSS node -> corresponding forest node. + // If there are more thant 1 forest nodes, it means we hit ambiguities. + // Used to assemble the ambiguous forest node at the end. + llvm::DenseMap> + BuiltForestNodes; + // Track whether we hit ambiguities, determined by the equality of + // predecessors. + std::vector CreatedGSSNodes; + + for (const auto &RA : Batch) { + SymbolID ReduceSymbolID = G.lookupRule(RA.ReduceRuleID).Target; + // Create a corresponding sequence forest node for the reduce rule. + std::vector ForestChildren; + for (const Graph::Node *PN : llvm::reverse(RA.ReducePath)) + ForestChildren.push_back(PN->Parsed); + const ForestNode &ForestNode = ParsedForest.createSequence( + ReduceSymbolID, RA.ReduceRuleID, ForestChildren.front()->startLoc(), + ForestChildren); - std::vector ReducePath; - enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() { - LLVM_DEBUG( - llvm::dbgs() << llvm::formatv( - " stack path: {0}, bases: {1}\n", - llvm::join(getStateString(ReducePath), " -> "), - llvm::join(getStateString(ReducePath.back()->predecessors()), - ", "))); - assert(ReducePath.size() == ReduceRule.Size && - "Reduce path's length must equal to the reduce rule size"); // A reduce is a back-and-forth operation in the stack. // For example, we reduce a rule "declaration := decl-specifier-seq ;" on // the linear stack: @@ -232,24 +296,26 @@ // // 1. back -- pop |ReduceRuleLength| nodes (ReducePath) in the stack; // 2. forth -- push a new node in the stack and mark it as a head; - // 0 -> 4(declaration) - // ^ Head // - // It becomes tricky if a reduce path has multiple bases, we want to merge - // them if their next state is the same. Similiar to above performShift, - // we partition the bases by their next state, and process each partition - // per loop iteration. + // 0 -> 4(declaration) + // ^ Head + // + // Each RA corresponds to a single reduce path, but a reduce path can have + // multiple Bases, which could split the stack (depends on whether their + // next state is identical). + // Similiar to above `performShift`, we partition the Bases by their + // next state, and process each partition. struct BaseInfo { // An intermediate head after the stack has poped |ReducePath| nodes. const Graph::Node *Base = nullptr; - // The final state after reduce. - // It is getGoToState(Base->State, ReduceSymbol). + // The final state after reduction. + // The value is getGoToState(Base->State, ReduceSymbol). StateID NextState; }; std::vector Bases; - for (const Graph::Node *Base : ReducePath.back()->predecessors()) + for (const Graph::Node *Base : RA.ReducePath.back()->predecessors()) Bases.push_back( - {Base, ParsingTable.getGoToState(Base->State, ReduceRule.Target)}); + {Base, ParsingTable.getGoToState(Base->State, ReduceSymbolID)}); llvm::sort(Bases, [](const BaseInfo &L, const BaseInfo &R) { return std::forward_as_tuple(L.NextState, L.Base) < std::forward_as_tuple(R.NextState, R.Base); @@ -269,44 +335,64 @@ assert(!Batch.empty()); Partition = Partition.drop_front(Batch.size()); + // Not needed, as it is created outside of the partition-loop. + // Create a corresponding sequence forest node for the reduce rule. + // std::vector ForestChildren; + // for (const Graph::Node *PN : llvm::reverse(RA.ReducePath)) + // ForestChildren.push_back(PN->Parsed); + // const ForestNode &ForestNode = ParsedForest.createSequence( + // ReduceSymbolID, RA.ReduceRuleID, + // ForestChildren.front()->startLoc(), ForestChildren); + LLVM_DEBUG(llvm::dbgs() << llvm::formatv( + " after reduce: {0} -> state {1} ({2})\n", + llvm::join(getStateString(Predecessors), ", "), + NextState, G.symbolName(ReduceSymbolID))); + // Check ambiguities. - auto It = llvm::find_if(CreatedHeads, [&](const Graph::Node *Head) { - return Head->Parsed->symbol() == ReduceRule.Target && - Head->predecessors() == llvm::makeArrayRef(Predecessors); - }); - if (It != CreatedHeads.end()) { - // This should be guaranteed by checking the equalivent of - // predecessors and reduce nonterminal symbol! + // FIXME: this is a linear scan, it might be too slow. + auto It = + llvm::find_if(CreatedGSSNodes, [&](const Graph::Node *Created) { + // Guaranteed by the side-effect of partition. + assert(llvm::is_sorted(Created->predecessors()) && + llvm::is_sorted(llvm::makeArrayRef(Predecessors))); + // Guaranteed by the Batch, where all reductions are reduced to + // a same nonterminal. + assert(Created->Parsed->symbol() == ReduceSymbolID); + return Created->predecessors() == + llvm::makeArrayRef(Predecessors); + }); + if (It != CreatedGSSNodes.end()) { + // This is guaranteed by the equality of predecessors and target + // nonterminal of reduction rule! assert(NextState == (*It)->State); LLVM_DEBUG(llvm::dbgs() << llvm::formatv( " found ambiguity, merged in state {0} (forest " "'{1}')\n", - (*It)->State, G.symbolName((*It)->Parsed->symbol()))); - // FIXME: create ambiguous foreset node! + NextState, G.symbolName((*It)->Parsed->symbol()))); + BuiltForestNodes[*It].push_back(&ForestNode); continue; } - // Create a corresponding sequence forest node for the reduce rule. - std::vector ForestChildren; - for (const Graph::Node *PN : llvm::reverse(ReducePath)) - ForestChildren.push_back(PN->Parsed); - const ForestNode &ForestNode = ParsedForest.createSequence( - ReduceRule.Target, RA.PerformAction->getReduceRule(), - ForestChildren.front()->startLoc(), ForestChildren); - LLVM_DEBUG(llvm::dbgs() << llvm::formatv( - " after reduce: {0} -> state {1} ({2})\n", - llvm::join(getStateString(Predecessors), ", "), - NextState, G.symbolName(ReduceRule.Target))); - - // Create a new stack head. - const Graph::Node *Head = - GSS.addNode(NextState, &ForestNode, Predecessors); - CreatedHeads.push_back(Head); + // Create a new GSS node. + Graph::Node *Head = GSS.addNode(NextState, &ForestNode, Predecessors); + CreatedGSSNodes.push_back(Head); + BuiltForestNodes[Head].push_back(&ForestNode); - // Actions that are enabled by this reduce. + // Actions that are enabled by this reduction. addActions(Head, Lookahead); } - }); + } + // We're good to assmeble the ambiguous forest node if any. + for (auto It : BuiltForestNodes) { + if (It.second.size() > 1) { + It.first->Parsed = &ParsedForest.createAmbiguous( + It.second.front()->symbol(), It.getSecond()); + continue; + } + assert(It.first->Parsed == It.getSecond().front()); + } + // OK, now add newly-enabled reductions to the ordered list; + addToOrderedReduceList(ReduceList); } } diff --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp @@ -100,11 +100,52 @@ ++RulePos; T->Nonterminals[SID].RuleRange = {Start, RulePos}; } + calculateDependencyOrder(T.get()); auto G = std::make_unique(std::move(T)); diagnoseGrammar(*G); return G; } + void calculateDependencyOrder(GrammarTable *T) const { + llvm::DenseMap> DependencyGraph; + for (const auto &Rule : T->Rules) { + // A := B, A depends on B. + if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0])) + DependencyGraph[Rule.Target].insert(Rule.Sequence[0]); + } + std::vector Order; + // Each nonterminal state flows: NotVisited -> Visiting -> Visited. + enum State { + NotVisited, + Visiting, + Visited, + }; + std::vector VisitStates(T->Nonterminals.size(), NotVisited); + std::function DFS = [&](SymbolID SID) -> void { + if (VisitStates[SID] == Visited) + return; + if (VisitStates[SID] == Visiting) { + Diagnostics.push_back( + llvm::formatv("The grammar is cyclic, see symbol {0}\n", + T->Nonterminals[SID].Name)); + return; + } + VisitStates[SID] = Visiting; + auto It = DependencyGraph.find(SID); + if (It != DependencyGraph.end()) { + for (SymbolID Dep : (It->getSecond())) + DFS(Dep); + } + VisitStates[SID] = Visited; + Order.push_back(SID); + }; + for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) + DFS(ID); + for (size_t I = 0; I < Order.size(); ++I) { + T->Nonterminals[Order[I]].TopologicalOrder = I; + } + } + private: // Text representation of a BNF grammar rule. struct RuleSpec { diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp --- a/clang/tools/clang-pseudo/ClangPseudo.cpp +++ b/clang/tools/clang-pseudo/ClangPseudo.cpp @@ -89,7 +89,7 @@ << " nodes: " << Arena.nodeNum() << "\n"; llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes() << " nodes: " << Parser.getGSS().nodeCount() << "\n"; - // llvm::outs() << Root->DumpRecursive(*G, true); + llvm::outs() << Root->DumpRecursive(*G, false); } } return 0;