diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h @@ -0,0 +1,177 @@ +//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// LR parsers are bottom-up parsers -- they scan the input from left to right, +// and collect the right-hand side of a production rule (called handle) on top +// of the stack, then replace (reduce) the handle with the nonterminal defined +// by the production rule. +// +// This file defines LRGraph, a deterministic handle-finding finite-state +// automaton, which is a key component in LR parsers to recognize any of +// handles in the grammar efficiently. We build the LR table (ACTION and GOTO +// Table) based on the LRGraph. +// +// LRGraph can be constructed for any context-free grammars. +// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but +// interpretation of the FSA is nondeterminsitic -- we might in a state where +// we can continue searching an handle and identify a handle (called +// shift/reduce conflicts), or identify more than one handle (callled +// reduce/reduce conflicts). +// +// LRGraph is a common model for all variants of LR automatons, from the most +// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead +// in making decisions. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H +#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/Hashing.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +// An LR item -- a grammar rule with a dot at some position of the body. +// e.g. a production rule A := X Y yields 3 items: +// A := . X Y +// A := X . Y +// A := X Y . +// An item indicates how much of a production rule has been recognized at a +// position (described by dot), for example, A := X . Y indicates that we have +// recognized the X part from the input, and we hope next to see the input +// derivable from Y. +class Item { +public: + static Item start(RuleID ID, const Grammar &G) { + Item I; + I.RID = ID; + I.RuleLength = G.lookupRule(ID).Size; + return I; + } + static Item sentinel(RuleID ID) { + Item I; + I.RID = ID; + return I; + } + + RuleID rule() const { return RID; } + uint8_t dot() const { return DotPos; } + + bool hasNext() const { return DotPos < RuleLength; } + SymbolID next(const Grammar &G) const { + assert(hasNext()); + return G.lookupRule(RID).Sequence[DotPos]; + } + + Item advance() const { + assert(hasNext()); + Item I = *this; + ++I.DotPos; + return I; + } + + std::string dump(const Grammar &G) const; + + bool operator==(const Item &I) const { + return DotPos == I.DotPos && RID == I.RID; + } + bool operator<(const Item &I) const { + return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos); + } + friend llvm::hash_code hash_value(const Item &I) { + return llvm::hash_combine(I.RID, I.DotPos); + } + +private: + RuleID RID = 0; + uint8_t DotPos = 0; + uint8_t RuleLength = 0; // the length of rule body. +}; + +// A state represents a node in the LR automaton graph. It is an item set, which +// contains all possible rules that the LR parser may be parsing in that state. +// +// Conceptually, If we knew in advance what we're parsing, at any point we're +// partway through parsing a production, sitting on a stack of partially parsed +// productions. But because we don't know, there could be *several* productions +// we're partway through. The set of possibilities is the parser state, and we +// precompute all the transitions between these states. +struct State { + // A full set of items (including non-kernel items) representing the state, + // in a canonical order (see SortByNextSymbol in the cpp file). + std::vector Items; + + std::string dump(const Grammar &G, unsigned Indent = 0) const; +}; + +// LRGraph is a deterministic finite state automaton for LR parsing. +// +// Intuitively, an LR automaton is a transition graph. The graph has a +// collection of nodes, called States. Each state corresponds to a particular +// item set, which represents a condition that could occur duing the process of +// parsing a production. Edges are directed from one state to another. Each edge +// is labeled by a grammar symbol (terminal or nonterminal). +// +// LRGraph is used to construct the LR parsing table which is a core +// data-structure driving the LR parser. +class LRGraph { +public: + // StateID is the index in States table. + using StateID = uint16_t; + + // Constructs an LR(0) automaton. + static LRGraph buildLR0(const Grammar &); + + // An edge in the LR graph, it represents a transition in the LR automaton. + // If the parser is at state Src, with a lookahead Label, then it + // transits to state Dst. + struct Edge { + StateID Src, Dst; + SymbolID Label; + }; + + llvm::ArrayRef states() const { return States; } + llvm::ArrayRef edges() const { return Edges; } + + std::string dumpForTests(const Grammar &) const; + +private: + LRGraph(std::vector States, std::vector Edges) + : States(std::move(States)), Edges(std::move(Edges)) {} + + std::vector States; + std::vector Edges; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +namespace llvm { +// Support clang::syntax::pseudo::Item as DenseMap keys. +template <> struct DenseMapInfo { + static inline clang::syntax::pseudo::Item getEmptyKey() { + return clang::syntax::pseudo::Item::sentinel(-1); + } + static inline clang::syntax::pseudo::Item getTombstoneKey() { + return clang::syntax::pseudo::Item::sentinel(-2); + } + static unsigned getHashValue(const clang::syntax::pseudo::Item &I) { + return hash_value(I); + } + static bool isEqual(const clang::syntax::pseudo::Item &LHS, + const clang::syntax::pseudo::Item &RHS) { + return LHS == RHS; + } +}; +} // namespace llvm + +#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -3,6 +3,7 @@ add_clang_library(clangToolingSyntaxPseudo Grammar.cpp GrammarBNF.cpp + LRGraph.cpp LINK_LIBS clangBasic diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp @@ -0,0 +1,231 @@ +//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" + +using ItemSet = std::vector; + +namespace llvm { +// Support clang::syntax::pseudo::Item as DenseMap keys. +template <> struct DenseMapInfo { + static inline ItemSet getEmptyKey() { + return {DenseMapInfo::getEmptyKey()}; + } + static inline ItemSet getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey()}; + } + static unsigned getHashValue(const ItemSet &I) { + return llvm::hash_combine_range(I.begin(), I.end()); + } + static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) { + return LHS == RHS; + } +}; +} // namespace llvm + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +struct SortByNextSymbol { + SortByNextSymbol(const Grammar &G) : G(G) {} + bool operator()(const Item &L, const Item &R) { + if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G)) + return L.next(G) < R.next(G); + if (L.hasNext() != R.hasNext()) + return L.hasNext() < R.hasNext(); // a trailing dot is minimal. + return L < R; + } + const Grammar &G; +}; + +// Computes a closure of the given item set S: +// - extends the given S to contain all options for parsing next token; +// - nonterminals after a dot are recursively expanded into the begin-state +// of all production rules that produce that nonterminal; +// +// Given +// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ] +// Input = [ E := . T ] +// returns [ E := . T, T := . n, T := . ( E ) ] +State closure(ItemSet Queue, const Grammar &G) { + llvm::DenseSet InQueue = {Queue.begin(), Queue.end()}; + // We reuse the passed-by-value Queue as the final result, as it's already + // initialized to the right elements. + size_t ItIndex = 0; + while (ItIndex < Queue.size()) { + const Item &ExpandingItem = Queue[ItIndex]; + ++ItIndex; + if (!ExpandingItem.hasNext()) + continue; + + SymbolID NextSym = ExpandingItem.next(G); + if (pseudo::isToken(NextSym)) + continue; + auto RRange = G.table().Nonterminals[NextSym].RuleRange; + for (RuleID RID = RRange.start; RID < RRange.end; ++RID) { + Item NewItem = Item::start(RID, G); + if (InQueue.insert(NewItem).second) // new + Queue.push_back(std::move(NewItem)); + } + } + Queue.shrink_to_fit(); + llvm::sort(Queue, SortByNextSymbol(G)); + return {std::move(Queue)}; +} + +// Returns all next (with a dot advanced) kernel item sets, partitioned by the +// advanced symbol. +// +// Given +// S = [ E := . a b, E := E . - T ] +// returns [ +// {id(a), [ E := a . b ]}, +// {id(-), [ E := E - . T ]} +// ] +std::vector> +nextAvailableKernelItems(const State &S, const Grammar &G) { + std::vector> Results; + llvm::ArrayRef AllItems = S.Items; + AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); }); + while (!AllItems.empty()) { + SymbolID AdvancedSymbol = AllItems.front().next(G); + auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) { + assert(I.hasNext()); + return I.next(G) == AdvancedSymbol; + }); + assert(!Batch.empty()); + AllItems = AllItems.drop_front(Batch.size()); + + // Advance a dot over the Symbol. + ItemSet Next; + for (const Item &I : Batch) + Next.push_back(I.advance()); + // sort the set to keep order determinism for hash computation. + llvm::sort(Next); + Results.push_back({AdvancedSymbol, std::move(Next)}); + } + return Results; +} + +} // namespace + +std::string Item::dump(const Grammar &G) const { + const auto &Rule = G.lookupRule(RID); + auto ToNames = [&](llvm::ArrayRef Syms) { + std::vector Results; + for (auto SID : Syms) + Results.push_back(G.symbolName(SID)); + return Results; + }; + return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target), + llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), + llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " ")) + .str(); +} + +std::string State::dump(const Grammar &G, unsigned Indent) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + for (const auto &Item : Items) + OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G)); + return OS.str(); +} + +std::string LRGraph::dumpForTests(const Grammar &G) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + OS << "States:\n"; + for (StateID ID = 0; ID < States.size(); ++ID) { + OS << llvm::formatv("State {0}\n", ID); + OS << States[ID].dump(G, /*Indent*/ 4); + } + for (const auto &E : Edges) { + OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label), + E.Dst); + } + return OS.str(); +} + +LRGraph LRGraph::buildLR0(const Grammar &G) { + class Builder { + public: + Builder(const Grammar &G) : G(G) {} + + // Adds a given state if not existed. + std::pair insert(ItemSet KernelItems) { + assert(llvm::is_sorted(KernelItems) && + "Item must be sorted before inserting to a hash map!"); + auto It = StatesIndex.find(KernelItems); + if (It != StatesIndex.end()) + return {It->second, false}; + States.push_back(closure(KernelItems, G)); + StateID NextStateID = States.size() - 1; + StatesIndex.insert({std::move(KernelItems), NextStateID}); + return {NextStateID, true}; + } + + void insertEdge(StateID Src, StateID Dst, SymbolID Label) { + Edges.push_back({Src, Dst, Label}); + } + + // Returns a state with the given id. + const State &find(StateID ID) const { + assert(ID < States.size()); + return States[ID]; + } + + LRGraph build() && { + States.shrink_to_fit(); + Edges.shrink_to_fit(); + return LRGraph(std::move(States), std::move(Edges)); + } + + private: + // Key is the **kernel** item sets. + llvm::DenseMap StatesIndex; + std::vector States; + std::vector Edges; + const Grammar &G; + } Builder(G); + + std::vector PendingStates; + // Initialize states with the start symbol. + auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange; + for (RuleID RID = RRange.start; RID < RRange.end; ++RID) { + auto StartState = std::vector{Item::start(RID, G)}; + auto Result = Builder.insert(std::move(StartState)); + assert(Result.second && "State must be new"); + PendingStates.push_back(Result.first); + } + + while (!PendingStates.empty()) { + auto CurrentStateID = PendingStates.back(); + PendingStates.pop_back(); + for (auto Next : + nextAvailableKernelItems(Builder.find(CurrentStateID), G)) { + auto Insert = Builder.insert(Next.second); + if (Insert.second) // new state, insert to the pending queue. + PendingStates.push_back(Insert.first); + Builder.insertEdge(CurrentStateID, Insert.first, Next.first); + } + } + return std::move(Builder).build(); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -4,6 +4,7 @@ add_clang_unittest(ClangPseudoTests GrammarTest.cpp + LRGraphTest.cpp ) clang_target_link_libraries(ClangPseudoTests diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp @@ -0,0 +1,84 @@ +//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +TEST(LRGraph, Build) { + struct TestCase { + llvm::StringRef BNF; + llvm::StringRef ExpectedStates; + }; + + TestCase Cases[] = {{ + R"bnf( +_ := expr +expr := IDENTIFIER + )bnf", + R"(States: +State 0 + _ := • expr + expr := • IDENTIFIER +State 1 + _ := expr • +State 2 + expr := IDENTIFIER • +0 ->[expr] 1 +0 ->[IDENTIFIER] 2 +)"}, + {// A grammar with a S/R conflict in SLR table: + // (id-id)-id, or id-(id-id). + R"bnf( +_ := expr +expr := expr - expr # S/R conflict at state 4 on '-' token +expr := IDENTIFIER + )bnf", + R"(States: +State 0 + _ := • expr + expr := • expr - expr + expr := • IDENTIFIER +State 1 + _ := expr • + expr := expr • - expr +State 2 + expr := IDENTIFIER • +State 3 + expr := • expr - expr + expr := expr - • expr + expr := • IDENTIFIER +State 4 + expr := expr - expr • + expr := expr • - expr +0 ->[expr] 1 +0 ->[IDENTIFIER] 2 +1 ->[-] 3 +3 ->[expr] 4 +3 ->[IDENTIFIER] 2 +4 ->[-] 3 +)"}}; + for (const auto &C : Cases) { + std::vector Diags; + auto G = Grammar::parseBNF(C.BNF, Diags); + ASSERT_THAT(Diags, testing::IsEmpty()); + auto LR0 = LRGraph::buildLR0(*G); + EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates); + } +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang