diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -38,6 +38,7 @@ #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include #include @@ -123,6 +124,17 @@ llvm::ArrayRef lookupRules(SymbolID SID) const; const Rule &lookupRule(RuleID RID) const; + // Returns all symbols. + std::vector symbols() const; + + // Returns the SymbolID of the augmented symbol '_' + SymbolID augmentedSymbol() const { return 0; } + + // Computes the FIRST(X) for all non-terminal symbol X. + std::vector> firstSets() const; + // Computes the FOLLOW(X) for all non-terminal symbol X. + std::vector> followSets() const; + // Get symbol (terminal or non-terminal) name. // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). llvm::StringRef symbolName(SymbolID) const; @@ -153,7 +165,7 @@ // The rules are sorted (and thus grouped) by target symbol. // RuleID is the index of the vector. std::vector Rules; - // A table of terminals (aka tokens). It correspond to the clang::Token. + // A table of terminals (aka tokens). It corresponds to the clang::Token. // clang::tok::TokenKind is the index of the table. std::vector Terminals; // A table of nonterminals, sorted by name. diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRAutomaton.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRAutomaton.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRAutomaton.h @@ -0,0 +1,119 @@ +//===--- LRAutomaton.h - Build an LR automaton ------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRAUTOMATON_H +#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRAUTOMATON_H + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "llvm/ADT/Hashing.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +// An LR item -- a grammar rule with a dot at some position of the body. +// e.g. a production rule A := xy yields 3 items: +// A := .xy +// A := x.y +// A := xy. +// An item indicates how much of a production rule has been recognized at a +// position (described by dot), for example, A := x.y indicates that we have +// recognized the x part from the input, and we hope next to see the input +// derivable from y. +class Item { +public: + Item(RuleID ID, uint8_t RuleLength, uint8_t DotPos) + : RID(ID), RuleLength(RuleLength), DotPos(DotPos) {} + + bool hasNext() const { return DotPos < RuleLength; } + SymbolID next(const Grammar &G) const { + assert(hasNext()); + return rule(G).Sequence[DotPos]; + } + + Item advance() const { + assert(hasNext()); + return Item(RID, RuleLength, DotPos + 1); + } + const pseudo::Rule &rule(const Grammar &G) const { return G.lookupRule(RID); } + RuleID ruleID() const { return RID; } + int dotPos() const { return DotPos; } + std::string dump(const Grammar &G) const; + + bool operator==(const Item &I) const { + return DotPos == I.DotPos && RID == I.RID; + } + bool operator<(const Item &I) const { + return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos); + } + friend llvm::hash_code hash_value(const Item &I) { + return llvm::hash_combine(I.RID, I.DotPos); + } + +private: + RuleID RID; + uint8_t RuleLength; // the length of rule body. + uint8_t DotPos; +}; + +using ItemSet = std::set; +// A state of the LR automaton is an item set, which contains all possible rules +// that the LR parser may be parsing in that state. +// +// Conceptually, If we knew in advance what we're parsing, at any point we're +// partway through parsing a production, sitting on a stack of partially parsed +// productions. But because we don't know, there could be *several* productions +// we're partway through. The set of possibilities is the parser state, and we +// precompute all the transitions between these states. +using State = ItemSet; +llvm::hash_code hash_code(const State &S); +std::string dump(const State &S, const Grammar &G); + +// A deterministic finite state automaton for LR parsing. +// +// Intuitively, an LR automaton is a transition graph. The graph has a +// collection of nodes, called States. Each state corresponds to a particular +// item set, which represents a condition that could occur duing the process of +// parsing a production. Edges are directed from one state to another. Each edge +// is labeled by a grammar symbol (terminal or non-terminal). +// +// The LR automaton is used to construct the LR parsing table (See comments in +// LRTable) which is used to drive the LR parser. +class LRAutomaton { +public: + // Constructs an LR(0) automaton. + static LRAutomaton build(const Grammar &); + + // Edge represents a transition in the LR automaton. + // If the parser is at state From, with a lookahead symbol Input, then it + // transit to state To. + struct Edge { + StateID From, To; + SymbolID Input; + }; + + llvm::ArrayRef states() const { return States; } + llvm::ArrayRef edges() const { return Edges; } + + std::string dumpForTests(const Grammar &) const; + +private: + LRAutomaton(std::vector States, std::vector Edges) + : States(std::move(States)), Edges(std::move(Edges)) {} + + std::vector States; + std::vector Edges; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRAUTOMATON_H diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h @@ -0,0 +1,147 @@ +//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_TABLE_H +#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_TABLE_H + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/ArrayRef.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +using StateID = uint16_t; + +// A dense (thus memory-efficient) implementation of the LR parsing table. +// +// The LR parsing table is used to drive LR parsing. It is being consulted by LR +// parsers to make parsing decision. Conceptually, the parsing table answers the +// question "Given a lookahead token and current state on top of the stack, what +// is the next step?". +// +// The LR parsing table is constructed from a LR automaton, and it has two +// kinds of entries: +// - Action(I, a) specifies the next action (shift/reduce/accept) on state I +// under a terminal a. +// - GoTo(I, X) specifies a transition from state I under a non-terminal X. +// +// Unlike a typical LR parsing table, it allows conflicted actions -- an Action +// entry can have multiple available actions. +class LRTable { +public: + struct Action { + enum class Kind : uint8_t { + // Action table + Error = 0, + Shift, + Reduce, + Accept, + // GoTo table + GoTo, + }; + + bool isReduce() const { return K == Kind::Reduce; } + bool isShift() const { return K == Kind::Shift; } + bool isGoTo() const { return K == Kind::GoTo; } + bool isAccept() const { return K == Kind::Accept; } + StateID shift() const { + assert(isShift()); + return Shift; + } + StateID goTo() const { + assert(isGoTo()); + return GoToState; + } + RuleID reduce() const { + assert(isReduce()); + return Rule; + } + + bool operator==(const Action &L) const { + if (K != L.K) + return false; + switch (K) { + case Kind::Accept: + case Kind::Error: + return true; + case Kind::Shift: + return Shift == L.Shift; + case Kind::Reduce: + return Rule == L.Rule; + case Kind::GoTo: + return GoToState == L.GoToState; + } + llvm_unreachable("Unhandled Action::Kind"); + } + + Kind K = Kind::Error; + // Value + union { + StateID Shift; // For shift action + RuleID Rule; // For reduce action + StateID GoToState; // For go-to + }; + }; + + // Returns all available actions for the state From on a terminal. + llvm::ArrayRef actions(StateID From, SymbolID Token) const { + assert(pseudo::isToken(Token)); + auto Actions = find({From, Token}); + assert(Actions.size() > 0); + return Actions; + } + + // Returns the transisted state for the state From on a nonterminal. + StateID goTo(StateID From, SymbolID NonTerminal) const { + assert(pseudo::isNonterminal(NonTerminal)); + auto Goto = find({From, NonTerminal}); + assert(Goto.size() == 1 && Goto.front().isGoTo()); + return Goto.front().goTo(); + } + + size_t bytes() const { + return sizeof(*this) + Actions.capacity() * sizeof(Action) + + Index.capacity() * sizeof(IndexKey); + } + + std::string dumpStatistics() const; + std::string dumpForTests(const Grammar &G) const; + + // Build a SLR(1) parsing table, see the Dragron book. + static LRTable build(const Grammar &G); + +private: + struct IndexKey { + StateID From; + SymbolID Next; + friend bool operator==(const IndexKey &L, const IndexKey &R) { + return std::tie(L.From, L.Next) == std::tie(R.From, R.Next); + } + friend bool operator<(const IndexKey &L, const IndexKey &R) { + return std::tie(L.From, L.Next) < std::tie(R.From, R.Next); + } + }; + static_assert(sizeof(IndexKey) <= 4, "expected to be <= 4 bytes"); + + // Looks up available actions for the given Key. + // Returns empty if no available actions in the table. + llvm::ArrayRef find(IndexKey Key) const; + + // A sorted table of lookup keys, lookup results are stored in Actions. + std::vector Index; // Sorted + // A flat table of available actions, in parallel with Index. + std::vector Actions; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_TABLE_H diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -3,6 +3,8 @@ add_clang_library(clangSyntaxPseudo Grammar.cpp GrammarBNF.cpp + LRTable.cpp + LRBuilder.cpp LINK_LIBS clangBasic diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -35,6 +36,87 @@ return T->Rules[RID]; } +std::vector> Grammar::firstSets() const { + std::vector> FirstSets(T->Nonterminals.size()); + auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { + assert(isNonterminal(Target)); + if (isToken(First)) + return FirstSets[Target].insert(First).second; + bool Changed = false; + for (SymbolID SID : FirstSets[First]) + Changed |= FirstSets[Target].insert(SID).second; + return Changed; + }; + + bool Changed = false; + do { + Changed = false; + for (const auto &R : T->Rules) { + assert(R.size() > 0); + Changed |= ExpandFirstSet(R.target(), R.seq().front()); + } + } while (Changed); + + return FirstSets; +} + +std::vector> Grammar::followSets() const { + auto FirstSets = firstSets(); + std::vector> FollowSets(T->Nonterminals.size()); + // Expand the follow set of a non-terminal symbol Y by adding all from the + // given symbol set. + auto ExpandFollowSet = [&FollowSets](SymbolID Y, + const llvm::DenseSet &ToAdd) { + assert(isNonterminal(Y)); + bool Changed = false; + for (SymbolID F : ToAdd) + Changed |= FollowSets[Y].insert(F).second; + return Changed; + }; + // Follow sets is computed based on the following 3 rules, the computation + // is completed at a fixed point where there is no more new symbols can be + // added to any of the follow sets. + // The algorithm doesn't consider nullable-nonterminal case, as we prohibit + // nullable-symbol in the grammar. + // + // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol. + FollowSets[augmentedSymbol()].insert(tokenSymbol(tok::eof)); + bool Changed = false; + do { + Changed = false; + for (const auto &R : T->Rules) { + // Rule 2: for a rule X := ...Yz, we add all symbols from FIRST(z) to + // FOLLOW(Y). + for (size_t i = 0; i + 1 < R.seq().size(); ++i) { + if (isToken(R.seq()[i])) + continue; + SymbolID Next = R.seq()[i + 1]; + // First set for a non-terminal is itself. + auto First = + isToken(Next) ? llvm::DenseSet{Next} : FirstSets[Next]; + Changed |= ExpandFollowSet(R.seq()[i], First); + } + SymbolID Z = R.seq().back(); + if (!isNonterminal(Z)) + continue; + // Rule 3: for a rule X := ....Z, we add all symbols from FOLLOW(X) to + // FOLLOW(Z). + Changed |= ExpandFollowSet(Z, FollowSets[R.target()]); + } + } while (Changed); + return FollowSets; +} + +std::vector Grammar::symbols() const { + std::vector Syms; + for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) + Syms.push_back(tokenSymbol(static_cast(Terminal))); + for (SymbolID Nonterminal = 0; Nonterminal < T->Nonterminals.size(); + ++Nonterminal) + Syms.push_back(Nonterminal); + return Syms; +} + llvm::StringRef Grammar::symbolName(SymbolID SID) const { if (isToken(SID)) return T->Terminals[symbolToToken(SID)]; diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRBuilder.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRBuilder.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/LRBuilder.cpp @@ -0,0 +1,240 @@ +//===--- LRBuilder.cpp - Build LRAutomaton and parsing tables ----*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRAutomaton.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +std::string Item::dump(const Grammar &G) const { + const auto &Rule = G.lookupRule(RID); + auto ToNames = [&](llvm::ArrayRef Syms) { + std::vector Results; + for (auto SID : Syms) + Results.push_back(G.symbolName(SID)); + return Results; + }; + return llvm::formatv("{0} := {1} • {2}\n", G.symbolName(Rule.target()), + llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), + llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " ")) + .str(); +} + +llvm::hash_code hash_code(const State &S) { + return llvm::hash_combine_range(S.begin(), S.end()); +} + +std::string dump(const State &S, const Grammar &G) { + std::string Result; + llvm::raw_string_ostream OS(Result); + for (const auto &Item : S) + OS << llvm::formatv(" {0}", Item.dump(G)); + return OS.str(); +} + +// Computes a closure of the given item set S: +// - extends the given S to contain all options for parsing next token; +// - nonterminals after a dot are recursively expanded into the begin-state +// of all production rules that produce that nonterminal; +// +// Given +// Grammar rules = [ _ := E, E := E-T, E := T, T := n, T := (E) ] +// Input = [ E := .T ] +// returns [ E := .T, T := .n, T := .(E) ] +static State closure(ItemSet IS, const Grammar &G) { + std::vector ProcessingItems = {IS.begin(), IS.end()}; + while (!ProcessingItems.empty()) { + auto ProcessingItem = std::move(ProcessingItems.back()); + ProcessingItems.pop_back(); + + if (!ProcessingItem.hasNext()) + continue; + SymbolID NextSym = ProcessingItem.next(G); + + if (pseudo::isToken(NextSym)) + continue; + auto RRange = G.table().Nonterminals[NextSym].RuleRange; + for (RuleID RID = RRange.start; RID < RRange.end; ++RID) { + Item NewItem(RID, G.lookupRule(RID).size(), /*dot*/ 0); + if (IS.insert(NewItem).second) // new + ProcessingItems.push_back(std::move(NewItem)); + } + } + return IS; +} +// For all items A := a.Xb in S, returns a closure itemset with a dot +// advance of X. +static State advance(const State &S, SymbolID X, const Grammar &G) { + ItemSet AdvanceX; + for (const Item &I : S) { + if (I.hasNext() && I.next(G) == X) + AdvanceX.insert(I.advance()); + } + return closure(std::move(AdvanceX), G); +} + +std::string LRAutomaton::dumpForTests(const Grammar &G) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + + OS << "States:\n"; + for (StateID ID = 0; ID < States.size(); ++ID) { + OS << llvm::formatv("State {0}\n", ID); + OS << dump(States[ID], G); + } + return OS.str(); +} + +LRAutomaton LRAutomaton::build(const Grammar &G) { + class Builder { + public: + // Adds a given state if not existed. + std::pair insert(State S) { + auto Hash = hash_code(S); + auto It = StatesIndex.find(Hash); + if (It != StatesIndex.end()) + return {It->second, false}; // no insertion took place. + States.push_back(std::move(S)); + StatesIndex.insert({Hash, States.size() - 1}); + return {States.size() - 1, true}; + } + void insertEdge(StateID From, StateID To, SymbolID Input) { + Edges.push_back({From, To, Input}); + } + // Returns a state with the given id. + const State &find(StateID ID) const { + assert(ID < States.size()); + return States[ID]; + } + + LRAutomaton build() && { + States.resize(States.size()); + Edges.resize(Edges.size()); + return LRAutomaton(std::move(States), std::move(Edges)); + } + + private: + llvm::DenseMap StatesIndex; + std::vector States; + std::vector Edges; + } Builder; + + std::vector PendingStates; + // Initialize with the augmented symbol states. + auto RRange = G.table().Nonterminals[G.augmentedSymbol()].RuleRange; + for (RuleID RID = RRange.start; RID < RRange.end; ++RID) { + const auto &Rule = G.lookupRule(RID); + assert(Rule.size() == 1 && + "augmented symbol rule must be _ := start_symbol."); + auto StartState = closure({{{RID, Rule.size(), 0}}}, G); + auto Result = Builder.insert(std::move(StartState)); + assert(Result.second && "State must be new"); + PendingStates.push_back(Result.first); + } + + while (!PendingStates.empty()) { + auto CurrentStateID = PendingStates.back(); + PendingStates.pop_back(); + for (SymbolID Move : G.symbols()) { + State NextState = advance(Builder.find(CurrentStateID), Move, G); + if (NextState.empty()) + continue; + auto Insert = Builder.insert(NextState); + if (Insert.second) // new state, insert to the processing. + PendingStates.push_back(Insert.first); + Builder.insertEdge(CurrentStateID, Insert.first, Move); + } + } + return std::move(Builder).build(); +} + +LRTable LRTable::build(const Grammar &G) { + class Builder { + public: + void addAction(StateID From, SymbolID Next, LRTable::Action New) { + auto It = Index.find({From, Next}); + if (It == Index.end()) { + StorageActions.push_back(New); + Index.insert({{From, Next}, {StorageActions.size() - 1}}); + return; + } + // Avoid duplicated actions. + if (llvm::any_of(It->second, [&](size_t index) { + return StorageActions[index] == New; + })) + return; + StorageActions.push_back(New); + It->second.push_back(StorageActions.size() - 1); + } + LRTable build() && { + LRTable Table; + Table.Actions.reserve(StorageActions.size()); + Table.Index.reserve(StorageActions.size()); + for (auto It : Index) { + for (auto Pos : It.second) { + Table.Actions.push_back(StorageActions[Pos]); + Table.Index.push_back(It.first); + } + } + assert(llvm::is_sorted(Table.Index)); + return Table; + } + + private: + std::vector StorageActions; + // We want the iteration to be ordered. + std::map> Index; + } Builder; + + auto Automaton = LRAutomaton::build(G); + for (const auto &T : Automaton.edges()) + Builder.addAction(T.From, T.Input, + LRTable::Action{isToken(T.Input) + ? LRTable::Action::Kind::Shift + : LRTable::Action::Kind::GoTo, + {T.To}}); + + auto FollowSets = G.followSets(); + for (StateID SID = 0; SID < Automaton.states().size(); ++SID) { + for (const Item &I : Automaton.states()[SID]) { + // If "_ := start_symbol ." in the State, set Action[State, eof] to + // accept. + if (G.lookupRule(I.ruleID()).target() == G.augmentedSymbol() && + !I.hasNext()) { + Builder.addAction(SID, tokenSymbol(tok::eof), + LRTable::Action{LRTable::Action::Kind::Accept, {}}); + continue; + } + if (!I.hasNext()) { + // If a state i has an item "A := x.", set Action[i, a] to reduce + // "A := x" for all a in FOLLOW(A). + for (SymbolID Follow : FollowSets[I.rule(G).target()]) { + assert(isToken(Follow)); + Builder.addAction( + SID, Follow, + LRTable::Action{LRTable::Action::Kind::Reduce, {I.ruleID()}}); + } + } + } + } + return std::move(Builder).build(); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp @@ -0,0 +1,79 @@ +//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +std::string LRTable::dumpStatistics() const { + StateID NumOfStates = 0; + for (auto It : Index) + NumOfStates = std::max(It.From, NumOfStates); + return llvm::formatv(R"( +Statistics of the LR parsing table: + number of states: {0} + number of actions: {1} + size of the table (bytes): {2} +)", + NumOfStates, Actions.size(), bytes()) + .str(); +} + +std::string LRTable::dumpForTests(const Grammar &G) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + StateID MaxState = 0; + for (const auto &It : Index) + MaxState = std::max(MaxState, It.From); + OS << "LRTable:\n"; + for (StateID S = 0; S <= MaxState; ++S) { + OS << llvm::formatv("State {0}\n", S); + for (SymbolID ID : G.symbols()) { + for (auto A : find({S, ID})) { + if (A.isShift()) + OS << llvm::formatv(" '{0}': shift, and go to state {1}\n", + G.symbolName(ID), A.shift()); + else if (A.isReduce()) + OS << llvm::formatv(" '{0}': reduce by rule {1} '{2}'\n", + G.symbolName(ID), A.reduce(), + G.dumpRule(A.reduce())); + else if (A.isAccept()) + OS << llvm::formatv(" '{0}': accept\n", G.symbolName(ID)); + } + } + for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID) { + if (find({S, ID}).empty()) + continue; + OS << llvm::formatv(" '{0}': go to state {1}\n", G.symbolName(ID), + goTo(S, ID)); + } + } + return OS.str(); +} + +llvm::ArrayRef LRTable::find(IndexKey Key) const { + assert(Index.size() == Actions.size()); + auto It = llvm::partition_point( + Index, [&Key](const IndexKey &E) { return E < Key; }); + if (It == Index.end()) + return {}; + size_t Start = It - Index.begin(); + auto End = Start; + while (End < Index.size() && Index[End] == Key) + ++End; + return llvm::makeArrayRef(&Actions[Start], &Actions[End]); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp --- a/clang/tools/clang-pseudo/ClangPseudo.cpp +++ b/clang/tools/clang-pseudo/ClangPseudo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" @@ -42,6 +43,8 @@ return 2; } llvm::errs() << llvm::formatv("grammar file {0} is parsed successfully\n", CheckGrammar); + llvm::errs() + << clang::syntax::pseudo::LRTable::build(*RSpecs).dumpStatistics(); return 0; } return 0; diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -5,6 +5,7 @@ add_custom_target(ClangPseudoUnitTests) add_unittest(ClangPseudoUnitTests ClangPseudoTests GrammarTests.cpp + LRBuilderTests.cpp ) clang_target_link_libraries(ClangPseudoTests diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp --- a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp +++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp @@ -1,4 +1,4 @@ -//===--- GrammarTests.cpp - grammar tests ------------------------*- C++-*-===// +//===--- GrammarTests.cpp - grammar tests -----------------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -19,6 +19,7 @@ using testing::AllOf; using testing::ElementsAre; using testing::IsEmpty; +using testing::Pair; using testing::UnorderedElementsAre; MATCHER_P(TargetID, SID, "") { return arg.target() == SID; } @@ -34,7 +35,7 @@ G = Grammar::parseBNF(BNF, Diags); } - SymbolID lookup(llvm::StringRef Name) const { + SymbolID id(llvm::StringRef Name) const { for (unsigned I = 0; I < NumTerminals; ++I) if (G->table().Terminals[I] == Name) return tokenSymbol(static_cast(I)); @@ -53,10 +54,10 @@ build("expression := IDENTIFIER + expression # comment"); EXPECT_THAT(Diags, IsEmpty()); - auto ExpectedRule = AllOf(TargetID(lookup("expression")), - Sequence(lookup("IDENTIFIER"), lookup("+"), - lookup("expression"))); - auto ExpressionID = lookup("expression"); + auto ExpectedRule = + AllOf(TargetID(id("expression")), + Sequence(id("IDENTIFIER"), id("+"), id("expression"))); + auto ExpressionID = id("expression"); // auto RuleID = 0; EXPECT_EQ(G->symbolName(ExpressionID), "expression"); EXPECT_THAT(G->lookupRules(ExpressionID), UnorderedElementsAre(ExpectedRule)); @@ -71,11 +72,10 @@ build("_ := CONST_opt INT ;_opt"); EXPECT_THAT(Diags, IsEmpty()); EXPECT_THAT(G->table().Rules, - UnorderedElementsAre( - Sequence(lookup("INT")), - Sequence(lookup("CONST"), lookup("INT")), - Sequence(lookup("CONST"), lookup("INT"), lookup(";")), - Sequence(lookup("INT"), lookup(";")))); + UnorderedElementsAre(Sequence(id("INT")), + Sequence(id("CONST"), id("INT")), + Sequence(id("CONST"), id("INT"), id(";")), + Sequence(id("INT"), id(";")))); } TEST_F(GrammarTest, Diagnostics) { @@ -97,6 +97,66 @@ "No rules for nonterminal: IDENFIFIE")); } +TEST_F(GrammarTest, FirstAndFollow) { + build( + R"bnf( +_ := expr +expr := expr - term +expr := term +term := IDENTIFIER +term := ( expr ) +)bnf"); + ASSERT_TRUE(Diags.empty()); + auto ToPair = [](std::vector> Input) { + std::vector>> Sets; + for (SymbolID ID = 0; ID < Input.size(); ++ID) + Sets.emplace_back(ID, std::move(Input[ID])); + return Sets; + }; + + EXPECT_THAT( + ToPair(G->firstSets()), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), + Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), + Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); + EXPECT_THAT( + ToPair(G->followSets()), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("EOF"))), + Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), + Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); + + build(R"bnf( +# A simplfied C++ decl-specifier-seq. +_ := decl-specifier-seq +decl-specifier-seq := decl-specifier decl-specifier-seq +decl-specifier-seq := decl-specifier +decl-specifier := simple-type-specifier +decl-specifier := INLINE +simple-type-specifier := INT + )bnf"); + ASSERT_TRUE(Diags.empty()); + EXPECT_THAT( + ToPair(G->firstSets()), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), + Pair(id("decl-specifier-seq"), + UnorderedElementsAre(id("INLINE"), id("INT"))), + Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), + Pair(id("decl-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"))))); + EXPECT_THAT( + ToPair(G->followSets()), + UnorderedElementsAre( + Pair(id("_"), UnorderedElementsAre(id("EOF"))), + Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), + Pair(id("decl-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), + Pair(id("simple-type-specifier"), + UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); +} + } // namespace } // namespace pseudo } // namespace syntax diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRBuilderTests.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRBuilderTests.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/LRBuilderTests.cpp @@ -0,0 +1,112 @@ +//===--- LRBuilderTests.cpp --------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRAutomaton.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +TEST(TableTest, BuildLRTable) { + struct TestCase { + llvm::StringRef BNF; + llvm::StringRef ExpectedStates; + llvm::StringRef ExpectedTable; + }; + + TestCase Cases[] = { + { + R"bnf( +_ := expr +expr := IDENTIFIER + )bnf", + R"(States: +State 0 + _ := • expr + expr := • IDENTIFIER +State 1 + expr := IDENTIFIER • +State 2 + _ := expr • +)", + R"(LRTable: +State 0 + 'IDENTIFIER': shift, and go to state 1 + 'expr': go to state 2 +State 1 + 'EOF': reduce by rule 1 'expr := IDENTIFIER' +State 2 + 'EOF': accept +)", + }, + { + // A grammar with a S/R conflict in SLR table: (id-id)-id, or + // id-(id-id). + R"bnf( +_ := expr +expr := expr - expr # S/R conflict at state 4 on '-' token +expr := IDENTIFIER + )bnf", + R"(States: +State 0 + _ := • expr + expr := • IDENTIFIER + expr := • expr - expr +State 1 + expr := IDENTIFIER • +State 2 + _ := expr • + expr := expr • - expr +State 3 + expr := • IDENTIFIER + expr := • expr - expr + expr := expr - • expr +State 4 + expr := expr • - expr + expr := expr - expr • +)", + R"(LRTable: +State 0 + 'IDENTIFIER': shift, and go to state 1 + 'expr': go to state 2 +State 1 + 'EOF': reduce by rule 1 'expr := IDENTIFIER' + '-': reduce by rule 1 'expr := IDENTIFIER' +State 2 + 'EOF': accept + '-': shift, and go to state 3 +State 3 + 'IDENTIFIER': shift, and go to state 1 + 'expr': go to state 4 +State 4 + 'EOF': reduce by rule 2 'expr := expr - expr' + '-': shift, and go to state 3 + '-': reduce by rule 2 'expr := expr - expr' +)", + }, + }; + for (const auto &C : Cases) { + std::vector Diags; + auto G = Grammar::parseBNF(C.BNF, Diags); + ASSERT_THAT(Diags, testing::IsEmpty()); + auto Automaton = LRAutomaton::build(*G); + auto Table = LRTable::build(*G); + EXPECT_EQ(Automaton.dumpForTests(*G), C.ExpectedStates); + EXPECT_EQ(Table.dumpForTests(*G), C.ExpectedTable); + } +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang