diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h --- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -154,6 +154,8 @@ // It can be constructed dynamically (from compiling BNF file) or statically // (a compiled data-source). struct GrammarTable { + GrammarTable(); + struct Nonterminal { std::string Name; // Corresponding rules that construct the non-terminal, it is a [start, end) @@ -169,7 +171,7 @@ std::vector Rules; // A table of terminals (aka tokens). It corresponds to the clang::Token. // clang::tok::TokenKind is the index of the table. - std::vector Terminals; + llvm::ArrayRef Terminals; // A table of nonterminals, sorted by name. // SymbolID is the index of the table. std::vector Nonterminals; diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h @@ -0,0 +1,182 @@ +//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The LRTable (referred as LR parsing table in the LR literature) is the core +// component in LR parsers, it drives the LR parsers by specifying an action to +// take given the current state on the top of the stack and the current +// lookahead token. +// +// The LRTable can be described as a matrix where the rows represent +// the states of the LR graph, the columns represent the symbols of the +// grammar, and each entry of the matrix (called action) represents a +// state transition in the graph. +// +// Typically, based on the category of the grammar symbol, the LRTable is +// broken into two logically separate tables: +// - ACTION table with terminals as columns -- e.g ACTION[S, a] specifies +// next action (shift/reduce/accept/error) on state S under a lookahead +// terminal a +// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specify +// the state which we transist to from the state S with the nonterminal X +// +// LRTable is *performance-critial* as it is consulted frequently during a +// parse. In general, LRTable is very sparse (most of the entries are empty). +// For example, for the C++ language, the SLR table has ~1500 states and 650 +// symbols which results in a matrix having 975K entries, ~90% of entries are +// empty. +// +// This file implements a speed-and-space-efficient LRTable. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H +#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/ArrayRef.h" +#include +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +// Represents the LR parsing table, which can efficiently the question "what is +// the next step given the lookahead token and current state on top of the +// stack?". +// +// This is a dense implementation, which only takes an amount of space that is +// proportional to the number of non-empty entries in the table. +// +// Unlike the typical LR parsing table which allows at most one available action +// per entry, conflicted actions are allowed in LRTable. The LRTable is designed +// to be used in nondeterministic LR parsers (e.g. GLR). +class LRTable { +public: + // StateID is only 13 bits wide. + using StateID = uint16_t; + static constexpr unsigned StateBits = 13; + + // Action represents the terminal and nonterminal actions, it combines the + // entry of the ACTION and GOTO tables from the LR literature. + class Action { + public: + enum Kind : uint8_t { + Sentinel = 0, + // Terminal actions, corresponding to entries of ACTION table. + + // Shift to state n: move forward with the lookahead, and push state n + // onto the state stack. + // A shift is a forward transition, and the value n is the next state that + // the parser is to enter. + Shift, + // Reduce by a rule: pop the state stack. + Reduce, + // Signals that we have parsed the input successfully. + Accept, + + // Nonterminal actions, corresponding to entry of GOTO table. + + // Go to state n: push state n onto the state stack. + // Similar to Shift, but it is a nonterminal forward transition. + GoTo, + }; + + static Action accept(RuleID RID) { return Action(Accept, RID); } + static Action goTo(StateID S) { return Action(GoTo, S); } + static Action shift(StateID S) { return Action(Shift, S); } + static Action reduce(RuleID RID) { return Action(Reduce, RID); } + static Action sentinel() { return Action(Sentinel, 0); } + + StateID getShiftState() const { + assert(kind() == Shift); + return Value; + } + StateID getGoToState() const { + assert(kind() == GoTo); + return Value; + } + RuleID getReduceRule() const { + assert(kind() == Reduce); + return Value; + } + Kind kind() const { return static_cast(K); } + + bool operator==(const Action &L) const { return opaque() == L.opaque(); } + uint16_t opaque() const { return K << ValueBits | Value; }; + + private: + Action(Kind K1, unsigned Value) : K(K1), Value(Value) {} + static constexpr unsigned ValueBits = StateBits; + static constexpr unsigned KindBits = 3; + static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID"); + static_assert(KindBits + ValueBits <= 16, + "Must be able to store kind and value efficiently"); + uint16_t K : KindBits; + // Either StateID or RuleID, depending on the Kind. + uint16_t Value : ValueBits; + }; + + // Returns all available actions for the given state on a terminal. + // Expected to be called by LR parsers. + llvm::ArrayRef getActions(StateID State, SymbolID Terminal) const; + // Returns the state after we reduce a nonterminal. + // Expected to be called by LR parsers. + StateID getGoToState(StateID State, SymbolID Nonterminal) const; + + // Looks up available actions. + // Returns empty if no available actions in the table. + llvm::ArrayRef find(StateID State, SymbolID Symbol) const; + + size_t bytes() const { + return sizeof(*this) + Actions.capacity() * sizeof(Action) + + States.capacity() * sizeof(StateID) + + NontermOffset.capacity() * sizeof(uint32_t) + + TerminalOffset.capacity() * sizeof(uint32_t); + } + + std::string dumpStatistics() const; + std::string dumpForTests(const Grammar &G) const; + + // Build a SLR(1) parsing table. + static LRTable buildSLR(const Grammar &G); + + class Builder; + // Represents an entry in the table, used for building the LRTable. + struct Entry { + StateID State; + SymbolID Symbol; + Action Act; + }; + // Build a specifid table for testing purposes. + static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef); + +private: + // Conceptually the LR table is a multimap from (State, SymbolID) => Action. + // Our physical representation is quite different for compactness. + + // Index is nonterminal SymbolID, value is the offset into States/Actions + // where the entries for this nonterminal begin. + // Give a non-terminal id, the corresponding half-open range of StateIdx is + // [NontermIdx[id], NontermIdx[id+1]). + std::vector NontermOffset; + // Similar to NontermOffset, but for terminals, index is tok::TokenKind. + std::vector TerminalOffset; + // Parallel to Actions, the value is State (rows of the matrix). + // Grouped by the SymbolID, and only subranges are sorted. + std::vector States; + // A flat list of available actions, sorted by (SymbolID, State). + std::vector Actions; +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -4,7 +4,9 @@ Grammar.cpp GrammarBNF.cpp LRGraph.cpp - + LRTable.cpp + LRTableBuild.cpp + LINK_LIBS clangBasic clangLex diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp @@ -163,6 +163,23 @@ return FollowSets; } +static llvm::ArrayRef getTerminalNames() { + static const std::vector *TerminalNames = []() { + static std::vector TerminalNames; + TerminalNames.reserve(NumTerminals); + for (unsigned I = 0; I < NumTerminals; ++I) { + tok::TokenKind K = static_cast(I); + if (const auto *Punc = tok::getPunctuatorSpelling(K)) + TerminalNames.push_back(Punc); + else + TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper()); + } + return &TerminalNames; + }(); + return *TerminalNames; +} +GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {} + } // namespace pseudo } // namespace syntax } // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp --- a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp +++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp @@ -21,17 +21,6 @@ static const llvm::StringRef OptSuffix = "_opt"; static const llvm::StringRef StartSymbol = "_"; -void initTerminals(std::vector &Out) { - Out.clear(); - Out.reserve(NumTerminals); - for (unsigned I = 0; I < NumTerminals; ++I) { - tok::TokenKind K = static_cast(I); - if (const auto *Punc = tok::getPunctuatorSpelling(K)) - Out.push_back(Punc); - else - Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper()); - } -} // Builds grammar from BNF files. class GrammarBuilder { public: @@ -53,7 +42,6 @@ "Optional symbols should be eliminated!"); auto T = std::make_unique(); - initTerminals(T->Terminals); // Assemble the name->ID and ID->nonterminal name maps. llvm::DenseSet UniqueNonterminals; diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp @@ -0,0 +1,124 @@ +//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) { + switch (A.kind()) { + case LRTable::Action::Shift: + return OS << llvm::formatv("shift state {0}", A.getShiftState()); + case LRTable::Action::Reduce: + return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule()); + case LRTable::Action::GoTo: + return OS << llvm::formatv("go to state {0}", A.getGoToState()); + case LRTable::Action::Accept: + return OS << "acc"; + case LRTable::Action::Sentinel: + llvm_unreachable("unexpected Sentinel action kind!"); + } +} + +std::string LRTable::dumpStatistics() const { + StateID NumOfStates = 0; + for (StateID It : States) + NumOfStates = std::max(It, NumOfStates); + return llvm::formatv(R"( +Statistics of the LR parsing table: + number of states: {0} + number of actions: {1} + size of the table (bytes): {2} +)", + NumOfStates, Actions.size(), bytes()) + .str(); +} + +std::string LRTable::dumpForTests(const Grammar &G) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + StateID MaxState = 0; + for (StateID It : States) + MaxState = std::max(MaxState, It); + OS << "LRTable:\n"; + for (StateID S = 0; S <= MaxState; ++S) { + OS << llvm::formatv("State {0}\n", S); + for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) { + SymbolID TokID = tokenSymbol(static_cast(Terminal)); + for (auto A : find(S, TokID)) { + if (A.kind() == LRTable::Action::Shift) + OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n", + G.symbolName(TokID), A.getShiftState()); + else if (A.kind() == LRTable::Action::Reduce) + OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n", + G.symbolName(TokID), A.getReduceRule(), + G.dumpRule(A.getReduceRule())); + else if (A.kind() == LRTable::Action::Accept) + OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID)); + } + } + for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size(); + ++NontermID) { + if (find(S, NontermID).empty()) + continue; + OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n", + G.symbolName(NontermID), + getGoToState(S, NontermID)); + } + } + return OS.str(); +} + +llvm::ArrayRef LRTable::getActions(StateID State, + SymbolID Terminal) const { + assert(pseudo::isToken(Terminal) && "expect terminal symbol!"); + return find(State, Terminal); +} + +LRTable::StateID LRTable::getGoToState(StateID State, + SymbolID Nonterminal) const { + assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!"); + auto Result = find(State, Nonterminal); + assert(Result.size() == 1 && Result.front().kind() == Action::GoTo); + return Result.front().getGoToState(); +} + +llvm::ArrayRef LRTable::find(StateID Src, SymbolID ID) const { + size_t Idx = isToken(ID) ? symbolToToken(ID) : ID; + assert(isToken(ID) ? Idx + 1 < TerminalOffset.size() + : Idx + 1 < NontermOffset.size()); + std::pair TargetStateRange = + isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1]) + : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]); + auto TargetedStates = + llvm::makeArrayRef(States.data() + TargetStateRange.first, + States.data() + TargetStateRange.second); + + assert(llvm::is_sorted(TargetedStates) && + "subrange of the StateIdx should be sorted!"); + const LRTable::StateID *It = llvm::partition_point( + TargetedStates, [&Src](LRTable::StateID S) { return S < Src; }); + if (It == TargetedStates.end()) + return {}; + size_t Start = It - States.data(), End = Start; + while (End < States.size() && States[End] == Src) + ++End; + return llvm::makeArrayRef(&Actions[Start], &Actions[End]); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp @@ -0,0 +1,143 @@ +//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include + +namespace llvm { +template <> struct DenseMapInfo { + using Entry = clang::syntax::pseudo::LRTable::Entry; + static inline Entry getEmptyKey() { + static Entry E{static_cast(-1), 0, + clang::syntax::pseudo::LRTable::Action::sentinel()}; + return E; + } + static inline Entry getTombstoneKey() { + static Entry E{static_cast(-2), 0, + clang::syntax::pseudo::LRTable::Action::sentinel()}; + return E; + } + static unsigned getHashValue(const Entry &I) { + return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque()); + } + static bool isEqual(const Entry &LHS, const Entry &RHS) { + return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol && + LHS.Act == RHS.Act; + } +}; +} // namespace llvm + +namespace clang { +namespace syntax { +namespace pseudo { + +class LRTable::Builder { +public: + bool insert(Entry E) { return Entries.insert(std::move(E)).second; } + LRTable build(const GrammarTable >) && { + // E.g. given the following parsing table with 3 states and 3 terminals: + // + // a b c + // +-------+----+-------+-+ + // |state0 | | s0,r0 | | + // |state1 | acc| | | + // |state2 | | r1 | | + // +-------+----+-------+-+ + // + // The final LRTable: + // - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel) + // - States: [ 1, 0, 0, 2] + // Actions: [ acc, s0, r0, r1] + // ~~~ corresponding range for terminal a + // ~~~~~~~~~~ corresponding range for terminal b + // First step, we sort all entries by (Symbol, State, Action). + std::vector Sorted(Entries.begin(), Entries.end()); + llvm::sort(Sorted, [](const Entry &L, const Entry &R) { + return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) < + std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque()); + }); + + LRTable Table; + Table.Actions.reserve(Sorted.size()); + Table.States.reserve(Sorted.size()); + // We are good to finalize the States and Actions. + for (const auto &E : Sorted) { + Table.Actions.push_back(E.Act); + Table.States.push_back(E.State); + } + // Initialize the terminal and nonterminal idx, all ranges are empty by + // default. + Table.TerminalOffset = std::vector(GT.Terminals.size() + 1, 0); + Table.NontermOffset = std::vector(GT.Nonterminals.size() + 1, 0); + size_t SortedIndex = 0; + for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size(); + ++NonterminalID) { + Table.NontermOffset[NonterminalID] = SortedIndex; + while (SortedIndex < Sorted.size() && + Sorted[SortedIndex].Symbol == NonterminalID) + ++SortedIndex; + } + for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size(); + ++Terminal) { + Table.TerminalOffset[Terminal] = SortedIndex; + while (SortedIndex < Sorted.size() && + Sorted[SortedIndex].Symbol == + tokenSymbol(static_cast(Terminal))) + ++SortedIndex; + } + return Table; + } + +private: + llvm::DenseSet Entries; +}; + +LRTable LRTable::buildForTests(const GrammarTable >, + llvm::ArrayRef Entries) { + Builder Build; + for (const Entry &E : Entries) + Build.insert(E); + return std::move(Build).build(GT); +} + +LRTable LRTable::buildSLR(const Grammar &G) { + Builder Build; + auto Graph = LRGraph::buildLR0(G); + for (const auto &T : Graph.edges()) { + Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); + Build.insert({T.Src, T.Label, Act}); + } + assert(Graph.states().size() <= (1 << StateBits) && + "Graph states execceds the maximum limit!"); + auto FollowSets = followSets(G); + for (StateID SID = 0; SID < Graph.states().size(); ++SID) { + for (const Item &I : Graph.states()[SID].Items) { + // If we've just parsed the start symbol, we can accept the input. + if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) { + Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())}); + continue; + } + if (!I.hasNext()) { + // If we've reached the end of a rule A := ..., then we can reduce if + // the next token is in the follow set of A". + for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) { + assert(isToken(Follow)); + Build.insert({SID, Follow, Action::reduce(I.rule())}); + } + } + } + } + return std::move(Build).build(G.table()); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/test/Syntax/check-cxx-bnf.test b/clang/test/Syntax/check-cxx-bnf.test --- a/clang/test/Syntax/check-cxx-bnf.test +++ b/clang/test/Syntax/check-cxx-bnf.test @@ -1,2 +1,2 @@ // verify clang/lib/Tooling/Syntax/Pseudo/cxx.bnf -// RUN: clang-pseudo -check-grammar=%cxx-bnf-file +// RUN: clang-pseudo -grammar=%cxx-bnf-file diff --git a/clang/test/Syntax/lr-build-basic.test b/clang/test/Syntax/lr-build-basic.test new file mode 100644 --- /dev/null +++ b/clang/test/Syntax/lr-build-basic.test @@ -0,0 +1,24 @@ +_ := expr +expr := IDENTIFIER + +# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH +# GRAPH: States: +# GRPAH-NEXT: State 0 +# GRPAH-NEXT: _ := • expr +# GRPAH-NEXT: expr := • IDENTIFIER +# GRPAH-NEXT: State 1 +# GRPAH-NEXT: _ := expr • +# GRPAH-NEXT: State 2 +# GRPAH-NEXT: expr := IDENTIFIER • +# GRPAH-NEXT: 0 ->[expr] 1 +# GRPAH-NEXT: 0 ->[IDENTIFIER] 2 + +# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE +# TABLE: LRTable: +# TABLE-NEXT: State 0 +# TABLE-NEXT: 'IDENTIFIER': shift state 2 +# TABLE-NEXT: 'expr': go to state 1 +# TABLE-NEXT: State 1 +# TABLE-NEXT: 'EOF': accept +# TABLE-NEXT: State 2 +# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER' diff --git a/clang/test/Syntax/lr-build-conflicts.test b/clang/test/Syntax/lr-build-conflicts.test new file mode 100644 --- /dev/null +++ b/clang/test/Syntax/lr-build-conflicts.test @@ -0,0 +1,47 @@ +_ := expr +expr := expr - expr # S/R conflict at state 4 on '-' token +expr := IDENTIFIER + +# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH +# GRAPH: States +# GRAPH-NEXT: State 0 +# GRAPH-NEXT: _ := • expr +# GRAPH-NEXT: expr := • expr - expr +# GRAPH-NEXT: expr := • IDENTIFIER +# GRAPH-NEXT: State 1 +# GRAPH-NEXT: _ := expr • +# GRAPH-NEXT: expr := expr • - expr +# GRAPH-NEXT: State 2 +# GRAPH-NEXT: expr := IDENTIFIER • +# GRAPH-NEXT: State 3 +# GRAPH-NEXT: expr := • expr - expr +# GRAPH-NEXT: expr := expr - • expr +# GRAPH-NEXT: expr := • IDENTIFIER +# GRAPH-NEXT: State 4 +# GRAPH-NEXT: expr := expr - expr • +# GRAPH-NEXT: expr := expr • - expr +# GRAPH-NEXT: 0 ->[expr] 1 +# GRAPH-NEXT: 0 ->[IDENTIFIER] 2 +# GRAPH-NEXT: 1 ->[-] 3 +# GRAPH-NEXT: 3 ->[expr] 4 +# GRAPH-NEXT: 3 ->[IDENTIFIER] 2 +# GRAPH-NEXT: 4 ->[-] 3 + +# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE +# TABLE: LRTable: +# TABLE-NEXT: State 0 +# TABLE-NEXT: 'IDENTIFIER': shift state 2 +# TABLE-NEXT: 'expr': go to state 1 +# TABLE-NEXT: State 1 +# TABLE-NEXT: 'EOF': accept +# TABLE-NEXT: '-': shift state 3 +# TABLE-NEXT: State 2 +# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER' +# TABLE-NEXT: '-': reduce by rule 1 'expr := IDENTIFIER' +# TABLE-NEXT: State 3 +# TABLE-NEXT: 'IDENTIFIER': shift state 2 +# TABLE-NEXT: 'expr': go to state 4 +# TABLE-NEXT: State 4 +# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: '-': shift state 3 +# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr' diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp --- a/clang/tools/clang-pseudo/ClangPseudo.cpp +++ b/clang/tools/clang-pseudo/ClangPseudo.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" @@ -18,30 +20,45 @@ using llvm::cl::opt; static opt - CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."), - init("")); + Grammar("grammar", desc("Parse and check a BNF grammar file."), init("")); +static opt PrintGraph("print-graph", + desc("Print the LR graph for the grammar")); +static opt PrintTable("print-table", + desc("Print the LR table for the grammar")); + +static std::string readOrDie(llvm::StringRef Path) { + llvm::ErrorOr> Text = + llvm::MemoryBuffer::getFile(Path); + if (std::error_code EC = Text.getError()) { + llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message() + << "\n"; + ::exit(1); + } + return Text.get()->getBuffer().str(); +} int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); - if (CheckGrammar.getNumOccurrences()) { - llvm::ErrorOr> Text = - llvm::MemoryBuffer::getFile(CheckGrammar); - if (std::error_code EC = Text.getError()) { - llvm::errs() << "Error: can't read grammar file '" << CheckGrammar - << "': " << EC.message() << "\n"; - return 1; - } + if (Grammar.getNumOccurrences()) { + std::string Text = readOrDie(Grammar); std::vector Diags; - auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags); + auto G = Grammar::parseBNF(Text, Diags); if (!Diags.empty()) { llvm::errs() << llvm::join(Diags, "\n"); return 2; } - llvm::errs() << llvm::formatv("grammar file {0} is parsed successfully\n", - CheckGrammar); + llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n", + Grammar); + if (PrintGraph) + llvm::outs() << clang::syntax::pseudo::LRGraph::buildLR0(*G).dumpForTests( + *G); + if (PrintTable) + llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests( + *G); return 0; } + return 0; } diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -4,7 +4,7 @@ add_clang_unittest(ClangPseudoTests GrammarTest.cpp - LRGraphTest.cpp + LRTableTest.cpp ) clang_target_link_libraries(ClangPseudoTests diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp deleted file mode 100644 --- a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp +++ /dev/null @@ -1,84 +0,0 @@ -//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include - -namespace clang { -namespace syntax { -namespace pseudo { -namespace { - -TEST(LRGraph, Build) { - struct TestCase { - llvm::StringRef BNF; - llvm::StringRef ExpectedStates; - }; - - TestCase Cases[] = {{ - R"bnf( -_ := expr -expr := IDENTIFIER - )bnf", - R"(States: -State 0 - _ := • expr - expr := • IDENTIFIER -State 1 - _ := expr • -State 2 - expr := IDENTIFIER • -0 ->[expr] 1 -0 ->[IDENTIFIER] 2 -)"}, - {// A grammar with a S/R conflict in SLR table: - // (id-id)-id, or id-(id-id). - R"bnf( -_ := expr -expr := expr - expr # S/R conflict at state 4 on '-' token -expr := IDENTIFIER - )bnf", - R"(States: -State 0 - _ := • expr - expr := • expr - expr - expr := • IDENTIFIER -State 1 - _ := expr • - expr := expr • - expr -State 2 - expr := IDENTIFIER • -State 3 - expr := • expr - expr - expr := expr - • expr - expr := • IDENTIFIER -State 4 - expr := expr - expr • - expr := expr • - expr -0 ->[expr] 1 -0 ->[IDENTIFIER] 2 -1 ->[-] 3 -3 ->[expr] 4 -3 ->[IDENTIFIER] 2 -4 ->[-] 3 -)"}}; - for (const auto &C : Cases) { - std::vector Diags; - auto G = Grammar::parseBNF(C.BNF, Diags); - ASSERT_THAT(Diags, testing::IsEmpty()); - auto LR0 = LRGraph::buildLR0(*G); - EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates); - } -} - -} // namespace -} // namespace pseudo -} // namespace syntax -} // namespace clang diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp @@ -0,0 +1,56 @@ +//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRTable.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +using testing::IsEmpty; +using testing::UnorderedElementsAre; +using Action = LRTable::Action; + +TEST(LRTable, Builder) { + GrammarTable GTable; + + // eof semi ... + // +-------+----+-------+--- + // |state0 | | s0,r0 |... + // |state1 | acc| |... + // |state2 | | r1 |... + // +-------+----+-------+--- + std::vector Entries = { + {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)}, + {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)}, + {/* State */ 1, tokenSymbol(tok::eof), Action::accept(2)}, + {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}}; + GrammarTable GT; + LRTable T = LRTable::buildForTests(GT, Entries); + EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty()); + EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)), + UnorderedElementsAre(Action::shift(0), Action::reduce(0))); + EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)), + UnorderedElementsAre(Action::accept(2))); + EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty()); + EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)), + UnorderedElementsAre(Action::reduce(1))); + // Verify the behaivor for other non-available-actions terminals. + EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty()); +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang