diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -39,6 +39,7 @@ #include "clang-pseudo/grammar/Grammar.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -64,65 +65,6 @@ using StateID = uint16_t; static constexpr unsigned StateBits = 13; - // Action represents the terminal and nonterminal actions, it combines the - // entry of the ACTION and GOTO tables from the LR literature. - // - // FIXME: as we move away from a homogeneous table structure shared between - // action types, this class becomes less useful. Remove it. - class Action { - public: - enum Kind : uint8_t { - Sentinel = 0, - // Terminal actions, corresponding to entries of ACTION table. - - // Shift to state n: move forward with the lookahead, and push state n - // onto the state stack. - // A shift is a forward transition, and the value n is the next state that - // the parser is to enter. - Shift, - - // NOTE: there are no typical accept actions in the LRtable, accept - // actions are handled specifically in the parser -- if the parser - // reaches to a target state (which is goto(StartState, StartSymbol)) at - // the EOF token after a reduce, this indicates the input has been parsed - // as the StartSymbol successfully. - - // Nonterminal actions, corresponding to entry of GOTO table. - - // Go to state n: push state n onto the state stack. - // Similar to Shift, but it is a nonterminal forward transition. - GoTo, - }; - - static Action goTo(StateID S) { return Action(GoTo, S); } - static Action shift(StateID S) { return Action(Shift, S); } - static Action sentinel() { return Action(Sentinel, 0); } - - StateID getShiftState() const { - assert(kind() == Shift); - return Value; - } - StateID getGoToState() const { - assert(kind() == GoTo); - return Value; - } - Kind kind() const { return static_cast(K); } - - bool operator==(const Action &L) const { return opaque() == L.opaque(); } - uint16_t opaque() const { return K << ValueBits | Value; }; - - private: - Action(Kind K1, unsigned Value) : K(K1), Value(Value) {} - static constexpr unsigned ValueBits = StateBits; - static constexpr unsigned KindBits = 3; - static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID"); - static_assert(KindBits + ValueBits <= 16, - "Must be able to store kind and value efficiently"); - uint16_t K : KindBits; - // Either StateID or RuleID, depending on the Kind. - uint16_t Value : ValueBits; - }; - // Returns the state after we reduce a nonterminal. // Expected to be called by LR parsers. // If the nonterminal is invalid here, returns None. @@ -184,20 +126,27 @@ // Build a SLR(1) parsing table. static LRTable buildSLR(const Grammar &G); - struct Builder; - // Represents an entry in the table, used for building the LRTable. - struct Entry { - StateID State; - SymbolID Symbol; - Action Act; - }; - struct ReduceEntry { - StateID State; - RuleID Rule; + // Helper for building a table with specified actions/states. + struct Builder { + Builder() = default; + Builder(const Grammar &G) { + NumNonterminals = G.table().Nonterminals.size(); + FollowSets = followSets(G); + } + + unsigned int NumNonterminals = 0; + // States representing `_ := . start` for various start symbols. + std::vector> StartStates; + // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`. + // Key is (initial state, D), value is final state. + llvm::DenseMap, StateID> Transition; + // Reductions available in a given state. + llvm::DenseMap> Reduce; + // FollowSets[NT] is the set of terminals that can follow the nonterminal. + std::vector> FollowSets; + + LRTable build() &&; }; - // Build a specifid table for testing purposes. - static LRTable buildForTests(const Grammar &G, llvm::ArrayRef, - llvm::ArrayRef); private: unsigned numStates() const { return ReduceOffset.size() - 1; } @@ -300,7 +249,6 @@ // as an index: Nonterminal * NUM_TOKENS + Token. llvm::BitVector FollowSets; }; -llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp @@ -11,25 +11,12 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" namespace clang { namespace pseudo { -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) { - switch (A.kind()) { - case LRTable::Action::Shift: - return OS << llvm::formatv("shift state {0}", A.getShiftState()); - case LRTable::Action::GoTo: - return OS << llvm::formatv("go to state {0}", A.getGoToState()); - case LRTable::Action::Sentinel: - llvm_unreachable("unexpected Sentinel action kind!"); - } - llvm_unreachable("unexpected action kind!"); -} - std::string LRTable::dumpStatistics() const { return llvm::formatv(R"( Statistics of the LR parsing table: diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp --- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp @@ -13,109 +13,67 @@ #include "llvm/ADT/SmallSet.h" #include -namespace llvm { -template <> struct DenseMapInfo { - using Entry = clang::pseudo::LRTable::Entry; - static inline Entry getEmptyKey() { - static Entry E{static_cast(-1), 0, - clang::pseudo::LRTable::Action::sentinel()}; - return E; - } - static inline Entry getTombstoneKey() { - static Entry E{static_cast(-2), 0, - clang::pseudo::LRTable::Action::sentinel()}; - return E; - } - static unsigned getHashValue(const Entry &I) { - return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque()); - } - static bool isEqual(const Entry &LHS, const Entry &RHS) { - return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol && - LHS.Act == RHS.Act; - } -}; -} // namespace llvm - namespace clang { namespace pseudo { -struct LRTable::Builder { - std::vector> StartStates; - llvm::DenseSet Entries; - llvm::DenseMap> Reduces; - std::vector> FollowSets; +LRTable LRTable::Builder::build() && { + assert(NumNonterminals != 0 && "Set NumNonterminals or init with grammar"); + LRTable Table; - LRTable build(unsigned NumStates, unsigned NumNonterminals) && { - LRTable Table; - Table.StartStates = std::move(StartStates); + // Count number of states: every state has to be reachable somehow. + StateID MaxState = 0; + for (const auto &Entry : StartStates) + MaxState = std::max(MaxState, Entry.second); + for (const auto &Entry : Transition) + MaxState = std::max(MaxState, Entry.second); + unsigned NumStates = MaxState + 1; - // Compile the goto and shift actions into transition tables. - llvm::DenseMap Gotos; - llvm::DenseMap Shifts; - for (const auto &E : Entries) { - if (E.Act.kind() == Action::Shift) - Shifts.try_emplace(shiftIndex(E.State, E.Symbol, NumStates), - E.Act.getShiftState()); - else if (E.Act.kind() == Action::GoTo) - Gotos.try_emplace(gotoIndex(E.State, E.Symbol, NumStates), - E.Act.getGoToState()); - } - Table.Shifts = TransitionTable(Shifts, NumStates * NumTerminals); - Table.Gotos = TransitionTable(Gotos, NumStates * NumNonterminals); + Table.StartStates = std::move(StartStates); - // Compile the follow sets into a bitmap. - Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size()); - for (SymbolID NT = 0; NT < FollowSets.size(); ++NT) - for (SymbolID Follow : FollowSets[NT]) - Table.FollowSets.set(NT * tok::NUM_TOKENS + symbolToToken(Follow)); + // Compile the goto and shift actions into transition tables. + llvm::DenseMap Gotos; + llvm::DenseMap Shifts; + for (const auto &E : Transition) { + if (isToken(E.first.second)) + Shifts.try_emplace(shiftIndex(E.first.first, E.first.second, NumStates), + E.second); + else + Gotos.try_emplace(gotoIndex(E.first.first, E.first.second, NumStates), + E.second); + } + Table.Shifts = TransitionTable(Shifts, NumStates * NumTerminals); + Table.Gotos = TransitionTable(Gotos, NumStates * NumNonterminals); - // Store the reduce actions in a vector partitioned by state. - Table.ReduceOffset.reserve(NumStates + 1); - std::vector StateRules; - for (StateID S = 0; S < NumStates; ++S) { - Table.ReduceOffset.push_back(Table.Reduces.size()); - auto It = Reduces.find(S); - if (It == Reduces.end()) - continue; - Table.Reduces.insert(Table.Reduces.end(), It->second.begin(), - It->second.end()); - std::sort(Table.Reduces.begin() + Table.ReduceOffset.back(), - Table.Reduces.end()); - } - Table.ReduceOffset.push_back(Table.Reduces.size()); + // Compile the follow sets into a bitmap. + Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size()); + for (SymbolID NT = 0; NT < FollowSets.size(); ++NT) + for (SymbolID Follow : FollowSets[NT]) + Table.FollowSets.set(NT * tok::NUM_TOKENS + symbolToToken(Follow)); - return Table; + // Store the reduce actions in a vector partitioned by state. + Table.ReduceOffset.reserve(NumStates + 1); + std::vector StateRules; + for (StateID S = 0; S < NumStates; ++S) { + Table.ReduceOffset.push_back(Table.Reduces.size()); + auto It = Reduce.find(S); + if (It == Reduce.end()) + continue; + Table.Reduces.insert(Table.Reduces.end(), It->second.begin(), + It->second.end()); + std::sort(Table.Reduces.begin() + Table.ReduceOffset.back(), + Table.Reduces.end()); } -}; + Table.ReduceOffset.push_back(Table.Reduces.size()); -LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef Entries, - llvm::ArrayRef Reduces) { - StateID MaxState = 0; - for (const auto &Entry : Entries) { - MaxState = std::max(MaxState, Entry.State); - if (Entry.Act.kind() == LRTable::Action::Shift) - MaxState = std::max(MaxState, Entry.Act.getShiftState()); - if (Entry.Act.kind() == LRTable::Action::GoTo) - MaxState = std::max(MaxState, Entry.Act.getGoToState()); - } - Builder Build; - Build.Entries.insert(Entries.begin(), Entries.end()); - for (const ReduceEntry &E : Reduces) - Build.Reduces[E.State].insert(E.Rule); - Build.FollowSets = followSets(G); - return std::move(Build).build(/*NumStates=*/MaxState + 1, - G.table().Nonterminals.size()); + return Table; } LRTable LRTable::buildSLR(const Grammar &G) { auto Graph = LRGraph::buildLR0(G); - Builder Build; + Builder Build(G); Build.StartStates = Graph.startStates(); - for (const auto &T : Graph.edges()) { - Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); - Build.Entries.insert({T.Src, T.Label, Act}); - } - Build.FollowSets = followSets(G); + for (const auto &T : Graph.edges()) + Build.Transition.try_emplace({T.Src, T.Label}, T.Dst); assert(Graph.states().size() <= (1 << StateBits) && "Graph states execceds the maximum limit!"); // Add reduce actions. @@ -129,11 +87,10 @@ if (!I.hasNext()) // If we've reached the end of a rule A := ..., then we can reduce if // the next token is in the follow set of A. - Build.Reduces[SID].insert(I.rule()); + Build.Reduce[SID].insert(I.rule()); } } - return std::move(Build).build(Graph.states().size(), - G.table().Nonterminals.size()); + return std::move(Build).build(); } } // namespace pseudo diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -30,7 +30,7 @@ namespace { -using Action = LRTable::Action; +using StateID = LRTable::StateID; using testing::AllOf; using testing::ElementsAre; using testing::UnorderedElementsAre; @@ -122,13 +122,11 @@ /*Parents=*/{GSSNode0}); buildGrammar({}, {}); // Create a fake empty grammar. - TestLang.Table = - LRTable::buildForTests(TestLang.G, /*Entries=*/{ - {1, tokenSymbol(tok::semi), Action::shift(4)}, - {2, tokenSymbol(tok::semi), Action::shift(4)}, - {3, tokenSymbol(tok::semi), Action::shift(5)}, - }, - {}); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{1}, tokenSymbol(tok::semi)}] = StateID{4}; + B.Transition[{StateID{2}, tokenSymbol(tok::semi)}] = StateID{4}; + B.Transition[{StateID{3}, tokenSymbol(tok::semi)}] = StateID{5}; + TestLang.Table = std::move(B).build(); ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0); std::vector NewHeads; @@ -152,17 +150,12 @@ // └--3(enum-name) // 3 is goto(0, enum-name) buildGrammar({"class-name", "enum-name"}, {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"}); - - TestLang.Table = LRTable::buildForTests( - TestLang.G, - { - {/*State=*/0, id("class-name"), Action::goTo(2)}, - {/*State=*/0, id("enum-name"), Action::goTo(3)}, - }, - { - {/*State=*/1, ruleFor("class-name")}, - {/*State=*/1, ruleFor("enum-name")}, - }); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{0}, id("class-name")}] = StateID{2}; + B.Transition[{StateID{0}, id("enum-name")}] = StateID{3}; + B.Reduce[StateID{1}].insert(ruleFor("class-name")); + B.Reduce[StateID{1}].insert(ruleFor("enum-name")); + TestLang.Table = std::move(B).build(); const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); @@ -202,15 +195,12 @@ /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1), /*Parents=*/{GSSNode2, GSSNode3}); - TestLang.Table = LRTable::buildForTests( - TestLang.G, - { - {/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)}, - {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}, - }, - { - {/*State=*/4, ruleFor("ptr-operator")}, - }); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{2}, id("ptr-operator")}] = StateID{5}; + B.Transition[{StateID{3}, id("ptr-operator")}] = StateID{6}; + B.Reduce[StateID{4}].insert(ruleFor("ptr-operator")); + TestLang.Table = std::move(B).build(); + std::vector Heads = {GSSNode4}; glrReduce(Heads, tokenSymbol(tok::eof), {TestLang.G, TestLang.Table, Arena, GSStack}); @@ -256,16 +246,13 @@ /*Parents=*/{GSSNode2}); // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! - TestLang.Table = LRTable::buildForTests( - TestLang.G, - { - {/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)}, - {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}, - }, - { - {/*State=*/3, /* type-name := class-name */ 0}, - {/*State=*/4, /* type-name := enum-name */ 1}, - }); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{1}, id("type-name")}] = StateID{5}; + B.Transition[{StateID{2}, id("type-name")}] = StateID{5}; + B.Reduce[StateID{3}].insert(/* type-name := class-name */ RuleID{0}); + B.Reduce[StateID{4}].insert(/* type-name := enum-name */ RuleID{1}); + TestLang.Table = std::move(B).build(); + std::vector Heads = {GSSNode3, GSSNode4}; glrReduce(Heads, tokenSymbol(tok::eof), {TestLang.G, TestLang.Table, Arena, GSStack}); @@ -314,15 +301,12 @@ /*Parents=*/{GSSNode2}); // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! - TestLang.Table = - LRTable::buildForTests(TestLang.G, - { - {/*State=*/0, id("pointer"), Action::goTo(5)}, - }, - { - {3, /* pointer := class-name */ 0}, - {4, /* pointer := enum-name */ 1}, - }); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{0}, id("pointer")}] = StateID{5}; + B.Reduce[StateID{3}].insert(/* pointer := class-name */ RuleID{0}); + B.Reduce[StateID{4}].insert(/* pointer := enum-name */ RuleID{1}); + TestLang.Table = std::move(B).build(); + std::vector Heads = {GSSNode3, GSSNode4}; glrReduce(Heads, tokenSymbol(tok::eof), {TestLang.G, TestLang.Table, Arena, GSStack}); @@ -345,14 +329,10 @@ TEST_F(GLRTest, ReduceLookahead) { // A term can be followed by +, but not by -. buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"}); - TestLang.Table = - LRTable::buildForTests(TestLang.G, - { - {/*State=*/0, id("term"), Action::goTo(2)}, - }, - { - {/*State=*/1, 0}, - }); + LRTable::Builder B(TestLang.G); + B.Transition[{StateID{0}, id("term")}] = StateID{2}; + B.Reduce[StateID{1}].insert(RuleID{0}); + TestLang.Table = std::move(B).build(); auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0); diff --git a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp b/clang-tools-extra/pseudo/unittests/LRTableTest.cpp --- a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp +++ b/clang-tools-extra/pseudo/unittests/LRTableTest.cpp @@ -20,7 +20,7 @@ using llvm::ValueIs; using testing::ElementsAre; -using Action = LRTable::Action; +using StateID = LRTable::StateID; TEST(LRTable, Builder) { std::vector GrammarDiags; @@ -38,22 +38,20 @@ SymbolID Identifier = tokenSymbol(tok::identifier); SymbolID Plus = tokenSymbol(tok::plus); + LRTable::Builder B(G); // eof IDENT term // +-------+----+-------+------+ // |state0 | | s0 | | // |state1 | | | g3 | // |state2 | | | | // +-------+----+-------+------+------- - std::vector Entries = { - {/* State */ 0, Identifier, Action::shift(0)}, - {/* State */ 1, Term, Action::goTo(3)}, - }; - std::vector ReduceEntries = { - {/*State=*/0, /*Rule=*/0}, - {/*State=*/1, /*Rule=*/2}, - {/*State=*/2, /*Rule=*/1}, - }; - LRTable T = LRTable::buildForTests(G, Entries, ReduceEntries); + B.Transition[{StateID{0}, Identifier}] = StateID{0}; + B.Transition[{StateID{1}, Term}] = StateID{3}; + B.Reduce[StateID{0}].insert(RuleID{0}); + B.Reduce[StateID{1}].insert(RuleID{2}); + B.Reduce[StateID{2}].insert(RuleID{1}); + LRTable T = std::move(B).build(); + EXPECT_EQ(T.getShiftState(0, Eof), llvm::None); EXPECT_THAT(T.getShiftState(0, Identifier), ValueIs(0)); EXPECT_THAT(T.getReduceRules(0), ElementsAre(0));