Index: llvm/trunk/include/llvm/Support/Automaton.h =================================================================== --- llvm/trunk/include/llvm/Support/Automaton.h +++ llvm/trunk/include/llvm/Support/Automaton.h @@ -0,0 +1,230 @@ +//===-- Automaton.h - Support for driving TableGen-produced DFAs ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements class that drive and introspect deterministic finite- +// state automata (DFAs) as generated by TableGen's -gen-automata backend. +// +// For a description of how to define an automaton, see +// include/llvm/TableGen/Automaton.td. +// +// One important detail is that these deterministic automata are created from +// (potentially) nondeterministic definitions. Therefore a unique sequence of +// input symbols will produce one path through the DFA but multiple paths +// through the original NFA. An automaton by default only returns "accepted" or +// "not accepted", but frequently we want to analyze what NFA path was taken. +// Finding a path through the NFA states that results in a DFA state can help +// answer *what* the solution to a problem was, not just that there exists a +// solution. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AUTOMATON_H +#define LLVM_SUPPORT_AUTOMATON_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Allocator.h" +#include +#include +#include +#include +#include + +namespace llvm { + +using NfaPath = SmallVector; + +/// Forward define the pair type used by the automata transition info tables. +/// +/// Experimental results with large tables have shown a significant (multiple +/// orders of magnitude) parsing speedup by using a custom struct here with a +/// trivial constructor rather than std::pair. +struct NfaStatePair { + uint64_t FromDfaState, ToDfaState; + + bool operator<(const NfaStatePair &Other) const { + return std::make_tuple(FromDfaState, ToDfaState) < + std::make_tuple(Other.FromDfaState, Other.ToDfaState); + } +}; + +namespace internal { +/// The internal class that maintains all possible paths through an NFA based +/// on a path through the DFA. +class NfaTranscriber { +private: + /// Cached transition table. This is a table of NfaStatePairs that contains + /// zero-terminated sequences pointed to by DFA transitions. + ArrayRef TransitionInfo; + + /// A simple linked-list of traversed states that can have a shared tail. The + /// traversed path is stored in reverse order with the latest state as the + /// head. + struct PathSegment { + uint64_t State; + PathSegment *Tail; + }; + + /// We allocate segment objects frequently. Allocate them upfront and dispose + /// at the end of a traversal rather than hammering the system allocator. + SpecificBumpPtrAllocator Allocator; + + /// Heads of each tracked path. These are not ordered. + std::deque Heads; + + /// The returned paths. This is populated during getPaths. + SmallVector Paths; + + /// Create a new segment and return it. + PathSegment *makePathSegment(uint64_t State, PathSegment *Tail) { + PathSegment *P = Allocator.Allocate(); + *P = {State, Tail}; + return P; + } + + /// Pairs defines a sequence of possible NFA transitions for a single DFA + /// transition. + void transition(ArrayRef Pairs) { + // Iterate over all existing heads. We will mutate the Heads deque during + // iteration. + unsigned NumHeads = Heads.size(); + for (auto HeadI = Heads.begin(), HeadE = std::next(Heads.begin(), NumHeads); + HeadI != HeadE; ++HeadI) { + PathSegment *Head = *HeadI; + // The sequence of pairs is sorted. Select the set of pairs that + // transition from the current head state. + auto PI = lower_bound(Pairs, NfaStatePair{Head->State, 0ULL}); + auto PE = upper_bound(Pairs, NfaStatePair{Head->State, INT64_MAX}); + // For every transition from the current head state, add a new path + // segment. + for (; PI != PE; ++PI) + if (PI->FromDfaState == Head->State) + Heads.push_back(makePathSegment(PI->ToDfaState, Head)); + } + // Now we've iterated over all the initial heads and added new ones, + // dispose of the original heads. + Heads.erase(Heads.begin(), std::next(Heads.begin(), NumHeads)); + } + +public: + NfaTranscriber(ArrayRef TransitionInfo) + : TransitionInfo(TransitionInfo) { + reset(); + } + + void reset() { + Paths.clear(); + Heads.clear(); + Allocator.DestroyAll(); + // The initial NFA state is 0. + Heads.push_back(makePathSegment(0ULL, nullptr)); + } + + void transition(unsigned TransitionInfoIdx) { + unsigned EndIdx = TransitionInfoIdx; + while (TransitionInfo[EndIdx].ToDfaState != 0) + ++EndIdx; + ArrayRef Pairs(&TransitionInfo[TransitionInfoIdx], + EndIdx - TransitionInfoIdx); + transition(Pairs); + } + + ArrayRef getPaths() { + Paths.clear(); + for (auto *Head : Heads) { + NfaPath P; + while (Head->State != 0) { + P.push_back(Head->State); + Head = Head->Tail; + } + std::reverse(P.begin(), P.end()); + Paths.push_back(std::move(P)); + } + return Paths; + } +}; +} // namespace internal + +/// A deterministic finite-state automaton. The automaton is defined in +/// TableGen; this object drives an automaton defined by tblgen-emitted tables. +/// +/// An automaton accepts a sequence of input tokens ("actions"). This class is +/// templated on the type of these actions. +template class Automaton { + /// Map from {State, Action} to {NewState, TransitionInfoIdx}. + /// TransitionInfoIdx is used by the DfaTranscriber to analyze the transition. + /// FIXME: This uses a std::map because ActionT can be a pair type including + /// an enum. In particular DenseMapInfo must be defined to use + /// DenseMap here. + std::map, std::pair> M; + /// An optional transcription object. This uses much more state than simply + /// traversing the DFA for acceptance, so is heap allocated. + std::unique_ptr Transcriber; + /// The initial DFA state is 1. + uint64_t State = 1; + +public: + /// Create an automaton. + /// \param Transitions The Transitions table as created by TableGen. Note that + /// because the action type differs per automaton, the + /// table type is templated as ArrayRef. + /// \param TranscriptionTable The TransitionInfo table as created by TableGen. + /// + /// Providing the TranscriptionTable argument as non-empty will enable the + /// use of transcription, which analyzes the possible paths in the original + /// NFA taken by the DFA. NOTE: This is substantially more work than simply + /// driving the DFA, so unless you require the getPaths() method leave this + /// empty. + template + Automaton(ArrayRef Transitions, + ArrayRef TranscriptionTable = {}) { + if (!TranscriptionTable.empty()) + Transcriber = + std::make_unique(TranscriptionTable); + for (const auto &I : Transitions) + // Greedily read and cache the transition table. + M.emplace(std::make_pair(I.FromDfaState, I.Action), + std::make_pair(I.ToDfaState, I.InfoIdx)); + } + + /// Reset the automaton to its initial state. + void reset() { + State = 1; + if (Transcriber) + Transcriber->reset(); + } + + /// Transition the automaton based on input symbol A. Return true if the + /// automaton transitioned to a valid state, false if the automaton + /// transitioned to an invalid state. + /// + /// If this function returns false, all methods are undefined until reset() is + /// called. + bool add(const ActionT &A) { + auto I = M.find({State, A}); + if (I == M.end()) + return false; + if (Transcriber) + Transcriber->transition(I->second.second); + State = I->second.first; + return true; + } + + /// Obtain a set of possible paths through the input nondeterministic + /// automaton that could be obtained from the sequence of input actions + /// presented to this deterministic automaton. + ArrayRef getNfaPaths() { + assert(Transcriber && "Can only obtain NFA paths if transcribing!"); + return Transcriber->getPaths(); + } +}; + +} // namespace llvm + +#endif // LLVM_SUPPORT_AUTOMATON_H Index: llvm/trunk/include/llvm/TableGen/Automaton.td =================================================================== --- llvm/trunk/include/llvm/TableGen/Automaton.td +++ llvm/trunk/include/llvm/TableGen/Automaton.td @@ -0,0 +1,95 @@ +//===- Automaton.td ----------------------------------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the key top-level classes needed to produce a reasonably +// generic finite-state automaton. +// +//===----------------------------------------------------------------------===// + +// Define a record inheriting from GenericAutomaton to generate a reasonably +// generic finite-state automaton over a set of actions and states. +// +// This automaton is defined by: +// 1) a state space (explicit, always bits<32>). +// 2) a set of input symbols (actions, explicit) and +// 3) a transition function from state + action -> state. +// +// A theoretical automaton is defined by : +// Q: A set of possible states. +// S: (sigma) The input alphabet. +// d: (delta) The transition function f(q in Q, s in S) -> q' in Q. +// F: The set of final (accepting) states. +// +// Because generating all possible states is tedious, we instead define the +// transition function only and crawl all reachable states starting from the +// initial state with all inputs under all transitions until termination. +// +// We define F = S, that is, all valid states are accepting. +// +// To ensure the generation of the automaton terminates, the state transitions +// are defined as a lattice (meaning every transitioned-to state is more +// specific than the transitioned-from state, for some definition of specificity). +// Concretely a transition may set one or more bits in the state that were +// previously zero to one. If any bit was not zero, the transition is invalid. +// +// Instead of defining all possible states (which would be cumbersome), the user +// provides a set of possible Transitions from state A, consuming an input +// symbol A to state B. The Transition object transforms state A to state B and +// acts as a predicate. This means the state space can be discovered by crawling +// all the possible transitions until none are valid. +// +// This automaton is considered to be nondeterministic, meaning that multiple +// transitions can occur from any (state, action) pair. The generated automaton +// is determinized, meaning that is executes in O(k) time where k is the input +// sequence length. +// +// In addition to a generated automaton that determines if a sequence of inputs +// is accepted or not, a table is emitted that allows determining a plausible +// sequence of states traversed to accept that input. +class GenericAutomaton { + // Name of a class that inherits from Transition. All records inheriting from + // this class will be considered when constructing the automaton. + string TransitionClass; + + // Names of fields within TransitionClass that define the action symbol. This + // defines the action as an N-tuple. + // + // Each symbol field can be of class, int, string or code type. + // If the type of a field is a class, the Record's name is used verbatim + // in C++ and the class name is used as the C++ type name. + // If the type of a field is a string, code or int, that is also used + // verbatim in C++. + // + // To override the C++ type name for field F, define a field called TypeOf_F. + // This should be a string that will be used verbatim in C++. + // + // As an example, to define a 2-tuple with an enum and a string, one might: + // def MyTransition : Transition { + // MyEnum S1; + // int S2; + // } + // def MyAutomaton : GenericAutomaton }{ + // let TransitionClass = "Transition"; + // let SymbolFields = ["S1", "S2"]; + // let TypeOf_S1 = "MyEnumInCxxKind"; + // } + list SymbolFields; +} + +// All transitions inherit from Transition. +class Transition { + // A transition S' = T(S) is valid if, for every set bit in NewState, the + // corresponding bit in S is clear. That is: + // def T(S): + // S' = S | NewState + // return S' if S' != S else Failure + // + // The automaton generator uses this property to crawl the set of possible + // transitions from a starting state of 0b0. + bits<32> NewState; +} Index: llvm/trunk/unittests/TableGen/Automata.td =================================================================== --- llvm/trunk/unittests/TableGen/Automata.td +++ llvm/trunk/unittests/TableGen/Automata.td @@ -0,0 +1,186 @@ +include "llvm/TableGen/Automaton.td" +include "llvm/TableGen/SearchableTable.td" + +// Define a set of input token symbols. +class SymKindTy; +def SK_a : SymKindTy; +def SK_b : SymKindTy; +def SK_c : SymKindTy; +def SK_d : SymKindTy; + +// Emit those as a C++ enum using SearchableTables. +def SymKind : GenericEnum { + let FilterClass = "SymKindTy"; +} + +// Define a transition implementation. +class SimpleTransition State, SymKindTy A> : Transition { + let NewState{1-0} = State; + SymKindTy ActionSym = A; +} + +// Token SK_a sets bit 0b01. +def : SimpleTransition<0b01, SK_a>; +// Token SK_b sets bits 0b10. +def : SimpleTransition<0b10, SK_b>; +// Token SK_c sets both bits 0b11. +def : SimpleTransition<0b11, SK_c>; + +def SimpleAutomaton : GenericAutomaton { + let TransitionClass = "SimpleTransition"; + let SymbolFields = ["ActionSym"]; + // Override the type of ActionSym from SymKindTy to the C++ type SymKind. + string TypeOf_ActionSym = "SymKind"; +} + +//===----------------------------------------------------------------------===// +// TupleActionAutomaton test implementation + +// Define a transition implementation. +class TupleTransition State, SymKindTy s1, SymKindTy s2, string s3> : Transition { + let NewState{1-0} = State; + SymKindTy S1 = s1; + SymKindTy S2 = s2; + string S3 = s3; +} + +def : TupleTransition<0b01, SK_a, SK_b, "yeet">; +def : TupleTransition<0b10, SK_b, SK_b, "foo">; +def : TupleTransition<0b10, SK_c, SK_a, "foo">; + +def TupleAutomaton : GenericAutomaton { + let TransitionClass = "TupleTransition"; + let SymbolFields = ["S1", "S2", "S3"]; + string TypeOf_S1 = "SymKind"; + string TypeOf_S2 = "SymKind"; +} + +//===----------------------------------------------------------------------===// +// NfaAutomaton test implementation + +class NfaTransition State, SymKindTy S> : Transition { + let NewState{1-0} = State; + SymKindTy A = S; +} + +// Symbols a and b can transition to 0b01 or 0b11 (sets bit 0). +def : NfaTransition<0b01, SK_a>; +def : NfaTransition<0b01, SK_b>; +// Symbols a and b can also transition to 0b10 or 0b11 (sets bit 1). +def : NfaTransition<0b10, SK_a>; +def : NfaTransition<0b10, SK_b>; + +def NfaAutomaton : GenericAutomaton { + let TransitionClass = "NfaTransition"; + let SymbolFields = ["A"]; + string TypeOf_A = "SymKind"; +} + +//===----------------------------------------------------------------------===// +// BinPacker test implementation +//===----------------------------------------------------------------------===// +// This test generates an automaton that can pack values into bins subject to +// constraints. There are 6 possible bins, and the input tokens are constraint +// types. Some input types span two bins. + +// The symbol type for a bin constraint. We use lists of ints as a tblgen hack +// to conditionally generate defs within multiclasses based on record +// information. A bin is nonempty (has a dummy one-element value) if enabled. +class BinRequirementKind { + list Bin0 = []; + list Bin1 = []; + list Bin2 = []; + list Bin3 = []; + list Bin4 = []; + list Bin5 = []; +} +// Can use bins {0-3} +def BRK_0_to_4 : BinRequirementKind { let Bin0 = [1]; let Bin1 = [1]; let Bin2 = [1]; let Bin3 = [1]; } +// Can use bins {0-3} but only evens (0 and 2). +def BRK_0_to_4_lo : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; } +// Can use bins {0-3} but only odds (1 and 3). +def BRK_0_to_4_hi : BinRequirementKind { let Bin1 = [1]; let Bin3 = [1]; } +// Can use bins {0-3} but only even-odd pairs (0+1 or 1+2). +def BRK_0_to_4_dbl : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; } +def BRK_0_to_6 : BinRequirementKind { let Bin0 = [1]; let Bin1 = [1]; let Bin2 = [1]; + let Bin3 = [1]; let Bin4 = [1]; let Bin5 = [1]; } +def BRK_0_to_6_lo : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; let Bin4 = [1]; } +def BRK_0_to_6_hi : BinRequirementKind { let Bin1 = [1]; let Bin3 = [1]; let Bin5 = [1]; } +def BRK_0_to_6_dbl : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; let Bin4 = [1]; } +def BRK_2_to_6 : BinRequirementKind { let Bin2 = [1]; + let Bin3 = [1]; let Bin4 = [1]; let Bin5 = [1]; } +def BRK_2_to_6_lo : BinRequirementKind { let Bin2 = [1]; let Bin4 = [1]; } +def BRK_2_to_6_hi : BinRequirementKind { let Bin3 = [1]; let Bin5 = [1];} +def BRK_2_to_6_dbl : BinRequirementKind { let Bin2 = [1]; let Bin4 = [1]; } +def BRK_2_to_4 : BinRequirementKind { let Bin2 = [1]; let Bin3 = [1]; } +def BRK_2_to_4_lo : BinRequirementKind { let Bin2 = [1]; } +def BRK_2_to_4_hi : BinRequirementKind { let Bin3 = [1]; } +def BRK_2_to_4_dbl : BinRequirementKind { let Bin2 = [1]; } + +def BinRequirementKindEnum : GenericEnum { + let FilterClass = "BinRequirementKind"; +} + +// The transition class is trivial; it just contains the constraint symbol. +class BinTransition : Transition { + BinRequirementKind Sym; +} + +// Mixin that occupies a single bin. +class Bin0 : BinTransition { let NewState{0} = 1; } +class Bin1 : BinTransition { let NewState{1} = 1; } +class Bin2 : BinTransition { let NewState{2} = 1;} +class Bin3 : BinTransition { let NewState{3} = 1; } +class Bin4 : BinTransition { let NewState{4} = 1;} +class Bin5 : BinTransition { let NewState{5} = 1; } +// Mixin that occupies a pair of bins (even-odd pairs). +class Bin01 : BinTransition { let NewState{0,1} = 0b11; } +class Bin23 : BinTransition { let NewState{2,3} = 0b11; } +class Bin45 : BinTransition { let NewState{4,5} = 0b11; } + +// Instantiate all possible bin assignments for E. +multiclass BinAssignments { + let Sym = E in { + // Note the tablegen hack to conditionally instantiate a def based on E. + foreach x = E.Bin0 in { def : Bin0; } + foreach x = E.Bin1 in { def : Bin1; } + foreach x = E.Bin2 in { def : Bin2; } + foreach x = E.Bin3 in { def : Bin3; } + foreach x = E.Bin4 in { def : Bin4; } + foreach x = E.Bin5 in { def : Bin5; } + } +} + +// Instantiate all possible bin assignments for E, which spans even-odd pairs. +multiclass DblBinAssignments { + let Sym = E in { + foreach x = E.Bin0 in { def : Bin01; } + foreach x = E.Bin2 in { def : Bin23; } + foreach x = E.Bin4 in { def : Bin45; } + } +} + +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; + +def BinPackerAutomaton : GenericAutomaton { + let TransitionClass = "BinTransition"; + let SymbolFields = ["Sym"]; + string TypeOf_Sym = "BinRequirementKindEnum"; +} + + Index: llvm/trunk/unittests/TableGen/AutomataTest.cpp =================================================================== --- llvm/trunk/unittests/TableGen/AutomataTest.cpp +++ llvm/trunk/unittests/TableGen/AutomataTest.cpp @@ -0,0 +1,153 @@ +//===- unittest/TableGen/AutomataTest.cpp - DFA tests ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Automaton.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; +using testing::ContainerEq; +using testing::UnorderedElementsAre; + +// Bring in the enums created by SearchableTables.td. +#define GET_SymKind_DECL +#define GET_BinRequirementKindEnum_DECL +#include "AutomataTables.inc" + +// And bring in the automata from Automata.td. +#define GET_SimpleAutomaton_DECL +#define GET_TupleAutomaton_DECL +#define GET_NfaAutomaton_DECL +#define GET_BinPackerAutomaton_DECL +#include "AutomataAutomata.inc" + +TEST(Automata, SimpleAutomatonAcceptsFromInitialState) { + Automaton A(makeArrayRef(SimpleAutomatonTransitions)); + EXPECT_TRUE(A.add(SK_a)); + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + A.reset(); + EXPECT_TRUE(A.add(SK_c)); + A.reset(); + EXPECT_FALSE(A.add(SK_d)); +} + +TEST(Automata, SimpleAutomatonAcceptsSequences) { + Automaton A(makeArrayRef(SimpleAutomatonTransitions)); + // Test sequence + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_TRUE(A.add(SK_b)); + + // Test sequence is rejected (c cannot get bit 0b10); + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_FALSE(A.add(SK_c)); + + // Symmetric test: sequence is rejected. + A.reset(); + EXPECT_TRUE(A.add(SK_c)); + EXPECT_FALSE(A.add(SK_a)); +} + +TEST(Automata, TupleAutomatonAccepts) { + Automaton A(makeArrayRef(TupleAutomatonTransitions)); + A.reset(); + EXPECT_TRUE( + A.add({SK_a, SK_b, "yeet"})); + A.reset(); + EXPECT_FALSE( + A.add({SK_a, SK_a, "yeet"})); + A.reset(); + EXPECT_FALSE( + A.add({SK_a, SK_b, "feet"})); + A.reset(); + EXPECT_TRUE( + A.add({SK_b, SK_b, "foo"})); +} + +TEST(Automata, NfaAutomatonAccepts) { + Automaton A(makeArrayRef(NfaAutomatonTransitions)); + + // Test sequences , , , . All should be accepted. + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_TRUE(A.add(SK_a)); + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_TRUE(A.add(SK_b)); + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_TRUE(A.add(SK_a)); + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_TRUE(A.add(SK_b)); + + // Expect that is not accepted. + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_FALSE(A.add(SK_b)); +} + +TEST(Automata, BinPackerAutomatonAccepts) { + Automaton A(makeArrayRef(BinPackerAutomatonTransitions)); + + // Expect that we can pack two double-bins in 0-4, then no more in 0-4. + A.reset(); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_FALSE(A.add(BRK_0_to_4)); + + // Expect that we can pack two double-bins in 0-4, two more in 0-6 then no + // more. + A.reset(); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_FALSE(A.add(BRK_0_to_6)); + + // Expect that we can pack BRK_0_to_6 five times to occupy five bins, then + // cannot allocate any double-bins. + A.reset(); + for (unsigned I = 0; I < 5; ++I) + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_FALSE(A.add(BRK_0_to_6_dbl)); +} + +// The state we defined in TableGen uses the least significant 6 bits to represent a bin state. +#define BINS(a, b, c, d, e, f) \ + ((a << 5) | (b << 4) | (c << 3) | (d << 2) | (e << 1) | (f << 0)) + +TEST(Automata, BinPackerAutomatonExplains) { + Automaton A(makeArrayRef(BinPackerAutomatonTransitions), + makeArrayRef(BinPackerAutomatonTransitionInfo)); + // Pack two double-bins in 0-4, then a single bin in 0-6. + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_THAT( + A.getNfaPaths(), + UnorderedElementsAre( + // Allocate {0,1} first, then 6. + ContainerEq(NfaPath{BINS(0, 0, 0, 0, 1, 1), BINS(0, 0, 1, 1, 1, 1), + BINS(1, 0, 1, 1, 1, 1)}), + // Allocate {0,1} first, then 5. + ContainerEq(NfaPath{BINS(0, 0, 0, 0, 1, 1), BINS(0, 0, 1, 1, 1, 1), + BINS(0, 1, 1, 1, 1, 1)}), + // Allocate {2,3} first, then 6. + ContainerEq(NfaPath{BINS(0, 0, 1, 1, 0, 0), BINS(0, 0, 1, 1, 1, 1), + BINS(1, 0, 1, 1, 1, 1)}), + // Allocate {2,3} first, then 5. + ContainerEq(NfaPath{BINS(0, 0, 1, 1, 0, 0), BINS(0, 0, 1, 1, 1, 1), + BINS(0, 1, 1, 1, 1, 1)}))); +} Index: llvm/trunk/unittests/TableGen/CMakeLists.txt =================================================================== --- llvm/trunk/unittests/TableGen/CMakeLists.txt +++ llvm/trunk/unittests/TableGen/CMakeLists.txt @@ -3,8 +3,15 @@ Support ) +set(LLVM_TARGET_DEFINITIONS Automata.td) + +tablegen(LLVM AutomataTables.inc -gen-searchable-tables) +tablegen(LLVM AutomataAutomata.inc -gen-automata) +add_public_tablegen_target(AutomataTestTableGen) + add_llvm_unittest(TableGenTests CodeExpanderTest.cpp + AutomataTest.cpp ) include_directories(${CMAKE_SOURCE_DIR}/utils/TableGen) target_link_libraries(TableGenTests PRIVATE LLVMTableGenGlobalISel LLVMTableGen) Index: llvm/trunk/utils/TableGen/CMakeLists.txt =================================================================== --- llvm/trunk/utils/TableGen/CMakeLists.txt +++ llvm/trunk/utils/TableGen/CMakeLists.txt @@ -21,6 +21,7 @@ DAGISelMatcherGen.cpp DAGISelMatcherOpt.cpp DAGISelMatcher.cpp + DFAEmitter.cpp DFAPacketizerEmitter.cpp DisassemblerEmitter.cpp ExegesisEmitter.cpp Index: llvm/trunk/utils/TableGen/DFAEmitter.h =================================================================== --- llvm/trunk/utils/TableGen/DFAEmitter.h +++ llvm/trunk/utils/TableGen/DFAEmitter.h @@ -0,0 +1,107 @@ +//===--------------------- DfaEmitter.h -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Defines a generic automaton builder. This takes a set of transitions and +// states that represent a nondeterministic finite state automaton (NFA) and +// emits a determinized DFA in a form that include/llvm/Support/Automaton.h can +// drive. +// +// See file llvm/TableGen/Automaton.td for the TableGen API definition. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UTILS_TABLEGEN_DFAEMITTER_H +#define LLVM_UTILS_TABLEGEN_DFAEMITTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Record.h" +#include +#include + +namespace llvm { + +class raw_ostream; +/// Construct a deterministic finite state automaton from possible +/// nondeterministic state and transition data. +/// +/// The state type is a 64-bit unsigned integer. The generated automaton is +/// invariant to the sparsity of the state representation - its size is only +/// a function of the cardinality of the set of states. +/// +/// The inputs to this emitter are considered to define a nondeterministic +/// finite state automaton (NFA). This is then converted to a DFA during +/// emission. The emitted tables can be used to by +/// include/llvm/Support/Automaton.h. +class DfaEmitter { +public: + // The type of an NFA state. The initial state is always zero. + using state_type = uint64_t; + // The type of an action. + using action_type = uint64_t; + + DfaEmitter() = default; + virtual ~DfaEmitter() = default; + + void addTransition(state_type From, state_type To, action_type A); + void emit(StringRef Name, raw_ostream &OS); + +protected: + /// Emit the C++ type of an action to OS. + virtual void printActionType(raw_ostream &OS); + /// Emit the C++ value of an action A to OS. + virtual void printActionValue(action_type A, raw_ostream &OS); + +private: + /// The state type of deterministic states. These are only used internally to + /// this class. This is an ID into the DfaStates UniqueVector. + using dfa_state_type = unsigned; + + /// The actual representation of a DFA state, which is a union of one or more + /// NFA states. + using DfaState = SmallVector; + + /// A DFA transition consists of a set of NFA states transitioning to a + /// new set of NFA states. The DfaTransitionInfo tracks, for every + /// transitioned-from NFA state, a set of valid transitioned-to states. + /// + /// Emission of this transition relation allows algorithmic determination of + /// the possible candidate NFA paths taken under a given input sequence to + /// reach a given DFA state. + using DfaTransitionInfo = SmallVector, 4>; + + /// The set of all possible actions. + std::set Actions; + + /// The set of nondeterministic transitions. A state-action pair can + /// transition to multiple target states. + std::map, std::vector> + NfaTransitions; + std::set NfaStates; + unsigned NumNfaTransitions = 0; + + /// The set of deterministic states. DfaStates.getId(DfaState) returns an ID, + /// which is dfa_state_type. Note that because UniqueVector reserves state + /// zero, the initial DFA state is always 1. + UniqueVector DfaStates; + /// The set of deterministic transitions. A state-action pair has only a + /// single target state. + std::map, + std::pair> + DfaTransitions; + + /// Visit all NFA states and construct the DFA. + void constructDfa(); + /// Visit a single DFA state and construct all possible transitions to new DFA + /// states. + void visitDfaState(DfaState DS); +}; + +} // namespace llvm + +#endif Index: llvm/trunk/utils/TableGen/DFAEmitter.cpp =================================================================== --- llvm/trunk/utils/TableGen/DFAEmitter.cpp +++ llvm/trunk/utils/TableGen/DFAEmitter.cpp @@ -0,0 +1,394 @@ +//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class can produce a generic deterministic finite state automaton (DFA), +// given a set of possible states and transitions. +// +// The input transitions can be nondeterministic - this class will produce the +// deterministic equivalent state machine. +// +// The generated code can run the DFA and produce an accepted / not accepted +// state and also produce, given a sequence of transitions that results in an +// accepted state, the sequence of intermediate states. This is useful if the +// initial automaton was nondeterministic - it allows mapping back from the DFA +// to the NFA. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "dfa-emitter" + +#include "DFAEmitter.h" +#include "CodeGenTarget.h" +#include "SequenceToOffsetTable.h" +#include "TableGenBackends.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Record.h" +#include "llvm/TableGen/TableGenBackend.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// DfaEmitter implementation. This is independent of the GenAutomaton backend. +//===----------------------------------------------------------------------===// + +void DfaEmitter::addTransition(state_type From, state_type To, action_type A) { + Actions.insert(A); + NfaStates.insert(From); + NfaStates.insert(To); + NfaTransitions[{From, A}].push_back(To); + ++NumNfaTransitions; +} + +void DfaEmitter::visitDfaState(DfaState DS) { + // For every possible action... + auto FromId = DfaStates.idFor(DS); + for (action_type A : Actions) { + DfaState NewStates; + DfaTransitionInfo TI; + // For every represented state, word pair in the original NFA... + for (state_type &FromState : DS) { + // If this action is possible from this state add the transitioned-to + // states to NewStates. + auto I = NfaTransitions.find({FromState, A}); + if (I == NfaTransitions.end()) + continue; + for (state_type &ToState : I->second) { + NewStates.push_back(ToState); + TI.emplace_back(FromState, ToState); + } + } + if (NewStates.empty()) + continue; + // Sort and unique. + sort(NewStates); + NewStates.erase(std::unique(NewStates.begin(), NewStates.end()), + NewStates.end()); + sort(TI); + TI.erase(std::unique(TI.begin(), TI.end()), TI.end()); + unsigned ToId = DfaStates.insert(NewStates); + DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI)); + } +} + +void DfaEmitter::constructDfa() { + DfaState Initial(1, /*NFA initial state=*/0); + DfaStates.insert(Initial); + + // Note that UniqueVector starts indices at 1, not zero. + unsigned DfaStateId = 1; + while (DfaStateId <= DfaStates.size()) + visitDfaState(DfaStates[DfaStateId++]); +} + +void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { + constructDfa(); + + OS << "// Input NFA has " << NfaStates.size() << " states with " + << NumNfaTransitions << " transitions.\n"; + OS << "// Generated DFA has " << DfaStates.size() << " states with " + << DfaTransitions.size() << " transitions.\n\n"; + + // Implementation note: We don't bake a simple std::pair<> here as it requires + // significantly more effort to parse. A simple test with a large array of + // struct-pairs (N=100000) took clang-10 6s to parse. The same array of + // std::pair took 242s. Instead we allow the user to + // define the pair type. + // + // FIXME: It may make sense to emit these as ULEB sequences instead of + // pairs of uint64_t. + OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n"; + OS << "// transition implies a set of NFA transitions. These are referred\n"; + OS << "// to by index in " << Name << "Transitions[].\n"; + + SequenceToOffsetTable Table; + std::map EmittedIndices; + for (auto &T : DfaTransitions) + Table.add(T.second.second); + Table.layout(); + OS << "std::array " << Name + << "TransitionInfo = {{\n"; + Table.emit( + OS, + [](raw_ostream &OS, std::pair P) { + OS << "{" << P.first << ", " << P.second << "}"; + }, + "{0ULL, 0ULL}"); + + OS << "}};\n\n"; + + OS << "// A transition in the generated " << Name << " DFA.\n"; + OS << "struct " << Name << "Transition {\n"; + OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n"; + OS << " "; + printActionType(OS); + OS << " Action; // The input symbol that causes this transition.\n"; + OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n"; + OS << " unsigned InfoIdx; // Start index into " << Name + << "TransitionInfo.\n"; + OS << "};\n\n"; + + OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; + OS << "// The initial state is 1, not zero.\n"; + OS << "std::array<" << Name << "Transition, " << DfaTransitions.size() << "> " + << Name << "Transitions = {{\n"; + for (auto &KV : DfaTransitions) { + dfa_state_type From = KV.first.first; + dfa_state_type To = KV.second.first; + action_type A = KV.first.second; + unsigned InfoIdx = Table.get(KV.second.second); + OS << " {" << From << ", "; + printActionValue(A, OS); + OS << ", " << To << ", " << InfoIdx << "},\n"; + } + OS << "\n}};\n\n"; +} + +void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; } + +void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } + +//===----------------------------------------------------------------------===// +// AutomatonEmitter implementation +//===----------------------------------------------------------------------===// + +namespace { +// FIXME: This entire discriminated union could be removed with c++17: +// using Action = std::variant; +struct Action { + Record *R = nullptr; + unsigned I = 0; + std::string S = nullptr; + + Action() = default; + Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {} + + void print(raw_ostream &OS) const { + if (R) + OS << R->getName(); + else if (!S.empty()) + OS << '"' << S << '"'; + else + OS << I; + } + bool operator<(const Action &Other) const { + return std::make_tuple(R, I, S) < + std::make_tuple(Other.R, Other.I, Other.S); + } +}; + +using ActionTuple = std::vector; +class Automaton; + +class Transition { + uint64_t NewState; + // The tuple of actions that causes this transition. + ActionTuple Actions; + // The types of the actions; this is the same across all transitions. + SmallVector Types; + +public: + Transition(Record *R, Automaton *Parent); + const ActionTuple &getActions() { return Actions; } + SmallVector getTypes() { return Types; } + + bool canTransitionFrom(uint64_t State); + uint64_t transitionFrom(uint64_t State); +}; + +class Automaton { + RecordKeeper &Records; + Record *R; + std::vector Transitions; + /// All possible action tuples, uniqued. + UniqueVector Actions; + /// The fields within each Transition object to find the action symbols. + std::vector ActionSymbolFields; + +public: + Automaton(RecordKeeper &Records, Record *R); + void emit(raw_ostream &OS); + + ArrayRef getActionSymbolFields() { return ActionSymbolFields; } + /// If the type of action A has been overridden (there exists a field + /// "TypeOf_A") return that, otherwise return the empty string. + StringRef getActionSymbolType(StringRef A); +}; + +class AutomatonEmitter { + RecordKeeper &Records; + +public: + AutomatonEmitter(RecordKeeper &R) : Records(R) {} + void run(raw_ostream &OS); +}; + +/// A DfaEmitter implementation that can print our variant action type. +class CustomDfaEmitter : public DfaEmitter { + const UniqueVector &Actions; + std::string TypeName; + +public: + CustomDfaEmitter(const UniqueVector &Actions, StringRef TypeName) + : Actions(Actions), TypeName(TypeName) {} + + void printActionType(raw_ostream &OS) override; + void printActionValue(action_type A, raw_ostream &OS) override; +}; +} // namespace + +void AutomatonEmitter::run(raw_ostream &OS) { + for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) { + Automaton A(Records, R); + OS << "#ifdef GET_" << R->getName() << "_DECL\n"; + A.emit(OS); + OS << "#endif // GET_" << R->getName() << "_DECL\n"; + } +} + +Automaton::Automaton(RecordKeeper &Records, Record *R) + : Records(Records), R(R) { + LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n"); + ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields"); +} + +void Automaton::emit(raw_ostream &OS) { + StringRef TransitionClass = R->getValueAsString("TransitionClass"); + for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) { + assert(T->isSubClassOf("Transition")); + Transitions.emplace_back(T, this); + Actions.insert(Transitions.back().getActions()); + } + + LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size() + << "\n"); + LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size() + << " potential transitions.\n"); + + StringRef Name = R->getName(); + + CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action"); + // Starting from the initial state, build up a list of possible states and + // transitions. + std::deque Worklist(1, 0); + std::set SeenStates; + unsigned NumTransitions = 0; + SeenStates.insert(Worklist.front()); + while (!Worklist.empty()) { + uint64_t State = Worklist.front(); + Worklist.pop_front(); + for (Transition &T : Transitions) { + if (!T.canTransitionFrom(State)) + continue; + uint64_t NewState = T.transitionFrom(State); + if (SeenStates.emplace(NewState).second) + Worklist.emplace_back(NewState); + ++NumTransitions; + Emitter.addTransition(State, NewState, Actions.idFor(T.getActions())); + } + } + LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() + << " states with " << NumTransitions << " transitions.\n"); + + const auto &ActionTypes = Transitions.back().getTypes(); + OS << "// The type of an action in the " << Name << " automaton.\n"; + if (ActionTypes.size() == 1) { + OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n"; + } else { + OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ") + << ">;\n"; + } + OS << "\n"; + + Emitter.emit(Name, OS); +} + +StringRef Automaton::getActionSymbolType(StringRef A) { + Twine Ty = "TypeOf_" + A; + if (!R->getValue(Ty.str())) + return ""; + return R->getValueAsString(Ty.str()); +} + +Transition::Transition(Record *R, Automaton *Parent) { + BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); + NewState = 0; + assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && + "State cannot be represented in 64 bits!"); + for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) { + if (auto *Bit = dyn_cast(NewStateInit->getBit(I))) { + if (Bit->getValue()) + NewState |= 1ULL << I; + } + } + + for (StringRef A : Parent->getActionSymbolFields()) { + RecordVal *SymbolV = R->getValue(A); + if (auto *Ty = dyn_cast(SymbolV->getType())) { + Actions.emplace_back(R->getValueAsDef(A), 0, ""); + Types.emplace_back(Ty->getAsString()); + } else if (isa(SymbolV->getType())) { + Actions.emplace_back(nullptr, R->getValueAsInt(A), ""); + Types.emplace_back("unsigned"); + } else if (isa(SymbolV->getType()) || + isa(SymbolV->getType())) { + Actions.emplace_back(nullptr, 0, R->getValueAsString(A)); + Types.emplace_back("std::string"); + } else { + report_fatal_error("Unhandled symbol type!"); + } + + StringRef TypeOverride = Parent->getActionSymbolType(A); + if (!TypeOverride.empty()) + Types.back() = TypeOverride; + } +} + +bool Transition::canTransitionFrom(uint64_t State) { + if ((State & NewState) == 0) + // The bits we want to set are not set; + return true; + return false; +} + +uint64_t Transition::transitionFrom(uint64_t State) { + return State | NewState; +} + +void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } + +void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { + const ActionTuple &AT = Actions[A]; + if (AT.size() > 1) + OS << "{"; + bool First = true; + for (const auto &SingleAction : AT) { + if (!First) + OS << ", "; + First = false; + SingleAction.print(OS); + } + if (AT.size() > 1) + OS << "}"; +} + +namespace llvm { + +void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) { + AutomatonEmitter(RK).run(OS); +} + +} // namespace llvm Index: llvm/trunk/utils/TableGen/TableGen.cpp =================================================================== --- llvm/trunk/utils/TableGen/TableGen.cpp +++ llvm/trunk/utils/TableGen/TableGen.cpp @@ -54,6 +54,7 @@ GenX86FoldTables, GenRegisterBank, GenExegesis, + GenAutomata, }; namespace llvm { @@ -124,7 +125,9 @@ clEnumValN(GenRegisterBank, "gen-register-bank", "Generate registers bank descriptions"), clEnumValN(GenExegesis, "gen-exegesis", - "Generate llvm-exegesis tables"))); + "Generate llvm-exegesis tables"), + clEnumValN(GenAutomata, "gen-automata", + "Generate generic automata"))); cl::OptionCategory PrintEnumsCat("Options for -print-enums"); cl::opt Class("class", cl::desc("Print Enum list for this class"), @@ -249,6 +252,9 @@ case GenExegesis: EmitExegesis(Records, OS); break; + case GenAutomata: + EmitAutomata(Records, OS); + break; } return false; Index: llvm/trunk/utils/TableGen/TableGenBackends.h =================================================================== --- llvm/trunk/utils/TableGen/TableGenBackends.h +++ llvm/trunk/utils/TableGen/TableGenBackends.h @@ -90,6 +90,7 @@ void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); void EmitExegesis(RecordKeeper &RK, raw_ostream &OS); +void EmitAutomata(RecordKeeper &RK, raw_ostream &OS); } // End llvm namespace