Index: include/llvm/ADT/SuffixTree.h =================================================================== --- /dev/null +++ include/llvm/ADT/SuffixTree.h @@ -0,0 +1,443 @@ +//===-- SuffixTree.h -----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// For more information on the suffix tree data structure, please see +// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ADT_SUFFIXTREE_H +#define LLVM_ADT_SUFFIXTREE_H + +#include "llvm/ADT/DenseMap.h" + +using namespace llvm; + +namespace { + +/// Represents an undefined index in the suffix tree. +const unsigned EmptyIdx = -1; + +/// A node in a suffix tree which represents a substring or suffix. +/// +/// Each node has either no children or at least two children, with the root +/// being a exception in the empty tree. +/// +/// Children are represented as a map between unsigned integers and nodes. If +/// a node N has a child M on unsigned integer k, then the mapping represented +/// by N is a proper prefix of the mapping represented by M. Note that this, +/// although similar to a trie is somewhat different: each node stores a full +/// substring of the full mapping rather than a single character state. +/// +/// Each internal node contains a pointer to the internal node representing +/// the same string, but with the first character chopped off. This is stored +/// in \p Link. Each leaf node stores the start index of its respective +/// suffix in \p SuffixIdx. +struct SuffixTreeNode { + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + DenseMap Children; + + /// A flag set to false if the node has been pruned from the tree. + bool IsInTree = true; + + /// The start index of this node's substring in the main string. + unsigned StartIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned *EndIdx = nullptr; + + /// For leaves, the start index of the suffix represented by this node. + /// + /// For all other nodes, this is ignored. + unsigned SuffixIdx = EmptyIdx; + + /// For internal nodes, a pointer to the internal node representing + /// the same sequence with the first character chopped off. + /// + /// This acts as a shortcut in Ukkonen's algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + SuffixTreeNode *Link = nullptr; + + /// The parent of this node. Every node except for the root has a parent. + SuffixTreeNode *Parent = nullptr; + + /// The number of times this node's string appears in the tree. + /// + /// This is equal to the number of leaf children of the string. It represents + /// the number of suffixes that the node's string is a prefix of. + unsigned OccurrenceCount = 0; + + /// The length of the string formed by concatenating the edge labels from the + /// root to this node. + unsigned ConcatLen = 0; + + /// Returns true if this node is a leaf. + bool isLeaf() const { return SuffixIdx != EmptyIdx; } + + /// Returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const { return StartIdx == EmptyIdx; } + + /// Return the number of elements in the substring associated with this node. + size_t size() const { + + // Is it the root? If so, it's the empty string so return 0. + if (isRoot()) + return 0; + + assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); + + // Size = the number of elements in the string. + // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. + return *EndIdx - StartIdx + 1; + } + + SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link, + SuffixTreeNode *Parent) + : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {} + + SuffixTreeNode() {} +}; + +/// A data structure for fast substring queries. +/// +/// Suffix trees represent the suffixes of their input strings in their leaves. +/// A suffix tree is a type of compressed trie structure where each node +/// represents an entire substring rather than a single character. Each leaf +/// of the tree is a suffix. +/// +/// A suffix tree can be seen as a type of state machine where each state is a +/// substring of the full string. The tree is structured so that, for a string +/// of length N, there are exactly N leaves in the tree. This structure allows +/// us to quickly find repeated substrings of the input string. +/// +/// In this implementation, a "string" is a vector of unsigned integers. +/// These integers may result from hashing some data type. A suffix tree can +/// contain 1 or many strings, which can then be queried as one large string. +/// +/// The suffix tree is implemented using Ukkonen's algorithm for linear-time +/// suffix tree construction. Ukkonen's algorithm is explained in more detail +/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +/// paper is available at +/// +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +class SuffixTree { +public: + /// Stores each leaf node in the tree. + /// + /// This is used for finding outlining candidates. + std::vector LeafVector; + + /// Each element is an integer representing an instruction in the module. + ArrayRef Str; + +private: + /// Maintains each node in the tree. + SpecificBumpPtrAllocator NodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeNode *Root = nullptr; + + /// Maintains the end indices of the internal nodes in the tree. + /// + /// Each internal node is guaranteed to never have its end index change + /// during the construction algorithm; however, leaves must be updated at + /// every step. Therefore, we need to store leaf end indices by reference + /// to avoid updating O(N) leaves at every step of construction. Thus, + /// every internal node must be allocated its own end index. + BumpPtrAllocator InternalEndIdxAllocator; + + /// The end index of each leaf in the tree. + unsigned LeafEndIdx = -1; + + /// Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeNode *Node; + + /// The index of the first character in the substring currently being added. + unsigned Idx = EmptyIdx; + + /// The length of the substring we have to add at the current step. + unsigned Len = 0; + }; + + /// The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, + unsigned Edge) { + + assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + + SuffixTreeNode *N = new (NodeAllocator.Allocate()) + SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr, &Parent); + Parent.Children[Edge] = N; + + return N; + } + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, + unsigned EndIdx, unsigned Edge) { + + assert(StartIdx <= EndIdx && "String can't start after it ends!"); + assert(!(!Parent && StartIdx != EmptyIdx) && + "Non-root internal nodes must have parents!"); + + unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx); + SuffixTreeNode *N = new (NodeAllocator.Allocate()) + SuffixTreeNode(StartIdx, E, Root, Parent); + if (Parent) + Parent->Children[Edge] = N; + + return N; + } + + /// Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. Also stores each leaf in \p LeafVector at its + /// respective suffix index. + /// + /// \param[in] CurrNode The node currently being visited. + /// \param CurrIdx The current index of the string being visited. + void setSuffixIndices(SuffixTreeNode &CurrNode, unsigned CurrIdx) { + + bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot(); + + // Store the length of the concatenation of all strings from the root to + // this node. + if (!CurrNode.isRoot()) { + if (CurrNode.ConcatLen == 0) + CurrNode.ConcatLen = CurrNode.size(); + + if (CurrNode.Parent) + CurrNode.ConcatLen += CurrNode.Parent->ConcatLen; + } + + // Traverse the tree depth-first. + for (auto &ChildPair : CurrNode.Children) { + assert(ChildPair.second && "Node had a null child!"); + setSuffixIndices(*ChildPair.second, CurrIdx + ChildPair.second->size()); + } + + // Is this node a leaf? + if (IsLeaf) { + // If yes, give it a suffix index and bump its parent's occurrence count. + CurrNode.SuffixIdx = Str.size() - CurrIdx; + assert(CurrNode.Parent && "CurrNode had no parent!"); + CurrNode.Parent->OccurrenceCount++; + + // Store the leaf in the leaf vector for pruning later. + LeafVector[CurrNode.SuffixIdx] = &CurrNode; + } + } + + /// Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd) { + SuffixTreeNode *NeedsLink = nullptr; + + while (SuffixesToAdd > 0) { + + // Are we waiting to add anything other than just the last character? + if (Active.Len == 0) { + // If not, then say the active index is the end index. + Active.Idx = EndIdx; + } + + assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + + // The first character in the current substring we're looking at. + unsigned FirstChar = Str[Active.Idx]; + + // Have we inserted anything starting with FirstChar at the current node? + if (Active.Node->Children.count(FirstChar) == 0) { + // If not, then we can just insert a leaf and move too the next step. + insertLeaf(*Active.Node, EndIdx, FirstChar); + + // The active node is an internal node, and we visited it, so it must + // need a link if it doesn't have one. + if (NeedsLink) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + } else { + // There's a match with FirstChar, so look for the point in the tree to + // insert a new node. + SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; + + unsigned SubstringLen = NextNode->size(); + + // Is the current suffix we're trying to insert longer than the size of + // the child we want to move to? + if (Active.Len >= SubstringLen) { + // If yes, then consume the characters we've seen and move to the next + // node. + Active.Idx += SubstringLen; + Active.Len -= SubstringLen; + Active.Node = NextNode; + continue; + } + + // Otherwise, the suffix we're trying to insert must be contained in the + // next node we want to move to. + unsigned LastChar = Str[EndIdx]; + + // Is the string we're trying to insert a substring of the next node? + if (Str[NextNode->StartIdx + Active.Len] == LastChar) { + // If yes, then we're done for this step. Remember our insertion point + // and move to the next end index. At this point, we have an implicit + // suffix tree. + if (NeedsLink && !Active.Node->isRoot()) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + + Active.Len++; + break; + } + + // The string we're trying to insert isn't a substring of the next node, + // but matches up to a point. Split the node. + // + // For example, say we ended our search at a node n and we're trying to + // insert ABD. Then we'll create a new node s for AB, reduce n to just + // representing C, and insert a new leaf node l to represent d. This + // allows us to ensure that if n was a leaf, it remains a leaf. + // + // | ABC ---split---> | AB + // n s + // C / \ D + // n l + + // The node s from the diagram + SuffixTreeNode *SplitNode = + insertInternalNode(Active.Node, NextNode->StartIdx, + NextNode->StartIdx + Active.Len - 1, FirstChar); + + // Insert the new node representing the new substring into the tree as + // a child of the split node. This is the node l from the diagram. + insertLeaf(*SplitNode, EndIdx, LastChar); + + // Make the old node a child of the split node and update its start + // index. This is the node n from the diagram. + NextNode->StartIdx += Active.Len; + NextNode->Parent = SplitNode; + SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; + + // SplitNode is an internal node, update the suffix link. + if (NeedsLink) + NeedsLink->Link = SplitNode; + + NeedsLink = SplitNode; + } + + // We've added something new to the tree, so there's one less suffix to + // add. + SuffixesToAdd--; + + if (Active.Node->isRoot()) { + if (Active.Len > 0) { + Active.Len--; + Active.Idx = EndIdx - SuffixesToAdd + 1; + } + } else { + // Start the next phase at the next smallest suffix. + Active.Node = Active.Node->Link; + } + } + + return SuffixesToAdd; + } + +public: + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector &Str) : Str(Str) { + Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); + Root->IsInTree = true; + Active.Node = Root; + LeafVector = std::vector(Str.size()); + + // Keep track of the number of suffixes we have to add of the current + // prefix. + unsigned SuffixesToAdd = 0; + Active.Node = Root; + + // Construct the suffix tree iteratively on each prefix of the string. + // PfxEndIdx is the end index of the current prefix. + // End is one past the last element in the string. + for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; + PfxEndIdx++) { + SuffixesToAdd++; + LeafEndIdx = PfxEndIdx; // Extend each of the leaves. + SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); + } + + // Set the suffix indices of each leaf. + assert(Root && "Root node can't be nullptr!"); + setSuffixIndices(*Root, 0); + } +}; + +} // namespace + +#endif Index: lib/CodeGen/MachineOutliner.cpp =================================================================== --- lib/CodeGen/MachineOutliner.cpp +++ lib/CodeGen/MachineOutliner.cpp @@ -58,6 +58,7 @@ #include "llvm/CodeGen/MachineOutliner.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SuffixTree.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -101,425 +102,6 @@ namespace { -/// Represents an undefined index in the suffix tree. -const unsigned EmptyIdx = -1; - -/// A node in a suffix tree which represents a substring or suffix. -/// -/// Each node has either no children or at least two children, with the root -/// being a exception in the empty tree. -/// -/// Children are represented as a map between unsigned integers and nodes. If -/// a node N has a child M on unsigned integer k, then the mapping represented -/// by N is a proper prefix of the mapping represented by M. Note that this, -/// although similar to a trie is somewhat different: each node stores a full -/// substring of the full mapping rather than a single character state. -/// -/// Each internal node contains a pointer to the internal node representing -/// the same string, but with the first character chopped off. This is stored -/// in \p Link. Each leaf node stores the start index of its respective -/// suffix in \p SuffixIdx. -struct SuffixTreeNode { - - /// The children of this node. - /// - /// A child existing on an unsigned integer implies that from the mapping - /// represented by the current node, there is a way to reach another - /// mapping by tacking that character on the end of the current string. - DenseMap Children; - - /// A flag set to false if the node has been pruned from the tree. - bool IsInTree = true; - - /// The start index of this node's substring in the main string. - unsigned StartIdx = EmptyIdx; - - /// The end index of this node's substring in the main string. - /// - /// Every leaf node must have its \p EndIdx incremented at the end of every - /// step in the construction algorithm. To avoid having to update O(N) - /// nodes individually at the end of every step, the end index is stored - /// as a pointer. - unsigned *EndIdx = nullptr; - - /// For leaves, the start index of the suffix represented by this node. - /// - /// For all other nodes, this is ignored. - unsigned SuffixIdx = EmptyIdx; - - /// For internal nodes, a pointer to the internal node representing - /// the same sequence with the first character chopped off. - /// - /// This acts as a shortcut in Ukkonen's algorithm. One of the things that - /// Ukkonen's algorithm does to achieve linear-time construction is - /// keep track of which node the next insert should be at. This makes each - /// insert O(1), and there are a total of O(N) inserts. The suffix link - /// helps with inserting children of internal nodes. - /// - /// Say we add a child to an internal node with associated mapping S. The - /// next insertion must be at the node representing S - its first character. - /// This is given by the way that we iteratively build the tree in Ukkonen's - /// algorithm. The main idea is to look at the suffixes of each prefix in the - /// string, starting with the longest suffix of the prefix, and ending with - /// the shortest. Therefore, if we keep pointers between such nodes, we can - /// move to the next insertion point in O(1) time. If we don't, then we'd - /// have to query from the root, which takes O(N) time. This would make the - /// construction algorithm O(N^2) rather than O(N). - SuffixTreeNode *Link = nullptr; - - /// The parent of this node. Every node except for the root has a parent. - SuffixTreeNode *Parent = nullptr; - - /// The number of times this node's string appears in the tree. - /// - /// This is equal to the number of leaf children of the string. It represents - /// the number of suffixes that the node's string is a prefix of. - unsigned OccurrenceCount = 0; - - /// The length of the string formed by concatenating the edge labels from the - /// root to this node. - unsigned ConcatLen = 0; - - /// Returns true if this node is a leaf. - bool isLeaf() const { return SuffixIdx != EmptyIdx; } - - /// Returns true if this node is the root of its owning \p SuffixTree. - bool isRoot() const { return StartIdx == EmptyIdx; } - - /// Return the number of elements in the substring associated with this node. - size_t size() const { - - // Is it the root? If so, it's the empty string so return 0. - if (isRoot()) - return 0; - - assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); - - // Size = the number of elements in the string. - // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. - return *EndIdx - StartIdx + 1; - } - - SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link, - SuffixTreeNode *Parent) - : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {} - - SuffixTreeNode() {} -}; - -/// A data structure for fast substring queries. -/// -/// Suffix trees represent the suffixes of their input strings in their leaves. -/// A suffix tree is a type of compressed trie structure where each node -/// represents an entire substring rather than a single character. Each leaf -/// of the tree is a suffix. -/// -/// A suffix tree can be seen as a type of state machine where each state is a -/// substring of the full string. The tree is structured so that, for a string -/// of length N, there are exactly N leaves in the tree. This structure allows -/// us to quickly find repeated substrings of the input string. -/// -/// In this implementation, a "string" is a vector of unsigned integers. -/// These integers may result from hashing some data type. A suffix tree can -/// contain 1 or many strings, which can then be queried as one large string. -/// -/// The suffix tree is implemented using Ukkonen's algorithm for linear-time -/// suffix tree construction. Ukkonen's algorithm is explained in more detail -/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The -/// paper is available at -/// -/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf -class SuffixTree { -public: - /// Stores each leaf node in the tree. - /// - /// This is used for finding outlining candidates. - std::vector LeafVector; - - /// Each element is an integer representing an instruction in the module. - ArrayRef Str; - -private: - /// Maintains each node in the tree. - SpecificBumpPtrAllocator NodeAllocator; - - /// The root of the suffix tree. - /// - /// The root represents the empty string. It is maintained by the - /// \p NodeAllocator like every other node in the tree. - SuffixTreeNode *Root = nullptr; - - /// Maintains the end indices of the internal nodes in the tree. - /// - /// Each internal node is guaranteed to never have its end index change - /// during the construction algorithm; however, leaves must be updated at - /// every step. Therefore, we need to store leaf end indices by reference - /// to avoid updating O(N) leaves at every step of construction. Thus, - /// every internal node must be allocated its own end index. - BumpPtrAllocator InternalEndIdxAllocator; - - /// The end index of each leaf in the tree. - unsigned LeafEndIdx = -1; - - /// Helper struct which keeps track of the next insertion point in - /// Ukkonen's algorithm. - struct ActiveState { - /// The next node to insert at. - SuffixTreeNode *Node; - - /// The index of the first character in the substring currently being added. - unsigned Idx = EmptyIdx; - - /// The length of the substring we have to add at the current step. - unsigned Len = 0; - }; - - /// The point the next insertion will take place at in the - /// construction algorithm. - ActiveState Active; - - /// Allocate a leaf node and add it to the tree. - /// - /// \param Parent The parent of this node. - /// \param StartIdx The start index of this node's associated string. - /// \param Edge The label on the edge leaving \p Parent to this node. - /// - /// \returns A pointer to the allocated leaf node. - SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, - unsigned Edge) { - - assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); - - SuffixTreeNode *N = new (NodeAllocator.Allocate()) - SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr, &Parent); - Parent.Children[Edge] = N; - - return N; - } - - /// Allocate an internal node and add it to the tree. - /// - /// \param Parent The parent of this node. Only null when allocating the root. - /// \param StartIdx The start index of this node's associated string. - /// \param EndIdx The end index of this node's associated string. - /// \param Edge The label on the edge leaving \p Parent to this node. - /// - /// \returns A pointer to the allocated internal node. - SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, - unsigned EndIdx, unsigned Edge) { - - assert(StartIdx <= EndIdx && "String can't start after it ends!"); - assert(!(!Parent && StartIdx != EmptyIdx) && - "Non-root internal nodes must have parents!"); - - unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx); - SuffixTreeNode *N = new (NodeAllocator.Allocate()) - SuffixTreeNode(StartIdx, E, Root, Parent); - if (Parent) - Parent->Children[Edge] = N; - - return N; - } - - /// Set the suffix indices of the leaves to the start indices of their - /// respective suffixes. Also stores each leaf in \p LeafVector at its - /// respective suffix index. - /// - /// \param[in] CurrNode The node currently being visited. - /// \param CurrIdx The current index of the string being visited. - void setSuffixIndices(SuffixTreeNode &CurrNode, unsigned CurrIdx) { - - bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot(); - - // Store the length of the concatenation of all strings from the root to - // this node. - if (!CurrNode.isRoot()) { - if (CurrNode.ConcatLen == 0) - CurrNode.ConcatLen = CurrNode.size(); - - if (CurrNode.Parent) - CurrNode.ConcatLen += CurrNode.Parent->ConcatLen; - } - - // Traverse the tree depth-first. - for (auto &ChildPair : CurrNode.Children) { - assert(ChildPair.second && "Node had a null child!"); - setSuffixIndices(*ChildPair.second, CurrIdx + ChildPair.second->size()); - } - - // Is this node a leaf? - if (IsLeaf) { - // If yes, give it a suffix index and bump its parent's occurrence count. - CurrNode.SuffixIdx = Str.size() - CurrIdx; - assert(CurrNode.Parent && "CurrNode had no parent!"); - CurrNode.Parent->OccurrenceCount++; - - // Store the leaf in the leaf vector for pruning later. - LeafVector[CurrNode.SuffixIdx] = &CurrNode; - } - } - - /// Construct the suffix tree for the prefix of the input ending at - /// \p EndIdx. - /// - /// Used to construct the full suffix tree iteratively. At the end of each - /// step, the constructed suffix tree is either a valid suffix tree, or a - /// suffix tree with implicit suffixes. At the end of the final step, the - /// suffix tree is a valid tree. - /// - /// \param EndIdx The end index of the current prefix in the main string. - /// \param SuffixesToAdd The number of suffixes that must be added - /// to complete the suffix tree at the current phase. - /// - /// \returns The number of suffixes that have not been added at the end of - /// this step. - unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd) { - SuffixTreeNode *NeedsLink = nullptr; - - while (SuffixesToAdd > 0) { - - // Are we waiting to add anything other than just the last character? - if (Active.Len == 0) { - // If not, then say the active index is the end index. - Active.Idx = EndIdx; - } - - assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); - - // The first character in the current substring we're looking at. - unsigned FirstChar = Str[Active.Idx]; - - // Have we inserted anything starting with FirstChar at the current node? - if (Active.Node->Children.count(FirstChar) == 0) { - // If not, then we can just insert a leaf and move too the next step. - insertLeaf(*Active.Node, EndIdx, FirstChar); - - // The active node is an internal node, and we visited it, so it must - // need a link if it doesn't have one. - if (NeedsLink) { - NeedsLink->Link = Active.Node; - NeedsLink = nullptr; - } - } else { - // There's a match with FirstChar, so look for the point in the tree to - // insert a new node. - SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; - - unsigned SubstringLen = NextNode->size(); - - // Is the current suffix we're trying to insert longer than the size of - // the child we want to move to? - if (Active.Len >= SubstringLen) { - // If yes, then consume the characters we've seen and move to the next - // node. - Active.Idx += SubstringLen; - Active.Len -= SubstringLen; - Active.Node = NextNode; - continue; - } - - // Otherwise, the suffix we're trying to insert must be contained in the - // next node we want to move to. - unsigned LastChar = Str[EndIdx]; - - // Is the string we're trying to insert a substring of the next node? - if (Str[NextNode->StartIdx + Active.Len] == LastChar) { - // If yes, then we're done for this step. Remember our insertion point - // and move to the next end index. At this point, we have an implicit - // suffix tree. - if (NeedsLink && !Active.Node->isRoot()) { - NeedsLink->Link = Active.Node; - NeedsLink = nullptr; - } - - Active.Len++; - break; - } - - // The string we're trying to insert isn't a substring of the next node, - // but matches up to a point. Split the node. - // - // For example, say we ended our search at a node n and we're trying to - // insert ABD. Then we'll create a new node s for AB, reduce n to just - // representing C, and insert a new leaf node l to represent d. This - // allows us to ensure that if n was a leaf, it remains a leaf. - // - // | ABC ---split---> | AB - // n s - // C / \ D - // n l - - // The node s from the diagram - SuffixTreeNode *SplitNode = - insertInternalNode(Active.Node, NextNode->StartIdx, - NextNode->StartIdx + Active.Len - 1, FirstChar); - - // Insert the new node representing the new substring into the tree as - // a child of the split node. This is the node l from the diagram. - insertLeaf(*SplitNode, EndIdx, LastChar); - - // Make the old node a child of the split node and update its start - // index. This is the node n from the diagram. - NextNode->StartIdx += Active.Len; - NextNode->Parent = SplitNode; - SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; - - // SplitNode is an internal node, update the suffix link. - if (NeedsLink) - NeedsLink->Link = SplitNode; - - NeedsLink = SplitNode; - } - - // We've added something new to the tree, so there's one less suffix to - // add. - SuffixesToAdd--; - - if (Active.Node->isRoot()) { - if (Active.Len > 0) { - Active.Len--; - Active.Idx = EndIdx - SuffixesToAdd + 1; - } - } else { - // Start the next phase at the next smallest suffix. - Active.Node = Active.Node->Link; - } - } - - return SuffixesToAdd; - } - -public: - /// Construct a suffix tree from a sequence of unsigned integers. - /// - /// \param Str The string to construct the suffix tree for. - SuffixTree(const std::vector &Str) : Str(Str) { - Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); - Root->IsInTree = true; - Active.Node = Root; - LeafVector = std::vector(Str.size()); - - // Keep track of the number of suffixes we have to add of the current - // prefix. - unsigned SuffixesToAdd = 0; - Active.Node = Root; - - // Construct the suffix tree iteratively on each prefix of the string. - // PfxEndIdx is the end index of the current prefix. - // End is one past the last element in the string. - for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; - PfxEndIdx++) { - SuffixesToAdd++; - LeafEndIdx = PfxEndIdx; // Extend each of the leaves. - SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); - } - - // Set the suffix indices of each leaf. - assert(Root && "Root node can't be nullptr!"); - setSuffixIndices(*Root, 0); - } -}; - /// Maps \p MachineInstrs to unsigned integers and stores the mappings. struct InstructionMapper {