1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the Suffix Tree class.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 #include "llvm/Support/Casting.h"
16 #include "llvm/Support/SuffixTreeNode.h"
20 /// \returns the number of elements in the substring associated with \p N.
21 static size_t numElementsInSubstring(const SuffixTreeNode
*N
) {
22 assert(N
&& "Got a null node?");
23 if (auto *Internal
= dyn_cast
<SuffixTreeInternalNode
>(N
))
24 if (Internal
->isRoot())
26 return N
->getEndIdx() - N
->getStartIdx() + 1;
29 SuffixTree::SuffixTree(const ArrayRef
<unsigned> &Str
,
30 bool OutlinerLeafDescendants
)
31 : Str(Str
), OutlinerLeafDescendants(OutlinerLeafDescendants
) {
35 // Keep track of the number of suffixes we have to add of the current
37 unsigned SuffixesToAdd
= 0;
39 // Construct the suffix tree iteratively on each prefix of the string.
40 // PfxEndIdx is the end index of the current prefix.
41 // End is one past the last element in the string.
42 for (unsigned PfxEndIdx
= 0, End
= Str
.size(); PfxEndIdx
< End
; PfxEndIdx
++) {
44 LeafEndIdx
= PfxEndIdx
; // Extend each of the leaves.
45 SuffixesToAdd
= extend(PfxEndIdx
, SuffixesToAdd
);
48 // Set the suffix indices of each leaf.
49 assert(Root
&& "Root node can't be nullptr!");
52 // Collect all leaf nodes of the suffix tree. And for each internal node,
53 // record the range of leaf nodes that are descendants of it.
54 if (OutlinerLeafDescendants
)
58 SuffixTreeNode
*SuffixTree::insertLeaf(SuffixTreeInternalNode
&Parent
,
59 unsigned StartIdx
, unsigned Edge
) {
60 assert(StartIdx
<= LeafEndIdx
&& "String can't start after it ends!");
61 auto *N
= new (LeafNodeAllocator
.Allocate())
62 SuffixTreeLeafNode(StartIdx
, &LeafEndIdx
);
63 Parent
.Children
[Edge
] = N
;
67 SuffixTreeInternalNode
*
68 SuffixTree::insertInternalNode(SuffixTreeInternalNode
*Parent
,
69 unsigned StartIdx
, unsigned EndIdx
,
71 assert(StartIdx
<= EndIdx
&& "String can't start after it ends!");
72 assert(!(!Parent
&& StartIdx
!= SuffixTreeNode::EmptyIdx
) &&
73 "Non-root internal nodes must have parents!");
74 auto *N
= new (InternalNodeAllocator
.Allocate())
75 SuffixTreeInternalNode(StartIdx
, EndIdx
, Root
);
77 Parent
->Children
[Edge
] = N
;
81 SuffixTreeInternalNode
*SuffixTree::insertRoot() {
82 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx
,
83 SuffixTreeNode::EmptyIdx
, /*Edge = */ 0);
86 void SuffixTree::setSuffixIndices() {
87 // List of nodes we need to visit along with the current length of the
89 SmallVector
<std::pair
<SuffixTreeNode
*, unsigned>> ToVisit
;
91 // Current node being visited.
92 SuffixTreeNode
*CurrNode
= Root
;
94 // Sum of the lengths of the nodes down the path to the current one.
95 unsigned CurrNodeLen
= 0;
96 ToVisit
.push_back({CurrNode
, CurrNodeLen
});
97 while (!ToVisit
.empty()) {
98 std::tie(CurrNode
, CurrNodeLen
) = ToVisit
.back();
100 // Length of the current node from the root down to here.
101 CurrNode
->setConcatLen(CurrNodeLen
);
102 if (auto *InternalNode
= dyn_cast
<SuffixTreeInternalNode
>(CurrNode
))
103 for (auto &ChildPair
: InternalNode
->Children
) {
104 assert(ChildPair
.second
&& "Node had a null child!");
107 CurrNodeLen
+ numElementsInSubstring(ChildPair
.second
)});
109 // No children, so we are at the end of the string.
110 if (auto *LeafNode
= dyn_cast
<SuffixTreeLeafNode
>(CurrNode
))
111 LeafNode
->setSuffixIdx(Str
.size() - CurrNodeLen
);
115 void SuffixTree::setLeafNodes() {
116 // A stack that keeps track of nodes to visit for post-order DFS traversal.
117 SmallVector
<SuffixTreeNode
*> ToVisit
;
118 ToVisit
.push_back(Root
);
120 // This keeps track of the index of the next leaf node to be added to
121 // the LeafNodes vector of the suffix tree.
122 unsigned LeafCounter
= 0;
124 // This keeps track of nodes whose children have been added to the stack.
125 // The value is a pair, representing a node's first and last children.
126 DenseMap
<SuffixTreeInternalNode
*,
127 std::pair
<SuffixTreeNode
*, SuffixTreeNode
*>>
130 // Traverse the tree in post-order.
131 while (!ToVisit
.empty()) {
132 SuffixTreeNode
*CurrNode
= ToVisit
.pop_back_val();
133 if (auto *CurrInternalNode
= dyn_cast
<SuffixTreeInternalNode
>(CurrNode
)) {
134 // The current node is an internal node.
135 auto I
= ChildrenMap
.find(CurrInternalNode
);
136 if (I
== ChildrenMap
.end()) {
137 // This is the first time we visit this node.
138 // Its children have not been added to the stack yet.
139 // We add current node back, and add its children to the stack.
140 // We keep track of the first and last children of the current node.
141 auto J
= CurrInternalNode
->Children
.begin();
142 if (J
!= CurrInternalNode
->Children
.end()) {
143 ToVisit
.push_back(CurrNode
);
144 SuffixTreeNode
*FirstChild
= J
->second
;
145 SuffixTreeNode
*LastChild
= nullptr;
146 for (; J
!= CurrInternalNode
->Children
.end(); ++J
) {
147 LastChild
= J
->second
;
148 ToVisit
.push_back(LastChild
);
150 ChildrenMap
[CurrInternalNode
] = {FirstChild
, LastChild
};
153 // This is the second time we visit this node.
154 // All of its children have already been processed.
155 // Now, we can set its LeftLeafIdx and RightLeafIdx;
156 auto [FirstChild
, LastChild
] = I
->second
;
157 // Get the first child to use its RightLeafIdx.
158 // The first child is the first one added to the stack, so it is
159 // the last one to be processed. Hence, the leaf descendants
160 // of the first child are assigned the largest index numbers.
161 CurrNode
->setRightLeafIdx(FirstChild
->getRightLeafIdx());
162 // Get the last child to use its LeftLeafIdx.
163 CurrNode
->setLeftLeafIdx(LastChild
->getLeftLeafIdx());
164 assert(CurrNode
->getLeftLeafIdx() <= CurrNode
->getRightLeafIdx() &&
165 "LeftLeafIdx should not be larger than RightLeafIdx");
168 // The current node is a leaf node.
169 // We can simply set its LeftLeafIdx and RightLeafIdx.
170 CurrNode
->setLeftLeafIdx(LeafCounter
);
171 CurrNode
->setRightLeafIdx(LeafCounter
);
173 auto *CurrLeafNode
= cast
<SuffixTreeLeafNode
>(CurrNode
);
174 LeafNodes
.push_back(CurrLeafNode
);
179 unsigned SuffixTree::extend(unsigned EndIdx
, unsigned SuffixesToAdd
) {
180 SuffixTreeInternalNode
*NeedsLink
= nullptr;
182 while (SuffixesToAdd
> 0) {
184 // Are we waiting to add anything other than just the last character?
185 if (Active
.Len
== 0) {
186 // If not, then say the active index is the end index.
190 assert(Active
.Idx
<= EndIdx
&& "Start index can't be after end index!");
192 // The first character in the current substring we're looking at.
193 unsigned FirstChar
= Str
[Active
.Idx
];
195 // Have we inserted anything starting with FirstChar at the current node?
196 if (Active
.Node
->Children
.count(FirstChar
) == 0) {
197 // If not, then we can just insert a leaf and move to the next step.
198 insertLeaf(*Active
.Node
, EndIdx
, FirstChar
);
200 // The active node is an internal node, and we visited it, so it must
201 // need a link if it doesn't have one.
203 NeedsLink
->setLink(Active
.Node
);
207 // There's a match with FirstChar, so look for the point in the tree to
208 // insert a new node.
209 SuffixTreeNode
*NextNode
= Active
.Node
->Children
[FirstChar
];
211 unsigned SubstringLen
= numElementsInSubstring(NextNode
);
213 // Is the current suffix we're trying to insert longer than the size of
214 // the child we want to move to?
215 if (Active
.Len
>= SubstringLen
) {
216 // If yes, then consume the characters we've seen and move to the next
218 assert(isa
<SuffixTreeInternalNode
>(NextNode
) &&
219 "Expected an internal node?");
220 Active
.Idx
+= SubstringLen
;
221 Active
.Len
-= SubstringLen
;
222 Active
.Node
= cast
<SuffixTreeInternalNode
>(NextNode
);
226 // Otherwise, the suffix we're trying to insert must be contained in the
227 // next node we want to move to.
228 unsigned LastChar
= Str
[EndIdx
];
230 // Is the string we're trying to insert a substring of the next node?
231 if (Str
[NextNode
->getStartIdx() + Active
.Len
] == LastChar
) {
232 // If yes, then we're done for this step. Remember our insertion point
233 // and move to the next end index. At this point, we have an implicit
235 if (NeedsLink
&& !Active
.Node
->isRoot()) {
236 NeedsLink
->setLink(Active
.Node
);
244 // The string we're trying to insert isn't a substring of the next node,
245 // but matches up to a point. Split the node.
247 // For example, say we ended our search at a node n and we're trying to
248 // insert ABD. Then we'll create a new node s for AB, reduce n to just
249 // representing C, and insert a new leaf node l to represent d. This
250 // allows us to ensure that if n was a leaf, it remains a leaf.
252 // | ABC ---split---> | AB
257 // The node s from the diagram
258 SuffixTreeInternalNode
*SplitNode
= insertInternalNode(
259 Active
.Node
, NextNode
->getStartIdx(),
260 NextNode
->getStartIdx() + Active
.Len
- 1, FirstChar
);
262 // Insert the new node representing the new substring into the tree as
263 // a child of the split node. This is the node l from the diagram.
264 insertLeaf(*SplitNode
, EndIdx
, LastChar
);
266 // Make the old node a child of the split node and update its start
267 // index. This is the node n from the diagram.
268 NextNode
->incrementStartIdx(Active
.Len
);
269 SplitNode
->Children
[Str
[NextNode
->getStartIdx()]] = NextNode
;
271 // SplitNode is an internal node, update the suffix link.
273 NeedsLink
->setLink(SplitNode
);
275 NeedsLink
= SplitNode
;
278 // We've added something new to the tree, so there's one less suffix to
282 if (Active
.Node
->isRoot()) {
283 if (Active
.Len
> 0) {
285 Active
.Idx
= EndIdx
- SuffixesToAdd
+ 1;
288 // Start the next phase at the next smallest suffix.
289 Active
.Node
= Active
.Node
->getLink();
293 return SuffixesToAdd
;
296 void SuffixTree::RepeatedSubstringIterator::advance() {
297 // Clear the current state. If we're at the end of the range, then this
298 // is the state we want to be in.
299 RS
= RepeatedSubstring();
302 // Each leaf node represents a repeat of a string.
303 SmallVector
<unsigned> RepeatedSubstringStarts
;
305 // Continue visiting nodes until we find one which repeats more than once.
306 while (!InternalNodesToVisit
.empty()) {
307 RepeatedSubstringStarts
.clear();
308 auto *Curr
= InternalNodesToVisit
.back();
309 InternalNodesToVisit
.pop_back();
311 // Keep track of the length of the string associated with the node. If
312 // it's too short, we'll quit.
313 unsigned Length
= Curr
->getConcatLen();
315 // Iterate over each child, saving internal nodes for visiting.
316 // Internal nodes represent individual strings, which may repeat.
317 for (auto &ChildPair
: Curr
->Children
)
318 // Save all of this node's children for processing.
319 if (auto *InternalChild
=
320 dyn_cast
<SuffixTreeInternalNode
>(ChildPair
.second
))
321 InternalNodesToVisit
.push_back(InternalChild
);
323 // If length of repeated substring is below threshold, then skip it.
324 if (Length
< MinLength
)
327 // The root never represents a repeated substring. If we're looking at
328 // that, then skip it.
332 // Collect leaf children or leaf descendants by OutlinerLeafDescendants.
333 if (OutlinerLeafDescendants
) {
334 for (unsigned I
= Curr
->getLeftLeafIdx(); I
<= Curr
->getRightLeafIdx();
336 RepeatedSubstringStarts
.push_back(LeafNodes
[I
]->getSuffixIdx());
338 for (auto &ChildPair
: Curr
->Children
)
339 if (auto *Leaf
= dyn_cast
<SuffixTreeLeafNode
>(ChildPair
.second
))
340 RepeatedSubstringStarts
.push_back(Leaf
->getSuffixIdx());
343 // Do we have any repeated substrings?
344 if (RepeatedSubstringStarts
.size() < 2)
347 // Yes. Update the state to reflect this, and then bail out.
350 for (unsigned StartIdx
: RepeatedSubstringStarts
)
351 RS
.StartIndices
.push_back(StartIdx
);
354 // At this point, either NewRS is an empty RepeatedSubstring, or it was
355 // set in the above loop. Similarly, N is either nullptr, or the node
356 // associated with NewRS.