1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the Suffix Tree class.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
19 SuffixTree::SuffixTree(const std::vector
<unsigned> &Str
) : Str(Str
) {
20 Root
= insertInternalNode(nullptr, EmptyIdx
, EmptyIdx
, 0);
23 // Keep track of the number of suffixes we have to add of the current
25 unsigned SuffixesToAdd
= 0;
27 // Construct the suffix tree iteratively on each prefix of the string.
28 // PfxEndIdx is the end index of the current prefix.
29 // End is one past the last element in the string.
30 for (unsigned PfxEndIdx
= 0, End
= Str
.size(); PfxEndIdx
< End
; PfxEndIdx
++) {
32 LeafEndIdx
= PfxEndIdx
; // Extend each of the leaves.
33 SuffixesToAdd
= extend(PfxEndIdx
, SuffixesToAdd
);
36 // Set the suffix indices of each leaf.
37 assert(Root
&& "Root node can't be nullptr!");
41 SuffixTreeNode
*SuffixTree::insertLeaf(SuffixTreeNode
&Parent
,
42 unsigned StartIdx
, unsigned Edge
) {
44 assert(StartIdx
<= LeafEndIdx
&& "String can't start after it ends!");
46 SuffixTreeNode
*N
= new (NodeAllocator
.Allocate())
47 SuffixTreeNode(StartIdx
, &LeafEndIdx
, nullptr);
48 Parent
.Children
[Edge
] = N
;
53 SuffixTreeNode
*SuffixTree::insertInternalNode(SuffixTreeNode
*Parent
,
55 unsigned EndIdx
, unsigned Edge
) {
57 assert(StartIdx
<= EndIdx
&& "String can't start after it ends!");
58 assert(!(!Parent
&& StartIdx
!= EmptyIdx
) &&
59 "Non-root internal nodes must have parents!");
61 unsigned *E
= new (InternalEndIdxAllocator
) unsigned(EndIdx
);
63 new (NodeAllocator
.Allocate()) SuffixTreeNode(StartIdx
, E
, Root
);
65 Parent
->Children
[Edge
] = N
;
70 void SuffixTree::setSuffixIndices() {
71 // List of nodes we need to visit along with the current length of the
73 std::vector
<std::pair
<SuffixTreeNode
*, unsigned>> ToVisit
;
75 // Current node being visited.
76 SuffixTreeNode
*CurrNode
= Root
;
78 // Sum of the lengths of the nodes down the path to the current one.
79 unsigned CurrNodeLen
= 0;
80 ToVisit
.push_back({CurrNode
, CurrNodeLen
});
81 while (!ToVisit
.empty()) {
82 std::tie(CurrNode
, CurrNodeLen
) = ToVisit
.back();
84 CurrNode
->ConcatLen
= CurrNodeLen
;
85 for (auto &ChildPair
: CurrNode
->Children
) {
86 assert(ChildPair
.second
&& "Node had a null child!");
88 {ChildPair
.second
, CurrNodeLen
+ ChildPair
.second
->size()});
91 // No children, so we are at the end of the string.
92 if (CurrNode
->Children
.size() == 0 && !CurrNode
->isRoot())
93 CurrNode
->SuffixIdx
= Str
.size() - CurrNodeLen
;
97 unsigned SuffixTree::extend(unsigned EndIdx
, unsigned SuffixesToAdd
) {
98 SuffixTreeNode
*NeedsLink
= nullptr;
100 while (SuffixesToAdd
> 0) {
102 // Are we waiting to add anything other than just the last character?
103 if (Active
.Len
== 0) {
104 // If not, then say the active index is the end index.
108 assert(Active
.Idx
<= EndIdx
&& "Start index can't be after end index!");
110 // The first character in the current substring we're looking at.
111 unsigned FirstChar
= Str
[Active
.Idx
];
113 // Have we inserted anything starting with FirstChar at the current node?
114 if (Active
.Node
->Children
.count(FirstChar
) == 0) {
115 // If not, then we can just insert a leaf and move to the next step.
116 insertLeaf(*Active
.Node
, EndIdx
, FirstChar
);
118 // The active node is an internal node, and we visited it, so it must
119 // need a link if it doesn't have one.
121 NeedsLink
->Link
= Active
.Node
;
125 // There's a match with FirstChar, so look for the point in the tree to
126 // insert a new node.
127 SuffixTreeNode
*NextNode
= Active
.Node
->Children
[FirstChar
];
129 unsigned SubstringLen
= NextNode
->size();
131 // Is the current suffix we're trying to insert longer than the size of
132 // the child we want to move to?
133 if (Active
.Len
>= SubstringLen
) {
134 // If yes, then consume the characters we've seen and move to the next
136 Active
.Idx
+= SubstringLen
;
137 Active
.Len
-= SubstringLen
;
138 Active
.Node
= NextNode
;
142 // Otherwise, the suffix we're trying to insert must be contained in the
143 // next node we want to move to.
144 unsigned LastChar
= Str
[EndIdx
];
146 // Is the string we're trying to insert a substring of the next node?
147 if (Str
[NextNode
->StartIdx
+ Active
.Len
] == LastChar
) {
148 // If yes, then we're done for this step. Remember our insertion point
149 // and move to the next end index. At this point, we have an implicit
151 if (NeedsLink
&& !Active
.Node
->isRoot()) {
152 NeedsLink
->Link
= Active
.Node
;
160 // The string we're trying to insert isn't a substring of the next node,
161 // but matches up to a point. Split the node.
163 // For example, say we ended our search at a node n and we're trying to
164 // insert ABD. Then we'll create a new node s for AB, reduce n to just
165 // representing C, and insert a new leaf node l to represent d. This
166 // allows us to ensure that if n was a leaf, it remains a leaf.
168 // | ABC ---split---> | AB
173 // The node s from the diagram
174 SuffixTreeNode
*SplitNode
=
175 insertInternalNode(Active
.Node
, NextNode
->StartIdx
,
176 NextNode
->StartIdx
+ Active
.Len
- 1, FirstChar
);
178 // Insert the new node representing the new substring into the tree as
179 // a child of the split node. This is the node l from the diagram.
180 insertLeaf(*SplitNode
, EndIdx
, LastChar
);
182 // Make the old node a child of the split node and update its start
183 // index. This is the node n from the diagram.
184 NextNode
->StartIdx
+= Active
.Len
;
185 SplitNode
->Children
[Str
[NextNode
->StartIdx
]] = NextNode
;
187 // SplitNode is an internal node, update the suffix link.
189 NeedsLink
->Link
= SplitNode
;
191 NeedsLink
= SplitNode
;
194 // We've added something new to the tree, so there's one less suffix to
198 if (Active
.Node
->isRoot()) {
199 if (Active
.Len
> 0) {
201 Active
.Idx
= EndIdx
- SuffixesToAdd
+ 1;
204 // Start the next phase at the next smallest suffix.
205 Active
.Node
= Active
.Node
->Link
;
209 return SuffixesToAdd
;