[RISCV] Fix mgather -> riscv.masked.strided.load combine not extending indices (...
[llvm-project.git] / llvm / lib / Support / SuffixTree.cpp
blobeaa653078e09009c833d9fa70387f2f0e97f3b82
1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 #include "llvm/Support/Casting.h"
16 #include "llvm/Support/SuffixTreeNode.h"
18 using namespace llvm;
20 /// \returns the number of elements in the substring associated with \p N.
21 static size_t numElementsInSubstring(const SuffixTreeNode *N) {
22 assert(N && "Got a null node?");
23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24 if (Internal->isRoot())
25 return 0;
26 return N->getEndIdx() - N->getStartIdx() + 1;
29 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
30 Root = insertRoot();
31 Active.Node = Root;
33 // Keep track of the number of suffixes we have to add of the current
34 // prefix.
35 unsigned SuffixesToAdd = 0;
37 // Construct the suffix tree iteratively on each prefix of the string.
38 // PfxEndIdx is the end index of the current prefix.
39 // End is one past the last element in the string.
40 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
41 SuffixesToAdd++;
42 LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
43 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
46 // Set the suffix indices of each leaf.
47 assert(Root && "Root node can't be nullptr!");
48 setSuffixIndices();
51 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
52 unsigned StartIdx, unsigned Edge) {
53 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
54 auto *N = new (LeafNodeAllocator.Allocate())
55 SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
56 Parent.Children[Edge] = N;
57 return N;
60 SuffixTreeInternalNode *
61 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
62 unsigned StartIdx, unsigned EndIdx,
63 unsigned Edge) {
64 assert(StartIdx <= EndIdx && "String can't start after it ends!");
65 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
66 "Non-root internal nodes must have parents!");
67 auto *N = new (InternalNodeAllocator.Allocate())
68 SuffixTreeInternalNode(StartIdx, EndIdx, Root);
69 if (Parent)
70 Parent->Children[Edge] = N;
71 return N;
74 SuffixTreeInternalNode *SuffixTree::insertRoot() {
75 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
76 SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
79 void SuffixTree::setSuffixIndices() {
80 // List of nodes we need to visit along with the current length of the
81 // string.
82 SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
84 // Current node being visited.
85 SuffixTreeNode *CurrNode = Root;
87 // Sum of the lengths of the nodes down the path to the current one.
88 unsigned CurrNodeLen = 0;
89 ToVisit.push_back({CurrNode, CurrNodeLen});
90 while (!ToVisit.empty()) {
91 std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
92 ToVisit.pop_back();
93 // Length of the current node from the root down to here.
94 CurrNode->setConcatLen(CurrNodeLen);
95 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
96 for (auto &ChildPair : InternalNode->Children) {
97 assert(ChildPair.second && "Node had a null child!");
98 ToVisit.push_back(
99 {ChildPair.second,
100 CurrNodeLen + numElementsInSubstring(ChildPair.second)});
102 // No children, so we are at the end of the string.
103 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
104 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
108 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
109 SuffixTreeInternalNode *NeedsLink = nullptr;
111 while (SuffixesToAdd > 0) {
113 // Are we waiting to add anything other than just the last character?
114 if (Active.Len == 0) {
115 // If not, then say the active index is the end index.
116 Active.Idx = EndIdx;
119 assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
121 // The first character in the current substring we're looking at.
122 unsigned FirstChar = Str[Active.Idx];
124 // Have we inserted anything starting with FirstChar at the current node?
125 if (Active.Node->Children.count(FirstChar) == 0) {
126 // If not, then we can just insert a leaf and move to the next step.
127 insertLeaf(*Active.Node, EndIdx, FirstChar);
129 // The active node is an internal node, and we visited it, so it must
130 // need a link if it doesn't have one.
131 if (NeedsLink) {
132 NeedsLink->setLink(Active.Node);
133 NeedsLink = nullptr;
135 } else {
136 // There's a match with FirstChar, so look for the point in the tree to
137 // insert a new node.
138 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
140 unsigned SubstringLen = numElementsInSubstring(NextNode);
142 // Is the current suffix we're trying to insert longer than the size of
143 // the child we want to move to?
144 if (Active.Len >= SubstringLen) {
145 // If yes, then consume the characters we've seen and move to the next
146 // node.
147 assert(isa<SuffixTreeInternalNode>(NextNode) &&
148 "Expected an internal node?");
149 Active.Idx += SubstringLen;
150 Active.Len -= SubstringLen;
151 Active.Node = cast<SuffixTreeInternalNode>(NextNode);
152 continue;
155 // Otherwise, the suffix we're trying to insert must be contained in the
156 // next node we want to move to.
157 unsigned LastChar = Str[EndIdx];
159 // Is the string we're trying to insert a substring of the next node?
160 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
161 // If yes, then we're done for this step. Remember our insertion point
162 // and move to the next end index. At this point, we have an implicit
163 // suffix tree.
164 if (NeedsLink && !Active.Node->isRoot()) {
165 NeedsLink->setLink(Active.Node);
166 NeedsLink = nullptr;
169 Active.Len++;
170 break;
173 // The string we're trying to insert isn't a substring of the next node,
174 // but matches up to a point. Split the node.
176 // For example, say we ended our search at a node n and we're trying to
177 // insert ABD. Then we'll create a new node s for AB, reduce n to just
178 // representing C, and insert a new leaf node l to represent d. This
179 // allows us to ensure that if n was a leaf, it remains a leaf.
181 // | ABC ---split---> | AB
182 // n s
183 // C / \ D
184 // n l
186 // The node s from the diagram
187 SuffixTreeInternalNode *SplitNode = insertInternalNode(
188 Active.Node, NextNode->getStartIdx(),
189 NextNode->getStartIdx() + Active.Len - 1, FirstChar);
191 // Insert the new node representing the new substring into the tree as
192 // a child of the split node. This is the node l from the diagram.
193 insertLeaf(*SplitNode, EndIdx, LastChar);
195 // Make the old node a child of the split node and update its start
196 // index. This is the node n from the diagram.
197 NextNode->incrementStartIdx(Active.Len);
198 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
200 // SplitNode is an internal node, update the suffix link.
201 if (NeedsLink)
202 NeedsLink->setLink(SplitNode);
204 NeedsLink = SplitNode;
207 // We've added something new to the tree, so there's one less suffix to
208 // add.
209 SuffixesToAdd--;
211 if (Active.Node->isRoot()) {
212 if (Active.Len > 0) {
213 Active.Len--;
214 Active.Idx = EndIdx - SuffixesToAdd + 1;
216 } else {
217 // Start the next phase at the next smallest suffix.
218 Active.Node = Active.Node->getLink();
222 return SuffixesToAdd;
225 void SuffixTree::RepeatedSubstringIterator::advance() {
226 // Clear the current state. If we're at the end of the range, then this
227 // is the state we want to be in.
228 RS = RepeatedSubstring();
229 N = nullptr;
231 // Each leaf node represents a repeat of a string.
232 SmallVector<unsigned> RepeatedSubstringStarts;
234 // Continue visiting nodes until we find one which repeats more than once.
235 while (!InternalNodesToVisit.empty()) {
236 RepeatedSubstringStarts.clear();
237 auto *Curr = InternalNodesToVisit.back();
238 InternalNodesToVisit.pop_back();
240 // Keep track of the length of the string associated with the node. If
241 // it's too short, we'll quit.
242 unsigned Length = Curr->getConcatLen();
244 // Iterate over each child, saving internal nodes for visiting, and
245 // leaf nodes in LeafChildren. Internal nodes represent individual
246 // strings, which may repeat.
247 for (auto &ChildPair : Curr->Children) {
248 // Save all of this node's children for processing.
249 if (auto *InternalChild =
250 dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
251 InternalNodesToVisit.push_back(InternalChild);
252 continue;
255 if (Length < MinLength)
256 continue;
258 // Have an occurrence of a potentially repeated string. Save it.
259 auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
260 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
263 // The root never represents a repeated substring. If we're looking at
264 // that, then skip it.
265 if (Curr->isRoot())
266 continue;
268 // Do we have any repeated substrings?
269 if (RepeatedSubstringStarts.size() < 2)
270 continue;
272 // Yes. Update the state to reflect this, and then bail out.
273 N = Curr;
274 RS.Length = Length;
275 for (unsigned StartIdx : RepeatedSubstringStarts)
276 RS.StartIndices.push_back(StartIdx);
277 break;
279 // At this point, either NewRS is an empty RepeatedSubstring, or it was
280 // set in the above loop. Similarly, N is either nullptr, or the node
281 // associated with NewRS.