Bump version to 19.1.0-rc3
[llvm-project.git] / llvm / unittests / Support / SuffixTreeTest.cpp
blobe8393c0a555ef6966ab06fcb55c3ee6291a235e5
1 //===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Support/SuffixTree.h"
10 #include "gtest/gtest.h"
11 #include <vector>
13 using namespace llvm;
15 namespace {
17 // Each example vector has a unique element at the end to represent the end of
18 // the string
20 // Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
21 // {1, 2} twice in the provided string.
22 TEST(SuffixTreeTest, TestSingleRepetition) {
23 std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
24 SuffixTree ST(SimpleRepetitionData);
25 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
26 for (auto It = ST.begin(); It != ST.end(); It++)
27 SubStrings.push_back(*It);
28 ASSERT_EQ(SubStrings.size(), 1u);
29 EXPECT_EQ(SubStrings[0].Length, 2u);
30 EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
31 for (unsigned StartIdx : SubStrings[0].StartIndices) {
32 EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
33 EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
37 // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
38 // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
39 // This test also serves as a flag for improvements to the suffix tree.
41 // FIXME: Right now, the longest repeated substring from a specific index is
42 // returned, this could be improved to return the longest repeated substring, as
43 // well as those that are smaller.
44 TEST(SuffixTreeTest, TestLongerRepetition) {
45 std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
46 SuffixTree ST(RepeatedRepetitionData);
47 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
48 for (auto It = ST.begin(); It != ST.end(); It++)
49 SubStrings.push_back(*It);
50 EXPECT_EQ(SubStrings.size(), 2u);
51 unsigned Len;
52 for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
53 Len = RS.Length;
54 bool IsExpectedLen = (Len == 3u || Len == 2u);
55 bool IsExpectedIndex;
56 ASSERT_TRUE(IsExpectedLen);
58 if (Len == 3u) {
59 for (unsigned StartIdx : RS.StartIndices) {
60 IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
61 EXPECT_TRUE(IsExpectedIndex);
62 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
63 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
64 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
66 } else {
67 for (unsigned StartIdx : RS.StartIndices) {
68 IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
69 EXPECT_TRUE(IsExpectedIndex);
70 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
71 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
77 // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
78 // indices 0 and 1.
80 // FIXME: Add support for detecting {1, 1} and {1, 1, 1}
81 // See Test TestSingleCharacterRepeatWithLeafDescendants for the fix
82 TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
83 std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
84 std::vector<unsigned>::iterator RRDIt, RRDIt2;
85 SuffixTree ST(RepeatedRepetitionData);
86 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
87 for (auto It = ST.begin(); It != ST.end(); It++)
88 SubStrings.push_back(*It);
89 EXPECT_EQ(SubStrings.size(), 1u);
90 for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
91 EXPECT_EQ(RS.StartIndices.size(),
92 RepeatedRepetitionData.size() - RS.Length);
93 for (unsigned StartIdx : SubStrings[0].StartIndices) {
94 RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
95 std::advance(RRDIt, StartIdx);
96 std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
97 ASSERT_TRUE(
98 all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
99 [](unsigned Elt) { return Elt == 1; }));
104 // Tests that the SuffixTree is able to find the following substrings:
105 // {1, 1} at indices 0, 1, 2, 3, and 4;
106 // {1, 1, 1} at indices 0, 1, 2, and 3;
107 // {1, 1, 1, 1} at indices 0, 1, and 2; and
108 // {1, 1, 1, 1, 1} at indices 0 and 1.
110 // This is the fix for TestSingleCharacterRepeat.
111 TEST(SuffixTreeTest, TestSingleCharacterRepeatWithLeafDescendants) {
112 std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
113 std::vector<unsigned>::iterator RRDIt, RRDIt2;
114 SuffixTree ST(RepeatedRepetitionData, /*OutlinerLeafDescendants=*/true);
115 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
116 for (auto It = ST.begin(); It != ST.end(); It++)
117 SubStrings.push_back(*It);
118 EXPECT_EQ(SubStrings.size(), 4u);
119 for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
120 EXPECT_EQ(RS.StartIndices.size(),
121 RepeatedRepetitionData.size() - RS.Length);
122 for (unsigned StartIdx : SubStrings[0].StartIndices) {
123 RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
124 std::advance(RRDIt, StartIdx);
125 std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
126 ASSERT_TRUE(
127 all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
128 [](unsigned Elt) { return Elt == 1; }));
133 // The suffix tree cannot currently find repeated substrings in strings of the
134 // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
135 // repeats")
137 // FIXME: Teach the SuffixTree to recognize these cases.
138 TEST(SuffixTreeTest, TestTandemRepeat) {
139 std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
140 SuffixTree ST(RepeatedRepetitionData);
141 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
142 for (auto It = ST.begin(); It != ST.end(); It++)
143 SubStrings.push_back(*It);
144 EXPECT_EQ(SubStrings.size(), 0u);
147 // Tests that the SuffixTree does not erroneously include values that are not
148 // in repeated substrings. That is, only finds {1, 1} at indices 0 and 3 and
149 // does not include 2 and 3.
150 TEST(SuffixTreeTest, TestExclusion) {
151 std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
152 std::vector<unsigned>::iterator RRDIt, RRDIt2;
153 SuffixTree ST(RepeatedRepetitionData);
154 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
155 for (auto It = ST.begin(); It != ST.end(); It++)
156 SubStrings.push_back(*It);
157 EXPECT_EQ(SubStrings.size(), 1u);
158 bool IsExpectedIndex;
159 for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
160 for (unsigned StartIdx : RS.StartIndices) {
161 IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
162 EXPECT_TRUE(IsExpectedIndex);
163 RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
164 std::advance(RRDIt, StartIdx);
165 std::advance(RRDIt2, StartIdx + RS.Length);
166 EXPECT_TRUE(
167 all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
168 [](unsigned Elt) { return Elt == 1; }));
173 // Tests that the SuffixTree is able to find three substrings
174 // {1, 2, 3} at indices 6 and 10;
175 // {2, 3} at indices 7 and 11; and
176 // {1, 2} at indicies 0 and 3.
178 // FIXME: {1, 2} has indices 6 and 10 missing as it is a substring of {1, 2, 3}
179 // See Test TestSubstringRepeatsWithLeafDescendants for the fix
180 TEST(SuffixTreeTest, TestSubstringRepeats) {
181 std::vector<unsigned> RepeatedRepetitionData = {1, 2, 100, 1, 2, 101, 1,
182 2, 3, 103, 1, 2, 3, 104};
183 SuffixTree ST(RepeatedRepetitionData);
184 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
185 for (auto It = ST.begin(); It != ST.end(); It++)
186 SubStrings.push_back(*It);
187 EXPECT_EQ(SubStrings.size(), 3u);
188 unsigned Len;
189 for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
190 Len = RS.Length;
191 bool IsExpectedLen = (Len == 3u || Len == 2u);
192 ASSERT_TRUE(IsExpectedLen);
193 bool IsExpectedIndex;
195 if (Len == 3u) { // {1, 2, 3}
196 EXPECT_EQ(RS.StartIndices.size(), 2u);
197 for (unsigned StartIdx : RS.StartIndices) {
198 IsExpectedIndex = (StartIdx == 6u || StartIdx == 10u);
199 EXPECT_TRUE(IsExpectedIndex);
200 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
201 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
202 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
204 } else {
205 if (RepeatedRepetitionData[RS.StartIndices[0]] == 1u) { // {1, 2}
206 EXPECT_EQ(RS.StartIndices.size(), 2u);
207 for (unsigned StartIdx : RS.StartIndices) {
208 IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
209 EXPECT_TRUE(IsExpectedIndex);
210 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
212 } else { // {2, 3}
213 EXPECT_EQ(RS.StartIndices.size(), 2u);
214 for (unsigned StartIdx : RS.StartIndices) {
215 IsExpectedIndex = (StartIdx == 7u || StartIdx == 11u);
216 EXPECT_TRUE(IsExpectedIndex);
217 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
218 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
225 // Tests that the SuffixTree is able to find three substrings
226 // {1, 2, 3} at indices 6 and 10;
227 // {2, 3} at indices 7 and 11; and
228 // {1, 2} at indicies 0, 3, 6, and 10.
230 // This is the fix for TestSubstringRepeats
231 TEST(SuffixTreeTest, TestSubstringRepeatsWithLeafDescendants) {
232 std::vector<unsigned> RepeatedRepetitionData = {1, 2, 100, 1, 2, 101, 1,
233 2, 3, 103, 1, 2, 3, 104};
234 SuffixTree ST(RepeatedRepetitionData, /*OutlinerLeafDescendants=*/true);
235 std::vector<SuffixTree::RepeatedSubstring> SubStrings;
236 for (auto It = ST.begin(); It != ST.end(); It++)
237 SubStrings.push_back(*It);
238 EXPECT_EQ(SubStrings.size(), 3u);
239 unsigned Len;
240 for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
241 Len = RS.Length;
242 bool IsExpectedLen = (Len == 3u || Len == 2u);
243 ASSERT_TRUE(IsExpectedLen);
244 bool IsExpectedIndex;
246 if (Len == 3u) { // {1, 2, 3}
247 EXPECT_EQ(RS.StartIndices.size(), 2u);
248 for (unsigned StartIdx : RS.StartIndices) {
249 IsExpectedIndex = (StartIdx == 6u || StartIdx == 10u);
250 EXPECT_TRUE(IsExpectedIndex);
251 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
252 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
253 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
255 } else {
256 if (RepeatedRepetitionData[RS.StartIndices[0]] == 1u) { // {1, 2}
257 EXPECT_EQ(RS.StartIndices.size(), 4u);
258 for (unsigned StartIdx : RS.StartIndices) {
259 IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u ||
260 StartIdx == 6u || StartIdx == 10u);
261 EXPECT_TRUE(IsExpectedIndex);
262 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
264 } else { // {2, 3}
265 EXPECT_EQ(RS.StartIndices.size(), 2u);
266 for (unsigned StartIdx : RS.StartIndices) {
267 IsExpectedIndex = (StartIdx == 7u || StartIdx == 11u);
268 EXPECT_TRUE(IsExpectedIndex);
269 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
270 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
277 } // namespace