1 //===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Support/SuffixTree.h"
10 #include "gtest/gtest.h"
17 // Each example vector has a unique element at the end to represent the end of
20 // Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
21 // {1, 2} twice in the provided string.
22 TEST(SuffixTreeTest
, TestSingleRepetition
) {
23 std::vector
<unsigned> SimpleRepetitionData
= {1, 2, 1, 2, 3};
24 SuffixTree
ST(SimpleRepetitionData
);
25 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
26 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
27 SubStrings
.push_back(*It
);
28 ASSERT_EQ(SubStrings
.size(), 1u);
29 EXPECT_EQ(SubStrings
[0].Length
, 2u);
30 EXPECT_EQ(SubStrings
[0].StartIndices
.size(), 2u);
31 for (unsigned StartIdx
: SubStrings
[0].StartIndices
) {
32 EXPECT_EQ(SimpleRepetitionData
[StartIdx
], 1u);
33 EXPECT_EQ(SimpleRepetitionData
[StartIdx
+ 1], 2u);
37 // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
38 // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
39 // This test also serves as a flag for improvements to the suffix tree.
41 // FIXME: Right now, the longest repeated substring from a specific index is
42 // returned, this could be improved to return the longest repeated substring, as
43 // well as those that are smaller.
44 TEST(SuffixTreeTest
, TestLongerRepetition
) {
45 std::vector
<unsigned> RepeatedRepetitionData
= {1, 2, 3, 1, 2, 3, 4};
46 SuffixTree
ST(RepeatedRepetitionData
);
47 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
48 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
49 SubStrings
.push_back(*It
);
50 EXPECT_EQ(SubStrings
.size(), 2u);
52 for (SuffixTree::RepeatedSubstring
&RS
: SubStrings
) {
54 bool IsExpectedLen
= (Len
== 3u || Len
== 2u);
56 ASSERT_TRUE(IsExpectedLen
);
59 for (unsigned StartIdx
: RS
.StartIndices
) {
60 IsExpectedIndex
= (StartIdx
== 0u || StartIdx
== 3u);
61 EXPECT_TRUE(IsExpectedIndex
);
62 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
], 1u);
63 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 2u);
64 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 2], 3u);
67 for (unsigned StartIdx
: RS
.StartIndices
) {
68 IsExpectedIndex
= (StartIdx
== 1u || StartIdx
== 4u);
69 EXPECT_TRUE(IsExpectedIndex
);
70 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
], 2u);
71 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 3u);
77 // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
80 // FIXME: Add support for detecting {1, 1} and {1, 1, 1}
81 // See Test TestSingleCharacterRepeatWithLeafDescendants for the fix
82 TEST(SuffixTreeTest
, TestSingleCharacterRepeat
) {
83 std::vector
<unsigned> RepeatedRepetitionData
= {1, 1, 1, 1, 1, 1, 2};
84 std::vector
<unsigned>::iterator RRDIt
, RRDIt2
;
85 SuffixTree
ST(RepeatedRepetitionData
);
86 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
87 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
88 SubStrings
.push_back(*It
);
89 EXPECT_EQ(SubStrings
.size(), 1u);
90 for (SuffixTree::RepeatedSubstring
&RS
: SubStrings
) {
91 EXPECT_EQ(RS
.StartIndices
.size(),
92 RepeatedRepetitionData
.size() - RS
.Length
);
93 for (unsigned StartIdx
: SubStrings
[0].StartIndices
) {
94 RRDIt
= RRDIt2
= RepeatedRepetitionData
.begin();
95 std::advance(RRDIt
, StartIdx
);
96 std::advance(RRDIt2
, StartIdx
+ SubStrings
[0].Length
);
98 all_of(make_range
<std::vector
<unsigned>::iterator
>(RRDIt
, RRDIt2
),
99 [](unsigned Elt
) { return Elt
== 1; }));
104 // Tests that the SuffixTree is able to find the following substrings:
105 // {1, 1} at indices 0, 1, 2, 3, and 4;
106 // {1, 1, 1} at indices 0, 1, 2, and 3;
107 // {1, 1, 1, 1} at indices 0, 1, and 2; and
108 // {1, 1, 1, 1, 1} at indices 0 and 1.
110 // This is the fix for TestSingleCharacterRepeat.
111 TEST(SuffixTreeTest
, TestSingleCharacterRepeatWithLeafDescendants
) {
112 std::vector
<unsigned> RepeatedRepetitionData
= {1, 1, 1, 1, 1, 1, 2};
113 std::vector
<unsigned>::iterator RRDIt
, RRDIt2
;
114 SuffixTree
ST(RepeatedRepetitionData
, /*OutlinerLeafDescendants=*/true);
115 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
116 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
117 SubStrings
.push_back(*It
);
118 EXPECT_EQ(SubStrings
.size(), 4u);
119 for (SuffixTree::RepeatedSubstring
&RS
: SubStrings
) {
120 EXPECT_EQ(RS
.StartIndices
.size(),
121 RepeatedRepetitionData
.size() - RS
.Length
);
122 for (unsigned StartIdx
: SubStrings
[0].StartIndices
) {
123 RRDIt
= RRDIt2
= RepeatedRepetitionData
.begin();
124 std::advance(RRDIt
, StartIdx
);
125 std::advance(RRDIt2
, StartIdx
+ SubStrings
[0].Length
);
127 all_of(make_range
<std::vector
<unsigned>::iterator
>(RRDIt
, RRDIt2
),
128 [](unsigned Elt
) { return Elt
== 1; }));
133 // The suffix tree cannot currently find repeated substrings in strings of the
134 // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
137 // FIXME: Teach the SuffixTree to recognize these cases.
138 TEST(SuffixTreeTest
, TestTandemRepeat
) {
139 std::vector
<unsigned> RepeatedRepetitionData
= {1, 2, 3, 1, 2, 3};
140 SuffixTree
ST(RepeatedRepetitionData
);
141 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
142 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
143 SubStrings
.push_back(*It
);
144 EXPECT_EQ(SubStrings
.size(), 0u);
147 // Tests that the SuffixTree does not erroneously include values that are not
148 // in repeated substrings. That is, only finds {1, 1} at indices 0 and 3 and
149 // does not include 2 and 3.
150 TEST(SuffixTreeTest
, TestExclusion
) {
151 std::vector
<unsigned> RepeatedRepetitionData
= {1, 1, 2, 1, 1, 3};
152 std::vector
<unsigned>::iterator RRDIt
, RRDIt2
;
153 SuffixTree
ST(RepeatedRepetitionData
);
154 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
155 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
156 SubStrings
.push_back(*It
);
157 EXPECT_EQ(SubStrings
.size(), 1u);
158 bool IsExpectedIndex
;
159 for (SuffixTree::RepeatedSubstring
&RS
: SubStrings
) {
160 for (unsigned StartIdx
: RS
.StartIndices
) {
161 IsExpectedIndex
= (StartIdx
== 0u || StartIdx
== 3u);
162 EXPECT_TRUE(IsExpectedIndex
);
163 RRDIt
= RRDIt2
= RepeatedRepetitionData
.begin();
164 std::advance(RRDIt
, StartIdx
);
165 std::advance(RRDIt2
, StartIdx
+ RS
.Length
);
167 all_of(make_range
<std::vector
<unsigned>::iterator
>(RRDIt
, RRDIt2
),
168 [](unsigned Elt
) { return Elt
== 1; }));
173 // Tests that the SuffixTree is able to find three substrings
174 // {1, 2, 3} at indices 6 and 10;
175 // {2, 3} at indices 7 and 11; and
176 // {1, 2} at indicies 0 and 3.
178 // FIXME: {1, 2} has indices 6 and 10 missing as it is a substring of {1, 2, 3}
179 // See Test TestSubstringRepeatsWithLeafDescendants for the fix
180 TEST(SuffixTreeTest
, TestSubstringRepeats
) {
181 std::vector
<unsigned> RepeatedRepetitionData
= {1, 2, 100, 1, 2, 101, 1,
182 2, 3, 103, 1, 2, 3, 104};
183 SuffixTree
ST(RepeatedRepetitionData
);
184 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
185 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
186 SubStrings
.push_back(*It
);
187 EXPECT_EQ(SubStrings
.size(), 3u);
189 for (SuffixTree::RepeatedSubstring
&RS
: SubStrings
) {
191 bool IsExpectedLen
= (Len
== 3u || Len
== 2u);
192 ASSERT_TRUE(IsExpectedLen
);
193 bool IsExpectedIndex
;
195 if (Len
== 3u) { // {1, 2, 3}
196 EXPECT_EQ(RS
.StartIndices
.size(), 2u);
197 for (unsigned StartIdx
: RS
.StartIndices
) {
198 IsExpectedIndex
= (StartIdx
== 6u || StartIdx
== 10u);
199 EXPECT_TRUE(IsExpectedIndex
);
200 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
], 1u);
201 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 2u);
202 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 2], 3u);
205 if (RepeatedRepetitionData
[RS
.StartIndices
[0]] == 1u) { // {1, 2}
206 EXPECT_EQ(RS
.StartIndices
.size(), 2u);
207 for (unsigned StartIdx
: RS
.StartIndices
) {
208 IsExpectedIndex
= (StartIdx
== 0u || StartIdx
== 3u);
209 EXPECT_TRUE(IsExpectedIndex
);
210 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 2u);
213 EXPECT_EQ(RS
.StartIndices
.size(), 2u);
214 for (unsigned StartIdx
: RS
.StartIndices
) {
215 IsExpectedIndex
= (StartIdx
== 7u || StartIdx
== 11u);
216 EXPECT_TRUE(IsExpectedIndex
);
217 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
], 2u);
218 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 3u);
225 // Tests that the SuffixTree is able to find three substrings
226 // {1, 2, 3} at indices 6 and 10;
227 // {2, 3} at indices 7 and 11; and
228 // {1, 2} at indicies 0, 3, 6, and 10.
230 // This is the fix for TestSubstringRepeats
231 TEST(SuffixTreeTest
, TestSubstringRepeatsWithLeafDescendants
) {
232 std::vector
<unsigned> RepeatedRepetitionData
= {1, 2, 100, 1, 2, 101, 1,
233 2, 3, 103, 1, 2, 3, 104};
234 SuffixTree
ST(RepeatedRepetitionData
, /*OutlinerLeafDescendants=*/true);
235 std::vector
<SuffixTree::RepeatedSubstring
> SubStrings
;
236 for (auto It
= ST
.begin(); It
!= ST
.end(); It
++)
237 SubStrings
.push_back(*It
);
238 EXPECT_EQ(SubStrings
.size(), 3u);
240 for (SuffixTree::RepeatedSubstring
&RS
: SubStrings
) {
242 bool IsExpectedLen
= (Len
== 3u || Len
== 2u);
243 ASSERT_TRUE(IsExpectedLen
);
244 bool IsExpectedIndex
;
246 if (Len
== 3u) { // {1, 2, 3}
247 EXPECT_EQ(RS
.StartIndices
.size(), 2u);
248 for (unsigned StartIdx
: RS
.StartIndices
) {
249 IsExpectedIndex
= (StartIdx
== 6u || StartIdx
== 10u);
250 EXPECT_TRUE(IsExpectedIndex
);
251 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
], 1u);
252 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 2u);
253 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 2], 3u);
256 if (RepeatedRepetitionData
[RS
.StartIndices
[0]] == 1u) { // {1, 2}
257 EXPECT_EQ(RS
.StartIndices
.size(), 4u);
258 for (unsigned StartIdx
: RS
.StartIndices
) {
259 IsExpectedIndex
= (StartIdx
== 0u || StartIdx
== 3u ||
260 StartIdx
== 6u || StartIdx
== 10u);
261 EXPECT_TRUE(IsExpectedIndex
);
262 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 2u);
265 EXPECT_EQ(RS
.StartIndices
.size(), 2u);
266 for (unsigned StartIdx
: RS
.StartIndices
) {
267 IsExpectedIndex
= (StartIdx
== 7u || StartIdx
== 11u);
268 EXPECT_TRUE(IsExpectedIndex
);
269 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
], 2u);
270 EXPECT_EQ(RepeatedRepetitionData
[StartIdx
+ 1], 3u);