1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #include "../src/stringsort-inl.h"
18 #include "../src/stringsort.h"
19 #include "../src/ternary_partition.h" // only for random numbers
20 #include "../src/inttypes.h"
31 using dcsbwt::StringsortSuffixes
;
32 using dcsbwt::ternary_partition::RandomNumberGenerator
;
34 void PermuteRandomly(uint32
* begin
, uint32
* end
, RandomNumberGenerator
* rng
) {
35 int size
= end
- begin
;
36 for (int i
= 0; i
< size
; ++i
) {
37 std::swap(begin
[i
], (begin
+i
)[rng
->Uniform(size
-i
)]);
41 class SortedGroupVerifier
{
43 SortedGroupVerifier(const unsigned char* text
, const unsigned char* text_end
,
44 uint32
* suffixes
, uint32
* suffixes_end
,
46 : text_(text
), text_length_(text_end
- text
),
47 prefix_length_(prefix_length
),
49 suffixes_end_(suffixes_end
),
50 previous_begin_(suffixes
), previous_end_(suffixes
),
51 report_count_(text_end
- text
+ 1, 0) {}
53 previous_begin_
= suffixes_
;
54 previous_end_
= suffixes_
;
55 std::fill(report_count_
.begin(), report_count_
.end(), 0);
58 assert(previous_end_
- suffixes_
== suffixes_end_
- suffixes_
);
59 for (const uint32
* it
= suffixes_
; it
< suffixes_end_
; ++it
) {
60 assert(report_count_
[*it
] == 1);
63 void operator() (const uint32
* begin
, const uint32
* end
) {
64 assert(begin
- suffixes_
== previous_end_
- suffixes_
);
65 assert(suffixes_end_
- suffixes_
>= end
- suffixes_
);
66 const int size
= end
- begin
;
68 assert(*begin
<= text_length_
);
69 for (const uint32
* it
= begin
; it
< end
; ++it
) {
72 // Check that all group suffixes share a prefix of length prefix_length_
74 int full_length_suffix
= text_length_
- prefix_length_
;
75 assert(*begin
<= full_length_suffix
);
76 for (const uint32
* it
= begin
+ 1; it
< end
; ++it
) {
77 assert(*it
<= full_length_suffix
);
78 for (int i
= 0; i
< prefix_length_
; ++i
) {
79 assert(text_
[*begin
+ i
] == text_
[*it
+ i
]);
83 // Check that this group's suffixes are larger than previous group's
84 // First check whether there is a previous group.
85 if (previous_begin_
< previous_end_
) {
86 // Compare the first elements of the groups.
87 int limit
= prefix_length_
;
88 limit
= std::min
<int>(limit
, text_length_
- *previous_begin_
);
89 limit
= std::min
<int>(limit
, text_length_
- *begin
);
91 for (i
= 0; i
< limit
; ++i
)
92 if (text_
[*previous_begin_
+ i
] != text_
[*begin
+ i
]) break;
93 assert(i
< text_length_
- *begin
);
95 assert(text_
[*previous_begin_
+ i
] < text_
[*begin
+ i
]);
96 } else if (i
< text_length_
- *previous_begin_
) {
97 // The two groups share a prefix of length prefix_length_.
98 // Now we must compare all-against-all all the way.
99 for (const uint32
* it1
= previous_begin_
; it1
< previous_end_
; ++it1
) {
100 for (const uint32
* it2
= begin
; it2
< end
; ++it2
) {
101 int length1
= text_length_
- *it1
;
102 int length2
= text_length_
- *it2
;
103 limit
= std::min
<int>(length1
, length2
);
104 for (i
= 0; i
< limit
; ++i
)
105 if (text_
[*it1
+ i
] != text_
[*it2
+ i
]) break;
108 assert(text_
[*it1
+ i
] < text_
[*it2
+ i
]);
114 previous_begin_
= begin
;
118 const unsigned char* const text_
;
119 const int text_length_
;
120 const int prefix_length_
;
121 const uint32
* const suffixes_
;
122 const uint32
* const suffixes_end_
;
123 const uint32
* previous_begin_
;
124 const uint32
* previous_end_
;
125 std::vector
<int> report_count_
;
128 void SortAndVerify(const std::string
& str
,
129 uint32 target_prefix_length
) {
130 int text_length
= str
.size();
131 std::string::const_iterator text
= str
.begin();
132 std::string::const_iterator text_end
= text
+ text_length
;
134 int num_suffixes
= text_length
+ 1;
135 std::vector
<uint32
> suffix_array(2 * num_suffixes
);
136 for (int i
= 0; i
< num_suffixes
; ++i
) suffix_array
[i
] = i
;
137 uint32
* suffix_area
= &suffix_array
[0];
138 uint32
* suffixes_end
= suffix_area
+ num_suffixes
;
139 uint32
* full_area_end
= suffix_area
+ suffix_array
.size();
140 uint32
* small_area_end
= suffixes_end
+ (full_area_end
- suffixes_end
)/4;
142 const unsigned char* text1
143 = reinterpret_cast<const unsigned char*>(str
.data());
144 const unsigned char* text_end1
= text1
+ text_length
;
145 SortedGroupVerifier
verifier(text1
, text_end1
, suffix_area
, suffixes_end
,
146 target_prefix_length
);
147 RandomNumberGenerator rng
;
149 PermuteRandomly(suffix_area
, suffixes_end
, &rng
);
152 StringsortSuffixes(text
, text_end
,
153 suffix_area
, suffix_area
, suffixes_end
,
154 0, target_prefix_length
,
156 assert(result
== suffixes_end
);
157 verifier
.FinalCheck();
159 PermuteRandomly(suffix_area
, suffixes_end
, &rng
);
161 uint32
* suffixes_in
=
162 std::copy_backward(suffix_area
, suffixes_end
, small_area_end
);
164 StringsortSuffixes(text
, text_end
,
165 suffix_area
, suffixes_in
, small_area_end
,
166 0, target_prefix_length
,
168 assert(result
== suffixes_end
);
169 verifier
.FinalCheck();
171 PermuteRandomly(suffix_area
, suffixes_end
, &rng
);
174 std::copy_backward(suffix_area
, suffixes_end
, full_area_end
);
176 StringsortSuffixes(text
, text_end
,
177 suffix_area
, suffixes_in
, full_area_end
,
178 0U, target_prefix_length
,
180 assert(result
== suffixes_end
);
181 verifier
.FinalCheck();
186 SortAndVerify(str
, 1);
187 std::cout
<< "TestEmpty passed" << std::endl
;
190 void TestLengthOne() {
191 std::string
str1("\0");
192 SortAndVerify(str1
, 1);
193 std::string str2
= "\xFF";
194 SortAndVerify(str2
, 1);
195 std::cout
<< "TestLengthOne passed" << std::endl
;
199 std::string
str("banana");
200 SortAndVerify(str
, 6);
201 SortAndVerify(str
, 3);
202 std::cout
<< "TestBanana passed" << std::endl
;
205 char const * const dna_512
=
206 "CACTGTGCCATCATCATCACCACCACTGTCATTATCACCACCACCATCATCACCAACACCACTG"
207 "CCATCGTCATCACCACCACTGTCATTATCACCACCACCATCACCAACATCACCACCACCATTAT"
208 "CACCACCATCAACACCACCACCCCCATCATCATCATCACTACTACCATCATTACCAGCACCACC"
209 "ACCACTATCACCACCACCACCACAATCACCATCACCACTATCATCAACATCATCACTACCACCA"
210 "TCACCAACACCACCATCATTATCACCACCACCACCATCACCAACATCACCACCATCATCATCAC"
211 "CACCATCACCAAGACCATCATCATCACCATCACCACCAACATCACCACCATCACCAACACCACC"
212 "ATCACCACCACCACCACCATCATCACCACCACCACCATCATCATCACCACCACCGCCATCATCA"
213 "TCGCCACCACCATGACCACCACCATCACAACCATCACCACCATCACAACCACCATCATCACTAT";
216 std::string
str(dna_512
);
217 SortAndVerify(str
, 512);
218 SortAndVerify(str
, 3);
219 std::cout
<< "TestDNA passed" << std::endl
;
222 char const * const aaa_256
=
223 "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
224 "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
225 "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
226 "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
229 std::string
str(aaa_256
);
230 SortAndVerify(str
, 256);
231 SortAndVerify(str
, 13);
232 std::cout
<< "TestAAAA passed" << std::endl
;
237 std::string
str("AAAAAAAAB");
238 SortAndVerify(str
, 9);
239 std::cout
<< "TestAAAB passed" << std::endl
;
242 void TestRandomLargeAlphabet() {
243 std::string
str(1<<14, '\0');
244 RandomNumberGenerator rng
;
245 for (std::string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
246 *i
= rng
.Uniform(256);
248 SortAndVerify(str
, 1<<14);
249 std::cout
<< "TestRandomLargeAlphabet passed" << std::endl
;
252 void TestRandomSmallAlphabet() {
253 std::string
str(1<<14, '\0');
254 RandomNumberGenerator rng
;
255 for (std::string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
256 *i
= 65 + rng
.Uniform(2);
258 SortAndVerify(str
, 1<<14);
259 std::cout
<< "TestRandomSmallAlphabet passed" << std::endl
;
262 void TestRandomLimited() {
263 std::string
str(1<<14, '\0');
264 RandomNumberGenerator rng
;
265 for (std::string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
266 *i
= 65 + rng
.Uniform(2);
268 SortAndVerify(str
, 2);
269 std::cout
<< "TestRandomLimited passed" << std::endl
;
272 void TestNoSuffixes() {
273 std::string
str(1<<14, '\0');
274 RandomNumberGenerator rng
;
275 for (std::string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
276 *i
= 65 + rng
.Uniform(2);
278 SortAndVerify(str
, 0);
279 std::cout
<< "TestNoSuffixes passed" << std::endl
;
284 int main(int argc
, char **argv
) {
292 TestRandomLargeAlphabet();
293 TestRandomSmallAlphabet();