1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Implementation of suffix sorting using string sorting algorithms,
18 // i.e., algorithms that do not take advantage of the fact the input
19 // strings consists of suffixes of a single string.
21 // The main algorithm is string quicksort, a.k.a. multikey quicksort,
22 // with two different versions, one for the normal representation and
23 // one for the cached representation (see below). Small sets are
24 // sorted with insertion sort.
26 // The implementation consists of several functions that manipulate
27 // a single array containing the suffixes. On the initial call of
28 // StringsortSuffixes the array looks like this:
29 // ----------???????????????? (?=unsorted suffix -=empty space)
30 // On exit the array looks like this:
31 // ++++++++++++++++---------- (+=sorted suffix in its final place)
33 // During the execution the array looks like this:
34 // ++++++++----------????????
36 // ++++++++------?#?#?#?#???? (#=cached characters)
37 // Each function call sees only a part of the array.
38 // On entry to the function that part looks like this:
42 // When the function call exits, that part looks like this:
45 // Each function has essentially the same arguments as StringsortSuffixes
46 // and the explanations in stringsort.h are valid except for
47 // workarea, suffixes, and suffixes_end, which are pointers identifying
48 // the parts of the array on which the function operates, like this:
52 // workarea suffixes suffixes_end
54 // Besides these three, only common_prefix_length changes during
55 // the execution. It is the length of a prefix shared by all the
56 // suffixes on which the function operates.
57 // The return value is a pointer to the start of the empty space:
63 // As indicated in the figures, there are two representations of the suffixes.
64 // The normal representation '?' is an integer, which is the starting position
65 // of the suffix in the text. The second representation '?#' consists of two
66 // integers. The first one is the starting position, and the second one
67 // is a cache containing characters of the suffix starting from the position
68 // common_prefix_length packed into the integer. A comparison of the caches
69 // is equivalent to comparing sizeof(UnsignedInteger) characters at once.
70 // The functions, whose names include Cache or Caching, operate on the
71 // representation with the cache and other functions operate on the normal
74 // The purpose of caching is to reduce costly cache misses when accessing
75 // the text. The basic string quicksort algorithm accesses characters
76 // of a suffix one at a time and may take a long time between two
77 // accesses. This makes it likely that (the relevant part of) the suffix
78 // has been driven out of the system caches in the mean while.
79 // The caching algorithms, when they access the characters of a suffix,
80 // read multiple consecutive characters at a time and store them into
81 // the algorithmic cache next to the starting position.
83 #ifndef DCSBWT_STRINGSORT_INL_H__
84 #define DCSBWT_STRINGSORT_INL_H__
86 #include "stringsort.h"
87 #include "ternary_partition.h"
91 #include <functional> // for bind2nd, equal_to, greater_equal
92 #include <algorithm> // for copy, copy_backward, find_if, partition
96 namespace stringsort
{
98 // Switch to insertion sort for groups no larger than this.
99 static const int kMaxInsertionSortSize
= 20;
101 // Switch from normal representation to cached representation
102 // only for groups larger or equal to this.
103 static const int kMinCachingStartSize
= 4096;
105 // Switch from cached representation to normal representation
106 // for groups smaller than this (but only at cache refill time).
107 static const int kMinCacheRefillSize
= 400;
109 ////////////////////////////////////////////////////////////////////
110 // Sort a set of suffixes (without caches) using a new
111 // character position, i.e., a new value of common_prefix_length.
112 // This is called for the initial input and every time
113 // common_prefix_length is incremented (when not caching).
114 // It does two special checks:
115 // 1. Has common_prefix_length reached target_prefix_length?
116 // 2. Is there a suffix of length common_prefix_length, i.e.,
117 // a suffix for which the new position is past the end of the text?
118 ////////////////////////////////////////////////////////////////////
119 template <typename CharIterator
, typename UnsignedInteger
,
120 typename FinishedGroupReporter
>
121 UnsignedInteger
* StringsortNewPosition(
122 const CharIterator text
, const CharIterator text_end
,
123 UnsignedInteger
* workarea
,
124 UnsignedInteger
* suffixes
, UnsignedInteger
* suffixes_end
,
125 UnsignedInteger common_prefix_length
,
126 const UnsignedInteger target_prefix_length
,
127 FinishedGroupReporter
& report_finished_group
)
129 const int64 num_suffixes
= suffixes_end
- suffixes
;
130 assert(num_suffixes
> 0);
131 // Optimization for the case of a single suffix
132 if (1 == num_suffixes
) {
133 *workarea
= *suffixes
;
134 report_finished_group(workarea
, workarea
+ 1);
137 // Have we reached target_prefix_length?
138 assert(common_prefix_length
<= target_prefix_length
);
139 if (common_prefix_length
== target_prefix_length
) {
140 if (workarea
< suffixes
) {
141 suffixes_end
= std::copy(suffixes
, suffixes_end
, workarea
);
144 report_finished_group(suffixes
, suffixes_end
);
147 // Handle the suffix of length common_prefix_length if it is there.
148 UnsignedInteger finished_suffix
= (text_end
- text
) - common_prefix_length
;
149 UnsignedInteger
* finished_suffix_position
150 = std::find(suffixes
, suffixes_end
, finished_suffix
);
151 if (finished_suffix_position
!= suffixes_end
) {
152 *finished_suffix_position
= *suffixes
;
154 *workarea
= finished_suffix
;
155 report_finished_group(workarea
, workarea
+ 1);
158 // Let StringsortDispatch do its magic for the remaining suffixes.
159 return StringsortDispatch(text
, text_end
,
160 workarea
, suffixes
, suffixes_end
,
161 common_prefix_length
, target_prefix_length
,
162 report_finished_group
);
165 ///////////////////////////////////////////////////////////////////
166 // Decide which algorithm to use for sorting.
167 // The candidates are:
168 // 1. InsertionSort for small sets.
169 // 2. Caching algorithms if conditions are right.
170 // 3. StringQuicksort otherwise.
171 ///////////////////////////////////////////////////////////////////
172 template <typename CharIterator
, typename UnsignedInteger
,
173 typename FinishedGroupReporter
>
174 UnsignedInteger
* StringsortDispatch(
175 const CharIterator text
, const CharIterator text_end
,
176 UnsignedInteger
* workarea
,
177 UnsignedInteger
* suffixes
, UnsignedInteger
* suffixes_end
,
178 UnsignedInteger common_prefix_length
,
179 const UnsignedInteger target_prefix_length
,
180 FinishedGroupReporter
& report_finished_group
)
182 const int64 num_suffixes
= suffixes_end
- suffixes
;
183 assert(num_suffixes
> 0);
184 assert(common_prefix_length
< target_prefix_length
);
185 // Check that we don't go past the end of the text for any suffix.
186 assert(std::find_if(suffixes
, suffixes_end
,
187 std::bind2nd(std::greater_equal
<UnsignedInteger
>(),
188 (text_end
- text
) - common_prefix_length
))
190 // Optimization for case of a single suffix
191 if (1 == num_suffixes
) {
192 *workarea
= *suffixes
;
193 report_finished_group(workarea
, workarea
+ 1);
196 // Use insertion sort for small sets
197 if (num_suffixes
<= kMaxInsertionSortSize
) {
198 return InsertionSort(text
, text_end
,
199 workarea
, suffixes
, suffixes_end
,
200 common_prefix_length
, target_prefix_length
,
201 report_finished_group
);
203 // Use caching sort if the following conditions are met:
204 // 1. There are enough (kMinCachingStartSize) suffixes to make
205 // caching worthwhile.
206 // 2. There is enough workspace for the caches.
207 // 3. We are still at least one cacheful away from reaching
208 // target_prefix_length.
209 if (num_suffixes
>= kMinCachingStartSize
210 && suffixes
- workarea
>= num_suffixes
211 && target_prefix_length
- common_prefix_length
>= sizeof(UnsignedInteger
))
213 return StartCachingAndSort(text
, text_end
,
214 workarea
, suffixes
, suffixes_end
,
215 common_prefix_length
, target_prefix_length
,
216 report_finished_group
);
218 // Otherwise use the regular string quicksort
219 return StringQuicksort(text
, text_end
,
220 workarea
, suffixes
, suffixes_end
,
221 common_prefix_length
, target_prefix_length
,
222 report_finished_group
);
225 ///////////////////////////////////////////////////////////////////
226 // Functor for comparing two suffixes.
227 // Used by InsertionSort.
228 ///////////////////////////////////////////////////////////////////
229 template <typename CharIterator
, typename UnsignedInteger
>
230 class SuffixLess
: public std::binary_function
<bool,
234 SuffixLess(CharIterator text
, CharIterator text_end
,
235 UnsignedInteger common_prefix_length
,
236 UnsignedInteger target_prefix_length
)
237 : text_plus_prefix_(text
+common_prefix_length
), text_end_(text_end
),
238 distance_to_target_(target_prefix_length
- common_prefix_length
) {}
239 bool operator() (UnsignedInteger a
, UnsignedInteger b
) const {
240 CharIterator a_iter
= text_plus_prefix_
+ a
;
241 CharIterator b_iter
= text_plus_prefix_
+ b
;
242 assert(text_end_
- a_iter
>= 0);
243 assert(text_end_
- b_iter
>= 0);
244 const UnsignedInteger a_length
= text_end_
- a_iter
;
245 const UnsignedInteger b_length
= text_end_
- b_iter
;
246 // The comparison of a and b stops after limit characters
247 // if no mismatch has been found by then.
248 // This happens when the algorithm reaches the end of a,
249 // the end of b or target_prefix_length, whichever comes first.
250 UnsignedInteger limit
251 = std::min
<UnsignedInteger
>(distance_to_target_
, b_length
);
252 bool result_at_limit
= false;
253 if (a_length
< limit
) {
255 result_at_limit
= true;
257 for (; limit
; --limit
) {
258 unsigned char a_ch
= *a_iter
;
259 unsigned char b_ch
= *b_iter
;
260 if (a_ch
!= b_ch
) return (a_ch
< b_ch
);
263 return result_at_limit
;
266 const CharIterator text_plus_prefix_
;
267 const CharIterator text_end_
;
268 const UnsignedInteger distance_to_target_
;
271 /////////////////////////////////////////////////////////////////
272 // Sort a (small) set of suffixes using standard insertion sort.
273 /////////////////////////////////////////////////////////////////
274 template <typename CharIterator
, typename UnsignedInteger
,
275 typename FinishedGroupReporter
>
276 UnsignedInteger
* InsertionSort(
277 const CharIterator text
, const CharIterator text_end
,
278 UnsignedInteger
* workarea
,
279 UnsignedInteger
* suffixes
, UnsignedInteger
* suffixes_end
,
280 UnsignedInteger common_prefix_length
,
281 const UnsignedInteger target_prefix_length
,
282 FinishedGroupReporter
& report_finished_group
)
284 const int64 num_suffixes
= suffixes_end
- suffixes
;
285 assert(num_suffixes
> 0);
286 // Optimization for case of a single suffix
287 if (1 == num_suffixes
) {
288 *workarea
= *suffixes
;
289 report_finished_group(workarea
, workarea
+ 1);
292 // The actual insertion sort starts.
293 // The suffixes will be moved from the end of the workarea
294 // to the beginning of the workarea during sorting like this:
295 // --------???? => ++------?? => ++++--------
296 SuffixLess
<CharIterator
,UnsignedInteger
> suffix_less(
298 common_prefix_length
, target_prefix_length
);
299 UnsignedInteger
* begin
= workarea
;
300 *begin
= *suffixes
++;
301 UnsignedInteger
* end
= workarea
+ 1;
302 while (suffixes
!= suffixes_end
) {
303 UnsignedInteger suffix
= *suffixes
++;
304 UnsignedInteger
* insertion_point
= end
;
305 while (insertion_point
> begin
306 && suffix_less(suffix
, *(insertion_point
-1))) {
307 *insertion_point
= *(insertion_point
-1);
310 *insertion_point
= suffix
;
313 assert(end
- begin
== num_suffixes
);
314 // The insertion sort has finished.
315 // Now [begin,end) contains the suffixes in order.
316 // We still have to report the groups.
317 if (NullFinishedGroupReporter
<UnsignedInteger
>() != report_finished_group
) {
318 for (UnsignedInteger
* group_end
= begin
+ 1;
319 group_end
< end
; ++group_end
) {
320 if (suffix_less(*begin
, *group_end
)) {
321 report_finished_group(begin
, group_end
);
325 report_finished_group(begin
, end
);
330 ////////////////////////////////////////////////////////////////
331 // Functor for obtaining the character at position common_prefix_length
333 // Used by StringQuicksort.
334 ////////////////////////////////////////////////////////////////
335 template <typename CharIterator
, typename UnsignedInteger
>
336 class SuffixChar
: public std::unary_function
<UnsignedInteger
,unsigned char> {
338 SuffixChar(CharIterator text
, UnsignedInteger common_prefix_length
)
339 : text_plus_prefix_(text
+ common_prefix_length
) {}
340 unsigned char operator() (UnsignedInteger suffix
) const {
341 return text_plus_prefix_
[suffix
];
344 CharIterator text_plus_prefix_
;
347 /////////////////////////////////////////////////////////////////
348 // Sort suffixes using string quicksort, a.k.a. multikey quicksort.
349 // Performs a ternary partition of the set using the characters
350 // at position common_prefix_length, the first position after the
351 // shared prefix. Recurses on each of the three parts.
352 /////////////////////////////////////////////////////////////////
353 template <typename CharIterator
, typename UnsignedInteger
,
354 typename FinishedGroupReporter
>
355 UnsignedInteger
* StringQuicksort(
356 const CharIterator text
, const CharIterator text_end
,
357 UnsignedInteger
* workarea
,
358 UnsignedInteger
* suffixes
, UnsignedInteger
* suffixes_end
,
359 UnsignedInteger common_prefix_length
,
360 const UnsignedInteger target_prefix_length
,
361 FinishedGroupReporter
& report_finished_group
)
363 assert(suffixes_end
- suffixes
> 0);
364 SuffixChar
<CharIterator
,UnsignedInteger
>
365 suffix_key(text
, common_prefix_length
);
366 UnsignedInteger
* pivot
= ChoosePivot(suffixes
, suffixes_end
, suffix_key
);
367 std::pair
<UnsignedInteger
*,UnsignedInteger
*> equal_part
368 = TernaryPartition(suffixes
, suffixes_end
, pivot
, suffix_key
);
369 UnsignedInteger
* equal_part_begin
= equal_part
.first
;
370 UnsignedInteger
* equal_part_end
= equal_part
.second
;
371 if (suffixes
< equal_part_begin
) {
372 workarea
= StringsortDispatch(text
, text_end
,
373 workarea
, suffixes
, equal_part_begin
,
374 common_prefix_length
, target_prefix_length
,
375 report_finished_group
);
377 assert(equal_part_end
- equal_part_begin
> 0);
378 workarea
= StringsortNewPosition(text
, text_end
,
379 workarea
, equal_part_begin
, equal_part_end
,
380 common_prefix_length
+ 1,
381 target_prefix_length
,
382 report_finished_group
);
383 if (equal_part_end
< suffixes_end
) {
384 workarea
= StringsortDispatch(text
, text_end
,
385 workarea
, equal_part_end
, suffixes_end
,
386 common_prefix_length
, target_prefix_length
,
387 report_finished_group
);
392 ////////////////////////////////////////////////////////////
393 // The representation of suffixes when caching is used.
394 ////////////////////////////////////////////////////////////
395 template <typename UnsignedInteger
>
396 struct CachedSuffix
{
397 UnsignedInteger suffix
;
398 UnsignedInteger cache
;
399 struct KeyGetter
: public
400 std::unary_function
<CachedSuffix
<UnsignedInteger
>,UnsignedInteger
> {
401 UnsignedInteger
operator() (
402 CachedSuffix
<UnsignedInteger
> suffix
) const {
406 bool operator>= (const CachedSuffix
<UnsignedInteger
>& other
) const {
407 return this->suffix
>= other
.suffix
;
411 /////////////////////////////////////////////////////////////
412 // Fill the cache of all suffixes with new characters,
413 // i.e., with the characters in positions
414 // [common_prefix_length, common_prefix_length + sizeof(UnsignedInteger))
415 /////////////////////////////////////////////////////////////
416 template <typename CharIterator
, typename UnsignedInteger
>
418 const CharIterator text
, const CharIterator text_end
,
419 CachedSuffix
<UnsignedInteger
>* suffixes
,
420 CachedSuffix
<UnsignedInteger
>* suffixes_end
,
421 const UnsignedInteger common_prefix_length
)
423 unsigned char buffer
[sizeof(UnsignedInteger
)];
424 for (CachedSuffix
<UnsignedInteger
>* suffix
= suffixes
;
425 suffix
< suffixes_end
; ++suffix
) {
426 // Copy characters to a local buffer.
427 CharIterator begin
= text
+ suffix
->suffix
+ common_prefix_length
;
428 CharIterator end
= begin
+ sizeof(UnsignedInteger
);
429 if (text_end
< end
) {
431 memset(buffer
, 0, sizeof(UnsignedInteger
));
433 std::copy(begin
, end
, buffer
);
434 // Pack characters into a local integer in an endianness independent way.
435 UnsignedInteger cache
= buffer
[0];
436 for (int i
= 1; i
< sizeof(UnsignedInteger
); ++i
) {
437 cache
= (cache
<< 8) + buffer
[i
];
439 // Write cache to the array.
440 suffix
->cache
= cache
;
444 ////////////////////////////////////////////////////////////////
445 // Change from the standard representation (---------????)
446 // into the cached representation (-----?#?#?#?#)
448 ////////////////////////////////////////////////////////////////
449 template <typename CharIterator
, typename UnsignedInteger
,
450 typename FinishedGroupReporter
>
451 UnsignedInteger
* StartCachingAndSort(
452 const CharIterator text
, const CharIterator text_end
,
453 UnsignedInteger
* workarea
,
454 UnsignedInteger
* uncached_suffixes
, UnsignedInteger
* uncached_suffixes_end
,
455 UnsignedInteger common_prefix_length
,
456 const UnsignedInteger target_prefix_length
,
457 FinishedGroupReporter
& report_finished_group
)
459 const uint64 num_suffixes
= uncached_suffixes_end
- uncached_suffixes
;
460 assert(num_suffixes
> kMaxInsertionSortSize
);
461 assert(uncached_suffixes
- workarea
>= num_suffixes
);
462 CachedSuffix
<UnsignedInteger
>* cached_suffixes_end
463 = reinterpret_cast<CachedSuffix
<UnsignedInteger
>*>(
464 uncached_suffixes_end
);
465 CachedSuffix
<UnsignedInteger
>* cached_suffixes
466 = cached_suffixes_end
- num_suffixes
;
467 // Copy starting positions to their new places.
468 CachedSuffix
<UnsignedInteger
>* i
= cached_suffixes
;
469 UnsignedInteger
* j
= uncached_suffixes
;
470 while (j
< uncached_suffixes_end
) {
471 (i
++)->suffix
= *j
++;
473 assert(i
== cached_suffixes_end
);
475 FillCache(text
, text_end
,
476 cached_suffixes
, cached_suffixes_end
, common_prefix_length
);
477 // Sort with string quicksort.
478 return StringQuicksortWithCache(text
, text_end
,
480 cached_suffixes
, cached_suffixes_end
,
481 common_prefix_length
, target_prefix_length
,
482 report_finished_group
);
485 ////////////////////////////////////////////////////////////////
486 // Change from the cached representation (?#?#?#?#)
487 // into the standard representation (----????)
488 // Returns the range containing the uncached suffixes.
489 ////////////////////////////////////////////////////////////////
490 template <typename UnsignedInteger
>
491 std::pair
<UnsignedInteger
*,UnsignedInteger
*> RemoveCache(
492 CachedSuffix
<UnsignedInteger
>* cached_suffixes
,
493 CachedSuffix
<UnsignedInteger
>* cached_suffixes_end
)
495 assert(cached_suffixes_end
- cached_suffixes
> 0);
496 UnsignedInteger
* uncached_suffixes_end
497 = reinterpret_cast<UnsignedInteger
*>(cached_suffixes_end
);
498 UnsignedInteger
* uncached_suffixes
= uncached_suffixes_end
;
500 *--uncached_suffixes
= (--cached_suffixes_end
)->suffix
;
501 } while (cached_suffixes_end
> cached_suffixes
);
502 return std::make_pair(uncached_suffixes
, uncached_suffixes_end
);
505 ////////////////////////////////////////////////////////////////
506 // Refill caches if appropriate and sort.
507 // Before refilling, handle various special cases.
508 ////////////////////////////////////////////////////////////////
509 template <typename CharIterator
, typename UnsignedInteger
,
510 typename FinishedGroupReporter
>
511 UnsignedInteger
* RefillCacheAndSort(
512 const CharIterator text
, const CharIterator text_end
,
513 UnsignedInteger
* workarea
,
514 CachedSuffix
<UnsignedInteger
>* suffixes
,
515 CachedSuffix
<UnsignedInteger
>* suffixes_end
,
516 UnsignedInteger common_prefix_length
,
517 const UnsignedInteger target_prefix_length
,
518 FinishedGroupReporter
& report_finished_group
)
520 const uint64 num_suffixes
= suffixes_end
- suffixes
;
521 assert(num_suffixes
> 0);
522 assert(common_prefix_length
<= target_prefix_length
);
523 // optimization for the case of one suffix
524 if (num_suffixes
== 1) {
525 *workarea
= suffixes
->suffix
;
526 report_finished_group(workarea
, workarea
+ 1);
529 // Find suffixes, whose end we have reached, i.e., for who the position
530 // common_prefix_length is beyond the end of the text.
531 UnsignedInteger first_finished_suffix
532 = (text_end
- text
) - common_prefix_length
;
533 CachedSuffix
<UnsignedInteger
> finished_suffix
;
534 finished_suffix
.suffix
= first_finished_suffix
;
535 finished_suffix
.cache
= 0;
536 CachedSuffix
<UnsignedInteger
>* cached_finished_end
537 = std::partition(suffixes
, suffixes_end
,
538 std::bind2nd(std::greater_equal
<CachedSuffix
<UnsignedInteger
> >(),
540 // If we found finished suffixes, handle them separately
541 if (cached_finished_end
> suffixes
) {
542 CachedSuffix
<UnsignedInteger
>* cached_finished_begin
= suffixes
;
543 suffixes
= cached_finished_end
;
544 // Now [cached_finished_begin,cached_finished_end) contains
545 // the finished suffixes and [suffixes,suffixes_end) contains
546 // the other suffixes.
548 // Remove caches of the finished suffixes
549 std::pair
<UnsignedInteger
*,UnsignedInteger
*> uncached_finished
550 = RemoveCache(cached_finished_begin
, cached_finished_end
);
551 UnsignedInteger
* finished_begin
= uncached_finished
.first
;
552 UnsignedInteger
* finished_end
= uncached_finished
.second
;
553 // Sort in descending order of starting position.
554 // This is their correct order.
555 std::sort(finished_begin
, finished_end
,
556 std::greater
<UnsignedInteger
>());
557 // Move from the end of the workarea to the beginning of the workarea
558 finished_end
= std::copy(finished_begin
, finished_end
, workarea
);
559 finished_begin
= workarea
;
560 workarea
= finished_end
;
561 // Report the finished suffixes.
562 for (UnsignedInteger
* i
= finished_begin
; i
!= finished_end
; ++i
) {
563 report_finished_group(i
, i
+1);
565 // If there were no unfinished suffixes, we are done.
566 if (suffixes
== suffixes_end
) return workarea
;
568 // If we have reached target_prefix_length, the remaining suffixes
569 // are finished, too. If so, they form a single group.
570 if (common_prefix_length
>= target_prefix_length
) {
571 std::pair
<UnsignedInteger
*,UnsignedInteger
*> uncached
572 = RemoveCache(suffixes
, suffixes_end
);
573 UnsignedInteger
* finished_suffixes
= workarea
;
574 workarea
= std::copy(uncached
.first
, uncached
.second
, workarea
);
575 report_finished_group(finished_suffixes
, workarea
);
578 // Switch to non-caching algorithms if, either there are not
579 // enough suffixes to justify caching, or there are less than
580 // a cacheful of characters left before reaching target_prefix_length.
581 if (num_suffixes
< kMinCacheRefillSize
582 || target_prefix_length
- common_prefix_length
< sizeof(UnsignedInteger
))
584 std::pair
<UnsignedInteger
*,UnsignedInteger
*> uncached_suffixes
585 = RemoveCache(suffixes
, suffixes_end
);
586 if (suffixes_end
- suffixes
<= kMaxInsertionSortSize
) {
587 return InsertionSort(text
, text_end
,
589 uncached_suffixes
.first
, uncached_suffixes
.second
,
590 common_prefix_length
, target_prefix_length
,
591 report_finished_group
);
593 return StringQuicksort(text
, text_end
,
595 uncached_suffixes
.first
, uncached_suffixes
.second
,
596 common_prefix_length
, target_prefix_length
,
597 report_finished_group
);
600 // No more excuses left; refill the cache and sort.
601 FillCache(text
, text_end
,
602 suffixes
, suffixes_end
, common_prefix_length
);
603 return StringQuicksortWithCache(text
, text_end
,
604 workarea
, suffixes
, suffixes_end
,
605 common_prefix_length
, target_prefix_length
,
606 report_finished_group
);
609 ///////////////////////////////////////////////////////////////////
610 // Decide whether to continue cached sorting
611 // or to switch to non-caching insertion sort.
612 ///////////////////////////////////////////////////////////////////
613 template <typename CharIterator
, typename UnsignedInteger
,
614 typename FinishedGroupReporter
>
615 UnsignedInteger
* StringsortDispatchWithCache(
616 const CharIterator text
, const CharIterator text_end
,
617 UnsignedInteger
* workarea
,
618 CachedSuffix
<UnsignedInteger
>* suffixes
,
619 CachedSuffix
<UnsignedInteger
>* suffixes_end
,
620 UnsignedInteger common_prefix_length
,
621 const UnsignedInteger target_prefix_length
,
622 FinishedGroupReporter
& report_finished_group
)
624 const uint64 num_suffixes
= suffixes_end
- suffixes
;
625 assert(num_suffixes
> 0);
626 if (num_suffixes
> kMaxInsertionSortSize
) {
627 return StringQuicksortWithCache(text
, text_end
,
628 workarea
, suffixes
, suffixes_end
,
629 common_prefix_length
, target_prefix_length
,
630 report_finished_group
);
632 std::pair
<UnsignedInteger
*,UnsignedInteger
*> uncached_suffixes
633 = RemoveCache(suffixes
, suffixes_end
);
634 return InsertionSort(text
, text_end
,
636 uncached_suffixes
.first
, uncached_suffixes
.second
,
637 common_prefix_length
, target_prefix_length
,
638 report_finished_group
);
641 ///////////////////////////////////////////////////////////////
642 // String quicksort for the cached representation.
643 // This is the main cached sorting algorithm.
644 ///////////////////////////////////////////////////////////////
645 template <typename CharIterator
, typename UnsignedInteger
,
646 typename FinishedGroupReporter
>
647 UnsignedInteger
* StringQuicksortWithCache(
648 CharIterator text
, CharIterator text_end
,
649 UnsignedInteger
* workarea
,
650 CachedSuffix
<UnsignedInteger
>* suffixes
,
651 CachedSuffix
<UnsignedInteger
>* suffixes_end
,
652 UnsignedInteger common_prefix_length
, UnsignedInteger target_prefix_length
,
653 FinishedGroupReporter
& report_finished_group
)
655 const uint64 num_suffixes
= suffixes_end
- suffixes
;
656 assert(num_suffixes
> 0);
657 typename CachedSuffix
<UnsignedInteger
>::KeyGetter get_key
;
658 CachedSuffix
<UnsignedInteger
>* pivot
659 = ChoosePivot(suffixes
, suffixes_end
, get_key
);
660 std::pair
<CachedSuffix
<UnsignedInteger
>*,CachedSuffix
<UnsignedInteger
>*>
661 equal_part
= TernaryPartition(suffixes
, suffixes_end
, pivot
, get_key
);
662 CachedSuffix
<UnsignedInteger
>* equal_part_begin
= equal_part
.first
;
663 CachedSuffix
<UnsignedInteger
>* equal_part_end
= equal_part
.second
;
664 if (suffixes
< equal_part_begin
) {
665 workarea
= StringsortDispatchWithCache(text
, text_end
,
666 workarea
, suffixes
, equal_part_begin
,
667 common_prefix_length
, target_prefix_length
,
668 report_finished_group
);
670 assert(equal_part_end
- equal_part_begin
> 0);
671 workarea
= RefillCacheAndSort(text
, text_end
,
672 workarea
, equal_part_begin
, equal_part_end
,
673 common_prefix_length
+ sizeof(UnsignedInteger
),
674 target_prefix_length
,
675 report_finished_group
);
676 if (equal_part_end
< suffixes_end
) {
677 workarea
= StringsortDispatchWithCache(text
, text_end
,
678 workarea
, equal_part_end
, suffixes_end
,
679 common_prefix_length
, target_prefix_length
,
680 report_finished_group
);
685 } // namespace stringsort
687 /////////////////////////////////////////////////////////////////////
688 // The interface function, see stringsort.h
689 /////////////////////////////////////////////////////////////////////
690 template <typename CharIterator
, typename UnsignedInteger
,
691 typename FinishedGroupReporter
>
692 UnsignedInteger
* StringsortSuffixes(
693 const CharIterator text
, const CharIterator text_end
,
694 UnsignedInteger
* suffix_area
, UnsignedInteger
* suffixes_in
,
695 UnsignedInteger
* suffix_area_end
,
696 int64 common_prefix_length
,
697 int64 target_prefix_length
,
698 FinishedGroupReporter
& report_finished_group
)
700 assert(text_end
- text
>= 0);
701 assert(suffixes_in
- suffix_area
>= 0);
702 assert(suffix_area_end
- suffixes_in
>= 0);
703 assert(common_prefix_length
>= 0);
704 assert(target_prefix_length
>= common_prefix_length
);
705 // Check that target_prefix_length (and consequently common_prefix_length)
706 // is a value that can be represented by UnsignedInteger.
707 assert(static_cast<int64
>(static_cast<UnsignedInteger
>(
708 target_prefix_length
)) >= target_prefix_length
);
709 if (suffixes_in
== suffix_area_end
) {
712 UnsignedInteger
* suffixes_end
=
713 stringsort::StringsortNewPosition(
715 suffix_area
, suffixes_in
, suffix_area_end
,
716 static_cast<UnsignedInteger
>(common_prefix_length
),
717 static_cast<UnsignedInteger
>(target_prefix_length
),
718 report_finished_group
);
719 assert(suffixes_end
- suffix_area
== suffix_area_end
- suffixes_in
);
723 } // namespace dcsbwt
725 #endif // DCSBWT_STRINGSORT_INL_H__