modified: myjupyterlab.sh
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / stringsort-inl.h
blob47b4f45b6824ba8fb551dbf7dce979f252d367b6
1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Implementation of suffix sorting using string sorting algorithms,
18 // i.e., algorithms that do not take advantage of the fact the input
19 // strings consists of suffixes of a single string.
21 // The main algorithm is string quicksort, a.k.a. multikey quicksort,
22 // with two different versions, one for the normal representation and
23 // one for the cached representation (see below). Small sets are
24 // sorted with insertion sort.
26 // The implementation consists of several functions that manipulate
27 // a single array containing the suffixes. On the initial call of
28 // StringsortSuffixes the array looks like this:
29 // ----------???????????????? (?=unsorted suffix -=empty space)
30 // On exit the array looks like this:
31 // ++++++++++++++++---------- (+=sorted suffix in its final place)
33 // During the execution the array looks like this:
34 // ++++++++----------????????
35 // or this:
36 // ++++++++------?#?#?#?#???? (#=cached characters)
37 // Each function call sees only a part of the array.
38 // On entry to the function that part looks like this:
39 // ----------????
40 // or this:
41 // ------?#?#?#?#
42 // When the function call exits, that part looks like this:
43 // ++++----------
45 // Each function has essentially the same arguments as StringsortSuffixes
46 // and the explanations in stringsort.h are valid except for
47 // workarea, suffixes, and suffixes_end, which are pointers identifying
48 // the parts of the array on which the function operates, like this:
49 // ----------????
50 // ^ ^ ^
51 // | | |
52 // workarea suffixes suffixes_end
54 // Besides these three, only common_prefix_length changes during
55 // the execution. It is the length of a prefix shared by all the
56 // suffixes on which the function operates.
57 // The return value is a pointer to the start of the empty space:
58 // ++++----------
59 // ^
60 // |
61 // return value
63 // As indicated in the figures, there are two representations of the suffixes.
64 // The normal representation '?' is an integer, which is the starting position
65 // of the suffix in the text. The second representation '?#' consists of two
66 // integers. The first one is the starting position, and the second one
67 // is a cache containing characters of the suffix starting from the position
68 // common_prefix_length packed into the integer. A comparison of the caches
69 // is equivalent to comparing sizeof(UnsignedInteger) characters at once.
70 // The functions, whose names include Cache or Caching, operate on the
71 // representation with the cache and other functions operate on the normal
72 // representation.
74 // The purpose of caching is to reduce costly cache misses when accessing
75 // the text. The basic string quicksort algorithm accesses characters
76 // of a suffix one at a time and may take a long time between two
77 // accesses. This makes it likely that (the relevant part of) the suffix
78 // has been driven out of the system caches in the mean while.
79 // The caching algorithms, when they access the characters of a suffix,
80 // read multiple consecutive characters at a time and store them into
81 // the algorithmic cache next to the starting position.
83 #ifndef DCSBWT_STRINGSORT_INL_H__
84 #define DCSBWT_STRINGSORT_INL_H__
86 #include "stringsort.h"
87 #include "ternary_partition.h"
89 #include "inttypes.h"
91 #include <functional> // for bind2nd, equal_to, greater_equal
92 #include <algorithm> // for copy, copy_backward, find_if, partition
93 #include <cassert>
95 namespace dcsbwt {
96 namespace stringsort {
98 // Switch to insertion sort for groups no larger than this.
99 static const int kMaxInsertionSortSize = 20;
101 // Switch from normal representation to cached representation
102 // only for groups larger or equal to this.
103 static const int kMinCachingStartSize = 4096;
105 // Switch from cached representation to normal representation
106 // for groups smaller than this (but only at cache refill time).
107 static const int kMinCacheRefillSize = 400;
109 ////////////////////////////////////////////////////////////////////
110 // Sort a set of suffixes (without caches) using a new
111 // character position, i.e., a new value of common_prefix_length.
112 // This is called for the initial input and every time
113 // common_prefix_length is incremented (when not caching).
114 // It does two special checks:
115 // 1. Has common_prefix_length reached target_prefix_length?
116 // 2. Is there a suffix of length common_prefix_length, i.e.,
117 // a suffix for which the new position is past the end of the text?
118 ////////////////////////////////////////////////////////////////////
119 template <typename CharIterator, typename UnsignedInteger,
120 typename FinishedGroupReporter>
121 UnsignedInteger* StringsortNewPosition(
122 const CharIterator text, const CharIterator text_end,
123 UnsignedInteger* workarea,
124 UnsignedInteger* suffixes, UnsignedInteger* suffixes_end,
125 UnsignedInteger common_prefix_length,
126 const UnsignedInteger target_prefix_length,
127 FinishedGroupReporter& report_finished_group)
129 const int64 num_suffixes = suffixes_end - suffixes;
130 assert(num_suffixes > 0);
131 // Optimization for the case of a single suffix
132 if (1 == num_suffixes) {
133 *workarea = *suffixes;
134 report_finished_group(workarea, workarea + 1);
135 return workarea + 1;
137 // Have we reached target_prefix_length?
138 assert(common_prefix_length <= target_prefix_length);
139 if (common_prefix_length == target_prefix_length) {
140 if (workarea < suffixes) {
141 suffixes_end = std::copy(suffixes, suffixes_end, workarea);
142 suffixes = workarea;
144 report_finished_group(suffixes, suffixes_end);
145 return suffixes_end;
147 // Handle the suffix of length common_prefix_length if it is there.
148 UnsignedInteger finished_suffix = (text_end - text) - common_prefix_length;
149 UnsignedInteger* finished_suffix_position
150 = std::find(suffixes, suffixes_end, finished_suffix);
151 if (finished_suffix_position != suffixes_end) {
152 *finished_suffix_position = *suffixes;
153 ++suffixes;
154 *workarea = finished_suffix;
155 report_finished_group(workarea, workarea + 1);
156 ++workarea;
158 // Let StringsortDispatch do its magic for the remaining suffixes.
159 return StringsortDispatch(text, text_end,
160 workarea, suffixes, suffixes_end,
161 common_prefix_length, target_prefix_length,
162 report_finished_group);
165 ///////////////////////////////////////////////////////////////////
166 // Decide which algorithm to use for sorting.
167 // The candidates are:
168 // 1. InsertionSort for small sets.
169 // 2. Caching algorithms if conditions are right.
170 // 3. StringQuicksort otherwise.
171 ///////////////////////////////////////////////////////////////////
172 template <typename CharIterator, typename UnsignedInteger,
173 typename FinishedGroupReporter>
174 UnsignedInteger* StringsortDispatch(
175 const CharIterator text, const CharIterator text_end,
176 UnsignedInteger* workarea,
177 UnsignedInteger* suffixes, UnsignedInteger* suffixes_end,
178 UnsignedInteger common_prefix_length,
179 const UnsignedInteger target_prefix_length,
180 FinishedGroupReporter& report_finished_group)
182 const int64 num_suffixes = suffixes_end - suffixes;
183 assert(num_suffixes > 0);
184 assert(common_prefix_length < target_prefix_length);
185 // Check that we don't go past the end of the text for any suffix.
186 assert(std::find_if(suffixes, suffixes_end,
187 std::bind2nd(std::greater_equal<UnsignedInteger>(),
188 (text_end - text) - common_prefix_length))
189 == suffixes_end);
190 // Optimization for case of a single suffix
191 if (1 == num_suffixes) {
192 *workarea = *suffixes;
193 report_finished_group(workarea, workarea + 1);
194 return workarea + 1;
196 // Use insertion sort for small sets
197 if (num_suffixes <= kMaxInsertionSortSize) {
198 return InsertionSort(text, text_end,
199 workarea, suffixes, suffixes_end,
200 common_prefix_length, target_prefix_length,
201 report_finished_group);
203 // Use caching sort if the following conditions are met:
204 // 1. There are enough (kMinCachingStartSize) suffixes to make
205 // caching worthwhile.
206 // 2. There is enough workspace for the caches.
207 // 3. We are still at least one cacheful away from reaching
208 // target_prefix_length.
209 if (num_suffixes >= kMinCachingStartSize
210 && suffixes - workarea >= num_suffixes
211 && target_prefix_length - common_prefix_length >= sizeof(UnsignedInteger))
213 return StartCachingAndSort(text, text_end,
214 workarea, suffixes, suffixes_end,
215 common_prefix_length, target_prefix_length,
216 report_finished_group);
218 // Otherwise use the regular string quicksort
219 return StringQuicksort(text, text_end,
220 workarea, suffixes, suffixes_end,
221 common_prefix_length, target_prefix_length,
222 report_finished_group);
225 ///////////////////////////////////////////////////////////////////
226 // Functor for comparing two suffixes.
227 // Used by InsertionSort.
228 ///////////////////////////////////////////////////////////////////
229 template <typename CharIterator, typename UnsignedInteger>
230 class SuffixLess : public std::binary_function<bool,
231 UnsignedInteger,
232 UnsignedInteger> {
233 public:
234 SuffixLess(CharIterator text, CharIterator text_end,
235 UnsignedInteger common_prefix_length,
236 UnsignedInteger target_prefix_length)
237 : text_plus_prefix_(text+common_prefix_length), text_end_(text_end),
238 distance_to_target_(target_prefix_length - common_prefix_length) {}
239 bool operator() (UnsignedInteger a, UnsignedInteger b) const {
240 CharIterator a_iter = text_plus_prefix_ + a;
241 CharIterator b_iter = text_plus_prefix_ + b;
242 assert(text_end_ - a_iter >= 0);
243 assert(text_end_ - b_iter >= 0);
244 const UnsignedInteger a_length = text_end_ - a_iter;
245 const UnsignedInteger b_length = text_end_ - b_iter;
246 // The comparison of a and b stops after limit characters
247 // if no mismatch has been found by then.
248 // This happens when the algorithm reaches the end of a,
249 // the end of b or target_prefix_length, whichever comes first.
250 UnsignedInteger limit
251 = std::min<UnsignedInteger>(distance_to_target_, b_length);
252 bool result_at_limit = false;
253 if (a_length < limit) {
254 limit = a_length;
255 result_at_limit = true;
257 for (; limit; --limit) {
258 unsigned char a_ch = *a_iter;
259 unsigned char b_ch = *b_iter;
260 if (a_ch != b_ch) return (a_ch < b_ch);
261 ++a_iter; ++b_iter;
263 return result_at_limit;
265 private:
266 const CharIterator text_plus_prefix_;
267 const CharIterator text_end_;
268 const UnsignedInteger distance_to_target_;
271 /////////////////////////////////////////////////////////////////
272 // Sort a (small) set of suffixes using standard insertion sort.
273 /////////////////////////////////////////////////////////////////
274 template <typename CharIterator, typename UnsignedInteger,
275 typename FinishedGroupReporter>
276 UnsignedInteger* InsertionSort(
277 const CharIterator text, const CharIterator text_end,
278 UnsignedInteger* workarea,
279 UnsignedInteger* suffixes, UnsignedInteger* suffixes_end,
280 UnsignedInteger common_prefix_length,
281 const UnsignedInteger target_prefix_length,
282 FinishedGroupReporter& report_finished_group)
284 const int64 num_suffixes = suffixes_end - suffixes;
285 assert(num_suffixes > 0);
286 // Optimization for case of a single suffix
287 if (1 == num_suffixes) {
288 *workarea = *suffixes;
289 report_finished_group(workarea, workarea + 1);
290 return workarea + 1;
292 // The actual insertion sort starts.
293 // The suffixes will be moved from the end of the workarea
294 // to the beginning of the workarea during sorting like this:
295 // --------???? => ++------?? => ++++--------
296 SuffixLess<CharIterator,UnsignedInteger> suffix_less(
297 text, text_end,
298 common_prefix_length, target_prefix_length);
299 UnsignedInteger* begin = workarea;
300 *begin = *suffixes++;
301 UnsignedInteger* end = workarea + 1;
302 while (suffixes != suffixes_end) {
303 UnsignedInteger suffix = *suffixes++;
304 UnsignedInteger* insertion_point = end;
305 while (insertion_point > begin
306 && suffix_less(suffix, *(insertion_point-1))) {
307 *insertion_point = *(insertion_point-1);
308 --insertion_point;
310 *insertion_point = suffix;
311 ++end;
313 assert(end - begin == num_suffixes);
314 // The insertion sort has finished.
315 // Now [begin,end) contains the suffixes in order.
316 // We still have to report the groups.
317 if (NullFinishedGroupReporter<UnsignedInteger>() != report_finished_group) {
318 for (UnsignedInteger* group_end = begin + 1;
319 group_end < end; ++group_end) {
320 if (suffix_less(*begin, *group_end)) {
321 report_finished_group(begin, group_end);
322 begin = group_end;
325 report_finished_group(begin, end);
327 return end;
330 ////////////////////////////////////////////////////////////////
331 // Functor for obtaining the character at position common_prefix_length
332 // for each suffix.
333 // Used by StringQuicksort.
334 ////////////////////////////////////////////////////////////////
335 template <typename CharIterator, typename UnsignedInteger>
336 class SuffixChar : public std::unary_function<UnsignedInteger,unsigned char> {
337 public:
338 SuffixChar(CharIterator text, UnsignedInteger common_prefix_length)
339 : text_plus_prefix_(text + common_prefix_length) {}
340 unsigned char operator() (UnsignedInteger suffix) const {
341 return text_plus_prefix_[suffix];
343 private:
344 CharIterator text_plus_prefix_;
347 /////////////////////////////////////////////////////////////////
348 // Sort suffixes using string quicksort, a.k.a. multikey quicksort.
349 // Performs a ternary partition of the set using the characters
350 // at position common_prefix_length, the first position after the
351 // shared prefix. Recurses on each of the three parts.
352 /////////////////////////////////////////////////////////////////
353 template <typename CharIterator, typename UnsignedInteger,
354 typename FinishedGroupReporter>
355 UnsignedInteger* StringQuicksort(
356 const CharIterator text, const CharIterator text_end,
357 UnsignedInteger* workarea,
358 UnsignedInteger* suffixes, UnsignedInteger* suffixes_end,
359 UnsignedInteger common_prefix_length,
360 const UnsignedInteger target_prefix_length,
361 FinishedGroupReporter& report_finished_group)
363 assert(suffixes_end - suffixes > 0);
364 SuffixChar<CharIterator,UnsignedInteger>
365 suffix_key(text, common_prefix_length);
366 UnsignedInteger* pivot = ChoosePivot(suffixes, suffixes_end, suffix_key);
367 std::pair<UnsignedInteger*,UnsignedInteger*> equal_part
368 = TernaryPartition(suffixes, suffixes_end, pivot, suffix_key);
369 UnsignedInteger* equal_part_begin = equal_part.first;
370 UnsignedInteger* equal_part_end = equal_part.second;
371 if (suffixes < equal_part_begin) {
372 workarea = StringsortDispatch(text, text_end,
373 workarea, suffixes, equal_part_begin,
374 common_prefix_length, target_prefix_length,
375 report_finished_group);
377 assert(equal_part_end - equal_part_begin > 0);
378 workarea = StringsortNewPosition(text, text_end,
379 workarea, equal_part_begin, equal_part_end,
380 common_prefix_length + 1,
381 target_prefix_length,
382 report_finished_group);
383 if (equal_part_end < suffixes_end) {
384 workarea = StringsortDispatch(text, text_end,
385 workarea, equal_part_end, suffixes_end,
386 common_prefix_length, target_prefix_length,
387 report_finished_group);
389 return workarea;
392 ////////////////////////////////////////////////////////////
393 // The representation of suffixes when caching is used.
394 ////////////////////////////////////////////////////////////
395 template <typename UnsignedInteger>
396 struct CachedSuffix {
397 UnsignedInteger suffix;
398 UnsignedInteger cache;
399 struct KeyGetter : public
400 std::unary_function<CachedSuffix<UnsignedInteger>,UnsignedInteger> {
401 UnsignedInteger operator() (
402 CachedSuffix<UnsignedInteger> suffix) const {
403 return suffix.cache;
406 bool operator>= (const CachedSuffix<UnsignedInteger>& other) const {
407 return this->suffix >= other.suffix;
411 /////////////////////////////////////////////////////////////
412 // Fill the cache of all suffixes with new characters,
413 // i.e., with the characters in positions
414 // [common_prefix_length, common_prefix_length + sizeof(UnsignedInteger))
415 /////////////////////////////////////////////////////////////
416 template <typename CharIterator, typename UnsignedInteger>
417 void FillCache(
418 const CharIterator text, const CharIterator text_end,
419 CachedSuffix<UnsignedInteger>* suffixes,
420 CachedSuffix<UnsignedInteger>* suffixes_end,
421 const UnsignedInteger common_prefix_length)
423 unsigned char buffer[sizeof(UnsignedInteger)];
424 for (CachedSuffix<UnsignedInteger>* suffix = suffixes;
425 suffix < suffixes_end; ++suffix) {
426 // Copy characters to a local buffer.
427 CharIterator begin = text + suffix->suffix + common_prefix_length;
428 CharIterator end = begin + sizeof(UnsignedInteger);
429 if (text_end < end) {
430 end = text_end;
431 memset(buffer, 0, sizeof(UnsignedInteger));
433 std::copy(begin, end, buffer);
434 // Pack characters into a local integer in an endianness independent way.
435 UnsignedInteger cache = buffer[0];
436 for (int i = 1; i < sizeof(UnsignedInteger); ++i) {
437 cache = (cache << 8) + buffer[i];
439 // Write cache to the array.
440 suffix->cache = cache;
444 ////////////////////////////////////////////////////////////////
445 // Change from the standard representation (---------????)
446 // into the cached representation (-----?#?#?#?#)
447 // and sort.
448 ////////////////////////////////////////////////////////////////
449 template <typename CharIterator, typename UnsignedInteger,
450 typename FinishedGroupReporter>
451 UnsignedInteger* StartCachingAndSort(
452 const CharIterator text, const CharIterator text_end,
453 UnsignedInteger* workarea,
454 UnsignedInteger* uncached_suffixes, UnsignedInteger* uncached_suffixes_end,
455 UnsignedInteger common_prefix_length,
456 const UnsignedInteger target_prefix_length,
457 FinishedGroupReporter& report_finished_group)
459 const uint64 num_suffixes = uncached_suffixes_end - uncached_suffixes;
460 assert(num_suffixes > kMaxInsertionSortSize);
461 assert(uncached_suffixes - workarea >= num_suffixes);
462 CachedSuffix<UnsignedInteger>* cached_suffixes_end
463 = reinterpret_cast<CachedSuffix<UnsignedInteger>*>(
464 uncached_suffixes_end);
465 CachedSuffix<UnsignedInteger>* cached_suffixes
466 = cached_suffixes_end - num_suffixes;
467 // Copy starting positions to their new places.
468 CachedSuffix<UnsignedInteger>* i = cached_suffixes;
469 UnsignedInteger* j = uncached_suffixes;
470 while (j < uncached_suffixes_end) {
471 (i++)->suffix = *j++;
473 assert(i == cached_suffixes_end);
474 // Fill the caches.
475 FillCache(text, text_end,
476 cached_suffixes, cached_suffixes_end, common_prefix_length);
477 // Sort with string quicksort.
478 return StringQuicksortWithCache(text, text_end,
479 workarea,
480 cached_suffixes, cached_suffixes_end,
481 common_prefix_length, target_prefix_length,
482 report_finished_group);
485 ////////////////////////////////////////////////////////////////
486 // Change from the cached representation (?#?#?#?#)
487 // into the standard representation (----????)
488 // Returns the range containing the uncached suffixes.
489 ////////////////////////////////////////////////////////////////
490 template <typename UnsignedInteger>
491 std::pair<UnsignedInteger*,UnsignedInteger*> RemoveCache(
492 CachedSuffix<UnsignedInteger>* cached_suffixes,
493 CachedSuffix<UnsignedInteger>* cached_suffixes_end)
495 assert(cached_suffixes_end - cached_suffixes > 0);
496 UnsignedInteger* uncached_suffixes_end
497 = reinterpret_cast<UnsignedInteger*>(cached_suffixes_end);
498 UnsignedInteger* uncached_suffixes = uncached_suffixes_end;
499 do {
500 *--uncached_suffixes = (--cached_suffixes_end)->suffix;
501 } while (cached_suffixes_end > cached_suffixes);
502 return std::make_pair(uncached_suffixes, uncached_suffixes_end);
505 ////////////////////////////////////////////////////////////////
506 // Refill caches if appropriate and sort.
507 // Before refilling, handle various special cases.
508 ////////////////////////////////////////////////////////////////
509 template <typename CharIterator, typename UnsignedInteger,
510 typename FinishedGroupReporter>
511 UnsignedInteger* RefillCacheAndSort(
512 const CharIterator text, const CharIterator text_end,
513 UnsignedInteger* workarea,
514 CachedSuffix<UnsignedInteger>* suffixes,
515 CachedSuffix<UnsignedInteger>* suffixes_end,
516 UnsignedInteger common_prefix_length,
517 const UnsignedInteger target_prefix_length,
518 FinishedGroupReporter& report_finished_group)
520 const uint64 num_suffixes = suffixes_end - suffixes;
521 assert(num_suffixes > 0);
522 assert(common_prefix_length <= target_prefix_length);
523 // optimization for the case of one suffix
524 if (num_suffixes == 1) {
525 *workarea = suffixes->suffix;
526 report_finished_group(workarea, workarea + 1);
527 return workarea + 1;
529 // Find suffixes, whose end we have reached, i.e., for who the position
530 // common_prefix_length is beyond the end of the text.
531 UnsignedInteger first_finished_suffix
532 = (text_end - text) - common_prefix_length;
533 CachedSuffix<UnsignedInteger> finished_suffix;
534 finished_suffix.suffix = first_finished_suffix;
535 finished_suffix.cache = 0;
536 CachedSuffix<UnsignedInteger>* cached_finished_end
537 = std::partition(suffixes, suffixes_end,
538 std::bind2nd(std::greater_equal<CachedSuffix<UnsignedInteger> >(),
539 finished_suffix));
540 // If we found finished suffixes, handle them separately
541 if (cached_finished_end > suffixes) {
542 CachedSuffix<UnsignedInteger>* cached_finished_begin = suffixes;
543 suffixes = cached_finished_end;
544 // Now [cached_finished_begin,cached_finished_end) contains
545 // the finished suffixes and [suffixes,suffixes_end) contains
546 // the other suffixes.
548 // Remove caches of the finished suffixes
549 std::pair<UnsignedInteger*,UnsignedInteger*> uncached_finished
550 = RemoveCache(cached_finished_begin, cached_finished_end);
551 UnsignedInteger* finished_begin = uncached_finished.first;
552 UnsignedInteger* finished_end = uncached_finished.second;
553 // Sort in descending order of starting position.
554 // This is their correct order.
555 std::sort(finished_begin, finished_end,
556 std::greater<UnsignedInteger>());
557 // Move from the end of the workarea to the beginning of the workarea
558 finished_end = std::copy(finished_begin, finished_end, workarea);
559 finished_begin = workarea;
560 workarea = finished_end;
561 // Report the finished suffixes.
562 for (UnsignedInteger* i = finished_begin; i != finished_end; ++i) {
563 report_finished_group(i, i+1);
565 // If there were no unfinished suffixes, we are done.
566 if (suffixes == suffixes_end) return workarea;
568 // If we have reached target_prefix_length, the remaining suffixes
569 // are finished, too. If so, they form a single group.
570 if (common_prefix_length >= target_prefix_length) {
571 std::pair<UnsignedInteger*,UnsignedInteger*> uncached
572 = RemoveCache(suffixes, suffixes_end);
573 UnsignedInteger* finished_suffixes = workarea;
574 workarea = std::copy(uncached.first, uncached.second, workarea);
575 report_finished_group(finished_suffixes, workarea);
576 return workarea;
578 // Switch to non-caching algorithms if, either there are not
579 // enough suffixes to justify caching, or there are less than
580 // a cacheful of characters left before reaching target_prefix_length.
581 if (num_suffixes < kMinCacheRefillSize
582 || target_prefix_length - common_prefix_length < sizeof(UnsignedInteger))
584 std::pair<UnsignedInteger*,UnsignedInteger*> uncached_suffixes
585 = RemoveCache(suffixes, suffixes_end);
586 if (suffixes_end - suffixes <= kMaxInsertionSortSize) {
587 return InsertionSort(text, text_end,
588 workarea,
589 uncached_suffixes.first, uncached_suffixes.second,
590 common_prefix_length, target_prefix_length,
591 report_finished_group);
592 } else {
593 return StringQuicksort(text, text_end,
594 workarea,
595 uncached_suffixes.first, uncached_suffixes.second,
596 common_prefix_length, target_prefix_length,
597 report_finished_group);
600 // No more excuses left; refill the cache and sort.
601 FillCache(text, text_end,
602 suffixes, suffixes_end, common_prefix_length);
603 return StringQuicksortWithCache(text, text_end,
604 workarea, suffixes, suffixes_end,
605 common_prefix_length, target_prefix_length,
606 report_finished_group);
609 ///////////////////////////////////////////////////////////////////
610 // Decide whether to continue cached sorting
611 // or to switch to non-caching insertion sort.
612 ///////////////////////////////////////////////////////////////////
613 template <typename CharIterator, typename UnsignedInteger,
614 typename FinishedGroupReporter>
615 UnsignedInteger* StringsortDispatchWithCache(
616 const CharIterator text, const CharIterator text_end,
617 UnsignedInteger* workarea,
618 CachedSuffix<UnsignedInteger>* suffixes,
619 CachedSuffix<UnsignedInteger>* suffixes_end,
620 UnsignedInteger common_prefix_length,
621 const UnsignedInteger target_prefix_length,
622 FinishedGroupReporter& report_finished_group)
624 const uint64 num_suffixes = suffixes_end - suffixes;
625 assert(num_suffixes > 0);
626 if (num_suffixes > kMaxInsertionSortSize) {
627 return StringQuicksortWithCache(text, text_end,
628 workarea, suffixes, suffixes_end,
629 common_prefix_length, target_prefix_length,
630 report_finished_group);
632 std::pair<UnsignedInteger*,UnsignedInteger*> uncached_suffixes
633 = RemoveCache(suffixes, suffixes_end);
634 return InsertionSort(text, text_end,
635 workarea,
636 uncached_suffixes.first, uncached_suffixes.second,
637 common_prefix_length, target_prefix_length,
638 report_finished_group);
641 ///////////////////////////////////////////////////////////////
642 // String quicksort for the cached representation.
643 // This is the main cached sorting algorithm.
644 ///////////////////////////////////////////////////////////////
645 template <typename CharIterator, typename UnsignedInteger,
646 typename FinishedGroupReporter>
647 UnsignedInteger* StringQuicksortWithCache(
648 CharIterator text, CharIterator text_end,
649 UnsignedInteger* workarea,
650 CachedSuffix<UnsignedInteger>* suffixes,
651 CachedSuffix<UnsignedInteger>* suffixes_end,
652 UnsignedInteger common_prefix_length, UnsignedInteger target_prefix_length,
653 FinishedGroupReporter& report_finished_group)
655 const uint64 num_suffixes = suffixes_end - suffixes;
656 assert(num_suffixes > 0);
657 typename CachedSuffix<UnsignedInteger>::KeyGetter get_key;
658 CachedSuffix<UnsignedInteger>* pivot
659 = ChoosePivot(suffixes, suffixes_end, get_key);
660 std::pair<CachedSuffix<UnsignedInteger>*,CachedSuffix<UnsignedInteger>*>
661 equal_part = TernaryPartition(suffixes, suffixes_end, pivot, get_key);
662 CachedSuffix<UnsignedInteger>* equal_part_begin = equal_part.first;
663 CachedSuffix<UnsignedInteger>* equal_part_end = equal_part.second;
664 if (suffixes < equal_part_begin) {
665 workarea = StringsortDispatchWithCache(text, text_end,
666 workarea, suffixes, equal_part_begin,
667 common_prefix_length, target_prefix_length,
668 report_finished_group);
670 assert(equal_part_end - equal_part_begin > 0);
671 workarea = RefillCacheAndSort(text, text_end,
672 workarea, equal_part_begin, equal_part_end,
673 common_prefix_length + sizeof(UnsignedInteger),
674 target_prefix_length,
675 report_finished_group);
676 if (equal_part_end < suffixes_end) {
677 workarea = StringsortDispatchWithCache(text, text_end,
678 workarea, equal_part_end, suffixes_end,
679 common_prefix_length, target_prefix_length,
680 report_finished_group);
682 return workarea;
685 } // namespace stringsort
687 /////////////////////////////////////////////////////////////////////
688 // The interface function, see stringsort.h
689 /////////////////////////////////////////////////////////////////////
690 template <typename CharIterator, typename UnsignedInteger,
691 typename FinishedGroupReporter>
692 UnsignedInteger* StringsortSuffixes(
693 const CharIterator text, const CharIterator text_end,
694 UnsignedInteger* suffix_area, UnsignedInteger* suffixes_in,
695 UnsignedInteger* suffix_area_end,
696 int64 common_prefix_length,
697 int64 target_prefix_length,
698 FinishedGroupReporter& report_finished_group)
700 assert(text_end - text >= 0);
701 assert(suffixes_in - suffix_area >= 0);
702 assert(suffix_area_end - suffixes_in >= 0);
703 assert(common_prefix_length >= 0);
704 assert(target_prefix_length >= common_prefix_length);
705 // Check that target_prefix_length (and consequently common_prefix_length)
706 // is a value that can be represented by UnsignedInteger.
707 assert(static_cast<int64>(static_cast<UnsignedInteger>(
708 target_prefix_length)) >= target_prefix_length);
709 if (suffixes_in == suffix_area_end) {
710 return suffix_area;
712 UnsignedInteger* suffixes_end =
713 stringsort::StringsortNewPosition(
714 text, text_end,
715 suffix_area, suffixes_in, suffix_area_end,
716 static_cast<UnsignedInteger>(common_prefix_length),
717 static_cast<UnsignedInteger>(target_prefix_length),
718 report_finished_group);
719 assert(suffixes_end - suffix_area == suffix_area_end - suffixes_in);
720 return suffixes_end;
723 } // namespace dcsbwt
725 #endif // DCSBWT_STRINGSORT_INL_H__