1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #ifndef DCSBWT_STRINGSORT_H__
18 #define DCSBWT_STRINGSORT_H__
24 ////////////////////////////////////////////////////////////////
25 // Sort a set of suffixes of a text using string sorting algorithms,
26 // i.e., algorithms that do not take advantage of the fact the input
27 // strings are suffixes of a single string. The primary algorithm
28 // is multikey quicksort.
32 // [text,text_end) contains the text whose suffixes are being sorted
33 // CharIterator must be a random access iterator with a value type
34 // convertible to unsigned char.
36 // [suffix_area,suffix_area_end) contains the suffixes and possibly
37 // additional working space. A value i represents [text+i,text_end).
38 // UnsignedInteger must be a built-in unsigned integer type.
39 // * On entry, the suffixes to be sorted must be at the end of
40 // the suffix area in [suffixes_in,suffix_area_end).
41 // The contents of [suffix_area,suffixes_in) are not used.
42 // * On exit, the suffixes are at the beginning of the suffix area,
43 // i.e, in [suffix_area,suffixes_end), where
44 // suffixes_end = suffix_area + (suffix_area_end - suffixes_in)
45 // is the return value of the function.
46 // The contents of the rest of the area are arbitrary.
47 // The additional working space in the suffix area is optional,
48 // but the algorithm may be faster with a larger working space.
49 // The maximum speed-up is achieved when the additional working
50 // space equals the space for the suffixes.
52 // common_prefix_length is the length of a prefix that all the suffixes
53 // to be sorted share (and which can be skipped in comparisons).
55 // target_prefix_length: A group of suffixes sharing a prefix of this
56 // length do not have to be sorted further. On exit, such a group
57 // is in the correct position with respect to other suffixes, but the
58 // internal order of the group is arbitrary. If target_prefix_length
59 // equals text length, the suffixes are completely sorted.
60 // On the other hand, a smaller value keeps the worst case time
61 // and space requirements reasonable (see below).
63 // report_finished_group is a functor that is called for every group
64 // of suffixes that the algorithm is finished with (see
65 // target_prefix_length). When StringsortSuffixes makes the call
66 // report_finished_group(group_begin, group_end), it means that:
67 // 1. [group_begin,group_end) is a subrange of [suffix_area,suffixes_end)
68 // and contains a subset of the original suffixes.
69 // 2. Either the size of the group is one or all suffixes in the group
70 // share a prefix of length target_prefix_length.
71 // 3. The contents of [group_begin,group_end) are not changed or even
72 // read anymore during the rest of the execution. Thus
73 // report_finished_group may also change the contents.
74 // Every suffix belongs to exactly one of the reported groups.
75 // Use NullFinishedGroupReporter if no reporting is needed.
79 // The time complexity of the algorithm is O(n log n + D)
80 // where n is the number of suffixes and D is the sum of the lengths
81 // of the distinguishing prefixes of the suffixes, i.e., the minimum
82 // number of characters that needs to be inspected to complete the job.
83 // In the worst case, D can be as large as n * (text length - (n-1)/2).
84 // Typically it is much smaller though.
85 // In particular, D <= n*target_prefix_length
89 // The algorithm uses little memory in addition to the input and the stack.
90 // However, the size of the stack can grow large. The size of the stack
91 // is at most a few hundred bytes times the following the quantity:
92 // log(n) + min(target_prefix_length, F)
93 // F is the length of the longest prefix shared by more than
94 // kMinInsertionSortSize different suffixes.
95 // (kMinInsertionSortSize is defined in stringsort-inl.h)
96 ////////////////////////////////////////////////////////////////
97 template <typename CharIterator
, typename UnsignedInteger
,
98 typename FinishedGroupReporter
>
99 UnsignedInteger
* StringsortSuffixes(
100 const CharIterator text
, const CharIterator text_end
,
101 UnsignedInteger
* suffix_area
,
102 UnsignedInteger
* suffixes_in
, UnsignedInteger
* suffix_area_end
,
103 int64 common_prefix_length
,
104 int64 target_prefix_length
,
105 FinishedGroupReporter
& report_finished_group
);
107 // Use this as the FinishedGroupReporter if no reporting is needed.
108 // It is a bit faster than using your own null reporter.
109 template <typename UnsignedInteger
>
110 class NullFinishedGroupReporter
{
112 void operator() (UnsignedInteger
* begin
, UnsignedInteger
* end
) const {}
114 template <typename T
>
115 bool operator!=(const T
&) const { return true; }
116 template <typename T
>
117 bool operator!=(const NullFinishedGroupReporter
<T
>&) const { return false; }
120 } // namespace dcsbwt
122 #endif // DCSBWT_STRINGSORT_H__