modified: myjupyterlab.sh
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / prefix_doubling.h
blob083fc7d34bcc4a15e581287cf2c64daf810937e8
1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Prefix doubling is a technique for sorting the suffixes
18 // of a string of length n in O(n log n) worst-case time.
19 // This file contains an implementation of a prefix doubling algorithm
20 // based on the Larsson-Sadakane algorithm described in
22 // J. Larsson, K. Sadakane: Faster Suffix Sorting.
23 // Theoretical Computer Science, in press 2007.
24 // http://eprints.csce.kyushu-u.ac.jp/33/
26 // The simple way to use the algorithm is the function
27 // SortSuffixesByPrefixDoubling().
28 // More flexibility and more chances for optimizations are offered
29 // by the class PrefixDoubler.
31 #ifndef DCSBWT_PREFIX_DOUBLING_H__
32 #define DCSBWT_PREFIX_DOUBLING_H__
34 #include <functional> // for unary_function
35 #include <cassert>
37 namespace dcsbwt {
39 // The function SortSuffixesByPrefixDoubling takes three arrays as
40 // an argument. The first array contains the input string, and the
41 // other two contain the output on exit:
42 // - The rank array contains a rank (an integer in the range [0..n])
43 // for each suffix such that the order of the ranks is the same as
44 // the lexicographic order of the suffixes.
45 // - The suffix array contains the suffixes in the lexicographic order
46 // with each suffix represented by its starting position.
47 // For example:
48 // input string: BANANA
49 // rank array: 4362510
50 // suffix array: 6531042
52 // The arrays are, in fact, STL random access iterator ranges.
53 // Here is a precise description of the input and the output:
54 // - The range [str,str_end) contains the input string; it will not be
55 // modified. The length of the range, i.e., the length of the string,
56 // (call it n) must be at most (2^32)-3.
57 // The type of the elements should be either char or unsigned char.
58 // (Other types are possible as long as they can be converted to
59 // unsigned char; this is checked in debug mode.) The characters
60 // are converted to unsigned char for purposes of comparison.
61 // (The standard string comparison operators and strcmp-like functions
62 // behave the same way.)
63 // - The range [rank_array,rank_array_end) contains the ranks on output;
64 // the contents on input are not used. The size of the range should
65 // be n+1 (one larger than the size of [str,str_end)). The element
66 // type should be an integral type capable of storing all values in
67 // the range [0..n] and supporting conversion to and from uint32.
68 // After execution, rank_array[i] contains the number of the suffixes
69 // of the string (including the empty suffix and the full string)
70 // that are smaller than the suffix [str+i,str_end).
71 // An alternative definition is that the range contains a permutation
72 // of the integers [0..n] satisfying for all i,j in [0..n]:
73 // rank_array[i] < rank_array[j] if and only if
74 // std::lexicographical_compare(str+i, str_end, str+j, str_end,
75 // std::less<unsigned char>())
76 // returns true.
77 // - The range [suffix_array,suffix_array_end) contains the suffix array
78 // on output; the contents on input are not used. The size of the range
79 // should be n+1 (the same as [rank_array,rank_array_end)). The element
80 // type should be an integral type capable of storing all values in
81 // the range [0..n+2] (the values n+1 and n+2 are needed by internal
82 // computation) and supporting conversion to and from uint32.
83 // After execution, the range contains a permutation of
84 // the integers [0..n], where a value i represents the suffix
85 // [str+i,str_end) and the order is the same as the lexicographical
86 // order of the suffixes. It satisfies, for all i in [0..n]:
87 // rank_array[suffix_array[i]] == suffix_array[rank_array[i]] == i.
88 // (If you wonder why both rank and suffix array are produced when
89 // one can so easily be constructed from the other, it is because both
90 // arrays are needed during the computation, so one can as well
91 // use them both for output.)
93 template <typename StringIterator,
94 typename RankIterator,
95 typename SuffixIterator>
96 void SortSuffixesByPrefixDoubling(
97 StringIterator str, StringIterator str_end,
98 RankIterator rank_array, RankIterator rank_array_end,
99 SuffixIterator suffix_array, SuffixIterator suffix_array_end);
101 // SortSuffixesByPrefixDoubling is not sufficiently flexible for some
102 // special situations (in particular, those arising in the suffix sorting
103 // algorithms based on difference covers). The class PrefixDoubler provides
104 // the core algorithm in its bare form together with some supporting
105 // components that can be used for building special applications.
106 // To understand PrefixDoubler you need to know more about the algorithm.
108 // An h-order of the suffixes is an order based on the first h characters
109 // (call it the h-prefix) of each suffix. If the suffix is shorter than h,
110 // the h-prefix is the suffix itself. An h-order is not a total order
111 // in general (but a weak order, see below) as multiple suffixes can have
112 // the same h-prefix. For example, under 0-order all suffixes are equal.
113 // However, for h >= n, the h-order is always the lexicographical total
114 // order of the suffixes.
116 // The algorithm is based on a doubling procedure that turns an h-order
117 // into a 2h-order. The outline of the algorithm is:
118 // 1. Obtain an initial h-order for some h > 0.
119 // 2. Apply the doubling procedure repeatedly until fully sorted.
120 // The second step is the core algorithm implemented by the member function
121 // PrefixDoubler::SortSuffixes(). It takes as an input the value of h (called
122 // order_depth in the code) and the two arrays, rank and suffix array,
123 // which must be consistent with the initial h-order on input.
124 // On output, the rank array is as with SortSuffixesByPrefixDoubling()
125 // but the suffix array is not (see below). The suffix array can be
126 // computed from the rank array using PrefixDoubler::ComputeFinalSuffixArray().
127 // Note that the original string is not an input to the main algorithm;
128 // it is only used for obtaining the initial h-order. There are functions
129 // that help in computing the initial h-order, but users can also write
130 // their own, which is the main source for the added flexibility.
132 // The rank array is the primary representation of the h-order at all stages
133 // of the algorithm. It must satisfy the following conditions:
134 // 1. If rank_array[i] < rank_array[j], then the suffix starting at
135 // position i is lexicographically smaller than the suffix starting at
136 // position j.
137 // 2. If rank_array[i] == rank_array[j], then suffixes i and j have
138 // the same h-prefix. (The reverse implication is not required;
139 // see the paragraph on weak orders below.)
140 // 3. rank_array[i] is equal to the number of other suffixes whose rank is
141 // smaller or equal to rank_array[i]. For example, rank_array[n]=0
142 // for all h > 0 (n is the empty suffix).
143 // When all suffixes have a different rank (e.g. for all h >= n), this
144 // definition coincides with the earlier definition of the output
145 // of SortSuffixesByPrefixDoubling().
147 // Example: input string: BANANA
148 // 1-order rank array: 4363630 <-- initial 1-order
149 // 2-order rank array: 4363610 <-- after first doubling
150 // 4-order rank array: 4362510 <-- after second doubling
151 // this is the total order
153 // The suffix array in its basic form contains the suffixes (i.e.,
154 // a permutation of [0..n]) in the order defined by the rank array,
155 // with equal suffixes in arbitrary order.
157 // Example: input string: BANANA
158 // 1-order rank array: 4363630
159 // 1-order suffix array: 6135024 <-- both of these would be valid
160 // 6531042 <-- inputs to SortSuffixes()
162 // An advanced form allows overwriting some entries with special markers
163 // as explained later. For basic use it is enough to know that:
164 // - On input to the main algorithm SortSuffixes(), the suffix array
165 // can but does not have to contain the markers.
166 // - On output, the suffix array contains markers to a large extent,
167 // and the funtion ComputeFinalSuffixArray() should be used for
168 // obtaining the final suffix array if needed.
170 // The class PrefixDoubler stores no information about the state of the
171 // algorithm except the length of the string; each function takes all
172 // necessary information as an argument. This design gives the user full
173 // control over the storage and representation of the arrays, which can
174 // be useful since these arrays can be really large. The reasons
175 // why this class exists at all (instead of separate functions) are:
176 // - collecting the related functions together
177 // - encapsulation of the special markers explained below
178 // - parametrization of the integral type
179 // The last point refers to the template paraneter type Integer, which is
180 // used in all internal integral computations. It should be able to represent
181 // all values in the range [0..n+2]. The value type of the rank and
182 // suffix arrays should support conversion to and from the type Integer
183 // for all values in the range [0..n+2].
185 // The rest of the documentation is not essential for basic use but
186 // may be useful for understanding the code and for some optimizations.
188 // A weak order over a set is an equivalence class partition of the set
189 // with a total order on the equivalence classes. Every h-order is
190 // a weak order over the set of suffixes. Weak orders are exactly
191 // those that fit the STL requirement of strict weak ordering (see 25.3
192 // in the C++ Standard). Two relations on weak orders are of interest to us:
193 // - Two weak orders W1 and W2 are *consistent* if there are no two
194 // elements x and y for which x<y according to W1 and y<x according
195 // to W2. All h-orders are consistent with each other.
196 // - A weak order W1 is a *refinement* of a weak order W2 if W1 and W2
197 // are consistent and the equivalence class partition of W1 is a
198 // refinement of the equivalence class partition of W2.
199 // A g-order is a refinement of an h-order if g > h.
200 // The rank array is able to represent any weak order on the suffixes
201 // A valid order for a specific value of h is any order that:
202 // - is consistent with the lexicographical total order and
203 // - is a refinement of the h-order.
204 // These are the conditions 1 and 2 in the earlier rank array definition.
205 // Any valid order for h > 0 is a legal input to the algorithm.
206 // A more refined order makes the algorithm faster in general.
207 // Note though that an additional refinement is much more effective if
208 // it allows increasing the value of h. Additional refinements have to
209 // be reflected in the suffix array, too.
211 // Example: input string: BANANA
212 // 1-order rank array: 4363630 <-- no additional refinement
213 // 4363610 <-- additional refinement
214 // 1-order suffix array: 6531042 <-- valid for both cases
215 // 6135024 <-- not valid for refined case
217 // A suffix that has been separated from other suffixes (i.e.,has
218 // a unique rank), is called finished: its rank and its position in the
219 // suffix array do not change anymore. The suffix array entry for
220 // a finished suffix is not needed anymore (the rank is enough)
221 // and may be overwritten by special a special marker,
222 // whose purpose is to quickly jump over the finished parts.
223 // Two kinds of markings are used:
224 // - The value FinishedSuffixMarker() marks a single finished suffix.
225 // - The value FinishedGroupMarker() marks the beginning of a group of
226 // more than one consecutive finished suffixes. The entry following
227 // the marker contains the next position after the end of the group.
228 // The remaining entries in the group are never accessed.
229 // Any combination of unmarked finished suffixes, single marked
230 // suffixes and marked groups are legal including consecutive marked
231 // suffixes and groups. Unmarked finished suffixes get marked and
232 // consecutive marked suffixes and groups get combined when encountered.
233 // Adding markers already during the computation of the initial
234 // h-order may speed up the algorithm. (Besides allowing jumping over
235 // groups, the markers avoid cache misses due to rank array accesses.)
237 template <typename Integer>
238 class PrefixDoubler {
239 public:
240 // Only the length of the string is set on initialization
241 explicit PrefixDoubler(Integer string_length)
242 : string_length_(string_length) {}
243 // no default constructor
244 // implicit destructor
245 // implicit copy constructor
246 // implicit copy assignment
248 // The main repeated doubling algorithm
249 template <typename RankIterator, typename SuffixIterator>
250 void SortSuffixes(
251 RankIterator rank_array, RankIterator rank_array_end,
252 SuffixIterator suffix_array, SuffixIterator suffix_array_end,
253 Integer order_depth) const;
255 // Compute an initial 1-order from a string.
256 // This works for any string whose characters can be compared.
257 // The ordering is based on std::less<value_type> for the
258 // value_type of the StringIterator.
259 // WARNING: If the value type is char and the string contains values
260 // outside [0..128), the order may be inconsistent with the standard
261 // string comparisons and strcmp-like functions, which convert to
262 // unsigned char for comparisons. Use ComputeOneOrderForCharString()
263 // instead.
264 // TODO: add support for user-defined character comparison operator
265 template <typename StringIterator, typename RankIterator,
266 typename SuffixIterator>
267 void ComputeOneOrder(
268 StringIterator str, StringIterator str_end,
269 RankIterator rank_array, RankIterator rank_array_end,
270 SuffixIterator suffix_array, SuffixIterator suffix_array_end) const;
272 // Compute an initial 1-order from a string of (unsigned) char.
273 // This works for strings whose characters are convertible to unsigned char.
274 // In debug mode, there is a check that the conversion loses no information.
275 // The ordering is based on std::less<unsigned char>.
276 // This is probably faster than ComputeOneOrder(), and produces an order
277 // consistent with the standard string comparisons and strcmp-like functions.
278 template <typename StringIterator, typename RankIterator,
279 typename SuffixIterator>
280 void ComputeOneOrderForCharString(
281 StringIterator str, StringIterator str_end,
282 RankIterator rank_array, RankIterator rank_array_end,
283 SuffixIterator suffix_array, SuffixIterator suffix_array_end) const;
285 // Compute the initial suffix_array including markers
286 // from the initial rank array.
287 template <typename RankIterator, typename SuffixIterator>
288 void ComputeInitialSuffixArray(
289 RankIterator rank_array, RankIterator rank_array_end,
290 SuffixIterator suffix_array, SuffixIterator suffix_array_end,
291 Integer order_depth) const;
293 // Compute the final suffix array (without markers) from the final
294 // rank array.
295 // This should only be called with the fully sorted rank array
296 // (checked in debug mode).
297 template <typename RankIterator, typename SuffixIterator>
298 void ComputeFinalSuffixArray(
299 RankIterator rank_array, RankIterator rank_array_end,
300 SuffixIterator suffix_array, SuffixIterator suffix_array_end) const;
302 // Special markers for finished suffixes
303 inline Integer FinishedSuffixMarker() const { return string_length_ + 1; }
304 inline Integer FinishedGroupMarker() const { return string_length_ + 2; }
305 inline bool IsFinishedSuffix(Integer i) const {
306 return FinishedSuffixMarker() == i;
308 inline bool IsFinishedGroup(Integer i) const {
309 return FinishedGroupMarker() == i;
311 inline bool IsFinishedSuffixOrGroup(Integer i) const {
312 assert(i <= string_length_ + 2);
313 return i > string_length_;
316 // Do one doubling step.
317 // Returns the new order_depth.
318 template <typename RankIterator, typename SuffixIterator>
319 Integer Double(
320 RankIterator rank_array, RankIterator rank_array_end,
321 SuffixIterator suffix_array, SuffixIterator suffix_array_end,
322 Integer order_depth) const;
324 private:
325 Integer string_length_;
327 // Functor that retrieves the rank at i+order_depth
328 template <typename RankIterator>
329 class RankAtDepth : public std::unary_function<Integer, Integer> {
330 public:
331 RankAtDepth(RankIterator rank_array, Integer order_depth)
332 : rank_plus_depth_(rank_array + order_depth) {}
333 Integer operator() (Integer i) const {
334 return rank_plus_depth_[i];
336 private:
337 RankIterator rank_plus_depth_;
340 // Do one doubling step for a group of suffixes.
341 template <typename RankIterator, typename SuffixIterator>
342 void RefineSubrange(
343 RankIterator rank_array, SuffixIterator suffix_array,
344 SuffixIterator begin, SuffixIterator end,
345 Integer common_prefix_length) const;
347 // Do one doubling step for a group of suffixes.
348 template <typename RankIterator, typename SuffixIterator>
349 void RefineSmallSubrange(
350 RankIterator rank_array, SuffixIterator suffix_array,
351 SuffixIterator begin, SuffixIterator end,
352 Integer common_prefix_length) const;
355 } // namespace dcsbwt
357 #endif // DCSBWT_PREFIX_DOUBLING_H__