modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / prefix_doubling-inl.h
blob3c628d8c98c21ac6c74938ff149758dc76af7e8e
1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #ifndef DCSBWT_PREFIX_DOUBLING_INL_H__
18 #define DCSBWT_PREFIX_DOUBLING_INL_H__
20 #include <functional> // for binary_function, bind1st
21 #include <algorithm> // for copy, sort, find_if
22 #include <numeric> // for partial_sum
24 #include "prefix_doubling.h"
25 #include "ternary_partition.h"
27 namespace dcsbwt {
29 template <typename StringIterator,
30 typename RankIterator,
31 typename SuffixIterator>
32 void SortSuffixesByPrefixDoubling(
33 StringIterator str, StringIterator str_end,
34 RankIterator rank_array, RankIterator rank_array_end,
35 SuffixIterator suffix_array, SuffixIterator suffix_array_end)
37 uint32 const kMaxLength = 0XfffffffdU; // == (2^32) - 3
38 uint32 length = str_end - str;
39 assert(length <= kMaxLength);
40 assert(rank_array_end - rank_array == length + 1);
41 assert(suffix_array_end - suffix_array == length + 1);
42 PrefixDoubler<uint32> pd(length);
43 // Construct the initial 1-order
44 pd.ComputeOneOrderForCharString(str, str_end, rank_array, rank_array_end,
45 suffix_array, suffix_array_end);
46 // Main repeated doubling procedure
47 pd.SortSuffixes(rank_array, rank_array_end,
48 suffix_array, suffix_array_end, 1U);
49 // Compute the suffix array from the final rank array
50 pd.ComputeFinalSuffixArray(rank_array, rank_array_end,
51 suffix_array, suffix_array_end);
54 namespace {
56 // Order comparison functor that treats its operands as indices to an array
57 // and compares the array entries instead of the indices themselves.
58 template <typename Iterator>
59 class IndexLess : public std::binary_function<
60 typename std::iterator_traits<Iterator>::difference_type,
61 typename std::iterator_traits<Iterator>::difference_type, bool>
63 public:
64 typedef typename std::iterator_traits<Iterator>::difference_type Index;
65 explicit IndexLess(Iterator begin, Iterator end) : array_(begin) {}
66 // The end of the array is not currently stored but a later version
67 // could store it and use it for boundary checking (in debug mode).
68 bool operator() (Index i, Index j) const { return array_[i] < array_[j]; }
69 private:
70 Iterator array_;
73 } // unnamed namespace
75 template <typename Integer>
76 template <typename StringIterator, typename RankIterator,
77 typename SuffixIterator>
78 void PrefixDoubler<Integer>::ComputeOneOrder(
79 StringIterator str, StringIterator str_end,
80 RankIterator rank_array, RankIterator rank_array_end,
81 SuffixIterator suffix_array, SuffixIterator suffix_array_end) const
83 assert(str_end - str == string_length_);
84 assert(rank_array_end - rank_array == string_length_ + 1);
85 assert(suffix_array_end - suffix_array == string_length_ + 1);
86 // Fill with the sequence: string_length_, 0, 1, ..., string_length_-1
87 suffix_array[0] = string_length_; // the empty suffix
88 for (SuffixIterator it = suffix_array + 1; it != suffix_array_end; ++it) {
89 *it = it - suffix_array - 1;
91 // Sort the suffixes into 1-order
92 IndexLess<StringIterator> less_for_one_order(str, str_end);
93 std::sort(suffix_array + 1, suffix_array_end, less_for_one_order);
94 // Compute the rank array
95 rank_array[string_length_] = 0;
96 SuffixIterator begin, end;
97 for (begin = suffix_array + 1; begin != suffix_array_end; begin = end) {
98 end = std::find_if(begin, suffix_array_end,
99 std::bind1st(less_for_one_order, *begin));
100 // [begin,end) is now an equivalence class
101 Integer rank = end - suffix_array - 1;
102 for (SuffixIterator it = begin; it != end; ++it) {
103 rank_array[*it] = rank;
105 if (end - begin == 1) {
106 *begin = FinishedSuffixMarker();
108 // Guard against infinite loop
109 assert(begin - suffix_array < end - suffix_array);
113 template <typename Integer>
114 template <typename StringIterator, typename RankIterator,
115 typename SuffixIterator>
116 void PrefixDoubler<Integer>::ComputeOneOrderForCharString(
117 StringIterator str, StringIterator str_end,
118 RankIterator rank_array, RankIterator rank_array_end,
119 SuffixIterator suffix_array, SuffixIterator suffix_array_end) const
121 assert(str_end - str == string_length_);
122 assert(rank_array_end - rank_array == string_length_ + 1);
123 assert(suffix_array_end - suffix_array == string_length_ + 1);
124 #ifndef NDEBUG
125 // Check that no information is lost by conversion to unsigned char
126 typedef typename std::iterator_traits<StringIterator>::value_type CharType;
127 for (StringIterator it = str; it != str_end; ++it) {
128 assert(*it == static_cast<CharType>(static_cast<unsigned char>(*it)));
130 #endif
131 // Count the number of occurrences of each character
132 std::vector<Integer> char_count(256, 0);
133 for (StringIterator it = str; it != str_end; ++it) {
134 unsigned int ch = *it;
135 ++char_count[ch];
137 // Compute the rank for suffixes starting with each character
138 std::vector<Integer> rank(char_count);
139 std::partial_sum(char_count.begin(), char_count.end(),
140 rank.begin());
141 assert(rank[255] == string_length_);
142 // The rank is also the suffix array position of
143 // the last suffix in the equivalence class.
144 // A shift by one gives us the last position of the previous
145 // equivalence class.
146 std::vector<uint32> suffix_array_position(256);
147 suffix_array_position[0] = 0;
148 std::copy(rank.begin(), rank.end() - 1,
149 suffix_array_position.begin() + 1);
150 for (StringIterator it = str; it != str_end; ++it) {
151 unsigned int ch = *it;
152 Integer str_pos = it - str;
153 rank_array[str_pos] = rank[ch];
154 suffix_array[++suffix_array_position[ch]] = str_pos;
156 assert(rank == suffix_array_position); // !!
157 // the empty suffix
158 rank_array[string_length_] = 0;
159 suffix_array[0] = string_length_;
160 // Could add finished suffix markers here but is it worth the effort?
163 template <typename Integer>
164 template <typename RankIterator, typename SuffixIterator>
165 void PrefixDoubler<Integer>::ComputeInitialSuffixArray(
166 RankIterator rank_array, RankIterator rank_array_end,
167 SuffixIterator suffix_array, SuffixIterator suffix_array_end,
168 Integer order_depth) const
170 // Fill with the sequence: string_length_, 0, 1, ..., string_length_-1
171 suffix_array[0] = string_length_; // the empty suffix
172 for (SuffixIterator it = suffix_array + 1; it != suffix_array_end; ++it) {
173 *it = it - suffix_array - 1;
175 RefineSubrange(rank_array, suffix_array,
176 suffix_array + 1, suffix_array_end, 0);
179 template <typename Integer>
180 template <typename RankIterator, typename SuffixIterator>
181 void PrefixDoubler<Integer>::ComputeFinalSuffixArray(
182 RankIterator rank_array, RankIterator rank_array_end,
183 SuffixIterator suffix_array, SuffixIterator suffix_array_end) const
185 #ifndef NDEBUG
186 // markers identify uninitialized entries
187 std::fill(suffix_array, suffix_array_end, FinishedSuffixMarker());
188 #endif
189 for (RankIterator it = rank_array; it != rank_array_end; ++it) {
190 suffix_array[*it] = it - rank_array;
192 #ifndef NDEBUG
193 SuffixIterator uninitialized_entry =
194 std::find(suffix_array, suffix_array_end, FinishedSuffixMarker());
195 assert(suffix_array_end - uninitialized_entry == 0);
196 #endif
199 // Refine the order of a group of consecutive suffixes.
200 // This is where the actual modification of ranks takes place.
201 // Preconditions (situation on entry):
202 // 1. [begin,end) is a sub range of the suffix array.
203 // The suffixes in this subrange have a common prefix of length
204 // common_prefix_length. The common_prefix_length can be 0.
205 // 2. The subrange occupies its final place in the suffix array,
206 // that is, the final suffix array positions of the suffixes
207 // are in this subrange. More formally, the number of other suffixes
208 // that are smaller than all the suffixes in this subrange is
209 // begin-suffix_array, and the number of other suffixes
210 // that are larger than all the suffixes in this subrange is
211 // suffix_array_end-end.
212 // 3. The other suffixes that are smaller than the suffixes in this
213 // subrange have ranks smaller than begin-suffix_array.
214 // The other suffixes that are larger than the suffixes in this
215 // subrange have ranks larger than or equal to suffix_array_end-end.
216 // (Together the above conditions imply that the ranks of the suffixes
217 // in this subrange are larger than or equal to begin-suffix_array
218 // and smaller than or equal to the smallest rank of a larger suffix.)
219 // 4. The rank array represents some weak order consistent with
220 // the lexicographic order, and thus a refinement of some h-order.
221 // The value of h is not provided (or needed), but it is assumed
222 // that the caller knows such h.
223 // Often h==common_prefix_length but this is not required.
224 // All the preconditions hold on exit, too, as well as the following
225 // additional condition:
226 // * The ranks of the suffixes in the subrange are now consistent
227 // with the (common_prefix_length+h)-order and their order in the
228 // subrange is consistent with the ranks.
229 template <typename Integer>
230 template <typename RankIterator, typename SuffixIterator>
231 void PrefixDoubler<Integer>::RefineSubrange(
232 RankIterator rank_array, SuffixIterator suffix_array,
233 SuffixIterator begin, SuffixIterator end,
234 Integer common_prefix_length) const
236 static const int kSmallRangeLimit = 15;
237 assert(end - begin >= 0);
238 RankAtDepth<RankIterator> sortkey(rank_array, common_prefix_length);
239 while (end - begin > kSmallRangeLimit) {
240 // Divide the range into three subranges:
241 // smaller-than-pivot, equal-to-pivot, and larger-than-pivot
242 SuffixIterator pivot = ChoosePivot(begin, end, sortkey);
243 std::pair<SuffixIterator, SuffixIterator> equal_group =
244 TernaryPartition(begin, end, pivot, sortkey);
245 SuffixIterator equal_begin = equal_group.first;
246 SuffixIterator equal_end = equal_group.second;
247 // Partition the smaller-than-pivot range recursively.
248 RefineSubrange(rank_array, suffix_array, begin, equal_begin,
249 common_prefix_length);
250 // Assign a new rank for the equal-to-pivot group
251 Integer new_rank = equal_end - suffix_array - 1;
252 for (SuffixIterator it = equal_begin; it != equal_end; ++it) {
253 rank_array[*it] = new_rank;
255 if (equal_end - equal_begin == 1) {
256 *(equal_begin) = FinishedSuffixMarker();
258 // Loop (tail recurse) to partition the larger-than-pivot group.
259 begin = equal_end;
260 assert(end - begin >= 0);
262 if (begin == end) return;
263 if (end - begin == 1) {
264 rank_array[*begin] = begin - suffix_array;
265 *begin = FinishedSuffixMarker();
266 return;
268 RefineSmallSubrange(rank_array, suffix_array,
269 begin, end, common_prefix_length);
272 template <typename Integer>
273 template <typename RankIterator, typename SuffixIterator>
274 void PrefixDoubler<Integer>::RefineSmallSubrange(
275 RankIterator rank_array, SuffixIterator suffix_array,
276 SuffixIterator begin, SuffixIterator end,
277 Integer common_prefix_length) const
279 // Selection sort for a small range
280 assert(end - begin >= 0);
281 RankAtDepth<RankIterator> sortkey(rank_array, common_prefix_length);
282 SuffixIterator smallest = begin;
283 Integer smallest_key = sortkey(*smallest);
284 for (SuffixIterator it = begin + 1; it < end; ++it) {
285 if (sortkey(*it) < smallest_key) {
286 smallest_key = sortkey(*it);
287 smallest = it;
290 using std::swap;
291 swap(*smallest, *begin);
292 SuffixIterator previous = begin;
293 Integer previous_key = smallest_key;
294 for (SuffixIterator it = begin + 1; it < end; ++it) {
295 smallest = it;
296 smallest_key = sortkey(*it);
297 for (SuffixIterator it2 = it + 1; it2 < end; ++it2) {
298 if (sortkey(*it2) < smallest_key) {
299 smallest_key = sortkey(*it2);
300 smallest = it2;
303 swap(*smallest, *it);
304 if (smallest_key > previous_key) {
305 Integer rank = (it - 1) - suffix_array;
306 for (SuffixIterator it2 = previous; it2 < it; ++it2) {
307 rank_array[*it2] = rank;
309 if (it - previous == 1) {
310 *previous = FinishedSuffixMarker();
312 previous = it;
313 previous_key = smallest_key;
316 Integer rank = (end - 1) - suffix_array;
317 for (SuffixIterator it = previous; it < end; ++it) {
318 rank_array[*it] = rank;
320 if (end - previous == 1) {
321 *previous = FinishedSuffixMarker();
325 // Go from h-order to 2h-order for all suffixes.
326 // This is done by finding all equivalence classes that have not been
327 // marked finished and calling DoubleGroup() for them.
328 // Consecutive marked suffixes or groups are combined.
329 // (DoubleGroup() does the initial marking of finished suffixes.)
330 template <typename Integer>
331 template <typename RankIterator, typename SuffixIterator>
332 Integer PrefixDoubler<Integer>::Double(
333 RankIterator rank_array, RankIterator rank_array_end,
334 SuffixIterator suffix_array, SuffixIterator suffix_array_end,
335 Integer order_depth) const
337 assert(order_depth > 0);
338 Integer size = suffix_array_end - suffix_array;
339 assert(size == string_length_ + 1);
340 assert(rank_array_end - rank_array == size);
341 bool previous_is_finished = false;
342 Integer previous_finished_first = 0;
343 Integer first = 0; // first position in equivalence class or finished group
344 Integer next = size; // first position in the next class or group
345 for (first = 0; first < size; first = next) {
346 assert(first <= string_length_);
347 assert(first >= 0);
348 Integer suffix = suffix_array[first];
349 if (IsFinishedSuffixOrGroup(suffix)) {
350 if (IsFinishedGroup(suffix)) {
351 next = suffix_array[first + 1];
352 } else {
353 assert(IsFinishedSuffix(suffix));
354 next = first + 1;
356 if (previous_is_finished) {
357 // Combine with a preceding finished suffix or group
358 suffix_array[previous_finished_first] = FinishedGroupMarker();
359 suffix_array[previous_finished_first + 1] = next;
360 } else {
361 previous_is_finished = true;
362 previous_finished_first = first;
364 } else {
365 assert(suffix <= string_length_);
366 assert(suffix >= 0);
367 Integer rank = rank_array[suffix];
368 assert(rank <= string_length_);
369 assert(rank >= 0);
370 next = rank + 1; // rank is the position of the last suffix
371 // in the equivalence class
372 RefineSubrange(rank_array, suffix_array,
373 suffix_array + first, suffix_array + next, order_depth);
374 previous_is_finished = false;
376 // guard against infinite loop
377 assert(next > first);
379 return order_depth * 2;
382 // Sort all suffixes by repeated doubling.
383 template <typename Integer>
384 template <typename RankIterator, typename SuffixIterator>
385 void PrefixDoubler<Integer>::SortSuffixes(
386 RankIterator rank_array, RankIterator rank_array_end,
387 SuffixIterator suffix_array, SuffixIterator suffix_array_end,
388 Integer order_depth) const
390 assert(order_depth > 0);
391 while (order_depth < string_length_) {
392 order_depth = Double(rank_array, rank_array_end,
393 suffix_array, suffix_array_end, order_depth);
397 } // namespace dcsbwt
399 #endif // DCSBWT_PREFIX_DOUBLING_INL_H__