1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #ifndef DCSBWT_PREFIX_DOUBLING_INL_H__
18 #define DCSBWT_PREFIX_DOUBLING_INL_H__
20 #include <functional> // for binary_function, bind1st
21 #include <algorithm> // for copy, sort, find_if
22 #include <numeric> // for partial_sum
24 #include "prefix_doubling.h"
25 #include "ternary_partition.h"
29 template <typename StringIterator
,
30 typename RankIterator
,
31 typename SuffixIterator
>
32 void SortSuffixesByPrefixDoubling(
33 StringIterator str
, StringIterator str_end
,
34 RankIterator rank_array
, RankIterator rank_array_end
,
35 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
)
37 uint32
const kMaxLength
= 0XfffffffdU
; // == (2^32) - 3
38 uint32 length
= str_end
- str
;
39 assert(length
<= kMaxLength
);
40 assert(rank_array_end
- rank_array
== length
+ 1);
41 assert(suffix_array_end
- suffix_array
== length
+ 1);
42 PrefixDoubler
<uint32
> pd(length
);
43 // Construct the initial 1-order
44 pd
.ComputeOneOrderForCharString(str
, str_end
, rank_array
, rank_array_end
,
45 suffix_array
, suffix_array_end
);
46 // Main repeated doubling procedure
47 pd
.SortSuffixes(rank_array
, rank_array_end
,
48 suffix_array
, suffix_array_end
, 1U);
49 // Compute the suffix array from the final rank array
50 pd
.ComputeFinalSuffixArray(rank_array
, rank_array_end
,
51 suffix_array
, suffix_array_end
);
56 // Order comparison functor that treats its operands as indices to an array
57 // and compares the array entries instead of the indices themselves.
58 template <typename Iterator
>
59 class IndexLess
: public std::binary_function
<
60 typename
std::iterator_traits
<Iterator
>::difference_type
,
61 typename
std::iterator_traits
<Iterator
>::difference_type
, bool>
64 typedef typename
std::iterator_traits
<Iterator
>::difference_type Index
;
65 explicit IndexLess(Iterator begin
, Iterator end
) : array_(begin
) {}
66 // The end of the array is not currently stored but a later version
67 // could store it and use it for boundary checking (in debug mode).
68 bool operator() (Index i
, Index j
) const { return array_
[i
] < array_
[j
]; }
73 } // unnamed namespace
75 template <typename Integer
>
76 template <typename StringIterator
, typename RankIterator
,
77 typename SuffixIterator
>
78 void PrefixDoubler
<Integer
>::ComputeOneOrder(
79 StringIterator str
, StringIterator str_end
,
80 RankIterator rank_array
, RankIterator rank_array_end
,
81 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
) const
83 assert(str_end
- str
== string_length_
);
84 assert(rank_array_end
- rank_array
== string_length_
+ 1);
85 assert(suffix_array_end
- suffix_array
== string_length_
+ 1);
86 // Fill with the sequence: string_length_, 0, 1, ..., string_length_-1
87 suffix_array
[0] = string_length_
; // the empty suffix
88 for (SuffixIterator it
= suffix_array
+ 1; it
!= suffix_array_end
; ++it
) {
89 *it
= it
- suffix_array
- 1;
91 // Sort the suffixes into 1-order
92 IndexLess
<StringIterator
> less_for_one_order(str
, str_end
);
93 std::sort(suffix_array
+ 1, suffix_array_end
, less_for_one_order
);
94 // Compute the rank array
95 rank_array
[string_length_
] = 0;
96 SuffixIterator begin
, end
;
97 for (begin
= suffix_array
+ 1; begin
!= suffix_array_end
; begin
= end
) {
98 end
= std::find_if(begin
, suffix_array_end
,
99 std::bind1st(less_for_one_order
, *begin
));
100 // [begin,end) is now an equivalence class
101 Integer rank
= end
- suffix_array
- 1;
102 for (SuffixIterator it
= begin
; it
!= end
; ++it
) {
103 rank_array
[*it
] = rank
;
105 if (end
- begin
== 1) {
106 *begin
= FinishedSuffixMarker();
108 // Guard against infinite loop
109 assert(begin
- suffix_array
< end
- suffix_array
);
113 template <typename Integer
>
114 template <typename StringIterator
, typename RankIterator
,
115 typename SuffixIterator
>
116 void PrefixDoubler
<Integer
>::ComputeOneOrderForCharString(
117 StringIterator str
, StringIterator str_end
,
118 RankIterator rank_array
, RankIterator rank_array_end
,
119 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
) const
121 assert(str_end
- str
== string_length_
);
122 assert(rank_array_end
- rank_array
== string_length_
+ 1);
123 assert(suffix_array_end
- suffix_array
== string_length_
+ 1);
125 // Check that no information is lost by conversion to unsigned char
126 typedef typename
std::iterator_traits
<StringIterator
>::value_type CharType
;
127 for (StringIterator it
= str
; it
!= str_end
; ++it
) {
128 assert(*it
== static_cast<CharType
>(static_cast<unsigned char>(*it
)));
131 // Count the number of occurrences of each character
132 std::vector
<Integer
> char_count(256, 0);
133 for (StringIterator it
= str
; it
!= str_end
; ++it
) {
134 unsigned int ch
= *it
;
137 // Compute the rank for suffixes starting with each character
138 std::vector
<Integer
> rank(char_count
);
139 std::partial_sum(char_count
.begin(), char_count
.end(),
141 assert(rank
[255] == string_length_
);
142 // The rank is also the suffix array position of
143 // the last suffix in the equivalence class.
144 // A shift by one gives us the last position of the previous
145 // equivalence class.
146 std::vector
<uint32
> suffix_array_position(256);
147 suffix_array_position
[0] = 0;
148 std::copy(rank
.begin(), rank
.end() - 1,
149 suffix_array_position
.begin() + 1);
150 for (StringIterator it
= str
; it
!= str_end
; ++it
) {
151 unsigned int ch
= *it
;
152 Integer str_pos
= it
- str
;
153 rank_array
[str_pos
] = rank
[ch
];
154 suffix_array
[++suffix_array_position
[ch
]] = str_pos
;
156 assert(rank
== suffix_array_position
); // !!
158 rank_array
[string_length_
] = 0;
159 suffix_array
[0] = string_length_
;
160 // Could add finished suffix markers here but is it worth the effort?
163 template <typename Integer
>
164 template <typename RankIterator
, typename SuffixIterator
>
165 void PrefixDoubler
<Integer
>::ComputeInitialSuffixArray(
166 RankIterator rank_array
, RankIterator rank_array_end
,
167 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
,
168 Integer order_depth
) const
170 // Fill with the sequence: string_length_, 0, 1, ..., string_length_-1
171 suffix_array
[0] = string_length_
; // the empty suffix
172 for (SuffixIterator it
= suffix_array
+ 1; it
!= suffix_array_end
; ++it
) {
173 *it
= it
- suffix_array
- 1;
175 RefineSubrange(rank_array
, suffix_array
,
176 suffix_array
+ 1, suffix_array_end
, 0);
179 template <typename Integer
>
180 template <typename RankIterator
, typename SuffixIterator
>
181 void PrefixDoubler
<Integer
>::ComputeFinalSuffixArray(
182 RankIterator rank_array
, RankIterator rank_array_end
,
183 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
) const
186 // markers identify uninitialized entries
187 std::fill(suffix_array
, suffix_array_end
, FinishedSuffixMarker());
189 for (RankIterator it
= rank_array
; it
!= rank_array_end
; ++it
) {
190 suffix_array
[*it
] = it
- rank_array
;
193 SuffixIterator uninitialized_entry
=
194 std::find(suffix_array
, suffix_array_end
, FinishedSuffixMarker());
195 assert(suffix_array_end
- uninitialized_entry
== 0);
199 // Refine the order of a group of consecutive suffixes.
200 // This is where the actual modification of ranks takes place.
201 // Preconditions (situation on entry):
202 // 1. [begin,end) is a sub range of the suffix array.
203 // The suffixes in this subrange have a common prefix of length
204 // common_prefix_length. The common_prefix_length can be 0.
205 // 2. The subrange occupies its final place in the suffix array,
206 // that is, the final suffix array positions of the suffixes
207 // are in this subrange. More formally, the number of other suffixes
208 // that are smaller than all the suffixes in this subrange is
209 // begin-suffix_array, and the number of other suffixes
210 // that are larger than all the suffixes in this subrange is
211 // suffix_array_end-end.
212 // 3. The other suffixes that are smaller than the suffixes in this
213 // subrange have ranks smaller than begin-suffix_array.
214 // The other suffixes that are larger than the suffixes in this
215 // subrange have ranks larger than or equal to suffix_array_end-end.
216 // (Together the above conditions imply that the ranks of the suffixes
217 // in this subrange are larger than or equal to begin-suffix_array
218 // and smaller than or equal to the smallest rank of a larger suffix.)
219 // 4. The rank array represents some weak order consistent with
220 // the lexicographic order, and thus a refinement of some h-order.
221 // The value of h is not provided (or needed), but it is assumed
222 // that the caller knows such h.
223 // Often h==common_prefix_length but this is not required.
224 // All the preconditions hold on exit, too, as well as the following
225 // additional condition:
226 // * The ranks of the suffixes in the subrange are now consistent
227 // with the (common_prefix_length+h)-order and their order in the
228 // subrange is consistent with the ranks.
229 template <typename Integer
>
230 template <typename RankIterator
, typename SuffixIterator
>
231 void PrefixDoubler
<Integer
>::RefineSubrange(
232 RankIterator rank_array
, SuffixIterator suffix_array
,
233 SuffixIterator begin
, SuffixIterator end
,
234 Integer common_prefix_length
) const
236 static const int kSmallRangeLimit
= 15;
237 assert(end
- begin
>= 0);
238 RankAtDepth
<RankIterator
> sortkey(rank_array
, common_prefix_length
);
239 while (end
- begin
> kSmallRangeLimit
) {
240 // Divide the range into three subranges:
241 // smaller-than-pivot, equal-to-pivot, and larger-than-pivot
242 SuffixIterator pivot
= ChoosePivot(begin
, end
, sortkey
);
243 std::pair
<SuffixIterator
, SuffixIterator
> equal_group
=
244 TernaryPartition(begin
, end
, pivot
, sortkey
);
245 SuffixIterator equal_begin
= equal_group
.first
;
246 SuffixIterator equal_end
= equal_group
.second
;
247 // Partition the smaller-than-pivot range recursively.
248 RefineSubrange(rank_array
, suffix_array
, begin
, equal_begin
,
249 common_prefix_length
);
250 // Assign a new rank for the equal-to-pivot group
251 Integer new_rank
= equal_end
- suffix_array
- 1;
252 for (SuffixIterator it
= equal_begin
; it
!= equal_end
; ++it
) {
253 rank_array
[*it
] = new_rank
;
255 if (equal_end
- equal_begin
== 1) {
256 *(equal_begin
) = FinishedSuffixMarker();
258 // Loop (tail recurse) to partition the larger-than-pivot group.
260 assert(end
- begin
>= 0);
262 if (begin
== end
) return;
263 if (end
- begin
== 1) {
264 rank_array
[*begin
] = begin
- suffix_array
;
265 *begin
= FinishedSuffixMarker();
268 RefineSmallSubrange(rank_array
, suffix_array
,
269 begin
, end
, common_prefix_length
);
272 template <typename Integer
>
273 template <typename RankIterator
, typename SuffixIterator
>
274 void PrefixDoubler
<Integer
>::RefineSmallSubrange(
275 RankIterator rank_array
, SuffixIterator suffix_array
,
276 SuffixIterator begin
, SuffixIterator end
,
277 Integer common_prefix_length
) const
279 // Selection sort for a small range
280 assert(end
- begin
>= 0);
281 RankAtDepth
<RankIterator
> sortkey(rank_array
, common_prefix_length
);
282 SuffixIterator smallest
= begin
;
283 Integer smallest_key
= sortkey(*smallest
);
284 for (SuffixIterator it
= begin
+ 1; it
< end
; ++it
) {
285 if (sortkey(*it
) < smallest_key
) {
286 smallest_key
= sortkey(*it
);
291 swap(*smallest
, *begin
);
292 SuffixIterator previous
= begin
;
293 Integer previous_key
= smallest_key
;
294 for (SuffixIterator it
= begin
+ 1; it
< end
; ++it
) {
296 smallest_key
= sortkey(*it
);
297 for (SuffixIterator it2
= it
+ 1; it2
< end
; ++it2
) {
298 if (sortkey(*it2
) < smallest_key
) {
299 smallest_key
= sortkey(*it2
);
303 swap(*smallest
, *it
);
304 if (smallest_key
> previous_key
) {
305 Integer rank
= (it
- 1) - suffix_array
;
306 for (SuffixIterator it2
= previous
; it2
< it
; ++it2
) {
307 rank_array
[*it2
] = rank
;
309 if (it
- previous
== 1) {
310 *previous
= FinishedSuffixMarker();
313 previous_key
= smallest_key
;
316 Integer rank
= (end
- 1) - suffix_array
;
317 for (SuffixIterator it
= previous
; it
< end
; ++it
) {
318 rank_array
[*it
] = rank
;
320 if (end
- previous
== 1) {
321 *previous
= FinishedSuffixMarker();
325 // Go from h-order to 2h-order for all suffixes.
326 // This is done by finding all equivalence classes that have not been
327 // marked finished and calling DoubleGroup() for them.
328 // Consecutive marked suffixes or groups are combined.
329 // (DoubleGroup() does the initial marking of finished suffixes.)
330 template <typename Integer
>
331 template <typename RankIterator
, typename SuffixIterator
>
332 Integer PrefixDoubler
<Integer
>::Double(
333 RankIterator rank_array
, RankIterator rank_array_end
,
334 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
,
335 Integer order_depth
) const
337 assert(order_depth
> 0);
338 Integer size
= suffix_array_end
- suffix_array
;
339 assert(size
== string_length_
+ 1);
340 assert(rank_array_end
- rank_array
== size
);
341 bool previous_is_finished
= false;
342 Integer previous_finished_first
= 0;
343 Integer first
= 0; // first position in equivalence class or finished group
344 Integer next
= size
; // first position in the next class or group
345 for (first
= 0; first
< size
; first
= next
) {
346 assert(first
<= string_length_
);
348 Integer suffix
= suffix_array
[first
];
349 if (IsFinishedSuffixOrGroup(suffix
)) {
350 if (IsFinishedGroup(suffix
)) {
351 next
= suffix_array
[first
+ 1];
353 assert(IsFinishedSuffix(suffix
));
356 if (previous_is_finished
) {
357 // Combine with a preceding finished suffix or group
358 suffix_array
[previous_finished_first
] = FinishedGroupMarker();
359 suffix_array
[previous_finished_first
+ 1] = next
;
361 previous_is_finished
= true;
362 previous_finished_first
= first
;
365 assert(suffix
<= string_length_
);
367 Integer rank
= rank_array
[suffix
];
368 assert(rank
<= string_length_
);
370 next
= rank
+ 1; // rank is the position of the last suffix
371 // in the equivalence class
372 RefineSubrange(rank_array
, suffix_array
,
373 suffix_array
+ first
, suffix_array
+ next
, order_depth
);
374 previous_is_finished
= false;
376 // guard against infinite loop
377 assert(next
> first
);
379 return order_depth
* 2;
382 // Sort all suffixes by repeated doubling.
383 template <typename Integer
>
384 template <typename RankIterator
, typename SuffixIterator
>
385 void PrefixDoubler
<Integer
>::SortSuffixes(
386 RankIterator rank_array
, RankIterator rank_array_end
,
387 SuffixIterator suffix_array
, SuffixIterator suffix_array_end
,
388 Integer order_depth
) const
390 assert(order_depth
> 0);
391 while (order_depth
< string_length_
) {
392 order_depth
= Double(rank_array
, rank_array_end
,
393 suffix_array
, suffix_array_end
, order_depth
);
397 } // namespace dcsbwt
399 #endif // DCSBWT_PREFIX_DOUBLING_INL_H__