1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Three algorithms for computing the Burrows-Wheeler transform.
18 // 1. StringsortBWTransform: Uses a fast implementation of
19 // string quicksort (multikey quicksort) for sorting the suffixes.
20 // 2. PrefixDoublingBWTransform: Uses the Larsson-Sadakane prefix
21 // doubling algorithm.
22 // 3. DifferenceCoverBWTransform: Uses the Karkkainen-Burkhardt
23 // algorithm based on difference cover sampling.
24 // This is the primary (and default) algorithm.
26 // TODO: This code needs to be reorganized and divided into multiple files.
28 #include "difference_cover-inl.h"
29 #include "prefix_doubling-inl.h"
30 #include "stringsort-inl.h"
31 #include "difference_cover.h"
32 #include "prefix_doubling.h"
33 #include "stringsort.h"
34 #include "bw_transform.h"
41 #include <cstdlib> // for strtoll
42 #include <algorithm> // for reverse
50 template <typename Iterator
>
51 class IteratorGreater
: public std::binary_function
<Iterator
, Iterator
, bool> {
54 bool operator() (Iterator a
, Iterator b
) const {
59 //////////////////////////////////////////////////////////////////
60 // BwtFromSuffixArray computes the BWT from the suffix array and
61 // sends it to output.
62 //////////////////////////////////////////////////////////////////
63 int64
BwtFromSuffixArray(char* block
, size_t block_size
,
64 uint32
* suffix_array
, uint32
* suffix_array_end
,
66 unsigned char ch
= '\0';
67 uint32 eob_position
= 0;
68 OutStreamBuffer buffer
;
69 buffer
.Connect(output
);
70 for (uint32 rank
= 0; rank
< block_size
+ 1; ++rank
) {
71 uint32 suffix
= suffix_array
[rank
];
73 assert(suffix
<= block_size
);
75 ch
= block
[suffix
- 1];
77 // This is the end-of-block (eob) character, which cannot be
78 // directly written since it has no code.
79 // Instead, write a copy of the previous character here and
80 // store this position in order to indicate its location.
90 //////////////////////////////////////////////////////////////////
91 // Compute the suffix cover sample ranks and provide methods
93 //////////////////////////////////////////////////////////////////
94 class DifferenceCoverSuffixComparer
{
96 DifferenceCoverSuffixComparer(char* block
, uint32 block_size
,
98 uint32
* suffix_area
, uint32
* suffix_area_end
)
99 : dcsampler_(period
, block_size
),
100 dcranks_(dcsampler_
.size() + 1) {
101 uint32 num_samples
= dcsampler_
.size();
102 assert(suffix_area_end
- suffix_area
>= 2 * num_samples
+ 1);
103 uint32
* sample_suffixes_in
= suffix_area_end
- num_samples
;
104 uint32
* sample_suffixes_end
= suffix_area
+ 1 + num_samples
;
106 uint32
* end
= dcsampler_
.Fill(suffix_area
);
107 assert(end
== sample_suffixes_end
- 1);
109 // Compute the sizes of the buckets.
110 uint32 bucket_size
[0x10000];
111 memset(bucket_size
, 0, 0x10000 * sizeof(uint32
));
112 // The suffix of length one needs special treatment if it is in the sample.
113 // If it is not, this will remain 0x10000.
114 int bucket_of_length_one_suffix
= 0x10000;
115 for (uint32 i
= 0; i
< num_samples
; ++i
) {
116 uint32 suffix
= suffix_area
[i
];
117 assert(suffix
< block_size
);
118 int bucket
= static_cast<unsigned char>(block
[suffix
]) << 8;
119 if (suffix
+ 1 < block_size
)
120 bucket
+= static_cast<unsigned char>(block
[suffix
+ 1]);
122 // The suffix of length one is treated as if there was
123 // an extra '\0' at the end of the block.
124 bucket_of_length_one_suffix
= bucket
;
125 ++bucket_size
[bucket
];
128 // Compute the bucket positions.
129 uint32 bucket_begin
[0x10000];
131 for (int bucket
= 0; bucket
< 0x10000; ++bucket
) {
132 bucket_begin
[bucket
] = sum
;
133 sum
+= bucket_size
[bucket
];
135 assert(sum
== num_samples
);
137 // Distribute the suffixes to the buckets.
138 if (bucket_of_length_one_suffix
!= 0x10000) {
139 // The suffix of length one is always the first in its bucket.
140 sample_suffixes_in
[bucket_begin
[bucket_of_length_one_suffix
]++]
143 for (uint32 i
= 0; i
< num_samples
; ++i
) {
144 uint32 suffix
= suffix_area
[i
];
145 assert(suffix
< block_size
);
146 if (suffix
< block_size
- 1) {
147 int bucket
= static_cast<unsigned char>(block
[suffix
]) << 8;
148 bucket
+= static_cast<unsigned char>(block
[suffix
+ 1]);
149 sample_suffixes_in
[bucket_begin
[bucket
]++] = suffix
;
153 uint32
* dcranks
= &dcranks_
[0];
154 uint32
* dcranks_end
= dcranks
+ num_samples
+ 1;
155 // This is the empty suffix needed by prefix doubling.
156 dcranks
[num_samples
] = 0;
157 suffix_area
[0] = num_samples
;
159 PrefixDoubler
<uint32
> doubler(num_samples
);
160 RankRecorder
rank_recorder(&dcsampler_
, dcranks
, suffix_area
,
161 doubler
.FinishedSuffixMarker());
164 end
= suffix_area
+ 1; // Skip the empty suffix.
165 uint32 shift
= sample_suffixes_in
- end
;
166 for (int bucket
= 0; bucket
< 0x10000; ++bucket
) {
168 end
+= bucket_size
[bucket
];
169 if (bucket
== bucket_of_length_one_suffix
) {
170 // The suffix of length one is already in its place
171 // at the beginning of the bucket. Process it separately and
172 // then sort the rest of the bucket. We cannot sort it with
173 // others, because StringsortSuffixes assumes that all suffixes
174 // have at least 2 characters.
175 assert(*(begin
+ shift
) == block_size
- 1);
176 *begin
= block_size
- 1;
177 rank_recorder(begin
, begin
+ 1);
182 block
, block
+ block_size
,
183 begin
, begin
+ shift
, end
+ shift
,
186 assert(end
== bucket_end
);
189 // Final sorting by prefix doubling.
190 uint32 period_interval
= dcsampler_
.PeriodInterval();
191 doubler
.SortSuffixes(dcranks
, dcranks_end
,
192 suffix_area
, sample_suffixes_end
, period_interval
);
195 // This functor can be used for comparing two suffixes in constant time
196 // if they have a common prefix of length period - 1.
197 class Less
: public std::binary_function
<uint32
, uint32
, bool> {
199 Less(const DifferenceCoverSample
<uint32
>* dcsample
,
200 const uint32
* dcranks
)
201 : dcsample_(dcsample
), dcranks_(dcranks
) {}
202 bool operator() (uint32 a
, uint32 b
) const {
203 uint32 shift
= dcsample_
->Shift(a
, b
);
204 uint32 apos
= dcsample_
->Rank(a
+ shift
);
205 uint32 bpos
= dcsample_
->Rank(b
+ shift
);
206 return dcranks_
[apos
] < dcranks_
[bpos
];
209 const DifferenceCoverSample
<uint32
>* const dcsample_
;
210 const uint32
* const dcranks_
;
212 Less
GetLess() const { return Less(&dcsampler_
, &dcranks_
[0]); }
214 uint32
Size() const { return dcranks_
.size(); }
217 DifferenceCoverSample
<uint32
> dcsampler_
;
218 std::vector
<uint32
> dcranks_
;
222 RankRecorder(const DifferenceCoverSample
<uint32
>* dcsample
,
223 uint32
* rank_array
, uint32
* suffix_array
,
224 uint32 finished_suffix_marker
)
225 : dcsample_(dcsample
),
226 rank_array_(rank_array
), suffix_array_(suffix_array
),
227 finished_suffix_marker_(finished_suffix_marker
) {}
228 void operator() (uint32
* begin
, uint32
* end
) const {
229 uint32 rank
= end
- suffix_array_
- 1;
230 if (end
- begin
== 1) {
231 rank_array_
[dcsample_
->Rank(*begin
)] = rank
;
232 *begin
= finished_suffix_marker_
;
234 for (uint32
* it
= begin
; it
!= end
; ++it
) {
235 uint32 pos
= dcsample_
->Rank(*it
);
236 rank_array_
[pos
] = rank
;
242 const DifferenceCoverSample
<uint32
>* const dcsample_
;
243 uint32
* const rank_array_
;
244 const uint32
* const suffix_array_
;
245 uint32 finished_suffix_marker_
;
251 //////////////////////////////////////////////////////////////////
252 // Optimize the order of the alphabet.
253 //////////////////////////////////////////////////////////////////
255 class EdgeGreater
: std::binary_function
<int, int, bool> {
257 EdgeGreater(const uint32
* weights
) : weights_(weights
) {}
258 bool operator() (int a
, int b
) const {
259 return weights_
[a
] > weights_
[b
];
262 const uint32
* const weights_
;
265 uint32
OptimizeAlphabetOrder(const uint32
* bucket_size
, uint8
* char_of_rank
) {
269 Graph(const uint32
* weights
) : kRemovedEdgeWeight(0xF0FFFFFF) {
270 memcpy(weights_
, weights
, 0x10000 * sizeof(uint32
));
272 int Edge(int i
, int j
) const {
279 int Source(int edge
) const {
281 assert(edge
< 0x10000);
284 int Target(int edge
) const {
286 assert(edge
< 0x10000);
289 uint32
Weight(int edge
) const { return weights_
[edge
]; }
290 void ReduceWeight(int edge
, uint32 weight
) {
292 assert(weight
<= weights_
[edge
]);
293 weights_
[edge
] -= weight
;
295 void Remove(int edge
) { weights_
[edge
] = kRemovedEdgeWeight
; }
296 bool Exists(int edge
) const {
297 return weights_
[edge
] != kRemovedEdgeWeight
;
300 EdgeGreater
GetEdgeGreater() const { return EdgeGreater(weights_
); }
302 uint32 weights_
[0x10000];
303 const uint32 kRemovedEdgeWeight
;
306 Graph
graph(bucket_size
);
308 uint32 total_weight
= 0;
309 for (int i
= 0; i
< 256; ++i
) {
310 for (int j
= 0; j
< 256; ++j
) {
311 int edge
= graph
.Edge(i
, j
);
312 total_weight
+= graph
.Weight(edge
);
316 uint32 commited_weight
= 0;
317 uint32 avoided_weight
= 0;
319 // Remove self-loops.
320 uint32 one_cycle_weight
= 0;
321 for (int i
= 0; i
< 256; ++i
) {
322 int edge
= graph
.Edge(i
, i
);
323 one_cycle_weight
+= graph
.Weight(edge
);
326 avoided_weight
+= one_cycle_weight
;
328 // '\0' is always the first character.
329 // Remove edges adjacent to it.
330 for (int i
= 1; i
< 256; ++i
) {
331 int edge
= graph
.Edge(0, i
);
332 commited_weight
+= graph
.Weight(edge
);
334 edge
= graph
.Edge(i
, 0);
335 avoided_weight
+= graph
.Weight(edge
);
340 std::clog
<< " total_weight=" << total_weight
341 << " commited_weight=" << commited_weight
342 << " avoided_weight=" << avoided_weight
343 << " one_cycle_weight=" << one_cycle_weight
347 uint32 two_cycle_weight
= 0;
348 for (int i
= 1; i
< 255; ++i
) {
349 for (int j
= i
+ 1; j
< 256; ++j
) {
350 int edge
= graph
.Edge(i
, j
);
351 int reverse_edge
= graph
.Edge(j
, i
);
352 uint32 min_weight
= std::min(graph
.Weight(edge
),
353 graph
.Weight(reverse_edge
));
354 two_cycle_weight
+= min_weight
;
355 graph
.ReduceWeight(edge
, min_weight
);
356 graph
.ReduceWeight(reverse_edge
, min_weight
);
359 commited_weight
+= two_cycle_weight
;
360 avoided_weight
+= two_cycle_weight
;
363 std::clog
<< " commited_weight=" << commited_weight
364 << " avoided_weight=" << avoided_weight
365 << " two_cycle_weight=" << two_cycle_weight
369 std::vector
<int> positive_edges
;
370 for (int i
= 1; i
< 256; ++i
) {
371 for (int j
= 1; j
< 256; ++j
) {
372 int edge
= graph
.Edge(i
, j
);
373 if (graph
.Exists(edge
) && graph
.Weight(edge
) > 0)
374 positive_edges
.push_back(edge
);
377 std::sort(positive_edges
.begin(), positive_edges
.end(),
378 graph
.GetEdgeGreater());
380 uint32 selected_weight
= 0;
381 uint32 discarded_weight
= 0;
383 // Dynamic transitive closure
384 std::vector
<uint8
> predecessors
[256];
385 std::vector
<uint8
> successors
[256];
386 for (std::vector
<int>::iterator it
= positive_edges
.begin();
387 it
!= positive_edges
.end(); ++it
) {
388 int discarded_edge
= *it
;
389 if (!graph
.Exists(discarded_edge
)) continue;
390 int source
= graph
.Target(discarded_edge
);
391 int target
= graph
.Source(discarded_edge
);
392 std::vector
<uint8
> new_target_predecessors
;
393 new_target_predecessors
.push_back(source
);
394 std::vector
<uint8
> new_source_successors
;
395 new_source_successors
.push_back(target
);
396 std::vector
<uint8
>::iterator it1
, it2
;
397 for (it1
= predecessors
[source
].begin();
398 it1
!= predecessors
[source
].end(); ++it1
) {
400 int edge
= graph
.Edge(node
, target
);
401 if (graph
.Exists(edge
)) {
402 new_target_predecessors
.push_back(node
);
405 for (it2
= successors
[target
].begin();
406 it2
!= successors
[target
].end(); ++it2
) {
408 int edge
= graph
.Edge(source
, node
);
409 if (graph
.Exists(edge
)) {
410 new_source_successors
.push_back(node
);
413 for (it1
= new_target_predecessors
.begin();
414 it1
!= new_target_predecessors
.end(); ++it1
) {
416 for (it2
= new_source_successors
.begin();
417 it2
!= new_source_successors
.end(); ++it2
) {
419 int selected_edge
= graph
.Edge(node1
, node2
);
420 if (graph
.Exists(selected_edge
)) {
421 int discarded_edge
= graph
.Edge(node2
, node1
);
422 selected_weight
+= graph
.Weight(selected_edge
);
423 graph
.Remove(selected_edge
);
424 discarded_weight
+= graph
.Weight(discarded_edge
);
425 graph
.Remove(discarded_edge
);
426 predecessors
[node2
].push_back(node1
);
427 successors
[node1
].push_back(node2
);
434 std::clog
<< " selected_weight=" << commited_weight
435 << " discarded_weight=" << avoided_weight
439 // Recover the node order.
440 uint8 node_weight
[256];
441 std::vector
<uint8
*> node_ptr
;
442 for (int i
= 1; i
< 256; ++i
) {
443 node_weight
[i
] = successors
[i
].size();
444 node_ptr
.push_back(&node_weight
[i
]);
446 std::sort(node_ptr
.begin(), node_ptr
.end(),
447 IteratorGreater
<uint8
*>());
449 for (int i
= 1; i
< 256; ++i
) {
450 char_of_rank
[i
] = node_ptr
[i
-1] - node_weight
;
452 if (discarded_weight
< selected_weight
) {
453 std::reverse(char_of_rank
+ 1, char_of_rank
+ 256);
454 std::swap(discarded_weight
, selected_weight
);
456 commited_weight
+= selected_weight
;
457 avoided_weight
+= discarded_weight
;
460 std::clog
<< " total_weight=" << total_weight
461 << " commited_weight=" << commited_weight
462 << " avoided_weight=" << avoided_weight
;
464 assert(total_weight
== commited_weight
+ avoided_weight
);
466 return commited_weight
;
471 //////////////////////////////////////////////////////////////////
472 // DifferenceCoverBWTransformer
473 //////////////////////////////////////////////////////////////////
475 class DifferenceCoverBWTransformer
: public BWTransformer
{
477 class Options
: public BWTransformer::AlgorithmSpecificOptions
{
479 Options() { Set(std::string("6")); }
480 virtual ~Options() {}
482 virtual bool Set(const std::string
& options
) {
483 if (0 == options
.length()) return true;
484 const char* begin
= options
.c_str();
485 const char* end
= begin
+ options
.length();
487 int64 log_period
= strtoll(begin
, &next
, 0);
488 if (next
!= end
) return false;
489 if (log_period
< kMinLogPeriod
|| log_period
> kMaxLogPeriod
)
492 std::clog
<< "Setting DifferenceCoverBWTransform::Options"
493 << " period=" << (1 << log_period
) << std::endl
;
495 log_period_
= log_period
;
498 virtual std::string
Get() const {
499 assert(log_period_
>= 0);
500 assert(log_period_
<= kMaxLogPeriod
);
502 sprintf(buffer
, "%d", log_period_
);
503 std::string
options(buffer
);
507 virtual int64
MaxSizeInBytes(int64 block_size
) const {
508 uint64 period
= 1LL << log_period_
;
509 uint64 size_per_period
= sizeof(char) * period
510 + sizeof(uint32
) * (period
511 + DifferenceCover::CoverSizeForPeriod(period
));
512 return kMemoryOverhead
513 + ((block_size
* size_per_period
) / period
)
514 + DifferenceCover::SizeInBytesForPeriod(period
);
516 virtual int64
MaxBlockSize(int64 memory_budget
) const {
517 uint64 period
= 1LL << log_period_
;
518 uint64 size_per_period
= sizeof(char) * period
519 + sizeof(uint32
) * (period
520 + DifferenceCover::CoverSizeForPeriod(period
));
521 memory_budget
-= (kMemoryOverhead
522 + DifferenceCover::SizeInBytesForPeriod(period
));
523 return (period
* memory_budget
) / size_per_period
;
525 virtual int64
SuggestedBlockSize(int64 memory_budget
) const {
526 return MaxBlockSize(memory_budget
);
529 virtual BWTransformer
* GetTransformer(int64 memory_budget
) {
530 return new DifferenceCoverBWTransformer(1 << log_period_
);
533 static const int64 kMemoryOverhead
= (1 << 20);
534 static const int kMinLogPeriod
= 3;
535 static const int kMaxLogPeriod
= 15;
539 DifferenceCoverBWTransformer(uint32 period
) : period_(period
) {}
540 virtual ~DifferenceCoverBWTransformer() {}
545 virtual int64
DoTransform(char* block
, size_t block_size
,
550 DCSorter(const DifferenceCoverSuffixComparer::Less
& dcless
)
552 void operator() (uint32
* begin
, uint32
* end
) const {
553 std::sort(begin
, end
, dcless_
);
556 DifferenceCoverSuffixComparer::Less dcless_
;
559 DifferenceCoverBWTransformer(const DifferenceCoverBWTransformer
&);
560 DifferenceCoverBWTransformer
& operator=(const DifferenceCoverBWTransformer
&);
563 int64
DifferenceCoverBWTransformer::DoTransform(
564 char* block
, size_t block_size
,
567 assert(block_size
> 0);
568 assert(block_size
< 0xFFFFFFFF - period_
); // TODO: Check this
569 uint32 num_suffixes
= block_size
+ 1;
571 // Use basic string sorting for small blocks.
572 if (block_size
< 1024) {
574 std::clog
<< "DifferenceCoverBWTransformer::DoTransform"
575 << " block_size=" << block_size
576 << " (using plain string sorting)"
579 std::vector
<uint32
> suffix_array(2 * (num_suffixes
));
580 uint32
* begin
= &suffix_array
[0];
581 uint32
* end
= begin
+ suffix_array
.size();
582 uint32
* suffixes_in
= end
- num_suffixes
;
583 for (int i
= 0; i
< num_suffixes
; ++i
) suffixes_in
[i
] = i
;
584 NullFinishedGroupReporter
<uint32
> null_reporter
;
585 uint32
* suffixes_end
=
587 block
, block
+ block_size
,
588 begin
, suffixes_in
, end
,
591 assert(suffixes_end
== begin
+ num_suffixes
);
592 return BwtFromSuffixArray(block
, block_size
,
593 begin
, suffixes_end
, output
);
595 // For short blocks and long periods, we may have to reduce the period.
596 // The period is restored at the end.
597 uint32 saved_period
= period_
;
598 while (block_size
< period_
- 1) period_
>>= 1;
601 std::clog
<< "DifferenceCoverBWTransformer::DoTransform"
602 << " block_size=" << block_size
603 << " period=" << period_
606 assert(block_size
>= period_
- 1);
608 // The extra +4 might be needed by DifferenceCoverSuffixComparer
610 std::vector
<uint32
> suffix_array(num_suffixes
+ 4);
611 uint32
* suffix_area
= &suffix_array
[0];
612 uint32
* suffix_area_end
= suffix_area
+ suffix_array
.size();
614 // Construct the difference cover sample.
615 DifferenceCoverSuffixComparer
dcscomp(block
, block_size
, period_
,
616 suffix_area
, suffix_area_end
);
618 std::clog
<< "Using difference cover sample of size " << dcscomp
.Size()
622 // Compute the sizes of the buckets.
624 std::clog
<< "Computing bucket sizes" << std::endl
;
626 uint32 bucket_size
[0x10000];
627 memset(bucket_size
, 0, 0x10000*sizeof(uint32
));
628 int bucket
= static_cast<unsigned char>(block
[0]) << 8;
629 for (uint32 i
= 1; i
< block_size
; ++i
) {
630 bucket
+= static_cast<unsigned char>(block
[i
]);
631 ++bucket_size
[bucket
];
632 bucket
= (bucket
<< 8) & 0xFF00;
634 ++bucket_size
[bucket
]; // suffix of length one
635 ++bucket_size
[0]; // empty suffix
638 std::clog
<< "Optimizing alphabet order" << std::endl
;
640 unsigned char char_of_rank
[256];
641 uint32 num_suffixes_to_sort
642 = OptimizeAlphabetOrder(bucket_size
, char_of_rank
);
643 uint8 rank_of_char
[256];
644 for (int i
= 0; i
< 256; ++i
) {
645 rank_of_char
[char_of_rank
[i
]] = i
;
648 // Compute the positions of the selected buckets.
650 std::clog
<< "Computing selected bucket positions" << std::endl
;
652 uint32 selected_bucket_begin
[0x10000];
653 uint32 num_selected_suffixes
= 0;
654 for (int ch1
= 0; ch1
< 256; ++ch1
) {
655 for (int ch2
= 0; ch2
< 256; ++ch2
) {
656 if (rank_of_char
[ch1
] < rank_of_char
[ch2
]) {
657 int bucket
= (ch1
<< 8) + ch2
;
658 selected_bucket_begin
[bucket
] = num_selected_suffixes
;
659 num_selected_suffixes
+= bucket_size
[bucket
];
663 assert(num_selected_suffixes
== num_suffixes_to_sort
);
665 uint32
* suffixes_end
= suffix_area
+ num_suffixes
;
666 uint32
* selected_suffixes
= suffix_area_end
- num_selected_suffixes
;
668 // Distribute the suffixes to the selected buckets.
670 std::clog
<< "Distributing into selected buckets" << std::endl
;
672 int ch1
= static_cast<unsigned char>(block
[0]);
673 for (uint32 i
= 1; i
< block_size
; ++i
) {
674 int ch2
= static_cast<unsigned char>(block
[i
]);
675 if (rank_of_char
[ch1
] < rank_of_char
[ch2
]) {
676 bucket
= (ch1
<< 8) + ch2
;
677 selected_suffixes
[selected_bucket_begin
[bucket
]++] = i
- 1;
682 // Compute the positions of all buckets.
683 uint32 bucket_begin
[0x10001];
685 uint32 sum
= bucket_size
[0];
686 for (int bucket
= 1; bucket
< 0x10000; ++bucket
) {
687 bucket_begin
[bucket
] = sum
;
688 sum
+= bucket_size
[bucket
];
690 assert(sum
== num_suffixes
);
691 bucket_size
[0x10000] = sum
;
693 // Sort the selected buckets.
695 std::clog
<< "Stringsorting " << num_suffixes_to_sort
696 << " out of " << num_suffixes
<< " suffixes"
699 DCSorter
dcsorter(dcscomp
.GetLess());
700 uint32
* source_end
= selected_suffixes
;
701 for (int ch1
= 0; ch1
< 256; ++ch1
) {
702 for (int ch2
= 0; ch2
< 256; ++ch2
) {
703 if (rank_of_char
[ch1
] < rank_of_char
[ch2
]) {
704 int bucket
= (ch1
<< 8) + ch2
;
705 uint32
* source_begin
= source_end
;
706 source_end
+= bucket_size
[bucket
];
707 uint32
* target_begin
= suffix_area
+ bucket_begin
[bucket
];
708 assert(target_begin
<= source_begin
);
711 block
, block
+ block_size
,
712 target_begin
, source_begin
, source_end
,
715 assert(target_end
== target_begin
+ bucket_size
[bucket
]);
720 // Move other suffixes into their final positions.
722 std::clog
<< "Setting other suffixes" << std::endl
;
725 suffix_area
[bucket_begin
[0]] = block_size
;
726 for (int i
= 0; i
< 256; ++i
) {
727 int chi
= char_of_rank
[i
];
728 uint32 sub_bucket_begin
[256];
729 uint32 sub_bucket_end
[256];
730 for (int ch
= 0; ch
< 256; ++ch
) {
731 int bucket
= (ch
<< 8) + chi
;
732 sub_bucket_begin
[ch
] = bucket_begin
[bucket
];
733 sub_bucket_end
[ch
] = bucket_begin
[bucket
] + bucket_size
[bucket
];
735 if (chi
== 0) ++sub_bucket_begin
[0];
736 uint32 rank
= bucket_begin
[chi
<< 8];
737 while (rank
< sub_bucket_begin
[chi
]) {
738 uint32 suffix
= suffix_area
[rank
++];
740 int ch
= static_cast<unsigned char>(block
[suffix
- 1]);
741 if (rank_of_char
[ch
] >= rank_of_char
[chi
])
742 suffix_area
[sub_bucket_begin
[ch
]++] = suffix
- 1;
745 rank
= bucket_begin
[(chi
+ 1) << 8];
746 while (rank
> sub_bucket_end
[chi
]) {
747 uint32 suffix
= suffix_area
[--rank
];
749 int ch
= static_cast<unsigned char>(block
[suffix
- 1]);
750 if (rank_of_char
[ch
] >= rank_of_char
[chi
])
751 suffix_area
[--sub_bucket_end
[ch
]] = suffix
- 1;
756 period_
= saved_period
;
757 return BwtFromSuffixArray(block
, block_size
,
758 suffix_area
, suffixes_end
, output
);
764 //////////////////////////////////////////////////////////////////
765 // StringsortBWTransformer
766 //////////////////////////////////////////////////////////////////
768 class StringsortBWTransformer
: public BWTransformer
{
770 class Options
: public BWTransformer::AlgorithmSpecificOptions
{
773 virtual ~Options() {}
775 virtual bool Set(const std::string
& options
) {
776 return 0 == options
.length();
778 virtual std::string
Get() const { return std::string(); }
779 virtual int64
MaxSizeInBytes(int64 block_size
) const {
780 return 9 * block_size
;
782 virtual int64
MaxBlockSize(int64 memory_budget
) const {
783 return memory_budget
/ 9;
785 virtual int64
SuggestedBlockSize(int64 memory_budget
) const {
786 return memory_budget
/ 9;
789 virtual BWTransformer
* GetTransformer(int64 memory_budget
) {
790 return new StringsortBWTransformer
;
794 StringsortBWTransformer() {}
795 virtual ~StringsortBWTransformer() {}
798 virtual int64
DoTransform(char* block
, size_t block_size
,
801 StringsortBWTransformer(const StringsortBWTransformer
&);
802 StringsortBWTransformer
& operator=(const StringsortBWTransformer
&);
806 int64
StringsortBWTransformer::DoTransform(
807 char* block
, size_t block_size
,
810 assert(block_size
> 0);
811 uint32 num_suffixes
= block_size
+ 1;
813 // Compute the sizes of the buckets.
814 uint32 bucket_size
[0x10000];
815 memset(bucket_size
, 0, 0x10000*sizeof(uint32
));
816 int ch
= static_cast<unsigned char>(block
[0]);
817 int bucket
= (ch
<< 8);
818 for (uint32 i
= 1; i
< block_size
; ++i
) {
819 bucket
+= static_cast<unsigned char>(block
[i
]);
820 ++bucket_size
[bucket
];
821 bucket
= (bucket
<< 8) & 0xFFFF;
823 int bucket_of_length_one_suffix
= bucket
;
824 ++bucket_size
[bucket
]; // suffix of length one
825 ++bucket_size
[0]; // empty suffix
827 // Compute the bucket positions.
828 uint32 bucket_begin
[0x10000];
830 uint32 sum
= bucket_size
[0];
831 bucket_begin
[1] = sum
;
832 for (int bucket
= 2; bucket
< 0x10000; ++bucket
) {
833 sum
+= bucket_size
[bucket
-1];
834 bucket_begin
[bucket
] = sum
;
836 assert(bucket_begin
[0xFFFF] + bucket_size
[0xFFFF] == num_suffixes
);
838 std::vector
<uint32
> suffix_array(2 * num_suffixes
);
839 uint32
* suffix_area
= &suffix_array
[0];
840 uint32
* suffixes_end
= suffix_area
+ num_suffixes
;
841 uint32
* suffix_area_end
= suffix_area
+ suffix_array
.size();
842 uint32
* suffixes_in
= suffix_area_end
- num_suffixes
;
844 // Distribute the suffixes to the buckets.
845 suffixes_in
[bucket_begin
[0]++] = block_size
;
846 suffixes_in
[bucket_begin
[bucket_of_length_one_suffix
]++] = block_size
- 1;
847 ch
= static_cast<unsigned char>(block
[0]);
849 for (uint32 i
= 1; i
< block_size
; ++i
) {
850 bucket
+= static_cast<unsigned char>(block
[i
]);
851 suffixes_in
[bucket_begin
[bucket
]++] = i
- 1;
852 bucket
= (bucket
<< 8) & 0xFFFF;
856 NullFinishedGroupReporter
<uint32
> null_reporter
;
857 uint32 shift
= suffixes_in
- suffix_area
;
858 uint32
* begin
= suffix_area
;
859 uint32
* end
= suffix_area
+ 1;
862 for (int bucket
= 0; bucket
< 0x10000; ++bucket
) {
864 end
+= bucket_size
[bucket
];
865 if (bucket
== bucket_of_length_one_suffix
) {
866 *begin
= block_size
- 1;
871 block
, block
+ block_size
,
872 begin
, begin
+ shift
, end
+ shift
,
875 assert(end
== bucket_end
);
878 return BwtFromSuffixArray(block
, block_size
,
879 suffix_area
, suffixes_end
, output
);
883 //////////////////////////////////////////////////////////////////
884 // PrefixDoublingBWTransformer
885 //////////////////////////////////////////////////////////////////
887 class PrefixDoublingBWTransformer
: public BWTransformer
{
889 class Options
: public BWTransformer::AlgorithmSpecificOptions
{
891 Options() : stringsort_depth_(6) {}
892 virtual ~Options() {}
894 virtual bool Set(const std::string
& options
) {
895 const char* begin
= options
.c_str();
896 const char* end
= begin
+ options
.length();
897 int64 stringsort_depth
= stringsort_depth_
;
898 while (begin
!= end
) {
902 stringsort_depth
= strtoll(begin
, &next
, 0);
904 if (stringsort_depth
< 2) return false;
910 stringsort_depth_
= stringsort_depth
;
913 virtual std::string
Get() const {
915 std::string
options("d");
916 sprintf(buffer
, "%u", stringsort_depth_
);
920 virtual int64
MaxSizeInBytes(int64 block_size
) const {
921 int64 input_factor
= sizeof(char);
922 int64 rank_array_factor
= sizeof(uint32
);
923 int64 maximum_suffix_array_factor
= 2 * sizeof(uint32
);
924 int64 maximum_factor
= input_factor
+ rank_array_factor
925 + maximum_suffix_array_factor
; // maximum_factor = 9
926 return maximum_factor
* block_size
+ kMemoryOverhead
;
928 virtual int64
MaxBlockSize(int64 memory_budget
) const {
929 int64 input_factor
= sizeof(char);
930 int64 rank_array_factor
= sizeof(uint32
);
931 int64 minimum_suffix_array_factor
= sizeof(uint32
);
932 int64 minimum_factor
= input_factor
+ rank_array_factor
933 + minimum_suffix_array_factor
; // minimum_factor = 9
934 return (memory_budget
- kMemoryOverhead
) / minimum_factor
;
936 virtual int64
SuggestedBlockSize(int64 memory_budget
) const {
937 int64 input_factor
= sizeof(char);
938 int64 rank_array_factor
= sizeof(uint32
);
939 int64 suggested_suffix_array_factor
= (5 * sizeof(uint32
))/4;
940 int64 suggested_factor
= input_factor
+ rank_array_factor
941 + suggested_suffix_array_factor
; // suggested_factor = 10
942 return (memory_budget
- kMemoryOverhead
) / suggested_factor
;
945 virtual BWTransformer
* GetTransformer(int64 memory_budget
) {
946 int64 linear_memory_budget
=
947 memory_budget
- kMemoryOverhead
;
948 return new PrefixDoublingBWTransformer(stringsort_depth_
,
949 linear_memory_budget
);
952 static const int64 kMemoryOverhead
= (1 << 20);
953 uint32 stringsort_depth_
;
956 PrefixDoublingBWTransformer(uint32 stringsort_depth
,
957 int64 linear_memory_budget
)
958 : stringsort_depth_(stringsort_depth
),
959 linear_memory_budget_(linear_memory_budget
) {}
960 virtual ~PrefixDoublingBWTransformer() {}
963 uint32 stringsort_depth_
;
964 int64 linear_memory_budget_
;
966 virtual int64
DoTransform(char* block
, size_t block_size
,
971 RankRecorder(uint32
* rank_array
, uint32
* suffix_array
)
972 : rank_array_(rank_array
), suffix_array_(suffix_array
) {}
973 void operator() (uint32
* begin
, uint32
* end
) const {
974 uint32 rank
= end
- suffix_array_
- 1;
975 for (uint32
* it
= begin
; it
!= end
; ++it
) {
976 rank_array_
[*it
] = rank
;
980 uint32
* const rank_array_
;
981 const uint32
* const suffix_array_
;
984 PrefixDoublingBWTransformer(const PrefixDoublingBWTransformer
&);
985 PrefixDoublingBWTransformer
& operator=(const PrefixDoublingBWTransformer
&);
988 int64
PrefixDoublingBWTransformer::DoTransform(char* block
, size_t block_size
,
990 uint32 num_suffixes
= block_size
+ 1;
991 assert(num_suffixes
> block_size
);
992 int64 suffix_array_extra_space
993 = (linear_memory_budget_
- sizeof(char) * block_size
994 - 2 * sizeof(uint32
) * num_suffixes
) / sizeof(uint32
);
995 assert(suffix_array_extra_space
>= 0);
996 if (suffix_array_extra_space
> num_suffixes
)
997 suffix_array_extra_space
= num_suffixes
;
999 std::clog
<< "Prefix doubling BW transform"
1000 << " block_size=" << block_size
1001 << " stringsort_depth=" << stringsort_depth_
1002 << " suffix_array_extra_space="
1003 << suffix_array_extra_space
;
1005 // Phase 1: Bucketsort by first two characters.
1006 // Compute the sizes of the buckets.
1007 uint32 bucket_size
[0x10000];
1008 memset(bucket_size
, 0, 0x10000*sizeof(uint32
));
1009 int ch
= static_cast<unsigned char>(block
[0]);
1010 int bucket
= (ch
<< 8);
1011 for (uint32 i
= 1; i
< block_size
; ++i
) {
1012 bucket
+= static_cast<unsigned char>(block
[i
]);
1013 ++bucket_size
[bucket
];
1014 bucket
= (bucket
<< 8) & 0xFFFF;
1016 int bucket_of_length_one_suffix
= bucket
;
1017 ++bucket_size
[bucket
]; // suffix of length one
1018 ++bucket_size
[0]; // empty suffix
1020 // Compute the bucket positions.
1021 uint32 bucket_begin
[0x10000];
1022 bucket_begin
[0] = 0;
1023 uint32 sum
= bucket_size
[0];
1024 bucket_begin
[1] = sum
;
1025 for (int bucket
= 2; bucket
< 0x10000; ++bucket
) {
1026 sum
+= bucket_size
[bucket
-1];
1027 bucket_begin
[bucket
] = sum
;
1029 assert(bucket_begin
[0xFFFF] + bucket_size
[0xFFFF] == num_suffixes
);
1031 // Setup suffix array.
1032 std::vector
<uint32
> suffix_array(num_suffixes
+ suffix_array_extra_space
);
1033 uint32
* suffix_area
= &suffix_array
[0];
1034 uint32
* suffixes_end
= suffix_area
+ num_suffixes
;
1035 uint32
* suffix_area_end
= suffix_area
+ suffix_array
.size();
1036 uint32
* suffixes_in
= suffix_area_end
- num_suffixes
;
1038 // Distribute the suffixes to the buckets (at the end of suffix area).
1039 suffixes_in
[bucket_begin
[0]++] = block_size
;
1040 suffixes_in
[bucket_begin
[bucket_of_length_one_suffix
]++] = block_size
- 1;
1041 ch
= static_cast<unsigned char>(block
[0]);
1043 for (uint32 i
= 1; i
< block_size
; ++i
) {
1044 bucket
+= static_cast<unsigned char>(block
[i
]);
1045 suffixes_in
[bucket_begin
[bucket
]++] = i
- 1;
1046 bucket
= (bucket
<< 8) & 0xFFFF;
1049 // Phase 2: Stringsort by stringsort_depth_ first characters.
1050 // During sorting, set up the rank array using rank_recorder.
1051 // rank_recorder must be called for every bucket resulting
1052 // from Phase 2, including buckets of size one. For the buckets
1053 // with suffixes of length 0 and 1, it is done directly here.
1054 // For others, it is done in StringsortSuffixes. Therefore,
1055 // StringsortSuffixes must be called for every Phase 1 bucket,
1056 // even if it does no sorting (bucket_size==1 or stringsort_depth_==2).
1057 std::vector
<uint32
> rank_array(num_suffixes
);
1058 RankRecorder
rank_recorder(&rank_array
[0], suffix_area
);
1059 uint32 shift
= suffixes_in
- suffix_area
;
1060 uint32
* begin
= suffix_area
;
1061 uint32
* end
= suffix_area
+ 1;
1062 *begin
= block_size
;
1063 rank_recorder(begin
, begin
+ 1);
1065 for (int bucket
= 0; bucket
< 0x10000; ++bucket
) {
1067 end
+= bucket_size
[bucket
];
1068 if (bucket
== bucket_of_length_one_suffix
) {
1069 *begin
= block_size
- 1;
1070 rank_recorder(begin
, begin
+ 1);
1073 uint32
* bucket_end
=
1075 block
, block
+ block_size
,
1076 begin
, begin
+ shift
, end
+ shift
,
1077 2, stringsort_depth_
,
1079 assert(end
== bucket_end
);
1082 // Phase 3: Complete sorting by prefix doubling.
1083 PrefixDoubler
<uint32
> prefix_doubler(block_size
);
1084 prefix_doubler
.SortSuffixes(rank_array
.begin(), rank_array
.end(),
1085 suffix_area
, suffixes_end
,
1087 prefix_doubler
.ComputeFinalSuffixArray(
1088 rank_array
.begin(), rank_array
.end(),
1089 suffix_area
, suffixes_end
);
1090 return BwtFromSuffixArray(block
, block_size
,
1091 suffix_area
, suffixes_end
, output
);
1096 //////////////////////////////////////////////////////////////////
1098 //////////////////////////////////////////////////////////////////
1100 bool BWTransformer::Options::Set(const std::string
& options_string
) {
1101 if (options_string
.length() < 1) return false;
1102 char algorithm
= options_string
[0];
1103 AlgorithmSpecificOptions
* options
;
1104 switch (algorithm
) {
1106 options
= new DifferenceCoverBWTransformer::Options
;
1109 options
= new StringsortBWTransformer::Options
;
1112 options
= new PrefixDoublingBWTransformer::Options
;
1117 std::string
algorithm_specific_options_string(options_string
, 1);
1118 bool options_are_valid
= options
->Set(algorithm_specific_options_string
);
1119 if (options_are_valid
) {
1120 algorithm_id_
= algorithm
;
1121 if (algorithm_specific_options_
) delete algorithm_specific_options_
;
1122 algorithm_specific_options_
= options
;
1126 return options_are_valid
;
1130 int64
BWTransformer::Transform(char* block
, size_t block_size
,
1132 Options options
, int64 memory_budget
) {
1133 assert(block_size
<= options
.MaxBlockSize(memory_budget
));
1134 std::reverse(block
, block
+ block_size
);
1135 BWTransformer
* algorithm
= options
.GetAlgorithm(memory_budget
);
1136 int64 eob_position
= algorithm
->DoTransform(block
, block_size
,
1139 std::reverse(block
, block
+ block_size
);
1140 return eob_position
;
1143 } // namespace dcsbwt