modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / bw_transform.cc
blob2fd54a1300696578b92334a4fcdd9771791cdc68
1 // Copyright 2007 Google Inc.
2 //
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
7 //
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Three algorithms for computing the Burrows-Wheeler transform.
18 // 1. StringsortBWTransform: Uses a fast implementation of
19 // string quicksort (multikey quicksort) for sorting the suffixes.
20 // 2. PrefixDoublingBWTransform: Uses the Larsson-Sadakane prefix
21 // doubling algorithm.
22 // 3. DifferenceCoverBWTransform: Uses the Karkkainen-Burkhardt
23 // algorithm based on difference cover sampling.
24 // This is the primary (and default) algorithm.
26 // TODO: This code needs to be reorganized and divided into multiple files.
28 #include "difference_cover-inl.h"
29 #include "prefix_doubling-inl.h"
30 #include "stringsort-inl.h"
31 #include "difference_cover.h"
32 #include "prefix_doubling.h"
33 #include "stringsort.h"
34 #include "bw_transform.h"
35 #include "stream.h"
36 #include "inttypes.h"
37 #include <string.h>
38 #include <stdlib.h>
40 #include <iostream>
41 #include <cstdlib> // for strtoll
42 #include <algorithm> // for reverse
44 namespace dcsbwt {
46 extern int verbosity;
48 namespace {
50 template <typename Iterator>
51 class IteratorGreater : public std::binary_function<Iterator, Iterator, bool> {
52 public:
53 IteratorGreater() {}
54 bool operator() (Iterator a, Iterator b) const {
55 return *a > *b;
59 //////////////////////////////////////////////////////////////////
60 // BwtFromSuffixArray computes the BWT from the suffix array and
61 // sends it to output.
62 //////////////////////////////////////////////////////////////////
63 int64 BwtFromSuffixArray(char* block, size_t block_size,
64 uint32* suffix_array, uint32* suffix_array_end,
65 OutStream* output) {
66 unsigned char ch = '\0';
67 uint32 eob_position = 0;
68 OutStreamBuffer buffer;
69 buffer.Connect(output);
70 for (uint32 rank = 0; rank < block_size + 1; ++rank) {
71 uint32 suffix = suffix_array[rank];
72 assert(suffix >= 0);
73 assert(suffix <= block_size);
74 if (suffix != 0) {
75 ch = block[suffix - 1];
76 } else {
77 // This is the end-of-block (eob) character, which cannot be
78 // directly written since it has no code.
79 // Instead, write a copy of the previous character here and
80 // store this position in order to indicate its location.
81 eob_position = rank;
83 buffer.WriteByte(ch);
85 buffer.Disconnect();
86 return eob_position;
90 //////////////////////////////////////////////////////////////////
91 // Compute the suffix cover sample ranks and provide methods
92 // for using it.
93 //////////////////////////////////////////////////////////////////
94 class DifferenceCoverSuffixComparer {
95 public:
96 DifferenceCoverSuffixComparer(char* block, uint32 block_size,
97 uint32 period,
98 uint32* suffix_area, uint32* suffix_area_end)
99 : dcsampler_(period, block_size),
100 dcranks_(dcsampler_.size() + 1) {
101 uint32 num_samples = dcsampler_.size();
102 assert(suffix_area_end - suffix_area >= 2 * num_samples + 1);
103 uint32* sample_suffixes_in = suffix_area_end - num_samples;
104 uint32* sample_suffixes_end = suffix_area + 1 + num_samples;
106 uint32* end = dcsampler_.Fill(suffix_area);
107 assert(end == sample_suffixes_end - 1);
109 // Compute the sizes of the buckets.
110 uint32 bucket_size[0x10000];
111 memset(bucket_size, 0, 0x10000 * sizeof(uint32));
112 // The suffix of length one needs special treatment if it is in the sample.
113 // If it is not, this will remain 0x10000.
114 int bucket_of_length_one_suffix = 0x10000;
115 for (uint32 i = 0; i < num_samples; ++i) {
116 uint32 suffix = suffix_area[i];
117 assert(suffix < block_size);
118 int bucket = static_cast<unsigned char>(block[suffix]) << 8;
119 if (suffix + 1 < block_size)
120 bucket += static_cast<unsigned char>(block[suffix + 1]);
121 else
122 // The suffix of length one is treated as if there was
123 // an extra '\0' at the end of the block.
124 bucket_of_length_one_suffix = bucket;
125 ++bucket_size[bucket];
128 // Compute the bucket positions.
129 uint32 bucket_begin[0x10000];
130 uint32 sum = 0;
131 for (int bucket = 0; bucket < 0x10000; ++bucket) {
132 bucket_begin[bucket] = sum;
133 sum += bucket_size[bucket];
135 assert(sum == num_samples);
137 // Distribute the suffixes to the buckets.
138 if (bucket_of_length_one_suffix != 0x10000) {
139 // The suffix of length one is always the first in its bucket.
140 sample_suffixes_in[bucket_begin[bucket_of_length_one_suffix]++]
141 = block_size - 1;
143 for (uint32 i = 0; i < num_samples; ++i) {
144 uint32 suffix = suffix_area[i];
145 assert(suffix < block_size);
146 if (suffix < block_size - 1) {
147 int bucket = static_cast<unsigned char>(block[suffix]) << 8;
148 bucket += static_cast<unsigned char>(block[suffix + 1]);
149 sample_suffixes_in[bucket_begin[bucket]++] = suffix;
153 uint32* dcranks = &dcranks_[0];
154 uint32* dcranks_end = dcranks + num_samples + 1;
155 // This is the empty suffix needed by prefix doubling.
156 dcranks[num_samples] = 0;
157 suffix_area[0] = num_samples;
159 PrefixDoubler<uint32> doubler(num_samples);
160 RankRecorder rank_recorder(&dcsampler_, dcranks, suffix_area,
161 doubler.FinishedSuffixMarker());
163 // Sort the buckets.
164 end = suffix_area + 1; // Skip the empty suffix.
165 uint32 shift = sample_suffixes_in - end;
166 for (int bucket = 0; bucket < 0x10000; ++bucket) {
167 uint32* begin = end;
168 end += bucket_size[bucket];
169 if (bucket == bucket_of_length_one_suffix) {
170 // The suffix of length one is already in its place
171 // at the beginning of the bucket. Process it separately and
172 // then sort the rest of the bucket. We cannot sort it with
173 // others, because StringsortSuffixes assumes that all suffixes
174 // have at least 2 characters.
175 assert(*(begin + shift) == block_size - 1);
176 *begin = block_size - 1;
177 rank_recorder(begin, begin + 1);
178 ++begin;
180 uint32* bucket_end =
181 StringsortSuffixes(
182 block, block + block_size,
183 begin, begin + shift, end + shift,
184 2, period,
185 rank_recorder);
186 assert(end == bucket_end);
189 // Final sorting by prefix doubling.
190 uint32 period_interval = dcsampler_.PeriodInterval();
191 doubler.SortSuffixes(dcranks, dcranks_end,
192 suffix_area, sample_suffixes_end, period_interval);
195 // This functor can be used for comparing two suffixes in constant time
196 // if they have a common prefix of length period - 1.
197 class Less : public std::binary_function<uint32, uint32, bool> {
198 public:
199 Less(const DifferenceCoverSample<uint32>* dcsample,
200 const uint32* dcranks)
201 : dcsample_(dcsample), dcranks_(dcranks) {}
202 bool operator() (uint32 a, uint32 b) const {
203 uint32 shift = dcsample_->Shift(a, b);
204 uint32 apos = dcsample_->Rank(a + shift);
205 uint32 bpos = dcsample_->Rank(b + shift);
206 return dcranks_[apos] < dcranks_[bpos];
208 private:
209 const DifferenceCoverSample<uint32>* const dcsample_;
210 const uint32* const dcranks_;
212 Less GetLess() const { return Less(&dcsampler_, &dcranks_[0]); }
214 uint32 Size() const { return dcranks_.size(); }
216 private:
217 DifferenceCoverSample<uint32> dcsampler_;
218 std::vector<uint32> dcranks_;
220 class RankRecorder {
221 public:
222 RankRecorder(const DifferenceCoverSample<uint32>* dcsample,
223 uint32* rank_array, uint32* suffix_array,
224 uint32 finished_suffix_marker)
225 : dcsample_(dcsample),
226 rank_array_(rank_array), suffix_array_(suffix_array),
227 finished_suffix_marker_(finished_suffix_marker) {}
228 void operator() (uint32* begin, uint32* end) const {
229 uint32 rank = end - suffix_array_ - 1;
230 if (end - begin == 1) {
231 rank_array_[dcsample_->Rank(*begin)] = rank;
232 *begin = finished_suffix_marker_;
233 } else {
234 for (uint32* it = begin; it != end; ++it) {
235 uint32 pos = dcsample_->Rank(*it);
236 rank_array_[pos] = rank;
237 *it = pos;
241 private:
242 const DifferenceCoverSample<uint32>* const dcsample_;
243 uint32* const rank_array_;
244 const uint32* const suffix_array_;
245 uint32 finished_suffix_marker_;
251 //////////////////////////////////////////////////////////////////
252 // Optimize the order of the alphabet.
253 //////////////////////////////////////////////////////////////////
255 class EdgeGreater : std::binary_function<int, int, bool> {
256 public:
257 EdgeGreater(const uint32* weights) : weights_(weights) {}
258 bool operator() (int a, int b) const {
259 return weights_[a] > weights_[b];
261 private:
262 const uint32* const weights_;
265 uint32 OptimizeAlphabetOrder(const uint32* bucket_size, uint8* char_of_rank) {
267 class Graph {
268 public:
269 Graph(const uint32* weights) : kRemovedEdgeWeight(0xF0FFFFFF) {
270 memcpy(weights_, weights, 0x10000 * sizeof(uint32));
272 int Edge(int i, int j) const {
273 assert(i >= 0);
274 assert(i < 256);
275 assert(j >= 0);
276 assert(i < 256);
277 return (i << 8) + j;
279 int Source(int edge) const {
280 assert(edge >= 0);
281 assert(edge < 0x10000);
282 return edge >> 8;
284 int Target(int edge) const {
285 assert(edge >= 0);
286 assert(edge < 0x10000);
287 return edge & 255;
289 uint32 Weight(int edge) const { return weights_[edge]; }
290 void ReduceWeight(int edge, uint32 weight) {
291 assert(weight >= 0);
292 assert(weight <= weights_[edge]);
293 weights_[edge] -= weight;
295 void Remove(int edge) { weights_[edge] = kRemovedEdgeWeight; }
296 bool Exists(int edge) const {
297 return weights_[edge] != kRemovedEdgeWeight;
300 EdgeGreater GetEdgeGreater() const { return EdgeGreater(weights_); }
301 private:
302 uint32 weights_[0x10000];
303 const uint32 kRemovedEdgeWeight;
306 Graph graph(bucket_size);
308 uint32 total_weight = 0;
309 for (int i = 0; i < 256; ++i) {
310 for (int j = 0; j < 256; ++j) {
311 int edge = graph.Edge(i, j);
312 total_weight += graph.Weight(edge);
316 uint32 commited_weight = 0;
317 uint32 avoided_weight = 0;
319 // Remove self-loops.
320 uint32 one_cycle_weight = 0;
321 for (int i = 0; i < 256; ++i) {
322 int edge = graph.Edge(i, i);
323 one_cycle_weight += graph.Weight(edge);
324 graph.Remove(edge);
326 avoided_weight += one_cycle_weight;
328 // '\0' is always the first character.
329 // Remove edges adjacent to it.
330 for (int i = 1; i < 256; ++i) {
331 int edge = graph.Edge(0, i);
332 commited_weight += graph.Weight(edge);
333 graph.Remove(edge);
334 edge = graph.Edge(i, 0);
335 avoided_weight += graph.Weight(edge);
336 graph.Remove(edge);
339 if (verbosity > 4) {
340 std::clog << " total_weight=" << total_weight
341 << " commited_weight=" << commited_weight
342 << " avoided_weight=" << avoided_weight
343 << " one_cycle_weight=" << one_cycle_weight
344 << std::endl;
347 uint32 two_cycle_weight = 0;
348 for (int i = 1; i < 255; ++i) {
349 for (int j = i + 1; j < 256; ++j) {
350 int edge = graph.Edge(i, j);
351 int reverse_edge = graph.Edge(j, i);
352 uint32 min_weight = std::min(graph.Weight(edge),
353 graph.Weight(reverse_edge));
354 two_cycle_weight += min_weight;
355 graph.ReduceWeight(edge, min_weight);
356 graph.ReduceWeight(reverse_edge, min_weight);
359 commited_weight += two_cycle_weight;
360 avoided_weight += two_cycle_weight;
362 if (verbosity > 4) {
363 std::clog << " commited_weight=" << commited_weight
364 << " avoided_weight=" << avoided_weight
365 << " two_cycle_weight=" << two_cycle_weight
366 << std::endl;
369 std::vector<int> positive_edges;
370 for (int i = 1; i < 256; ++i) {
371 for (int j = 1; j < 256; ++j) {
372 int edge = graph.Edge(i, j);
373 if (graph.Exists(edge) && graph.Weight(edge) > 0)
374 positive_edges.push_back(edge);
377 std::sort(positive_edges.begin(), positive_edges.end(),
378 graph.GetEdgeGreater());
380 uint32 selected_weight = 0;
381 uint32 discarded_weight = 0;
383 // Dynamic transitive closure
384 std::vector<uint8> predecessors[256];
385 std::vector<uint8> successors[256];
386 for (std::vector<int>::iterator it = positive_edges.begin();
387 it != positive_edges.end(); ++it) {
388 int discarded_edge = *it;
389 if (!graph.Exists(discarded_edge)) continue;
390 int source = graph.Target(discarded_edge);
391 int target = graph.Source(discarded_edge);
392 std::vector<uint8> new_target_predecessors;
393 new_target_predecessors.push_back(source);
394 std::vector<uint8> new_source_successors;
395 new_source_successors.push_back(target);
396 std::vector<uint8>::iterator it1, it2;
397 for (it1 = predecessors[source].begin();
398 it1 != predecessors[source].end(); ++it1) {
399 int node = *it1;
400 int edge = graph.Edge(node, target);
401 if (graph.Exists(edge)) {
402 new_target_predecessors.push_back(node);
405 for (it2 = successors[target].begin();
406 it2 != successors[target].end(); ++it2) {
407 int node = *it2;
408 int edge = graph.Edge(source, node);
409 if (graph.Exists(edge)) {
410 new_source_successors.push_back(node);
413 for (it1 = new_target_predecessors.begin();
414 it1 != new_target_predecessors.end(); ++it1) {
415 int node1 = *it1;
416 for (it2 = new_source_successors.begin();
417 it2 != new_source_successors.end(); ++it2) {
418 int node2 = *it2;
419 int selected_edge = graph.Edge(node1, node2);
420 if (graph.Exists(selected_edge)) {
421 int discarded_edge = graph.Edge(node2, node1);
422 selected_weight += graph.Weight(selected_edge);
423 graph.Remove(selected_edge);
424 discarded_weight += graph.Weight(discarded_edge);
425 graph.Remove(discarded_edge);
426 predecessors[node2].push_back(node1);
427 successors[node1].push_back(node2);
433 if (verbosity > 4) {
434 std::clog << " selected_weight=" << commited_weight
435 << " discarded_weight=" << avoided_weight
436 << std::endl;
439 // Recover the node order.
440 uint8 node_weight[256];
441 std::vector<uint8*> node_ptr;
442 for (int i = 1; i < 256; ++i) {
443 node_weight[i] = successors[i].size();
444 node_ptr.push_back(&node_weight[i]);
446 std::sort(node_ptr.begin(), node_ptr.end(),
447 IteratorGreater<uint8*>());
448 char_of_rank[0] = 0;
449 for (int i = 1; i < 256; ++i) {
450 char_of_rank[i] = node_ptr[i-1] - node_weight;
452 if (discarded_weight < selected_weight) {
453 std::reverse(char_of_rank + 1, char_of_rank + 256);
454 std::swap(discarded_weight, selected_weight);
456 commited_weight += selected_weight;
457 avoided_weight += discarded_weight;
459 if (verbosity > 4) {
460 std::clog << " total_weight=" << total_weight
461 << " commited_weight=" << commited_weight
462 << " avoided_weight=" << avoided_weight;
464 assert(total_weight == commited_weight + avoided_weight);
466 return commited_weight;
471 //////////////////////////////////////////////////////////////////
472 // DifferenceCoverBWTransformer
473 //////////////////////////////////////////////////////////////////
475 class DifferenceCoverBWTransformer : public BWTransformer {
476 public:
477 class Options : public BWTransformer::AlgorithmSpecificOptions {
478 public:
479 Options() { Set(std::string("6")); }
480 virtual ~Options() {}
482 virtual bool Set(const std::string& options) {
483 if (0 == options.length()) return true;
484 const char* begin = options.c_str();
485 const char* end = begin + options.length();
486 char* next;
487 int64 log_period = strtoll(begin, &next, 0);
488 if (next != end) return false;
489 if (log_period < kMinLogPeriod || log_period > kMaxLogPeriod)
490 return false;
491 if (verbosity > 4) {
492 std::clog << "Setting DifferenceCoverBWTransform::Options"
493 << " period=" << (1 << log_period) << std::endl;
495 log_period_ = log_period;
496 return true;
498 virtual std::string Get() const {
499 assert(log_period_ >= 0);
500 assert(log_period_ <= kMaxLogPeriod);
501 char buffer[4];
502 sprintf(buffer, "%d", log_period_);
503 std::string options(buffer);
504 return options;
507 virtual int64 MaxSizeInBytes(int64 block_size) const {
508 uint64 period = 1LL << log_period_;
509 uint64 size_per_period = sizeof(char) * period
510 + sizeof(uint32) * (period
511 + DifferenceCover::CoverSizeForPeriod(period));
512 return kMemoryOverhead
513 + ((block_size * size_per_period) / period)
514 + DifferenceCover::SizeInBytesForPeriod(period);
516 virtual int64 MaxBlockSize(int64 memory_budget) const {
517 uint64 period = 1LL << log_period_;
518 uint64 size_per_period = sizeof(char) * period
519 + sizeof(uint32) * (period
520 + DifferenceCover::CoverSizeForPeriod(period));
521 memory_budget -= (kMemoryOverhead
522 + DifferenceCover::SizeInBytesForPeriod(period));
523 return (period * memory_budget) / size_per_period;
525 virtual int64 SuggestedBlockSize(int64 memory_budget) const {
526 return MaxBlockSize(memory_budget);
529 virtual BWTransformer* GetTransformer(int64 memory_budget) {
530 return new DifferenceCoverBWTransformer(1 << log_period_);
532 private:
533 static const int64 kMemoryOverhead = (1 << 20);
534 static const int kMinLogPeriod = 3;
535 static const int kMaxLogPeriod = 15;
536 int log_period_;
539 DifferenceCoverBWTransformer(uint32 period) : period_(period) {}
540 virtual ~DifferenceCoverBWTransformer() {}
542 private:
543 uint32 period_;
545 virtual int64 DoTransform(char* block, size_t block_size,
546 OutStream* output);
548 class DCSorter {
549 public:
550 DCSorter(const DifferenceCoverSuffixComparer::Less& dcless)
551 : dcless_(dcless) {}
552 void operator() (uint32* begin, uint32* end) const {
553 std::sort(begin, end, dcless_);
555 private:
556 DifferenceCoverSuffixComparer::Less dcless_;
559 DifferenceCoverBWTransformer(const DifferenceCoverBWTransformer&);
560 DifferenceCoverBWTransformer& operator=(const DifferenceCoverBWTransformer&);
563 int64 DifferenceCoverBWTransformer::DoTransform(
564 char* block, size_t block_size,
565 OutStream* output)
567 assert(block_size > 0);
568 assert(block_size < 0xFFFFFFFF - period_); // TODO: Check this
569 uint32 num_suffixes = block_size + 1;
571 // Use basic string sorting for small blocks.
572 if (block_size < 1024) {
573 if (verbosity > 2) {
574 std::clog << "DifferenceCoverBWTransformer::DoTransform"
575 << " block_size=" << block_size
576 << " (using plain string sorting)"
577 << std::endl;
579 std::vector<uint32> suffix_array(2 * (num_suffixes));
580 uint32* begin = &suffix_array[0];
581 uint32* end = begin + suffix_array.size();
582 uint32* suffixes_in = end - num_suffixes;
583 for (int i = 0; i < num_suffixes; ++i) suffixes_in[i] = i;
584 NullFinishedGroupReporter<uint32> null_reporter;
585 uint32* suffixes_end =
586 StringsortSuffixes(
587 block, block + block_size,
588 begin, suffixes_in, end,
589 0, block_size,
590 null_reporter);
591 assert(suffixes_end == begin + num_suffixes);
592 return BwtFromSuffixArray(block, block_size,
593 begin, suffixes_end, output);
595 // For short blocks and long periods, we may have to reduce the period.
596 // The period is restored at the end.
597 uint32 saved_period = period_;
598 while (block_size < period_ - 1) period_ >>= 1;
600 if (verbosity > 2) {
601 std::clog << "DifferenceCoverBWTransformer::DoTransform"
602 << " block_size=" << block_size
603 << " period=" << period_
604 << std::endl;
606 assert(block_size >= period_ - 1);
608 // The extra +4 might be needed by DifferenceCoverSuffixComparer
609 // if period_==8.
610 std::vector<uint32> suffix_array(num_suffixes + 4);
611 uint32* suffix_area = &suffix_array[0];
612 uint32* suffix_area_end = suffix_area + suffix_array.size();
614 // Construct the difference cover sample.
615 DifferenceCoverSuffixComparer dcscomp(block, block_size, period_,
616 suffix_area, suffix_area_end);
617 if (verbosity > 3) {
618 std::clog << "Using difference cover sample of size " << dcscomp.Size()
619 << std::endl;
622 // Compute the sizes of the buckets.
623 if (verbosity > 4) {
624 std::clog << "Computing bucket sizes" << std::endl;
626 uint32 bucket_size[0x10000];
627 memset(bucket_size, 0, 0x10000*sizeof(uint32));
628 int bucket = static_cast<unsigned char>(block[0]) << 8;
629 for (uint32 i = 1; i < block_size; ++i) {
630 bucket += static_cast<unsigned char>(block[i]);
631 ++bucket_size[bucket];
632 bucket = (bucket << 8) & 0xFF00;
634 ++bucket_size[bucket]; // suffix of length one
635 ++bucket_size[0]; // empty suffix
637 if (verbosity > 4) {
638 std::clog << "Optimizing alphabet order" << std::endl;
640 unsigned char char_of_rank[256];
641 uint32 num_suffixes_to_sort
642 = OptimizeAlphabetOrder(bucket_size, char_of_rank);
643 uint8 rank_of_char[256];
644 for (int i = 0; i < 256; ++i) {
645 rank_of_char[char_of_rank[i]] = i;
648 // Compute the positions of the selected buckets.
649 if (verbosity > 4) {
650 std::clog << "Computing selected bucket positions" << std::endl;
652 uint32 selected_bucket_begin[0x10000];
653 uint32 num_selected_suffixes = 0;
654 for (int ch1 = 0; ch1 < 256; ++ch1) {
655 for (int ch2 = 0; ch2 < 256; ++ch2) {
656 if (rank_of_char[ch1] < rank_of_char[ch2]) {
657 int bucket = (ch1 << 8) + ch2;
658 selected_bucket_begin[bucket] = num_selected_suffixes;
659 num_selected_suffixes += bucket_size[bucket];
663 assert(num_selected_suffixes == num_suffixes_to_sort);
665 uint32* suffixes_end = suffix_area + num_suffixes;
666 uint32* selected_suffixes = suffix_area_end - num_selected_suffixes;
668 // Distribute the suffixes to the selected buckets.
669 if (verbosity > 4) {
670 std::clog << "Distributing into selected buckets" << std::endl;
672 int ch1 = static_cast<unsigned char>(block[0]);
673 for (uint32 i = 1; i < block_size; ++i) {
674 int ch2 = static_cast<unsigned char>(block[i]);
675 if (rank_of_char[ch1] < rank_of_char[ch2]) {
676 bucket = (ch1 << 8) + ch2;
677 selected_suffixes[selected_bucket_begin[bucket]++] = i - 1;
679 ch1 = ch2;
682 // Compute the positions of all buckets.
683 uint32 bucket_begin[0x10001];
684 bucket_begin[0] = 0;
685 uint32 sum = bucket_size[0];
686 for (int bucket = 1; bucket < 0x10000; ++bucket) {
687 bucket_begin[bucket] = sum;
688 sum += bucket_size[bucket];
690 assert(sum == num_suffixes);
691 bucket_size[0x10000] = sum;
693 // Sort the selected buckets.
694 if (verbosity > 3) {
695 std::clog << "Stringsorting " << num_suffixes_to_sort
696 << " out of " << num_suffixes << " suffixes"
697 << std::endl;
699 DCSorter dcsorter(dcscomp.GetLess());
700 uint32* source_end = selected_suffixes;
701 for (int ch1 = 0; ch1 < 256; ++ch1) {
702 for (int ch2 = 0; ch2 < 256; ++ch2) {
703 if (rank_of_char[ch1] < rank_of_char[ch2]) {
704 int bucket = (ch1 << 8) + ch2;
705 uint32* source_begin = source_end;
706 source_end += bucket_size[bucket];
707 uint32* target_begin = suffix_area + bucket_begin[bucket];
708 assert(target_begin <= source_begin);
709 uint32* target_end =
710 StringsortSuffixes(
711 block, block + block_size,
712 target_begin, source_begin, source_end,
713 2, period_ - 1,
714 dcsorter);
715 assert(target_end == target_begin + bucket_size[bucket]);
720 // Move other suffixes into their final positions.
721 if (verbosity > 4) {
722 std::clog << "Setting other suffixes" << std::endl;
725 suffix_area[bucket_begin[0]] = block_size;
726 for (int i = 0; i < 256; ++i) {
727 int chi = char_of_rank[i];
728 uint32 sub_bucket_begin[256];
729 uint32 sub_bucket_end[256];
730 for (int ch = 0; ch < 256; ++ch) {
731 int bucket = (ch << 8) + chi;
732 sub_bucket_begin[ch] = bucket_begin[bucket];
733 sub_bucket_end[ch] = bucket_begin[bucket] + bucket_size[bucket];
735 if (chi == 0) ++sub_bucket_begin[0];
736 uint32 rank = bucket_begin[chi << 8];
737 while (rank < sub_bucket_begin[chi]) {
738 uint32 suffix = suffix_area[rank++];
739 if (suffix > 0) {
740 int ch = static_cast<unsigned char>(block[suffix - 1]);
741 if (rank_of_char[ch] >= rank_of_char[chi])
742 suffix_area[sub_bucket_begin[ch]++] = suffix - 1;
745 rank = bucket_begin[(chi + 1) << 8];
746 while (rank > sub_bucket_end[chi]) {
747 uint32 suffix = suffix_area[--rank];
748 if (suffix > 0) {
749 int ch = static_cast<unsigned char>(block[suffix - 1]);
750 if (rank_of_char[ch] >= rank_of_char[chi])
751 suffix_area[--sub_bucket_end[ch]] = suffix - 1;
756 period_ = saved_period;
757 return BwtFromSuffixArray(block, block_size,
758 suffix_area, suffixes_end, output);
764 //////////////////////////////////////////////////////////////////
765 // StringsortBWTransformer
766 //////////////////////////////////////////////////////////////////
768 class StringsortBWTransformer : public BWTransformer {
769 public:
770 class Options : public BWTransformer::AlgorithmSpecificOptions {
771 public:
772 Options() {}
773 virtual ~Options() {}
775 virtual bool Set(const std::string& options) {
776 return 0 == options.length();
778 virtual std::string Get() const { return std::string(); }
779 virtual int64 MaxSizeInBytes(int64 block_size) const {
780 return 9 * block_size;
782 virtual int64 MaxBlockSize(int64 memory_budget) const {
783 return memory_budget / 9;
785 virtual int64 SuggestedBlockSize(int64 memory_budget) const {
786 return memory_budget / 9;
789 virtual BWTransformer* GetTransformer(int64 memory_budget) {
790 return new StringsortBWTransformer;
794 StringsortBWTransformer() {}
795 virtual ~StringsortBWTransformer() {}
797 private:
798 virtual int64 DoTransform(char* block, size_t block_size,
799 OutStream* output);
801 StringsortBWTransformer(const StringsortBWTransformer&);
802 StringsortBWTransformer& operator=(const StringsortBWTransformer&);
806 int64 StringsortBWTransformer::DoTransform(
807 char* block, size_t block_size,
808 OutStream* output)
810 assert(block_size > 0);
811 uint32 num_suffixes = block_size + 1;
813 // Compute the sizes of the buckets.
814 uint32 bucket_size[0x10000];
815 memset(bucket_size, 0, 0x10000*sizeof(uint32));
816 int ch = static_cast<unsigned char>(block[0]);
817 int bucket = (ch << 8);
818 for (uint32 i = 1; i < block_size; ++i) {
819 bucket += static_cast<unsigned char>(block[i]);
820 ++bucket_size[bucket];
821 bucket = (bucket << 8) & 0xFFFF;
823 int bucket_of_length_one_suffix = bucket;
824 ++bucket_size[bucket]; // suffix of length one
825 ++bucket_size[0]; // empty suffix
827 // Compute the bucket positions.
828 uint32 bucket_begin[0x10000];
829 bucket_begin[0] = 0;
830 uint32 sum = bucket_size[0];
831 bucket_begin[1] = sum;
832 for (int bucket = 2; bucket < 0x10000; ++bucket) {
833 sum += bucket_size[bucket-1];
834 bucket_begin[bucket] = sum;
836 assert(bucket_begin[0xFFFF] + bucket_size[0xFFFF] == num_suffixes);
838 std::vector<uint32> suffix_array(2 * num_suffixes);
839 uint32* suffix_area = &suffix_array[0];
840 uint32* suffixes_end = suffix_area + num_suffixes;
841 uint32* suffix_area_end = suffix_area + suffix_array.size();
842 uint32* suffixes_in = suffix_area_end - num_suffixes;
844 // Distribute the suffixes to the buckets.
845 suffixes_in[bucket_begin[0]++] = block_size;
846 suffixes_in[bucket_begin[bucket_of_length_one_suffix]++] = block_size - 1;
847 ch = static_cast<unsigned char>(block[0]);
848 bucket = (ch << 8);
849 for (uint32 i = 1; i < block_size; ++i) {
850 bucket += static_cast<unsigned char>(block[i]);
851 suffixes_in[bucket_begin[bucket]++] = i - 1;
852 bucket = (bucket << 8) & 0xFFFF;
855 // Sort the buckets.
856 NullFinishedGroupReporter<uint32> null_reporter;
857 uint32 shift = suffixes_in - suffix_area;
858 uint32* begin = suffix_area;
859 uint32* end = suffix_area + 1;
860 *begin = block_size;
861 --bucket_size[0];
862 for (int bucket = 0; bucket < 0x10000; ++bucket) {
863 begin = end;
864 end += bucket_size[bucket];
865 if (bucket == bucket_of_length_one_suffix) {
866 *begin = block_size - 1;
867 ++begin;
869 uint32* bucket_end =
870 StringsortSuffixes(
871 block, block + block_size,
872 begin, begin + shift, end + shift,
873 2, block_size,
874 null_reporter);
875 assert(end == bucket_end);
878 return BwtFromSuffixArray(block, block_size,
879 suffix_area, suffixes_end, output);
883 //////////////////////////////////////////////////////////////////
884 // PrefixDoublingBWTransformer
885 //////////////////////////////////////////////////////////////////
887 class PrefixDoublingBWTransformer : public BWTransformer {
888 public:
889 class Options : public BWTransformer::AlgorithmSpecificOptions {
890 public:
891 Options() : stringsort_depth_(6) {}
892 virtual ~Options() {}
894 virtual bool Set(const std::string& options) {
895 const char* begin = options.c_str();
896 const char* end = begin + options.length();
897 int64 stringsort_depth = stringsort_depth_;
898 while (begin != end) {
899 switch (*begin++) {
900 case 'd':
901 char* next;
902 stringsort_depth = strtoll(begin, &next, 0);
903 begin = next;
904 if (stringsort_depth < 2) return false;
905 break;
906 default:
907 return false;
910 stringsort_depth_ = stringsort_depth;
911 return true;
913 virtual std::string Get() const {
914 char buffer[12];
915 std::string options("d");
916 sprintf(buffer, "%u", stringsort_depth_);
917 options += buffer;
918 return options;
920 virtual int64 MaxSizeInBytes(int64 block_size) const {
921 int64 input_factor = sizeof(char);
922 int64 rank_array_factor = sizeof(uint32);
923 int64 maximum_suffix_array_factor = 2 * sizeof(uint32);
924 int64 maximum_factor = input_factor + rank_array_factor
925 + maximum_suffix_array_factor; // maximum_factor = 9
926 return maximum_factor * block_size + kMemoryOverhead;
928 virtual int64 MaxBlockSize(int64 memory_budget) const {
929 int64 input_factor = sizeof(char);
930 int64 rank_array_factor = sizeof(uint32);
931 int64 minimum_suffix_array_factor = sizeof(uint32);
932 int64 minimum_factor = input_factor + rank_array_factor
933 + minimum_suffix_array_factor; // minimum_factor = 9
934 return (memory_budget - kMemoryOverhead) / minimum_factor;
936 virtual int64 SuggestedBlockSize(int64 memory_budget) const {
937 int64 input_factor = sizeof(char);
938 int64 rank_array_factor = sizeof(uint32);
939 int64 suggested_suffix_array_factor = (5 * sizeof(uint32))/4;
940 int64 suggested_factor = input_factor + rank_array_factor
941 + suggested_suffix_array_factor; // suggested_factor = 10
942 return (memory_budget - kMemoryOverhead) / suggested_factor;
945 virtual BWTransformer* GetTransformer(int64 memory_budget) {
946 int64 linear_memory_budget =
947 memory_budget - kMemoryOverhead;
948 return new PrefixDoublingBWTransformer(stringsort_depth_,
949 linear_memory_budget);
951 private:
952 static const int64 kMemoryOverhead = (1 << 20);
953 uint32 stringsort_depth_;
956 PrefixDoublingBWTransformer(uint32 stringsort_depth,
957 int64 linear_memory_budget)
958 : stringsort_depth_(stringsort_depth),
959 linear_memory_budget_(linear_memory_budget) {}
960 virtual ~PrefixDoublingBWTransformer() {}
962 private:
963 uint32 stringsort_depth_;
964 int64 linear_memory_budget_;
966 virtual int64 DoTransform(char* block, size_t block_size,
967 OutStream* output);
969 class RankRecorder {
970 public:
971 RankRecorder(uint32* rank_array, uint32* suffix_array)
972 : rank_array_(rank_array), suffix_array_(suffix_array) {}
973 void operator() (uint32* begin, uint32* end) const {
974 uint32 rank = end - suffix_array_ - 1;
975 for (uint32* it = begin; it != end; ++it) {
976 rank_array_[*it] = rank;
979 private:
980 uint32* const rank_array_;
981 const uint32* const suffix_array_;
984 PrefixDoublingBWTransformer(const PrefixDoublingBWTransformer&);
985 PrefixDoublingBWTransformer& operator=(const PrefixDoublingBWTransformer&);
988 int64 PrefixDoublingBWTransformer::DoTransform(char* block, size_t block_size,
989 OutStream* output) {
990 uint32 num_suffixes = block_size + 1;
991 assert(num_suffixes > block_size);
992 int64 suffix_array_extra_space
993 = (linear_memory_budget_ - sizeof(char) * block_size
994 - 2 * sizeof(uint32) * num_suffixes) / sizeof(uint32);
995 assert(suffix_array_extra_space >= 0);
996 if (suffix_array_extra_space > num_suffixes)
997 suffix_array_extra_space = num_suffixes;
998 if (verbosity > 1) {
999 std::clog << "Prefix doubling BW transform"
1000 << " block_size=" << block_size
1001 << " stringsort_depth=" << stringsort_depth_
1002 << " suffix_array_extra_space="
1003 << suffix_array_extra_space;
1005 // Phase 1: Bucketsort by first two characters.
1006 // Compute the sizes of the buckets.
1007 uint32 bucket_size[0x10000];
1008 memset(bucket_size, 0, 0x10000*sizeof(uint32));
1009 int ch = static_cast<unsigned char>(block[0]);
1010 int bucket = (ch << 8);
1011 for (uint32 i = 1; i < block_size; ++i) {
1012 bucket += static_cast<unsigned char>(block[i]);
1013 ++bucket_size[bucket];
1014 bucket = (bucket << 8) & 0xFFFF;
1016 int bucket_of_length_one_suffix = bucket;
1017 ++bucket_size[bucket]; // suffix of length one
1018 ++bucket_size[0]; // empty suffix
1020 // Compute the bucket positions.
1021 uint32 bucket_begin[0x10000];
1022 bucket_begin[0] = 0;
1023 uint32 sum = bucket_size[0];
1024 bucket_begin[1] = sum;
1025 for (int bucket = 2; bucket < 0x10000; ++bucket) {
1026 sum += bucket_size[bucket-1];
1027 bucket_begin[bucket] = sum;
1029 assert(bucket_begin[0xFFFF] + bucket_size[0xFFFF] == num_suffixes);
1031 // Setup suffix array.
1032 std::vector<uint32> suffix_array(num_suffixes + suffix_array_extra_space);
1033 uint32* suffix_area = &suffix_array[0];
1034 uint32* suffixes_end = suffix_area + num_suffixes;
1035 uint32* suffix_area_end = suffix_area + suffix_array.size();
1036 uint32* suffixes_in = suffix_area_end - num_suffixes;
1038 // Distribute the suffixes to the buckets (at the end of suffix area).
1039 suffixes_in[bucket_begin[0]++] = block_size;
1040 suffixes_in[bucket_begin[bucket_of_length_one_suffix]++] = block_size - 1;
1041 ch = static_cast<unsigned char>(block[0]);
1042 bucket = (ch << 8);
1043 for (uint32 i = 1; i < block_size; ++i) {
1044 bucket += static_cast<unsigned char>(block[i]);
1045 suffixes_in[bucket_begin[bucket]++] = i - 1;
1046 bucket = (bucket << 8) & 0xFFFF;
1049 // Phase 2: Stringsort by stringsort_depth_ first characters.
1050 // During sorting, set up the rank array using rank_recorder.
1051 // rank_recorder must be called for every bucket resulting
1052 // from Phase 2, including buckets of size one. For the buckets
1053 // with suffixes of length 0 and 1, it is done directly here.
1054 // For others, it is done in StringsortSuffixes. Therefore,
1055 // StringsortSuffixes must be called for every Phase 1 bucket,
1056 // even if it does no sorting (bucket_size==1 or stringsort_depth_==2).
1057 std::vector<uint32> rank_array(num_suffixes);
1058 RankRecorder rank_recorder(&rank_array[0], suffix_area);
1059 uint32 shift = suffixes_in - suffix_area;
1060 uint32* begin = suffix_area;
1061 uint32* end = suffix_area + 1;
1062 *begin = block_size;
1063 rank_recorder(begin, begin + 1);
1064 --bucket_size[0];
1065 for (int bucket = 0; bucket < 0x10000; ++bucket) {
1066 begin = end;
1067 end += bucket_size[bucket];
1068 if (bucket == bucket_of_length_one_suffix) {
1069 *begin = block_size - 1;
1070 rank_recorder(begin, begin + 1);
1071 ++begin;
1073 uint32* bucket_end =
1074 StringsortSuffixes(
1075 block, block + block_size,
1076 begin, begin + shift, end + shift,
1077 2, stringsort_depth_,
1078 rank_recorder);
1079 assert(end == bucket_end);
1082 // Phase 3: Complete sorting by prefix doubling.
1083 PrefixDoubler<uint32> prefix_doubler(block_size);
1084 prefix_doubler.SortSuffixes(rank_array.begin(), rank_array.end(),
1085 suffix_area, suffixes_end,
1086 stringsort_depth_);
1087 prefix_doubler.ComputeFinalSuffixArray(
1088 rank_array.begin(), rank_array.end(),
1089 suffix_area, suffixes_end);
1090 return BwtFromSuffixArray(block, block_size,
1091 suffix_area, suffixes_end, output);
1094 } // namespace
1096 //////////////////////////////////////////////////////////////////
1097 // BWTransformer
1098 //////////////////////////////////////////////////////////////////
1100 bool BWTransformer::Options::Set(const std::string& options_string) {
1101 if (options_string.length() < 1) return false;
1102 char algorithm = options_string[0];
1103 AlgorithmSpecificOptions* options;
1104 switch (algorithm) {
1105 case 'd':
1106 options = new DifferenceCoverBWTransformer::Options;
1107 break;
1108 case 's':
1109 options = new StringsortBWTransformer::Options;
1110 break;
1111 case 'p':
1112 options = new PrefixDoublingBWTransformer::Options;
1113 break;
1114 default:
1115 return false;
1117 std::string algorithm_specific_options_string(options_string, 1);
1118 bool options_are_valid = options->Set(algorithm_specific_options_string);
1119 if (options_are_valid) {
1120 algorithm_id_ = algorithm;
1121 if (algorithm_specific_options_) delete algorithm_specific_options_;
1122 algorithm_specific_options_ = options;
1123 } else {
1124 delete options;
1126 return options_are_valid;
1130 int64 BWTransformer::Transform(char* block, size_t block_size,
1131 OutStream* output,
1132 Options options, int64 memory_budget) {
1133 assert(block_size <= options.MaxBlockSize(memory_budget));
1134 std::reverse(block, block + block_size);
1135 BWTransformer* algorithm = options.GetAlgorithm(memory_budget);
1136 int64 eob_position = algorithm->DoTransform(block, block_size,
1137 output);
1138 delete algorithm;
1139 std::reverse(block, block + block_size);
1140 return eob_position;
1143 } // namespace dcsbwt