modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / rl_compress.cc
blob2493f6f0705254fc90ef498be2f7ebd353670727
1 // Copyright 2007 Google Inc.
2 //
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
7 //
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #include "rl_compress.h"
18 #include "stream.h"
19 #include "inttypes.h"
21 #include <iomanip>
22 #include <iostream>
23 #include <string>
25 namespace dcsbwt {
27 extern int statistics;
29 //////////////////////////////////////////////////////////////////
30 // BitEncoder and BitDecoder maintain a range [low_,high_] (of uint32's).
31 // The next four bytes of the compressed output/input represent
32 // some value in this range.
33 // For each bit (en/de)coded, the range is split into two parts
34 // with the sizes proportional to the probabilities of 1-bit and 0-bit.
35 // BitEncoder chooses one the two subranges as the new range [low_,high_].
36 // based on the actual value of the bit.
37 // BitDecoder determines the bit by finding out, which subrange
38 // contains the value represented by the next four input bytes.
39 // Whenever low_ and high_ share the most significant byte,
40 // the range is expanded by the factor 256 shifting the shared byte out.
41 // BitEncoder emits the shared byte. BitDecoder shifts the shared byte
42 // out of the next four input bytes that it is holding and reads in
43 // a new input byte.
44 //////////////////////////////////////////////////////////////////
46 namespace {
47 // Split a range [low,high] into [low,split] and [split+1,high]
48 // proportional to the probability and its complement.
49 uint32 Split(uint32 low, uint32 high, uint32 probability) {
50 assert(probability <= kProbabilityScale);
51 assert(low < high);
52 // range_size is high-low-1 rather than high-low+1 to ensure that
53 // neither subrange is empty, even when probability is 0 and 1.
54 uint32 range_size = high - low - 1;
55 // split = low + round(range_size * p)
56 // = low + floor(range_size * p + .5)
57 // where p is the probability as a real value.
58 uint32 high_bits = range_size >> kLogProbabilityScale;
59 uint32 low_bits = range_size & (kProbabilityScale - 1);
60 uint32 half = kProbabilityScale >> 1;
61 uint32 split = low + high_bits * probability
62 + ((low_bits * probability + half) >> kLogProbabilityScale);
63 assert(split >= low);
64 assert(split < high);
65 return split;
67 } // namespace
69 void BitEncoder::Encode(bool bit, Probability probability_of_one) {
70 if (verbosity > 7) {
71 std::clog << "Encoding bit " << int(bit) << " with probability "
72 << (bit ? probability_of_one :
73 kProbabilityScale - probability_of_one)
74 << std::endl;
76 uint32 split = Split(low_, high_, probability_of_one);
77 // Choose the subrange.
78 if (bit) high_ = split; else low_ = split + 1;
79 while (((low_ ^ high_) & 0xFF000000) == 0) {
80 EmitByte(low_ >> 24);
81 low_ <<= 8;
82 high_ = (high_ << 8) + 255;
84 assert(low_ < high_);
86 void BitEncoder::Finish() {
87 // Emit 4 bytes representing any value in [low_,high_]
88 // These are needed for BitDecoder lookahead.
89 EmitByte(low_ >> 24);
90 EmitByte(255);
91 EmitByte(255);
92 EmitByte(255);
93 // Prepare to encode another sequence.
94 low_ = 0;
95 high_ = 0xFFFFFFFF;
98 void BitDecoder::Start() {
99 low_ = 0;
100 high_ = 0xFFFFFFFF;
101 next_ = ReadByte();
102 next_ = (next_ << 8) + ReadByte();
103 next_ = (next_ << 8) + ReadByte();
104 next_ = (next_ << 8) + ReadByte();
106 bool BitDecoder::Decode(Probability probability_of_one) {
107 uint32 split = Split(low_, high_, probability_of_one);
108 bool bit = (next_ <= split);
109 if (bit) high_ = split; else low_ = split + 1;
110 while (((low_ ^ high_) & 0xFF000000) == 0) {
111 low_ <<= 8;
112 high_ = (high_ << 8) + 255;
113 next_ = (next_ << 8) + ReadByte();
115 assert(low_ < high_);
116 assert(next_ >= low_);
117 assert(next_ <= high_);
118 if (verbosity > 7) {
119 std::clog << "Decoded bit " << int(bit) << " with probability "
120 << (bit ? probability_of_one :
121 kProbabilityScale - probability_of_one)
122 << std::endl;
124 return bit;
127 //////////////////////////////////////////////////////////////////
128 // BitPredictor
130 // Note: The restriction min_probability < 2^update_delay
131 // comes from the requirement that the expressions (in Update)
132 // kProbabilityScale - *probability + rounding_correction_
133 // *probability + rounding_correction_
134 // need to be non-negative. This is because the result of
135 // the shift (>> update_delay_) for negative values
136 // is implementation-defined (5.8/3).
137 //////////////////////////////////////////////////////////////////
138 void BitPredictor::SetParameters(int update_delay,
139 Probability min_probability) {
140 assert(update_delay >= 0);
141 assert(update_delay <= kLogProbabilityScale);
142 assert(min_probability >= 0);
143 assert(min_probability < (1 << update_delay));
144 update_delay_ = update_delay;
145 rounding_correction_ = (1 << update_delay) - 1 - min_probability;
146 assert(rounding_correction_ >= 0);
147 assert(rounding_correction_ < 1 << update_delay);
149 void BitPredictor::Update(Probability* probability, bool bit) const {
150 assert(*probability >= 0);
151 assert(*probability <= kProbabilityScale);
152 if (bit)
153 *probability += ((kProbabilityScale - *probability + rounding_correction_)
154 >> update_delay_);
155 else
156 *probability -= ((*probability + rounding_correction_) >> update_delay_);
157 assert(*probability >= 0);
158 assert(*probability <= kProbabilityScale);
162 //////////////////////////////////////////////////////////////////
163 // BitHistory
164 //////////////////////////////////////////////////////////////////
165 namespace {
166 // BitHistory state transitions.
167 // Only 1-transitions of 0-dominant states are given.
168 // 0-transitions of 0-dominant states always go to the previous state.
169 // The transitions of 1-dominant states are symmetric.
170 const BitHistory::State bit_history_transition_1[1] = {1};
171 const BitHistory::State bit_history_transition_2[2] = {2, 2};
172 const BitHistory::State bit_history_transition_3[4] = {3, 4, 4, 4};
173 const BitHistory::State bit_history_transition_4[8] = {6, 6, 6, 7, 7, 7, 8, 8};
174 const BitHistory::State bit_history_transition_5[16] = {
175 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16
177 const BitHistory::State* bit_history_transitions[6] = {
178 NULL,
179 bit_history_transition_1,
180 bit_history_transition_2,
181 bit_history_transition_3,
182 bit_history_transition_4,
183 bit_history_transition_5
186 // BitHistory initial probabilities.
187 const Probability bit_history_probability_0[1] = {2048};
188 const Probability bit_history_probability_1[1] = {1358};
189 const Probability bit_history_probability_2[2] = {768, 1408};
190 const Probability bit_history_probability_3[4] = {636, 864, 1196, 1675};
191 const Probability bit_history_probability_4[8] = {
192 391, 469, 572, 704, 876, 1100, 1391, 1768
194 const Probability bit_history_probability_5[16] = {
195 214, 233, 256, 284, 318, 360, 411, 474,
196 550, 642, 755, 893, 1061, 1266, 1516, 1821
198 const Probability* bit_history_probabilities[6] = {
199 bit_history_probability_0,
200 bit_history_probability_1,
201 bit_history_probability_2,
202 bit_history_probability_3,
203 bit_history_probability_4,
204 bit_history_probability_5
208 void BitHistory::SetHistoryLength(int history_length) {
209 assert(history_length >= 0);
210 assert(history_length <= 5);
211 history_length_ = history_length;
212 int num_states = (1 << history_length);
213 initial_probabilities_ = bit_history_probabilities[history_length];
214 const State* transitions = bit_history_transitions[history_length];
215 for (int i = 0; i < (num_states >> 1); ++i) {
216 zero_transition_[i] = i - 1;
217 one_transition_[i] = transitions[i];
218 zero_transition_[num_states - i - 1] = num_states - transitions[i] - 1;
219 one_transition_[num_states - i - 1] = num_states - i;
221 zero_transition_[0] = 0;
222 one_transition_[num_states - 1] = num_states - 1;
223 for (int i = 0; i < num_states; ++i) {
224 assert(int(zero_transition_[i]) < num_states);
225 assert(int(one_transition_[i]) < num_states);
228 void BitHistory::UpdateState(State* state, bool bit) const {
229 assert(int(*state) < (1 << history_length_));
230 if (bit) *state = one_transition_[*state];
231 else *state = zero_transition_[*state];
232 assert(int(*state) < (1 << history_length_));
235 void BitHistory::SetPredictorParameters(int update_delay,
236 Probability min_probability) {
237 predictor_.SetParameters(update_delay, min_probability);
238 int neighbor_update_delay = std::min(kLogProbabilityScale, update_delay + 1);
239 neighbor_predictor_.SetParameters(neighbor_update_delay, min_probability);
241 void BitHistory::SetInitialProbabilities(Probability* probability_vector,
242 int adjustment) const {
243 int num_states = (1 << history_length_);
244 for (int state = 0; state < num_states; ++ state) {
245 if (state < (num_states >> 1)) {
246 probability_vector[state] = initial_probabilities_[state];
247 } else {
248 probability_vector[state]
249 = initial_probabilities_[num_states - state - 1];
251 while (adjustment > 0) {
252 predictor_.Update(&probability_vector[state], true);
253 --adjustment;
255 while (adjustment < 0) {
256 predictor_.Update(&probability_vector[state], false);
257 ++adjustment;
261 void BitHistory::UpdateProbability(Probability* probability_vector,
262 State state, bool bit) const {
263 int num_states = (1 << history_length_);
264 //assert(int(state) >= 0);
265 assert(int(state) < num_states);
266 predictor_.Update(&probability_vector[state], bit);
267 if (history_length_ > 2) {
268 if (state > 0)
269 neighbor_predictor_.Update(&probability_vector[state - 1], bit);
270 if (state < num_states - 1)
271 neighbor_predictor_.Update(&probability_vector[state + 1], bit);
275 //////////////////////////////////////////////////////////////////
276 // BitSequencePredictorSet
277 //////////////////////////////////////////////////////////////////
278 BitSequencePredictorSet::BitSequencePredictorSet(
279 int num_sequences, int history_length,
280 int update_delay, Probability min_probability)
281 : num_sequences_(num_sequences),
282 history_(history_length, update_delay, min_probability),
283 history_states_(num_sequences),
284 probabilities_(num_sequences << history_length) {
285 for (int sequence = 0; sequence < num_sequences; ++sequence) {
286 history_states_[sequence] = history_.InitialState();
287 history_.SetInitialProbabilities(
288 &probabilities_[sequence << history_length]);
291 uint32 BitSequencePredictorSet::ProbabilityOfOne(int sequence) const {
292 assert(sequence >= 0);
293 assert(sequence <= num_sequences_);
294 BitHistory::State state = history_states_[sequence];
295 int history_length = history_.HistoryLength();
296 int position = (sequence << history_length) + state;
297 assert(position >= 0);
298 assert(position < probabilities_.size());
299 return probabilities_[position];
301 void BitSequencePredictorSet::Update(int sequence, bool bit) {
302 assert(sequence >= 0);
303 assert(sequence < num_sequences_);
304 BitHistory::State* state = &history_states_[sequence];
305 int history_length = history_.HistoryLength();
306 history_.UpdateProbability(&probabilities_[sequence << history_length],
307 *state, bit);
308 history_.UpdateState(state, bit);
312 //////////////////////////////////////////////////////////////////
313 // ByteCompressor and ByteDecompressor
314 //////////////////////////////////////////////////////////////////
315 namespace {
316 // Each context is turned into a unique number by prepending a 1-bit.
317 // For example 001 -> 1001=9 and 01 -> 101=5
318 // These functions extract the bit and the context
319 // given a byte and a bit position.
320 // They are used outside ByteCompressor and ByteDecompressor, too,
321 // in CharCoder, for example.
322 int GetBit(int byte, int bit_position) {
323 return (byte >> bit_position) & 1;
325 int BitContext(int byte, int bit_position) {
326 return (byte | 256) >> (bit_position + 1);
329 // Transform a string representation of BitHistory options
330 // into an array of integers.
331 bool ParseBitHistoryOptions(const std::string& options,
332 int* parameters) {
333 if (options.length() != 3) return false;
334 int history_length = options[0] - '0';
335 int update_delay = options[1] - '0';
336 int log_min_probability = options[2] - '0';
337 if (verbosity > 4) {
338 std::clog << "Parsing BitHistory options"
339 << " history_length=" << history_length
340 << " update_delay=" << update_delay
341 << " log_min_probability=" << log_min_probability
342 << std::endl;
344 if (history_length < 0 || history_length > 9) return false;
345 if (update_delay < 0 || update_delay > 9) return false;
346 if (log_min_probability < 0 || log_min_probability > update_delay)
347 return false;
348 parameters[0] = history_length;
349 parameters[1] = update_delay;
350 parameters[2] = log_min_probability;
351 return true;
355 bool ByteCompressor::Options::Set(const std::string& options) {
356 std::string opts = options;
357 if (opts.length() == 0) opts = "344";
358 return ParseBitHistoryOptions(opts, parameters_);
360 std::string ByteCompressor::Options::Get() const {
361 std::string options;
362 for (int i = 0; i < 3; ++i) {
363 assert(parameters_[i] >= 0);
364 assert(parameters_[i] <= 9);
365 options += static_cast<char>('0' + parameters_[i]);
367 return options;
369 void ByteCompressor::EncodeByte(unsigned char byte) {
370 if (verbosity > 6) {
371 std::clog << "ByteCompressor: encoding byte " << int(byte) << std::endl;
373 for (int i = 7; i >= 0; --i) {
374 int bit = GetBit(byte, i);
375 int context = BitContext(byte, i);
376 encoder_.Encode(bit, predictor_.ProbabilityOfOne(context));
377 predictor_.Update(context, bit);
381 StreamDecompressor* ByteDecompressor::Create(const std::string& options) {
382 int parameters[3];
383 bool ok = ParseBitHistoryOptions(options, parameters);
384 if (!ok) return NULL;
385 return new ByteDecompressor(parameters);
387 unsigned char ByteDecompressor::DecodeByte() {
388 int context = 1;
389 for (int i = 7; i >= 0; --i) {
390 bool bit = decoder_.Decode(predictor_.ProbabilityOfOne(context));
391 predictor_.Update(context, bit);
392 context = (context << 1) + bit;
394 unsigned char byte = context & 255;
395 if (verbosity > 6) {
396 std::clog << "ByteDecompressor: decoded byte " << int(byte) << std::endl;
398 return byte;
401 //////////////////////////////////////////////////////////////////
402 // CharCoder
403 //////////////////////////////////////////////////////////////////
404 namespace {
405 // Get a bit if in given context. The return value is one of:
406 // 0: same context, bit is 0
407 // 1: same context, bit is 1
408 // 2: different context
409 static int ByteContext(int byte, int bit_position, int other_context) {
410 int context = BitContext(byte, bit_position);
411 return context == other_context ? GetBit(byte, bit_position) : 2;
415 CharCoder::CharCoder(const int* parameters)
416 : byte_context_length_(parameters[0]),
417 history_(parameters[1], parameters[2],
418 (1 << parameters[3]) - 1),
419 history_states_(256),
420 probabilities_(256 << (history_.HistoryLength()
421 + (2 * byte_context_length_))),
422 encoder_(NULL), decoder_(NULL) {
423 if (verbosity > 3) {
424 std::clog << "CharCoder byte_context_length=" << byte_context_length_
425 << " history_length=" << history_.HistoryLength()
426 << " update_delay=" << parameters[2]
427 << " min_probability=" << ((1 << parameters[3]) - 1)
428 << std::endl;
430 for (int i = 0; i < 3; ++i) { recent_bytes_[i] = '\0'; }
431 for (int bit_context = 0; bit_context < 256; ++bit_context) {
432 history_states_[bit_context] = history_.InitialState();
433 int byte_context_begin = bit_context << (2 * byte_context_length_);
434 int byte_context_end = (bit_context + 1) << (2 * byte_context_length_);
435 for (int byte_context = byte_context_begin;
436 byte_context < byte_context_end; ++byte_context) {
437 history_.SetInitialProbabilities(
438 &probabilities_[byte_context << history_.HistoryLength()]);
443 void CharCoder::Code(unsigned char* byte) {
444 int history_length = history_.HistoryLength();
445 int bit_context = 1;
446 for (int i = 7; i >= 0; --i) {
447 int byte_context = 0;
448 for (int j = 0; j < byte_context_length_; ++j) {
449 byte_context <<= 2;
450 byte_context += ByteContext(recent_bytes_[j], i, bit_context);
452 int full_context
453 = (bit_context << (byte_context_length_ << 1)) + byte_context;
454 BitHistory::State state = history_states_[bit_context];
455 int position = (full_context << history_length) + state;
456 Probability probability = probabilities_[position];
457 bool bit;
458 if (encoder_ != NULL) {
459 bit = GetBit(*byte, i);
460 encoder_->Encode(bit, probability);
461 } else {
462 bit = decoder_->Decode(probability);
464 history_.UpdateProbability(
465 &probabilities_[full_context << history_length], state, bit);
466 history_.UpdateState(&history_states_[BitContext(recent_bytes_[0], i)],
467 GetBit(recent_bytes_[0], i));
468 bit_context = (bit_context << 1) + bit;
470 *byte = (bit_context & 255);
471 // Update the last three distinct bytes.
472 // This relies on the post-run-length-encoding property
473 // that consecutive bytes are always different.
474 // Thus we always have the last two bytes but the third byte
475 // might be more distant.
476 if (*byte != recent_bytes_[1]) {
477 recent_bytes_[2] = recent_bytes_[1];
479 recent_bytes_[1] = recent_bytes_[0];
480 recent_bytes_[0] = *byte;
483 //////////////////////////////////////////////////////////////////
484 // RunLengthCoder
485 //////////////////////////////////////////////////////////////////
486 namespace {
487 // Return the number of context when the nunber of bits
488 // is smaller or equal to num_bits.
489 // This is simply 1+2+3+...+num_bits
490 int SignificantBitContexts(int num_bits) {
491 return (num_bits * (num_bits + 1)) >> 1;
494 RunLengthCoder::RunLengthCoder(int log_max_run_length, const int* parameters)
495 : max_run_length_((1LL << (log_max_run_length + 1)) - 1),
496 first_bit_predictor_(256, parameters[0], parameters[1],
497 (1 << parameters[2]) - 1),
498 unary_part_predictor_(log_max_run_length, parameters[3], parameters[4],
499 (1 << parameters[5]) - 1),
500 significant_bits_predictor_(
501 SignificantBitContexts(log_max_run_length),
502 parameters[6], parameters[7], (1 << parameters[8]) - 1),
503 encoder_(NULL), decoder_(NULL) {
504 assert(log_max_run_length <= 62);
505 if (verbosity > 3) {
506 std::clog << "RunlengthCoder max_run_length=" << max_run_length_
507 << std::endl;
508 std::clog << "RunlengthCoder first_bit_predictor:"
509 << " history_length=" << parameters[0]
510 << " update_delay=" << parameters[1]
511 << " min_probability=" << ((1 << parameters[2]) - 1)
512 << std::endl;
513 std::clog << "RunlengthCoder unary_part_predictor:"
514 << " history_length=" << parameters[3]
515 << " update_delay=" << parameters[4]
516 << " min_probability=" << ((1 << parameters[5]) - 1)
517 << std::endl;
518 std::clog << "RunlengthCoder significant_bits_predictor:"
519 << " history_length=" << parameters[6]
520 << " update_delay=" << parameters[7]
521 << " min_probability=" << ((1 << parameters[8]) - 1)
522 << std::endl;
526 void RunLengthCoder::Code(unsigned char byte, int64* run_length) {
527 assert(*run_length <= max_run_length_);
528 bool bit;
530 // Code the first bit
531 Probability probability = first_bit_predictor_.ProbabilityOfOne(byte);
532 if (encoder_ != NULL) {
533 bit = (1 < *run_length);
534 encoder_->Encode(bit, probability);
535 } else {
536 assert(decoder_ != NULL);
537 bit = decoder_->Decode(probability);
539 first_bit_predictor_.Update(byte, bit);
541 // Code the rest of the unary part
542 int bits_after_leading_one = 0;
543 while(bit) {
544 ++bits_after_leading_one;
545 probability
546 = unary_part_predictor_.ProbabilityOfOne(bits_after_leading_one);
547 if (encoder_ != NULL) {
548 bit = ((*run_length >> bits_after_leading_one) > 1);
549 encoder_->Encode(bit, probability);
550 } else {
551 assert(decoder_ != NULL);
552 bit = decoder_->Decode(probability);
554 unary_part_predictor_.Update(bits_after_leading_one, bit);
557 // Code the significant bits
558 int context_begin = SignificantBitContexts(bits_after_leading_one - 1);
559 int64 length = 1;
560 for (int i = bits_after_leading_one - 1; i >= 0; --i) {
561 int context = context_begin + i;
562 probability = significant_bits_predictor_.ProbabilityOfOne(context);
563 if (encoder_ != NULL) {
564 bit = (*run_length >> i) & 1;
565 encoder_->Encode(bit, probability);
566 } else {
567 assert(decoder_ != NULL);
568 bit = decoder_->Decode(probability);
570 significant_bits_predictor_.Update(context, bit);
571 length = (length << 1) + bit;
573 *run_length = length;
576 //////////////////////////////////////////////////////////////////
577 // RunLengthCompressor
578 //////////////////////////////////////////////////////////////////
579 namespace{
580 bool ParseRunLengthCompressorOptions(const std::string& options,
581 int* parameters) {
582 if (options.length() != 13) return false;
583 int params[13];
584 // CharCoder byte context length
585 int byte_context_length = options[0] - '0';
586 if (verbosity > 4) {
587 std::clog << "Parsing CharCoder options"
588 << " byte_context_length=" << byte_context_length
589 << std::endl;
591 if (byte_context_length < 0 || byte_context_length > 9) return false;
592 params[0] = byte_context_length;
593 bool ok = true;
594 // CharCoder BitHistory option
595 ok &= ParseBitHistoryOptions(std::string(options, 1, 3), params + 1);
596 // RunLengthCoder BitHistory options
597 if (verbosity > 4) {
598 std::clog << "Parsing RunlengthCoder options" << std::endl;
600 ok &= ParseBitHistoryOptions(std::string(options, 4, 3), params + 4);
601 ok &= ParseBitHistoryOptions(std::string(options, 7, 3), params + 7);
602 ok &= ParseBitHistoryOptions(std::string(options, 10, 3), params + 10);
603 if (ok) memcpy(parameters, params, 13 * sizeof(int));
604 return ok;
608 bool RunLengthCompressor::Options::Set(const std::string& options) {
609 std::string opts = options;
610 if (options.length() == 0) opts = "2344255466366";
611 if (verbosity > 4) {
612 std::clog << "Setting RunLengthCompressor::Options=" << opts << std::endl;
614 return ParseRunLengthCompressorOptions(opts, parameters_);
617 std::string RunLengthCompressor::Options::Get() const {
618 std::string options;
619 for (int i = 0; i < kNumParameters; ++i) {
620 assert(parameters_[i] >= 0);
621 assert(parameters_[i] <= 9);
622 options += static_cast<char>('0' + parameters_[i]);
624 return options;
627 void RunLengthCompressor::Write(const char* bytes, size_t n) {
628 if (n == 0) return;
629 int64 run_length = current_run_length_;
630 unsigned char ch = current_char_;
631 if (run_length == 0) {
632 ch = *bytes++;
633 --n;
634 run_length = 1;
636 for (; n; --n) {
637 unsigned char next = *bytes++;
638 if (next != ch) {
639 EncodeRun(ch, run_length);
640 ch = next;
641 run_length = 1;
642 } else {
643 ++run_length;
646 current_run_length_ = run_length;
647 current_char_ = ch;
649 void RunLengthCompressor::EncodeRun(unsigned char byte, int64 run_length) {
650 if (verbosity > 6) {
651 std::clog << "Encoding: "
652 <<"char=" << int(byte) << " run_length=" << run_length
653 << std::endl;
655 if (statistics > 0) statistics_collector.RecordRun(byte, run_length);
657 char_coder_.Encode(byte);
658 run_length_coder_.Encode(byte, run_length);
661 //////////////////////////////////////////////////////////////////
662 // RunLengthDecompressor
663 //////////////////////////////////////////////////////////////////
664 StreamDecompressor* RunLengthDecompressor::Create(const std::string& options) {
665 if (options.length() != kNumParameters) return NULL;
666 int parameters[kNumParameters];
667 if (verbosity > 2) {
668 std::clog << "RunLengthDecompressor options=" << options << std::endl;
670 bool ok = ParseRunLengthCompressorOptions(options, parameters);
671 if (!ok) return NULL;
672 return new RunLengthDecompressor(parameters);
674 void RunLengthDecompressor::Read(char* bytes, size_t n) {
675 if (n == 0) return;
676 int64 run_length = current_run_length_;
677 unsigned char ch = current_char_;
678 for (; n; --n) {
679 if (run_length == 0) {
680 DecodeRun(&ch, &run_length);
681 assert(run_length > 0);
683 *bytes++ = ch;
684 --run_length;
686 current_run_length_ = run_length;
687 current_char_ = ch;
689 void RunLengthDecompressor::DecodeRun(unsigned char* byte, int64* run_length) {
690 *byte = char_coder_.Decode();
691 *run_length = run_length_coder_.Decode(*byte);
692 if (verbosity > 6) {
693 std::clog << "Decoded: "
694 <<"char=" << int(*byte) << " run_length=" << *run_length
695 << std::endl;
699 //////////////////////////////////////////////////////////////////
700 // RunLengthCompressor::StatisticsCollector
701 //////////////////////////////////////////////////////////////////
702 void RunLengthCompressor::StatisticsCollector::StartEncoding() {
703 if (statistics <= 0) return;
704 memset(num_chars_, 0, 256 * sizeof(int64));
705 memset(num_runs_by_char_, 0, 256 * sizeof(int64));
706 memset(num_short_runs_by_length_, 0, kLongRunLength * sizeof(int64));
707 memset(num_long_runs_by_log_length_, 0,
708 (64 - kLogLongRunLength) * sizeof(int64));
710 void RunLengthCompressor::StatisticsCollector::EndEncoding() {
711 if (statistics <= 0) return;
712 if (statistics > 1) std::clog << "FULL BWT STATISTICS" << std::endl;
714 int num_distinct_chars = 0;
715 int64 num_runs = 0;
716 int64 total_length = 0;
717 for (int i = 0; i < 256; ++i) {
718 total_length += num_chars_[i];
719 num_runs += num_runs_by_char_[i];
720 if (num_runs > 0) ++num_distinct_chars;
723 for (int i = 0; i < 256; ++i) {
724 if (num_chars_[i] > 0) {
725 double average_length = (1.0 * num_chars_[i]) / num_runs_by_char_[i];
726 double percentage = (100.0 * num_chars_[i]) / total_length;
727 if (statistics > 1)
728 std::clog << "Char " << i << ": "
729 << num_runs_by_char_[i]
730 << " runs of total length " << num_chars_[i]
731 << std::setprecision(3)
732 << " (" << percentage << "%)"
733 << " and average length " << average_length
734 << std::endl;
738 int64 num_short_runs = 0;
739 int64 total_length_of_short_runs = 0;
740 int64 num_short_runs_by_log_length[kLogLongRunLength];
741 memset(num_short_runs_by_log_length, 0, kLogLongRunLength * sizeof(int64));
742 for (int length = 1; length < kLongRunLength; ++length) {
743 int64 count = num_short_runs_by_length_[length];
744 if (count > 0) {
745 double percentage = (length * count * 100.0) / total_length;
746 if (statistics > 1)
747 std::clog << count << " runs of length " << length
748 << std::setprecision(3)
749 << " (" << percentage << "% of total length)"
750 << std::endl;
751 num_short_runs += count;
752 total_length_of_short_runs += count * length;
753 int log_length = 0;
754 int len = length;
755 while (len >>= 1) ++log_length;
756 num_short_runs_by_log_length[log_length] += count;
760 for (int log_length = 0; log_length < kLogLongRunLength; ++log_length) {
761 int64 count = num_short_runs_by_log_length[log_length];
762 if (count > 0) {
763 if (statistics > 1)
764 std::clog << count << " runs with length in ["
765 << (1 << log_length) << "," << (1 << (log_length + 1)) - 1
766 << "]" << std::endl;
770 for (int log_length = kLogLongRunLength; log_length < 64; ++log_length) {
771 int64 count = num_long_runs_by_log_length_[log_length - kLogLongRunLength];
772 if (count > 0) {
773 if (statistics > 1)
774 std::clog << count << " runs with length in ["
775 << (1 << log_length) << "," << (1 << (log_length + 1)) - 1
776 << "]" << std::endl;
777 int log_length = kLogLongRunLength;
778 count >>= kLogLongRunLength;
779 while (count >>= 1) ++log_length;
780 ++num_short_runs_by_log_length[log_length];
784 std::clog << "SUMMARY BWT STATISTICS" << std::endl;
785 std::clog << num_distinct_chars << " distinct characters" << std::endl;
786 double average_length = (1.0 * total_length) / num_runs;
787 std::clog << num_runs << " runs"
788 << " of average length " << average_length << std::endl;
789 double average_length_of_short_runs =
790 (1.0 * total_length_of_short_runs) / num_short_runs;
791 std::clog << num_short_runs << " runs shorter than " << kLongRunLength
792 << " of total length " << total_length_of_short_runs
793 << " and average length " << average_length_of_short_runs
794 << std::endl;
795 int64 num_long_runs = num_runs - num_short_runs;
796 int64 total_length_of_long_runs = total_length - total_length_of_short_runs;
797 double average_length_of_long_runs =
798 (1.0 * total_length_of_long_runs) / num_long_runs;
799 std::clog << num_long_runs << " longer runs"
800 << " of total length " << total_length_of_long_runs
801 << " and average length " << average_length_of_long_runs
802 << std::endl;
804 void RunLengthCompressor::StatisticsCollector::StartBlock() {
805 // TODO: Record the statistics at the start of a block.
807 void RunLengthCompressor::StatisticsCollector::EndBlock() {
808 // TODO: Compute block statistics by subtracting the statistics
809 // at the block start from the current statistics and
810 // print them.
812 void RunLengthCompressor::StatisticsCollector::RecordRun(
813 unsigned char byte, int64 run_length) {
814 if (statistics <= 0) return;
815 num_chars_[byte] += run_length;
816 ++num_runs_by_char_[byte];
817 if (run_length < kLongRunLength) {
818 ++num_short_runs_by_length_[run_length];
819 } else {
820 int log_run_length = kLogLongRunLength;
821 run_length >>= kLogLongRunLength;
822 while (run_length >>= 1) ++log_run_length;
823 ++num_long_runs_by_log_length_[log_run_length - kLogLongRunLength];
827 } // namespace dcsbwt