1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #include "rl_compress.h"
27 extern int statistics
;
29 //////////////////////////////////////////////////////////////////
30 // BitEncoder and BitDecoder maintain a range [low_,high_] (of uint32's).
31 // The next four bytes of the compressed output/input represent
32 // some value in this range.
33 // For each bit (en/de)coded, the range is split into two parts
34 // with the sizes proportional to the probabilities of 1-bit and 0-bit.
35 // BitEncoder chooses one the two subranges as the new range [low_,high_].
36 // based on the actual value of the bit.
37 // BitDecoder determines the bit by finding out, which subrange
38 // contains the value represented by the next four input bytes.
39 // Whenever low_ and high_ share the most significant byte,
40 // the range is expanded by the factor 256 shifting the shared byte out.
41 // BitEncoder emits the shared byte. BitDecoder shifts the shared byte
42 // out of the next four input bytes that it is holding and reads in
44 //////////////////////////////////////////////////////////////////
47 // Split a range [low,high] into [low,split] and [split+1,high]
48 // proportional to the probability and its complement.
49 uint32
Split(uint32 low
, uint32 high
, uint32 probability
) {
50 assert(probability
<= kProbabilityScale
);
52 // range_size is high-low-1 rather than high-low+1 to ensure that
53 // neither subrange is empty, even when probability is 0 and 1.
54 uint32 range_size
= high
- low
- 1;
55 // split = low + round(range_size * p)
56 // = low + floor(range_size * p + .5)
57 // where p is the probability as a real value.
58 uint32 high_bits
= range_size
>> kLogProbabilityScale
;
59 uint32 low_bits
= range_size
& (kProbabilityScale
- 1);
60 uint32 half
= kProbabilityScale
>> 1;
61 uint32 split
= low
+ high_bits
* probability
62 + ((low_bits
* probability
+ half
) >> kLogProbabilityScale
);
69 void BitEncoder::Encode(bool bit
, Probability probability_of_one
) {
71 std::clog
<< "Encoding bit " << int(bit
) << " with probability "
72 << (bit
? probability_of_one
:
73 kProbabilityScale
- probability_of_one
)
76 uint32 split
= Split(low_
, high_
, probability_of_one
);
77 // Choose the subrange.
78 if (bit
) high_
= split
; else low_
= split
+ 1;
79 while (((low_
^ high_
) & 0xFF000000) == 0) {
82 high_
= (high_
<< 8) + 255;
86 void BitEncoder::Finish() {
87 // Emit 4 bytes representing any value in [low_,high_]
88 // These are needed for BitDecoder lookahead.
93 // Prepare to encode another sequence.
98 void BitDecoder::Start() {
102 next_
= (next_
<< 8) + ReadByte();
103 next_
= (next_
<< 8) + ReadByte();
104 next_
= (next_
<< 8) + ReadByte();
106 bool BitDecoder::Decode(Probability probability_of_one
) {
107 uint32 split
= Split(low_
, high_
, probability_of_one
);
108 bool bit
= (next_
<= split
);
109 if (bit
) high_
= split
; else low_
= split
+ 1;
110 while (((low_
^ high_
) & 0xFF000000) == 0) {
112 high_
= (high_
<< 8) + 255;
113 next_
= (next_
<< 8) + ReadByte();
115 assert(low_
< high_
);
116 assert(next_
>= low_
);
117 assert(next_
<= high_
);
119 std::clog
<< "Decoded bit " << int(bit
) << " with probability "
120 << (bit
? probability_of_one
:
121 kProbabilityScale
- probability_of_one
)
127 //////////////////////////////////////////////////////////////////
130 // Note: The restriction min_probability < 2^update_delay
131 // comes from the requirement that the expressions (in Update)
132 // kProbabilityScale - *probability + rounding_correction_
133 // *probability + rounding_correction_
134 // need to be non-negative. This is because the result of
135 // the shift (>> update_delay_) for negative values
136 // is implementation-defined (5.8/3).
137 //////////////////////////////////////////////////////////////////
138 void BitPredictor::SetParameters(int update_delay
,
139 Probability min_probability
) {
140 assert(update_delay
>= 0);
141 assert(update_delay
<= kLogProbabilityScale
);
142 assert(min_probability
>= 0);
143 assert(min_probability
< (1 << update_delay
));
144 update_delay_
= update_delay
;
145 rounding_correction_
= (1 << update_delay
) - 1 - min_probability
;
146 assert(rounding_correction_
>= 0);
147 assert(rounding_correction_
< 1 << update_delay
);
149 void BitPredictor::Update(Probability
* probability
, bool bit
) const {
150 assert(*probability
>= 0);
151 assert(*probability
<= kProbabilityScale
);
153 *probability
+= ((kProbabilityScale
- *probability
+ rounding_correction_
)
156 *probability
-= ((*probability
+ rounding_correction_
) >> update_delay_
);
157 assert(*probability
>= 0);
158 assert(*probability
<= kProbabilityScale
);
162 //////////////////////////////////////////////////////////////////
164 //////////////////////////////////////////////////////////////////
166 // BitHistory state transitions.
167 // Only 1-transitions of 0-dominant states are given.
168 // 0-transitions of 0-dominant states always go to the previous state.
169 // The transitions of 1-dominant states are symmetric.
170 const BitHistory::State bit_history_transition_1
[1] = {1};
171 const BitHistory::State bit_history_transition_2
[2] = {2, 2};
172 const BitHistory::State bit_history_transition_3
[4] = {3, 4, 4, 4};
173 const BitHistory::State bit_history_transition_4
[8] = {6, 6, 6, 7, 7, 7, 8, 8};
174 const BitHistory::State bit_history_transition_5
[16] = {
175 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16
177 const BitHistory::State
* bit_history_transitions
[6] = {
179 bit_history_transition_1
,
180 bit_history_transition_2
,
181 bit_history_transition_3
,
182 bit_history_transition_4
,
183 bit_history_transition_5
186 // BitHistory initial probabilities.
187 const Probability bit_history_probability_0
[1] = {2048};
188 const Probability bit_history_probability_1
[1] = {1358};
189 const Probability bit_history_probability_2
[2] = {768, 1408};
190 const Probability bit_history_probability_3
[4] = {636, 864, 1196, 1675};
191 const Probability bit_history_probability_4
[8] = {
192 391, 469, 572, 704, 876, 1100, 1391, 1768
194 const Probability bit_history_probability_5
[16] = {
195 214, 233, 256, 284, 318, 360, 411, 474,
196 550, 642, 755, 893, 1061, 1266, 1516, 1821
198 const Probability
* bit_history_probabilities
[6] = {
199 bit_history_probability_0
,
200 bit_history_probability_1
,
201 bit_history_probability_2
,
202 bit_history_probability_3
,
203 bit_history_probability_4
,
204 bit_history_probability_5
208 void BitHistory::SetHistoryLength(int history_length
) {
209 assert(history_length
>= 0);
210 assert(history_length
<= 5);
211 history_length_
= history_length
;
212 int num_states
= (1 << history_length
);
213 initial_probabilities_
= bit_history_probabilities
[history_length
];
214 const State
* transitions
= bit_history_transitions
[history_length
];
215 for (int i
= 0; i
< (num_states
>> 1); ++i
) {
216 zero_transition_
[i
] = i
- 1;
217 one_transition_
[i
] = transitions
[i
];
218 zero_transition_
[num_states
- i
- 1] = num_states
- transitions
[i
] - 1;
219 one_transition_
[num_states
- i
- 1] = num_states
- i
;
221 zero_transition_
[0] = 0;
222 one_transition_
[num_states
- 1] = num_states
- 1;
223 for (int i
= 0; i
< num_states
; ++i
) {
224 assert(int(zero_transition_
[i
]) < num_states
);
225 assert(int(one_transition_
[i
]) < num_states
);
228 void BitHistory::UpdateState(State
* state
, bool bit
) const {
229 assert(int(*state
) < (1 << history_length_
));
230 if (bit
) *state
= one_transition_
[*state
];
231 else *state
= zero_transition_
[*state
];
232 assert(int(*state
) < (1 << history_length_
));
235 void BitHistory::SetPredictorParameters(int update_delay
,
236 Probability min_probability
) {
237 predictor_
.SetParameters(update_delay
, min_probability
);
238 int neighbor_update_delay
= std::min(kLogProbabilityScale
, update_delay
+ 1);
239 neighbor_predictor_
.SetParameters(neighbor_update_delay
, min_probability
);
241 void BitHistory::SetInitialProbabilities(Probability
* probability_vector
,
242 int adjustment
) const {
243 int num_states
= (1 << history_length_
);
244 for (int state
= 0; state
< num_states
; ++ state
) {
245 if (state
< (num_states
>> 1)) {
246 probability_vector
[state
] = initial_probabilities_
[state
];
248 probability_vector
[state
]
249 = initial_probabilities_
[num_states
- state
- 1];
251 while (adjustment
> 0) {
252 predictor_
.Update(&probability_vector
[state
], true);
255 while (adjustment
< 0) {
256 predictor_
.Update(&probability_vector
[state
], false);
261 void BitHistory::UpdateProbability(Probability
* probability_vector
,
262 State state
, bool bit
) const {
263 int num_states
= (1 << history_length_
);
264 //assert(int(state) >= 0);
265 assert(int(state
) < num_states
);
266 predictor_
.Update(&probability_vector
[state
], bit
);
267 if (history_length_
> 2) {
269 neighbor_predictor_
.Update(&probability_vector
[state
- 1], bit
);
270 if (state
< num_states
- 1)
271 neighbor_predictor_
.Update(&probability_vector
[state
+ 1], bit
);
275 //////////////////////////////////////////////////////////////////
276 // BitSequencePredictorSet
277 //////////////////////////////////////////////////////////////////
278 BitSequencePredictorSet::BitSequencePredictorSet(
279 int num_sequences
, int history_length
,
280 int update_delay
, Probability min_probability
)
281 : num_sequences_(num_sequences
),
282 history_(history_length
, update_delay
, min_probability
),
283 history_states_(num_sequences
),
284 probabilities_(num_sequences
<< history_length
) {
285 for (int sequence
= 0; sequence
< num_sequences
; ++sequence
) {
286 history_states_
[sequence
] = history_
.InitialState();
287 history_
.SetInitialProbabilities(
288 &probabilities_
[sequence
<< history_length
]);
291 uint32
BitSequencePredictorSet::ProbabilityOfOne(int sequence
) const {
292 assert(sequence
>= 0);
293 assert(sequence
<= num_sequences_
);
294 BitHistory::State state
= history_states_
[sequence
];
295 int history_length
= history_
.HistoryLength();
296 int position
= (sequence
<< history_length
) + state
;
297 assert(position
>= 0);
298 assert(position
< probabilities_
.size());
299 return probabilities_
[position
];
301 void BitSequencePredictorSet::Update(int sequence
, bool bit
) {
302 assert(sequence
>= 0);
303 assert(sequence
< num_sequences_
);
304 BitHistory::State
* state
= &history_states_
[sequence
];
305 int history_length
= history_
.HistoryLength();
306 history_
.UpdateProbability(&probabilities_
[sequence
<< history_length
],
308 history_
.UpdateState(state
, bit
);
312 //////////////////////////////////////////////////////////////////
313 // ByteCompressor and ByteDecompressor
314 //////////////////////////////////////////////////////////////////
316 // Each context is turned into a unique number by prepending a 1-bit.
317 // For example 001 -> 1001=9 and 01 -> 101=5
318 // These functions extract the bit and the context
319 // given a byte and a bit position.
320 // They are used outside ByteCompressor and ByteDecompressor, too,
321 // in CharCoder, for example.
322 int GetBit(int byte
, int bit_position
) {
323 return (byte
>> bit_position
) & 1;
325 int BitContext(int byte
, int bit_position
) {
326 return (byte
| 256) >> (bit_position
+ 1);
329 // Transform a string representation of BitHistory options
330 // into an array of integers.
331 bool ParseBitHistoryOptions(const std::string
& options
,
333 if (options
.length() != 3) return false;
334 int history_length
= options
[0] - '0';
335 int update_delay
= options
[1] - '0';
336 int log_min_probability
= options
[2] - '0';
338 std::clog
<< "Parsing BitHistory options"
339 << " history_length=" << history_length
340 << " update_delay=" << update_delay
341 << " log_min_probability=" << log_min_probability
344 if (history_length
< 0 || history_length
> 9) return false;
345 if (update_delay
< 0 || update_delay
> 9) return false;
346 if (log_min_probability
< 0 || log_min_probability
> update_delay
)
348 parameters
[0] = history_length
;
349 parameters
[1] = update_delay
;
350 parameters
[2] = log_min_probability
;
355 bool ByteCompressor::Options::Set(const std::string
& options
) {
356 std::string opts
= options
;
357 if (opts
.length() == 0) opts
= "344";
358 return ParseBitHistoryOptions(opts
, parameters_
);
360 std::string
ByteCompressor::Options::Get() const {
362 for (int i
= 0; i
< 3; ++i
) {
363 assert(parameters_
[i
] >= 0);
364 assert(parameters_
[i
] <= 9);
365 options
+= static_cast<char>('0' + parameters_
[i
]);
369 void ByteCompressor::EncodeByte(unsigned char byte
) {
371 std::clog
<< "ByteCompressor: encoding byte " << int(byte
) << std::endl
;
373 for (int i
= 7; i
>= 0; --i
) {
374 int bit
= GetBit(byte
, i
);
375 int context
= BitContext(byte
, i
);
376 encoder_
.Encode(bit
, predictor_
.ProbabilityOfOne(context
));
377 predictor_
.Update(context
, bit
);
381 StreamDecompressor
* ByteDecompressor::Create(const std::string
& options
) {
383 bool ok
= ParseBitHistoryOptions(options
, parameters
);
384 if (!ok
) return NULL
;
385 return new ByteDecompressor(parameters
);
387 unsigned char ByteDecompressor::DecodeByte() {
389 for (int i
= 7; i
>= 0; --i
) {
390 bool bit
= decoder_
.Decode(predictor_
.ProbabilityOfOne(context
));
391 predictor_
.Update(context
, bit
);
392 context
= (context
<< 1) + bit
;
394 unsigned char byte
= context
& 255;
396 std::clog
<< "ByteDecompressor: decoded byte " << int(byte
) << std::endl
;
401 //////////////////////////////////////////////////////////////////
403 //////////////////////////////////////////////////////////////////
405 // Get a bit if in given context. The return value is one of:
406 // 0: same context, bit is 0
407 // 1: same context, bit is 1
408 // 2: different context
409 static int ByteContext(int byte
, int bit_position
, int other_context
) {
410 int context
= BitContext(byte
, bit_position
);
411 return context
== other_context
? GetBit(byte
, bit_position
) : 2;
415 CharCoder::CharCoder(const int* parameters
)
416 : byte_context_length_(parameters
[0]),
417 history_(parameters
[1], parameters
[2],
418 (1 << parameters
[3]) - 1),
419 history_states_(256),
420 probabilities_(256 << (history_
.HistoryLength()
421 + (2 * byte_context_length_
))),
422 encoder_(NULL
), decoder_(NULL
) {
424 std::clog
<< "CharCoder byte_context_length=" << byte_context_length_
425 << " history_length=" << history_
.HistoryLength()
426 << " update_delay=" << parameters
[2]
427 << " min_probability=" << ((1 << parameters
[3]) - 1)
430 for (int i
= 0; i
< 3; ++i
) { recent_bytes_
[i
] = '\0'; }
431 for (int bit_context
= 0; bit_context
< 256; ++bit_context
) {
432 history_states_
[bit_context
] = history_
.InitialState();
433 int byte_context_begin
= bit_context
<< (2 * byte_context_length_
);
434 int byte_context_end
= (bit_context
+ 1) << (2 * byte_context_length_
);
435 for (int byte_context
= byte_context_begin
;
436 byte_context
< byte_context_end
; ++byte_context
) {
437 history_
.SetInitialProbabilities(
438 &probabilities_
[byte_context
<< history_
.HistoryLength()]);
443 void CharCoder::Code(unsigned char* byte
) {
444 int history_length
= history_
.HistoryLength();
446 for (int i
= 7; i
>= 0; --i
) {
447 int byte_context
= 0;
448 for (int j
= 0; j
< byte_context_length_
; ++j
) {
450 byte_context
+= ByteContext(recent_bytes_
[j
], i
, bit_context
);
453 = (bit_context
<< (byte_context_length_
<< 1)) + byte_context
;
454 BitHistory::State state
= history_states_
[bit_context
];
455 int position
= (full_context
<< history_length
) + state
;
456 Probability probability
= probabilities_
[position
];
458 if (encoder_
!= NULL
) {
459 bit
= GetBit(*byte
, i
);
460 encoder_
->Encode(bit
, probability
);
462 bit
= decoder_
->Decode(probability
);
464 history_
.UpdateProbability(
465 &probabilities_
[full_context
<< history_length
], state
, bit
);
466 history_
.UpdateState(&history_states_
[BitContext(recent_bytes_
[0], i
)],
467 GetBit(recent_bytes_
[0], i
));
468 bit_context
= (bit_context
<< 1) + bit
;
470 *byte
= (bit_context
& 255);
471 // Update the last three distinct bytes.
472 // This relies on the post-run-length-encoding property
473 // that consecutive bytes are always different.
474 // Thus we always have the last two bytes but the third byte
475 // might be more distant.
476 if (*byte
!= recent_bytes_
[1]) {
477 recent_bytes_
[2] = recent_bytes_
[1];
479 recent_bytes_
[1] = recent_bytes_
[0];
480 recent_bytes_
[0] = *byte
;
483 //////////////////////////////////////////////////////////////////
485 //////////////////////////////////////////////////////////////////
487 // Return the number of context when the nunber of bits
488 // is smaller or equal to num_bits.
489 // This is simply 1+2+3+...+num_bits
490 int SignificantBitContexts(int num_bits
) {
491 return (num_bits
* (num_bits
+ 1)) >> 1;
494 RunLengthCoder::RunLengthCoder(int log_max_run_length
, const int* parameters
)
495 : max_run_length_((1LL << (log_max_run_length
+ 1)) - 1),
496 first_bit_predictor_(256, parameters
[0], parameters
[1],
497 (1 << parameters
[2]) - 1),
498 unary_part_predictor_(log_max_run_length
, parameters
[3], parameters
[4],
499 (1 << parameters
[5]) - 1),
500 significant_bits_predictor_(
501 SignificantBitContexts(log_max_run_length
),
502 parameters
[6], parameters
[7], (1 << parameters
[8]) - 1),
503 encoder_(NULL
), decoder_(NULL
) {
504 assert(log_max_run_length
<= 62);
506 std::clog
<< "RunlengthCoder max_run_length=" << max_run_length_
508 std::clog
<< "RunlengthCoder first_bit_predictor:"
509 << " history_length=" << parameters
[0]
510 << " update_delay=" << parameters
[1]
511 << " min_probability=" << ((1 << parameters
[2]) - 1)
513 std::clog
<< "RunlengthCoder unary_part_predictor:"
514 << " history_length=" << parameters
[3]
515 << " update_delay=" << parameters
[4]
516 << " min_probability=" << ((1 << parameters
[5]) - 1)
518 std::clog
<< "RunlengthCoder significant_bits_predictor:"
519 << " history_length=" << parameters
[6]
520 << " update_delay=" << parameters
[7]
521 << " min_probability=" << ((1 << parameters
[8]) - 1)
526 void RunLengthCoder::Code(unsigned char byte
, int64
* run_length
) {
527 assert(*run_length
<= max_run_length_
);
530 // Code the first bit
531 Probability probability
= first_bit_predictor_
.ProbabilityOfOne(byte
);
532 if (encoder_
!= NULL
) {
533 bit
= (1 < *run_length
);
534 encoder_
->Encode(bit
, probability
);
536 assert(decoder_
!= NULL
);
537 bit
= decoder_
->Decode(probability
);
539 first_bit_predictor_
.Update(byte
, bit
);
541 // Code the rest of the unary part
542 int bits_after_leading_one
= 0;
544 ++bits_after_leading_one
;
546 = unary_part_predictor_
.ProbabilityOfOne(bits_after_leading_one
);
547 if (encoder_
!= NULL
) {
548 bit
= ((*run_length
>> bits_after_leading_one
) > 1);
549 encoder_
->Encode(bit
, probability
);
551 assert(decoder_
!= NULL
);
552 bit
= decoder_
->Decode(probability
);
554 unary_part_predictor_
.Update(bits_after_leading_one
, bit
);
557 // Code the significant bits
558 int context_begin
= SignificantBitContexts(bits_after_leading_one
- 1);
560 for (int i
= bits_after_leading_one
- 1; i
>= 0; --i
) {
561 int context
= context_begin
+ i
;
562 probability
= significant_bits_predictor_
.ProbabilityOfOne(context
);
563 if (encoder_
!= NULL
) {
564 bit
= (*run_length
>> i
) & 1;
565 encoder_
->Encode(bit
, probability
);
567 assert(decoder_
!= NULL
);
568 bit
= decoder_
->Decode(probability
);
570 significant_bits_predictor_
.Update(context
, bit
);
571 length
= (length
<< 1) + bit
;
573 *run_length
= length
;
576 //////////////////////////////////////////////////////////////////
577 // RunLengthCompressor
578 //////////////////////////////////////////////////////////////////
580 bool ParseRunLengthCompressorOptions(const std::string
& options
,
582 if (options
.length() != 13) return false;
584 // CharCoder byte context length
585 int byte_context_length
= options
[0] - '0';
587 std::clog
<< "Parsing CharCoder options"
588 << " byte_context_length=" << byte_context_length
591 if (byte_context_length
< 0 || byte_context_length
> 9) return false;
592 params
[0] = byte_context_length
;
594 // CharCoder BitHistory option
595 ok
&= ParseBitHistoryOptions(std::string(options
, 1, 3), params
+ 1);
596 // RunLengthCoder BitHistory options
598 std::clog
<< "Parsing RunlengthCoder options" << std::endl
;
600 ok
&= ParseBitHistoryOptions(std::string(options
, 4, 3), params
+ 4);
601 ok
&= ParseBitHistoryOptions(std::string(options
, 7, 3), params
+ 7);
602 ok
&= ParseBitHistoryOptions(std::string(options
, 10, 3), params
+ 10);
603 if (ok
) memcpy(parameters
, params
, 13 * sizeof(int));
608 bool RunLengthCompressor::Options::Set(const std::string
& options
) {
609 std::string opts
= options
;
610 if (options
.length() == 0) opts
= "2344255466366";
612 std::clog
<< "Setting RunLengthCompressor::Options=" << opts
<< std::endl
;
614 return ParseRunLengthCompressorOptions(opts
, parameters_
);
617 std::string
RunLengthCompressor::Options::Get() const {
619 for (int i
= 0; i
< kNumParameters
; ++i
) {
620 assert(parameters_
[i
] >= 0);
621 assert(parameters_
[i
] <= 9);
622 options
+= static_cast<char>('0' + parameters_
[i
]);
627 void RunLengthCompressor::Write(const char* bytes
, size_t n
) {
629 int64 run_length
= current_run_length_
;
630 unsigned char ch
= current_char_
;
631 if (run_length
== 0) {
637 unsigned char next
= *bytes
++;
639 EncodeRun(ch
, run_length
);
646 current_run_length_
= run_length
;
649 void RunLengthCompressor::EncodeRun(unsigned char byte
, int64 run_length
) {
651 std::clog
<< "Encoding: "
652 <<"char=" << int(byte
) << " run_length=" << run_length
655 if (statistics
> 0) statistics_collector
.RecordRun(byte
, run_length
);
657 char_coder_
.Encode(byte
);
658 run_length_coder_
.Encode(byte
, run_length
);
661 //////////////////////////////////////////////////////////////////
662 // RunLengthDecompressor
663 //////////////////////////////////////////////////////////////////
664 StreamDecompressor
* RunLengthDecompressor::Create(const std::string
& options
) {
665 if (options
.length() != kNumParameters
) return NULL
;
666 int parameters
[kNumParameters
];
668 std::clog
<< "RunLengthDecompressor options=" << options
<< std::endl
;
670 bool ok
= ParseRunLengthCompressorOptions(options
, parameters
);
671 if (!ok
) return NULL
;
672 return new RunLengthDecompressor(parameters
);
674 void RunLengthDecompressor::Read(char* bytes
, size_t n
) {
676 int64 run_length
= current_run_length_
;
677 unsigned char ch
= current_char_
;
679 if (run_length
== 0) {
680 DecodeRun(&ch
, &run_length
);
681 assert(run_length
> 0);
686 current_run_length_
= run_length
;
689 void RunLengthDecompressor::DecodeRun(unsigned char* byte
, int64
* run_length
) {
690 *byte
= char_coder_
.Decode();
691 *run_length
= run_length_coder_
.Decode(*byte
);
693 std::clog
<< "Decoded: "
694 <<"char=" << int(*byte
) << " run_length=" << *run_length
699 //////////////////////////////////////////////////////////////////
700 // RunLengthCompressor::StatisticsCollector
701 //////////////////////////////////////////////////////////////////
702 void RunLengthCompressor::StatisticsCollector::StartEncoding() {
703 if (statistics
<= 0) return;
704 memset(num_chars_
, 0, 256 * sizeof(int64
));
705 memset(num_runs_by_char_
, 0, 256 * sizeof(int64
));
706 memset(num_short_runs_by_length_
, 0, kLongRunLength
* sizeof(int64
));
707 memset(num_long_runs_by_log_length_
, 0,
708 (64 - kLogLongRunLength
) * sizeof(int64
));
710 void RunLengthCompressor::StatisticsCollector::EndEncoding() {
711 if (statistics
<= 0) return;
712 if (statistics
> 1) std::clog
<< "FULL BWT STATISTICS" << std::endl
;
714 int num_distinct_chars
= 0;
716 int64 total_length
= 0;
717 for (int i
= 0; i
< 256; ++i
) {
718 total_length
+= num_chars_
[i
];
719 num_runs
+= num_runs_by_char_
[i
];
720 if (num_runs
> 0) ++num_distinct_chars
;
723 for (int i
= 0; i
< 256; ++i
) {
724 if (num_chars_
[i
] > 0) {
725 double average_length
= (1.0 * num_chars_
[i
]) / num_runs_by_char_
[i
];
726 double percentage
= (100.0 * num_chars_
[i
]) / total_length
;
728 std::clog
<< "Char " << i
<< ": "
729 << num_runs_by_char_
[i
]
730 << " runs of total length " << num_chars_
[i
]
731 << std::setprecision(3)
732 << " (" << percentage
<< "%)"
733 << " and average length " << average_length
738 int64 num_short_runs
= 0;
739 int64 total_length_of_short_runs
= 0;
740 int64 num_short_runs_by_log_length
[kLogLongRunLength
];
741 memset(num_short_runs_by_log_length
, 0, kLogLongRunLength
* sizeof(int64
));
742 for (int length
= 1; length
< kLongRunLength
; ++length
) {
743 int64 count
= num_short_runs_by_length_
[length
];
745 double percentage
= (length
* count
* 100.0) / total_length
;
747 std::clog
<< count
<< " runs of length " << length
748 << std::setprecision(3)
749 << " (" << percentage
<< "% of total length)"
751 num_short_runs
+= count
;
752 total_length_of_short_runs
+= count
* length
;
755 while (len
>>= 1) ++log_length
;
756 num_short_runs_by_log_length
[log_length
] += count
;
760 for (int log_length
= 0; log_length
< kLogLongRunLength
; ++log_length
) {
761 int64 count
= num_short_runs_by_log_length
[log_length
];
764 std::clog
<< count
<< " runs with length in ["
765 << (1 << log_length
) << "," << (1 << (log_length
+ 1)) - 1
770 for (int log_length
= kLogLongRunLength
; log_length
< 64; ++log_length
) {
771 int64 count
= num_long_runs_by_log_length_
[log_length
- kLogLongRunLength
];
774 std::clog
<< count
<< " runs with length in ["
775 << (1 << log_length
) << "," << (1 << (log_length
+ 1)) - 1
777 int log_length
= kLogLongRunLength
;
778 count
>>= kLogLongRunLength
;
779 while (count
>>= 1) ++log_length
;
780 ++num_short_runs_by_log_length
[log_length
];
784 std::clog
<< "SUMMARY BWT STATISTICS" << std::endl
;
785 std::clog
<< num_distinct_chars
<< " distinct characters" << std::endl
;
786 double average_length
= (1.0 * total_length
) / num_runs
;
787 std::clog
<< num_runs
<< " runs"
788 << " of average length " << average_length
<< std::endl
;
789 double average_length_of_short_runs
=
790 (1.0 * total_length_of_short_runs
) / num_short_runs
;
791 std::clog
<< num_short_runs
<< " runs shorter than " << kLongRunLength
792 << " of total length " << total_length_of_short_runs
793 << " and average length " << average_length_of_short_runs
795 int64 num_long_runs
= num_runs
- num_short_runs
;
796 int64 total_length_of_long_runs
= total_length
- total_length_of_short_runs
;
797 double average_length_of_long_runs
=
798 (1.0 * total_length_of_long_runs
) / num_long_runs
;
799 std::clog
<< num_long_runs
<< " longer runs"
800 << " of total length " << total_length_of_long_runs
801 << " and average length " << average_length_of_long_runs
804 void RunLengthCompressor::StatisticsCollector::StartBlock() {
805 // TODO: Record the statistics at the start of a block.
807 void RunLengthCompressor::StatisticsCollector::EndBlock() {
808 // TODO: Compute block statistics by subtracting the statistics
809 // at the block start from the current statistics and
812 void RunLengthCompressor::StatisticsCollector::RecordRun(
813 unsigned char byte
, int64 run_length
) {
814 if (statistics
<= 0) return;
815 num_chars_
[byte
] += run_length
;
816 ++num_runs_by_char_
[byte
];
817 if (run_length
< kLongRunLength
) {
818 ++num_short_runs_by_length_
[run_length
];
820 int log_run_length
= kLogLongRunLength
;
821 run_length
>>= kLogLongRunLength
;
822 while (run_length
>>= 1) ++log_run_length
;
823 ++num_long_runs_by_log_length_
[log_run_length
- kLogLongRunLength
];
827 } // namespace dcsbwt