modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / rl_compress.h
blob17c8c6ae2bd111ad4b1dcf9a6f99b9322c8bccda
1 // Copyright 2007 Google Inc.
2 //
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
7 //
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Byte stream compressor designed for compressing
18 // Burrows-Wheeler transforms.
20 // The main compression steps are the following:
21 // 1. Run length encoding: Replace a run of identical bytes bbb...bb
22 // with the pair (b, run_length).
23 // 2. Turn each pair into a bit sequence: 8 bits of the byte followed
24 // by the Elias gamma coding of the run length.
25 // 3. Encode each bit using range coding: Adaptive models are used
26 // for predicting the probabilities of bits.
28 #ifndef DCSBWT_RL_COMPRESS_H__
29 #define DCSBWT_RL_COMPRESS_H__
31 #include "stream_compressor.h"
32 #include "stream.h"
33 #include "inttypes.h"
35 #include <string>
36 #include <vector>
37 #include <algorithm>
38 #include <iostream>
39 #include <cassert>
41 namespace dcsbwt {
43 extern int verbosity;
45 // Probabilities are encoded as integers in [0,kProbabilityScale]
46 // with p representing the probability p/kProbabilityScale.
47 typedef int16 Probability;
48 static const int kLogProbabilityScale = 12;
49 static const Probability kProbabilityScale = (1 << kLogProbabilityScale);
51 ///////////////////////////////////////////////////////////////////
52 // Entropy compressor for a sequence of bits.
53 // Given a probility distribution for each bit in the sequence,
54 // ouputs a compressed representation of the sequence.
55 // Uses range encoding (arithmetic encoding).
56 // The expected length of the compressed representation is
57 // close to the information theoretic lower bound.
58 ///////////////////////////////////////////////////////////////////
59 class BitEncoder {
60 public:
61 BitEncoder() : low_(0), high_(0xFFFFFFFF) {}
63 // Output is written to an OutStreamBuffer.
64 void Connect(OutStreamBuffer* out) { output_.Connect(out); }
65 OutStreamBuffer* Disconnect() { return output_.Disconnect(); }
67 // Append a bit to the sequence.
68 // Even input cases bit==1(true), probability_of_one==0 and
69 // bit==0(false), probability_of_one==kProbabilityScale are legal.
70 // In these cases, up to four bytes of output is generated
71 // from the single bit.
72 void Encode(bool bit, Probability probability_of_one);
74 // Must be called to finish the encoding of a sequence.
75 // After the call, BitEncoder is ready to start encoding a new sequence.
76 void Finish();
78 private:
79 uint32 low_;
80 uint32 high_;
81 StreamMaster<OutStreamBuffer> output_;
83 void EmitByte(unsigned char byte) { output_->WriteByte(byte); }
86 /////////////////////////////////////////////////////////////
87 // Decompressor for a bit sequence compressed by BitEncoder.
88 /////////////////////////////////////////////////////////////
89 class BitDecoder {
90 public:
91 BitDecoder() : low_(0), high_(0xFFFFFFFF) {}
93 // The compressed data is read from an InStreamBuffer.
94 void Connect(InStreamBuffer* out) { input_.Connect(out); }
95 InStreamBuffer* Disconnect() { return input_.Disconnect(); }
97 // Start() must be called to start the decoding of a sequence.
98 // Nothing needs to be called to finish the decoding, but Start()
99 // must be called again when starting to decode a new sequence.
100 void Start();
102 // Get the next bit of the uncompressed sequence.
103 // The probability distribution must be the same as the one used
104 // when encoding the same bit.
105 bool Decode(Probability probability_of_one);
107 private:
108 uint32 low_;
109 uint32 high_;
110 uint32 next_;
111 StreamMaster<InStreamBuffer> input_;
113 unsigned char ReadByte() { return input_->ReadByte(); }
116 //////////////////////////////////////////////////////////////////
117 // BitPredictor maintains a probability distribution of a bit.
119 // The method Update(&probability, bit) adjusts the probability
120 // towards the given bit. This can be used for learning the probability.
122 // The amount of adjustment is controlled by two parameters:
123 // 1. update_delay: Higher value means a smaller adjustment.
124 // REQUIRE: 0 <= update_delay <= kLogProbabilityScale
125 // update_delay == 0 sets the probability to 0 or kProbabilityScale.
126 // update_delay == kLogProbabilityScale means that the probability
127 // changes by one (unless already at min or max).
128 // 2. min_probability limits the probability to the range
129 // [min_probability, kProbabilityScale - min_probability]
130 // Close to the extremes, adjustments are smaller.
131 // REQUIRE: 0 <= min_probability < (2^update_delay)
132 //////////////////////////////////////////////////////////////////
133 class BitPredictor {
134 public:
135 BitPredictor(int update_delay, Probability min_probability) {
136 SetParameters(update_delay, min_probability);
138 void SetParameters(int update_delay, Probability min_probability);
139 void Update(Probability* probability_of_one, bool bit) const;
140 private:
141 int update_delay_;
142 int rounding_correction_;
145 //////////////////////////////////////////////////////////////////
146 // BitHistory keeps an account of the distribution of
147 // the last few bits in a bit sequence.
149 // Bit history is essentially an automaton with a small number of states
150 // and two transitions (0-bit and 1-bit) out of each state.
151 // Half of the states represent situations where 0-bits are dominant
152 // in the recent history and the other half represent 1-dominant histories.
153 // There is no neutral state (except for history_length==0).
154 // Let's look at the 0-dominant states which we denote with
155 // z1, z2, z3, ..., zk. 1-dominant states behave symmetrically.
156 // zi represents the bit sequence ...010101000...0,
157 // where the last run of zeros has length i.
158 // zk represents all such sequences with i >= k.
159 // The 0-transition from zi goes to z(i+1) (and from zk to itself).
160 // The 1-transition from zi goes either to zj for some j < i (for larger i)
161 // or to the first 1-dominant state (for smaller i).
162 // Here is an example:
164 // ----| ---------------|
165 // | v | | v
166 // |--z4<--z3<--z2<--z1-->o1
167 // | ^
168 // ---------------|
170 // The total number of states is 2^history_length.
171 // (REQUIRE: 0 <= history_length <= 5)
172 // The states are numbered from 0 to 2^history_length-1 in the
173 // order zk, ..., z1, o1, ..., ok.
174 // history_length==0 is a special case with only one state.
176 // A BitHistory object does not store a current state but supports
177 // computing the transitions using the method UpdateState().
178 // Thus the same BitHistory objet can be used with many bit sequences,
179 // each with its own current state.
181 // Each state can be associated with a probability of the next bit beeing 1.
182 // The probabilities can be updated according to observations using
183 // the method UpdateProbabilities(). This will update not only the probability
184 // of the given state but also those of its neighboring states
185 // (by a smaller amount). The updates are done using BitPredictors.
186 // Neighbor updates are not used if history_length <= 1.
188 // The state transitions and the probability updates are independent
189 // of each other, and there can be multiple independent probability
190 // vectors.
191 //////////////////////////////////////////////////////////////////
192 class BitHistory {
193 public:
194 typedef uint8 State;
195 BitHistory(int history_length, int update_delay,
196 Probability min_probability)
197 : predictor_(update_delay, min_probability),
198 neighbor_predictor_(update_delay + 1, min_probability) {
199 SetHistoryLength(history_length);
202 void SetHistoryLength(int history_length);
203 int HistoryLength() const { return history_length_; }
204 int NumberOfStates() const { return 1 << history_length_; }
205 State InitialState() const { return ((1 << history_length_) - 1) >> 1; }
206 void UpdateState(State* state, bool bit) const;
208 void SetPredictorParameters(int update_delay, Probability min_probability);
209 // probability_vector must be a pointer to an array of size NumberOfStates()
210 void UpdateProbability(Probability* probability_vector,
211 State state, bool bit) const;
212 // adjustment changes initial probabilities from their normal values.
213 // This is done by updating the probabilities abs(adjustment) times
214 // towards zero if adjustment<0 or towards one if adjustment>0.
215 void SetInitialProbabilities(Probability* probability_vector,
216 int adjustment = 0) const;
217 private:
218 State zero_transition_[32];
219 State one_transition_[32];
220 int history_length_;
221 const Probability* initial_probabilities_;
222 BitPredictor predictor_;
223 BitPredictor neighbor_predictor_;
226 //////////////////////////////////////////////////////////////////
227 // BitSequencePredictorSet represents a set of adaptive predictors
228 // for bit sequences. The predictors are based on BitHistory.
229 // Each predictor is independent from others except that all are
230 // updated with the same BitHistory object, i.e.
231 // they have the same learning parameters.
232 //////////////////////////////////////////////////////////////////
233 class BitSequencePredictorSet {
234 public:
235 BitSequencePredictorSet(int num_sequences, int history_length,
236 int update_delay, Probability min_probability);
238 // Returns the probability of the next bit being one.
239 uint32 ProbabilityOfOne(int sequence) const;
240 // Updates the prediction based on the actual next bit.
241 void Update(int sequence, bool bit);
243 static int64 SizeInBytes(int num_sequences, int history_length) {
244 return 128 + num_sequences * sizeof(BitHistory::State)
245 + (num_sequences << history_length) * sizeof(Probability);
247 int64 Size() const {
248 return SizeInBytes(history_states_.size(), history_.HistoryLength());
250 private:
251 int num_sequences_;
252 int history_length_;
253 BitHistory history_;
254 std::vector<BitHistory::State> history_states_;
255 std::vector<Probability> probabilities_;
257 BitSequencePredictorSet(const BitSequencePredictorSet&);
258 BitSequencePredictorSet& operator=(const BitSequencePredictorSet&);
261 //////////////////////////////////////////////////////////////////
262 // ByteCompressor is a simple byte stream compressor.
264 // Each byte is divided into bits and each bit has a context:
265 // the preceding bits in the byte. For example, if byte is
266 // 54=0011110, the 4th bit (a 1) has the context 001.
267 // Note that 001 is different from 1, 01, or 0001.
268 // There are 255 different contexts.
270 // The probability of each bit is predicted based on the previous
271 // bits in the same context. BitSequencePredictorSet is used
272 // for making the predictions for all contexts.
274 // The compressor has three parameters, which are the three
275 // parameters of BitHistory: history_length, update_delay,
276 // min_probability. The options string has one character,
277 // an ascii digit, for each parameter. The numeric value
278 // of the digit gives directly history_length, and update_delay,
279 // while the min_probability is computed using the formula
280 // min_probability = (2^digit)-1. For example,
281 // "344" means history_length=3, update_delay=4, min_probability=15.
283 // Legal parameter settings are all three digit combinations
284 // ("000".."999"), where the last digit is smaller or equal
285 // to the middle digit. For example "344" is legal but "345" is illegal.
286 // (The restriction comes from BitPredictor).
287 // An empty options string sets the parameters to default
288 // values (see ParseBitHistoryOptions in rl_compress.cc).
289 //////////////////////////////////////////////////////////////////
290 class ByteCompressor : public StreamCompressor {
291 public:
292 class Options : public StreamCompressor::OptionsBase {
293 public:
294 Options() { Set(""); }
295 virtual bool Set(const std::string& options);
296 virtual std::string Get() const;
297 virtual int64 SizeInBytes() const {
298 return 256 + BitSequencePredictorSet::SizeInBytes(256, parameters_[0]);
300 virtual StreamCompressor* GetCompressor() {
301 return new ByteCompressor(parameters_);
303 private:
304 int parameters_[3];
307 explicit ByteCompressor(const int* parameters)
308 : predictor_(256, parameters[0], parameters[1],
309 (1 << parameters[2]) - 1) {
310 if (verbosity > 2) {
311 std::clog << "ByteCompressor"
312 << " history_length=" << parameters[0]
313 << " update_delay=" << parameters[1]
314 << " min_probability=" << (1 << parameters[2]) - 1
315 << std::endl;
318 virtual ~ByteCompressor() {}
320 virtual void ConnectPrivate(OutStreamBuffer* out) { encoder_.Connect(out); }
321 virtual void DisconnectPrivate() { encoder_.Disconnect();}
323 virtual void WriteBegin() {}
324 virtual void WriteEndPrivate() { encoder_.Finish(); }
325 virtual void Write(const char* bytes, size_t n) {
326 for (; n; --n) { EncodeByte(*bytes++); }
329 private:
330 BitSequencePredictorSet predictor_;
331 BitEncoder encoder_;
333 void EncodeByte(unsigned char byte);
335 ByteCompressor(const ByteCompressor&);
336 ByteCompressor& operator=(const ByteCompressor&);
339 //////////////////////////////////////////////////////////////////
340 // ByteDecompressor decompresses data compressed by ByteCompressor
341 //////////////////////////////////////////////////////////////////
342 class ByteDecompressor : public StreamDecompressor {
343 public:
344 static StreamDecompressor* Create(const std::string& options);
345 explicit ByteDecompressor(const int* parameters)
346 : predictor_(256, parameters[0], parameters[1],
347 (1 << parameters[2]) - 1) {
348 if (verbosity > 2) {
349 std::clog << "ByteDecompressor"
350 << " history_length=" << parameters[0]
351 << " update_delay=" << parameters[1]
352 << " min_probability=" << (1 << parameters[2]) - 1
353 << std::endl;
356 virtual ~ByteDecompressor() {}
358 virtual void ConnectPrivate(InStreamBuffer* out) { decoder_.Connect(out); }
359 virtual void DisconnectPrivate() { decoder_.Disconnect(); }
361 virtual void ReadBegin() { decoder_.Start(); }
362 virtual void ReadEnd() {}
363 virtual void Read(char* bytes, size_t n) {
364 for (; n; --n) *bytes++ = DecodeByte();
367 virtual int64 SizeInBytes() const {
368 return predictor_.Size();
371 private:
372 BitSequencePredictorSet predictor_;
373 BitDecoder decoder_;
375 unsigned char DecodeByte();
377 ByteDecompressor(const ByteDecompressor&);
378 ByteDecompressor& operator=(const ByteDecompressor&);
382 //////////////////////////////////////////////////////////////////
383 // CharCoder compresses and decompresses byte streams.
385 // Differences to ByteCompressor/ByteDecompressor:
386 // - CharCoder is not a selfstanding StreamCompressor but
387 // is used from within RunLengthCompressor and RunLengthDecompressor.
388 // - CharCoder is particularly designed for compressing
389 // byte streams resulting from run length encoding.
390 // - CharCoder uses a more complex predictor model.
391 // - CharCoder does both compressing and decompressing
392 // (in the same private function) to ensure consistency
393 // under the complex predictor.
395 // Like ByteCompressor, the prediction model looks each bit in
396 // the context of preceding bits in the byte (the bit context)
397 // and uses a separate BitHistory state for each bit context.
398 // The differences are:
399 // - BitHistory state updates are delayed by one byte.
400 // The rational is that after run length encoding
401 // consecutive bytes are always different. Thus,
402 // the preceding byte is used for updating the BitHistory states
403 // only after the update can no more affect the encoding
404 // of the current byte.
405 // - In addition to bit context, there is a byte context,
406 // which depends on the preceding 0-3 _distinct_ bytes.
407 // For each preceding bytes, the model takes into account,
408 // whether it has the same bit context, and if it has
409 // what the bit in the current bit position is.
410 // Thus there are 1, 3, 9 or 27 different byte contexts.
412 // The prediction is derived as follows:
413 // - Use the combination of the bit context and the byte context
414 // to choose a BitHistory probability vector P.
415 // - Use the bit context alone to choose a BitHistory state s.
416 // - The prediction P[s].
417 // Updating gos as follows:
418 // - P[s] (and its neighbors P[s-1] and P[s+1]) are updated
419 // immediately after getting the prediction.
420 // - The BitHistory state is updated in the delayed manner
421 // explained above.
423 // The parameters are given as an array of 4 integers. They are:
424 // 0: byte_context_length, the number of bytes used for byte context
425 // 1: history_length
426 // 2: update_delay
427 // 3: log_2(min_probability + 1)
428 // The last three are the same as the parameters of ByteCompressor
430 // TODO: Allow a varying length encoding of bytes, such as
431 // Huffmann encoding, instead of the even 8-bit encoding.
432 // The encoding must be static; the prediction mechanism assumes
433 // that the same byte is always encoded in the same way.
434 // The main purpose of this change would be to speed up coding
435 // by reducing the number bits to code. It might improve compression, too.
436 //////////////////////////////////////////////////////////////////
437 class CharCoder {
438 public:
439 explicit CharCoder(const int* parameters);
440 void SetEncoder(BitEncoder* encoder) {
441 assert(decoder_ == NULL);
442 encoder_ = encoder;
444 void SetDecoder(BitDecoder* decoder) {
445 assert(encoder_ == NULL);
446 decoder_ = decoder;
448 void Encode(unsigned char byte) {
449 assert(encoder_ != NULL);
450 Code(&byte);
452 unsigned char Decode() {
453 assert(decoder_ != NULL);
454 unsigned char byte = 0;
455 Code(&byte);
456 return byte;
459 static int64 SizeInBytes(const int* parameters) {
460 int byte_context_length = parameters[0];
461 int history_length = parameters[1];
462 int num_probabilities = 256 << (history_length + 2 * byte_context_length);
463 return 128 + 256 * sizeof(BitHistory::State)
464 + num_probabilities * sizeof(Probability);
466 int64 Size() const {
467 int parameters[2];
468 parameters[0] = byte_context_length_;
469 parameters[1] = history_.HistoryLength();
470 return SizeInBytes(parameters);
472 private:
473 int byte_context_length_;
474 unsigned char recent_bytes_[3];
475 BitHistory history_;
476 std::vector<BitHistory::State> history_states_;
477 std::vector<Probability> probabilities_;
478 BitEncoder* encoder_;
479 BitDecoder* decoder_;
481 void Code(unsigned char* byte);
484 //////////////////////////////////////////////////////////////////
485 // RunLengthCoder compresses and decompresses run lengths in
486 // run length encoding.
488 // The encoding has two steps:
489 // 1. Turn the run length into a bit sequence by Elias gamma code.
490 // 2. Entropy code each bit using a prediction model based on BitHistory
491 // (and using BitSequencePredictorSet to represent the models).
493 // Elias gamma code has two parts:
494 // 1. Unary encoding of the number of significant bits (to be precise,
495 // the number of bits after the most significant 1-bit).
496 // 2. The significant bits (without the most significant 1-bit).
497 // For example:
498 // 1 = ...1 -> 0 (no bits after the most sigificant 1-bit)
499 // 2 = ...10 -> 10 0
500 // 3 = ...11 -> 10 1
501 // 4 = ...100 -> 110 00
502 // ...
503 // 13 = ...1101 -> 1110 101
504 // (Side note: Common description writes the unary part with
505 // the roles of 0s and 1s reversed. Then the last 1-bit of the unary
506 // part can also be seen as the omitted most significant 1-bit.
507 // On the other hand, under the present encoding, the bit sequences
508 // have the correct lexicographic order.)
510 // There are three kinds of prediction models:
511 // 1. The first bit (which tells whether the run length is one or more)
512 // is predicted in the context of the associated byte.
513 // I.e., there is a separate predictor for each byte.
514 // 2. The rest of the bytes are predicted without context.
515 // I.e., there is a separate predictor for each position in the unary part.
516 // 3. The significant bits are predicted in the context of
517 // the number of significant bits. I.e., there is a predictor
518 // for each combination of the number of significant bits and
519 // the bit positions in the significant bits.
521 // Each of the three prediction model kinds has its own
522 // BitSequencePredictorSet object and thus can have its
523 // own learning parameters. These are provided in an array
524 // of 9 integers, three for each kind (in the above order).
525 // The three parameters for one kind are:
526 // 1: history_length
527 // 2: update_delay
528 // 3: log_2(min_probability + 1)
529 //////////////////////////////////////////////////////////////////
530 class RunLengthCoder {
531 public:
532 RunLengthCoder(int log_max_run_length, const int* parameters);
533 void SetEncoder(BitEncoder* encoder) {
534 assert(decoder_ == NULL);
535 encoder_ = encoder;
537 void SetDecoder(BitDecoder* decoder) {
538 assert(encoder_ == NULL);
539 decoder_ = decoder;
542 void Encode(unsigned char byte, int64 run_length) {
543 assert(encoder_ != NULL);
544 assert(run_length >= 1);
545 Code(byte, &run_length);
547 int64 Decode(unsigned char byte) {
548 assert(decoder_ != NULL);
549 int64 run_length = 0;
550 Code(byte, &run_length);
551 assert(run_length >= 1);
552 return run_length;
555 static int64 SizeInBytes(int log_max_run_length, const int* parameters) {
556 int num_contexts = (log_max_run_length * (log_max_run_length + 1)) >> 1;
557 return 32
558 + BitSequencePredictorSet::SizeInBytes(256, parameters[0])
559 + BitSequencePredictorSet::SizeInBytes(log_max_run_length,
560 parameters[3])
561 + BitSequencePredictorSet::SizeInBytes(num_contexts,
562 parameters[6]);
564 int64 Size() const {
565 return 32 + first_bit_predictor_.Size()
566 + unary_part_predictor_.Size() + significant_bits_predictor_.Size();
568 private:
569 int64 max_run_length_;
570 BitSequencePredictorSet first_bit_predictor_;
571 BitSequencePredictorSet unary_part_predictor_;
572 BitSequencePredictorSet significant_bits_predictor_;
573 BitEncoder* encoder_;
574 BitDecoder* decoder_;
576 void Code(unsigned char byte, int64* run_length);
579 //////////////////////////////////////////////////////////////////
580 // RunLengthCompressor is a byte stream compressor
581 // based on run length encoding followed by entropy encoding.
583 // The character of each run is encoded with CharCoder
584 // and the run length with RunLengthCoder.
585 //////////////////////////////////////////////////////////////////
586 class RunLengthCompressor : public StreamCompressor {
587 public:
589 class Options : public StreamCompressor::OptionsBase {
590 public:
591 static const int kNumParameters = 13;
592 static const int kLogMaxRunLength = 31; // max run length = 2^32 - 1
593 Options() { Set(""); }
594 virtual bool Set(const std::string& options);
595 virtual std::string Get() const;
596 virtual int64 SizeInBytes() const {
597 return 1024 + CharCoder::SizeInBytes(parameters_)
598 + RunLengthCoder::SizeInBytes(kLogMaxRunLength, parameters_ + 4)
599 + 4800; // statistics
601 virtual StreamCompressor* GetCompressor() {
602 if (verbosity > 1) {
603 std::clog << "Using RunLengthCompressor with options=" << Get()
604 << std::endl;
606 return new RunLengthCompressor(parameters_);
608 private:
609 int parameters_[kNumParameters];
612 explicit RunLengthCompressor(const int *parameters)
613 : char_coder_(parameters),
614 run_length_coder_(Options::kLogMaxRunLength, parameters + 4) {}
615 virtual ~RunLengthCompressor() {}
617 virtual void ConnectPrivate(OutStreamBuffer* out) {
618 statistics_collector.StartEncoding();
619 encoder_.Connect(out);
620 char_coder_.SetEncoder(&encoder_);
621 run_length_coder_.SetEncoder(&encoder_);
623 virtual void DisconnectPrivate() {
624 run_length_coder_.SetEncoder(NULL);
625 char_coder_.SetEncoder(NULL);
626 encoder_.Disconnect();
627 statistics_collector.EndEncoding();
630 virtual void WriteBegin() {
631 current_run_length_ = 0;
632 statistics_collector.StartBlock();
634 virtual void WriteEndPrivate() {
635 EncodeRun(current_char_, current_run_length_);
636 current_run_length_ = 0;
637 encoder_.Finish();
638 statistics_collector.EndBlock();
640 virtual void Write(const char* bytes, size_t n);
642 private:
643 CharCoder char_coder_;
644 RunLengthCoder run_length_coder_;
645 BitEncoder encoder_;
646 unsigned char current_char_;
647 int64 current_run_length_;
649 void EncodeRun(unsigned char byte, int64 run_length);
651 class StatisticsCollector {
652 public:
653 void StartEncoding();
654 void EndEncoding();
655 void StartBlock();
656 void EndBlock();
657 void RecordRun(unsigned char byte, int64 run_length);
658 private:
659 static const int kLogLongRunLength = 6;
660 static const int kLongRunLength = 1 << kLogLongRunLength;
661 int64 num_chars_[256];
662 int64 num_runs_by_char_[256];
663 int64 num_short_runs_by_length_[kLongRunLength];
664 int64 num_long_runs_by_log_length_[64 - kLogLongRunLength];
666 StatisticsCollector statistics_collector;
668 RunLengthCompressor(const RunLengthCompressor&);
669 RunLengthCompressor& operator=(const RunLengthCompressor&);
672 //////////////////////////////////////////////////////////////////
673 // RunLengthDecompressor decompresses data compressed with
674 // RunLengthCompressor
675 //////////////////////////////////////////////////////////////////
676 class RunLengthDecompressor : public StreamDecompressor {
677 public:
678 static const int kNumParameters = 13;
679 static const int kLogMaxRunLength = 31; // max run length = 2^32 - 1
680 static StreamDecompressor* Create(const std::string& options);
681 explicit RunLengthDecompressor(const int* parameters)
682 : char_coder_(parameters),
683 run_length_coder_(kLogMaxRunLength, parameters + 4) {}
684 virtual ~RunLengthDecompressor() {}
686 virtual void ConnectPrivate(InStreamBuffer* out) {
687 decoder_.Connect(out);
688 char_coder_.SetDecoder(&decoder_);
689 run_length_coder_.SetDecoder(&decoder_);
691 virtual void DisconnectPrivate() {
692 run_length_coder_.SetDecoder(NULL);
693 char_coder_.SetDecoder(NULL);
694 decoder_.Disconnect();
697 virtual void ReadBegin() {
698 decoder_.Start();
699 current_run_length_ = 0;
701 virtual void ReadEnd() { current_run_length_ = 0; }
702 virtual void Read(char* bytes, size_t n);
704 virtual int64 SizeInBytes() const {
705 return 1024 + char_coder_.Size() + run_length_coder_.Size();
708 private:
709 CharCoder char_coder_;
710 RunLengthCoder run_length_coder_;
711 BitDecoder decoder_;
712 unsigned char current_char_;
713 int64 current_run_length_;
715 void DecodeRun(unsigned char* byte, int64* run_length);
717 RunLengthDecompressor(const RunLengthDecompressor&);
718 RunLengthDecompressor& operator=(const RunLengthDecompressor&);
721 } // namespace dcsbwt
723 #endif // DCSBWT_RL_COMPRESS_H__