modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / bbb_compress.h
blob6b114641c7feb69043f403ad5a0983977f884a41
1 // (C) 2006, Matt Mahoney, mmahoney (at) cs.fit.edu
2 // Copyright 2007 Google Inc.
3 //
4 // This program is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU General Public License
6 // as published by the Free Software Foundation; either version 2
7 // of the License, or (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; if not, write to the Free Software
16 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // This is an algorithm for entropy compression of the Burrows-Wheeler
18 // transform of a text. It was originally written by Matt Mahoney as
19 // bbb.cpp - big block BWT compressor version 1, Aug. 31, 2006.
20 // http://cs.fit.edu/~mmahoney/compression/bbb.cpp
22 // Only the entropy compression of bbb.cpp is included here.
23 // Other parts, including the computation of the transform, is not included.
24 // The entropy compressor has been fitted with an interface that
25 // ties it with the rest of the dcs-bwt-compressor.
27 // The class Predictor has seen the following modifications:
28 // - static variables in member function p() has been changed into
29 // data members of Predictor.
30 // - The APM chain has been modified a bit.
31 // Here is the original documentation of the entropy coding with
32 // the changes to bbb.cpp added in square brackets.
34 // ENTROPY CODING
36 // BWT data is best coded with an order 0 model. The transformed text tends
37 // to have long runs of identical bytes (e.g. "nnbaaa"). The BWT data is
38 // modeled with a modified PAQ with just one context (no mixing) followed
39 // by a 5 stage SSE (APM) and bitwise arithmetic coding. Modeling typically
40 // takes about as much time as sorting and unsorting in slow mode.
41 // The model uses about 5 MB memory.
42 // [ Now reduced to about 256KB of memory. ]
44 // The order 0 model consists of a mapping:
46 // order 1, 2, 3 contexts ----------+
47 // V
48 // order 0 context -> bit history -> p -> APM chain -> arithmetic coder
49 // t1 sm
51 // Bits are coded one at a time. The arithmetic coder maintains a range
52 // [lo, hi), initially [0, 1) and repeatedly subdivides the range in proportion
53 // to p(0), p(1), the next bit probabilites predicted by the model. The final
54 // output is the shortest base 256 number x such that lo <= x < hi. As the
55 // leading bytes of x become known, they are output. To decompress, the model
56 // predictions are repeated as during compression, then the actual bit is
57 // determined by which half of the subrange contains x.
59 // The model inputs a bytewise order 0 context consisting of the last 0 to 7
60 // bits of the current byte, plus the number of bits. There are a total of
61 // 255 possible bitwise contexts. For each context, a table (t1) maintains
62 // an 8 bit state representing the history of 0 and 1 bits previously seen.
63 // This history is mapped by another table (a StateMap sm) to a probability,
64 // p, that the next bit will be 1. This table is adaptive: after each
65 // prediction, the mapping (state -> p) is adjusted to improve the last
66 // prediction.
68 // The output of the StateMap is passed through a series of 6 more adaptive
69 // tables, (Adaptive Probability Maps, or APM) each of which maps a context
70 // and the input probability to an output probability. The input probability
71 // is interpolated between 33 bins on a nonlinear scale with smaller bins
72 // near 0 and 1. After each prediction, the corresponding table entries
73 // on both sides of p are adjusted to improve the last prediction.
74 // The APM chain is like this:
76 // + A11 ->+ +--->---+ +--->---+
77 // | | | | | |
78 // p ->+ +-> A2 -> A3 +-> A4 -+-+-> A5 -+-> Encoder
79 // | |
80 // + A12 ->+
82 // [ The APM chain has been modified into:
84 // +--->---+ +--->---+
85 // | | | |
86 // p --> A2 -> A3 +-> A5 -+-+-> A4 -+-> Encoder
88 // ]
90 // A11 and A12 both take c0 (the preceding bits of the current byte) as
91 // additional context, but one is fast adapting and the other is slow
92 // adapting. Their outputs are averaged.
94 // A2 is an order 1 context (previous byte and current partial byte).
95 // [ A2 has been modified so that it uses only two bits of information
96 // from the previous byte: what is the bit in the current bit position
97 // and whether the preceding bits are same or different from c0. ]
99 // A3 takes the previous (but not current) byte as context, plus 2 bits
100 // that depend on the current run length (0, 1, 2-3, or 4+), the number
101 // of times the last byte was repeated.
102 // [ A3 now only takes the two bits on run length. ]
104 // A4 takes the current byte and the low 5 bits of the second byte back.
105 // The output is averaged with 3/4 weight to the A3 output with 1/4 weight.
106 // [ A4 has been moved after A5, it takes only the current byte (not the
107 // 5 additional bits), and the averaging weights are 1/2 and 1/2. ]
109 // A5 takes a 14 bit hash of an order 3 context (last 3 bytes plus
110 // current partial byte) and is averaged with 1/2 weight to the A4 output.
111 // [ A5 takes now 11 bit hash of an order 4 context. ]
113 // The StateMap, state table, APM, Encoder, and associated code (Array,
114 // squash(), stretch()) are taken from PAQ8 with minor non-functional
115 // changes (e.g. removing global context).
118 #ifndef DCSBWT_BBB_COMPRESS_H__
119 #define DCSBWT_BBB_COMPRESS_H__
121 #include "stream_compressor.h"
122 #include "inttypes.h"
124 #include <string>
125 #include <cassert>
127 namespace dcsbwt {
129 namespace bbb {
131 //////////////////////////////////////////////////////////////////
132 // Array<T, ALIGN> a(n); creates n elements of T initialized to 0 bits.
133 // Constructors for T are not called.
134 // Indexing is bounds checked if assertions are on.
135 // a.size() returns n.
136 // a.resize(n) changes size to n, padding with 0 bits or truncating.
137 // Copy and assignment are not supported.
138 // Memory is aligned on a ALIGN byte boundary (power of 2), default is none.
139 //////////////////////////////////////////////////////////////////
140 template <class T, int ALIGN=0> class Array {
141 public:
142 explicit Array(int i=0) { create(i);}
143 ~Array();
144 T& operator[](int i) {
145 assert(i >= 0);
146 assert(i < n);
147 return data[i];
149 const T& operator[](int i) const {
150 assert(i >= 0);
151 assert(i < n);
152 return data[i];
154 int size() const { return n; }
155 void resize(int i); // change size to i
156 private:
157 int n; // user size
158 int reserved; // actual size
159 char *ptr; // allocated memory, zeroed
160 T* data; // start of n elements of aligned data
162 void create(int i); // create with size i
164 Array(const Array&); // no copy or assignment
165 Array& operator=(const Array&);
168 template<class T, int ALIGN> void Array<T, ALIGN>::resize(int i) {
169 if (i<=reserved) {
170 n=i;
171 return;
173 char *saveptr=ptr;
174 T *savedata=data;
175 int saven=n;
176 create(i);
177 if (savedata && saveptr) {
178 memcpy(data, savedata, sizeof(T)*std::min(i, saven));
179 free(saveptr);
183 template<class T, int ALIGN> void Array<T, ALIGN>::create(int i) {
184 n=reserved=i;
185 if (i<=0) {
186 data=0;
187 ptr=0;
188 return;
190 const int sz=ALIGN+n*sizeof(T);
191 ptr = (char*)calloc(sz, 1);
192 if (!ptr) fprintf(stderr, "Out of memory\n"), exit(1);
193 data = (ALIGN ? (T*)(ptr+ALIGN-(((long)ptr)&(ALIGN-1))) : (T*)ptr);
194 assert((char*)data >= ptr);
195 assert((char*)data <= ptr+ALIGN);
198 template<class T, int ALIGN> Array<T, ALIGN>::~Array() {
199 free(ptr);
202 //////////////////////////////////////////////////////////////////
203 // A StateMap maps a nonstationary counter state to a probability.
204 // After each mapping, the mapping is adjusted to improve future
205 // predictions. Methods:
207 // sm.p(y, cx) converts state cx (0-255) to a probability (0-4095),
208 // and trains by updating the previous prediction with y (0-1).
210 // Counter state -> probability * 256
211 //////////////////////////////////////////////////////////////////
212 class StateMap {
213 public:
214 StateMap();
215 int p(int y, int cx) {
216 assert(cx>=0 && cx<t.size());
217 t[cxt]+=((y<<16)-t[cxt]+128) >> 8;
218 return t[cxt=cx] >> 4;
220 private:
221 int cxt; // context
222 Array<uint16> t; // 256 states -> probability * 64K
226 //////////////////////////////////////////////////////////////////
227 // APM maps a probability and a context into a new probability
228 // that bit y will next be 1. After each guess it updates
229 // its state to improve future guesses. Methods:
231 // APM a(N) creates with N contexts, uses 66*N bytes memory.
232 // a.p(y, pr, cx, rate=8) returned adjusted probability in context cx (0 to
233 // N-1). rate determines the learning rate (smaller = faster, default 8).
234 // Probabilities are scaled 12 bits (0-4095). Update on last bit y (0-1).
235 //////////////////////////////////////////////////////////////////
236 class APM {
237 public:
238 explicit APM(int n);
239 int p(int y, int pr=2048, int cxt=0, int rate=8);
241 private:
242 int index; // last p, context
243 const int N; // number of contexts
244 Array<uint16> t; // [N][33]: p, context -> p
247 //////////////////////////// Predictor //////////////////////////
248 class Predictor {
249 public:
250 Predictor(): pr(2048), c0(1), c4(0), bpos(0), t1(256), cp(&t1[0]),
251 run(0), runcxt(0),
252 // a11(256), a12(256), a2(65536), a3(1024), a4(8192), a5(16384)
253 a2(1024), a3(4), a5(2048), a4(256) {}
255 int p() const {return pr;}
256 void update(int y);
258 private:
259 int pr; // next return value of p() (0-4095)
260 int c0; // bitwise context: last 0-7 bits with a leading 1 (1-255)
261 uint32 c4; // last 4 whole bytes, last is in low 8 bits
262 int bpos; // number of bits in c0 (0-7)
263 Array<uint8> t1; // context -> state
264 StateMap sm; // state -> pr
265 uint8* cp; // context pointer
266 int run; // count of consecutive identical bytes (0-65535)
267 int runcxt; // (0-3) if run is 0, 1, 2-3, 4+
268 //APM a11, a12, a2, a3, a4, a5;
269 APM a2, a3, a5, a4;
272 } // namespace bbb
274 class BbbCompressor : public StreamCompressor {
275 public:
276 class Options : public StreamCompressor::OptionsBase {
277 public:
278 virtual bool Set(const std::string& options) {
279 return 0 == options.length();
281 virtual std::string Get() const { return std::string(); }
282 int64 SizeInBytes() const { return 256 << 10; }
283 virtual StreamCompressor* GetCompressor() { return new BbbCompressor; }
286 BbbCompressor() {}
287 virtual ~BbbCompressor() { }
289 virtual void WriteBegin();
290 virtual void WriteEndPrivate();
291 virtual void Write(const char* bytes, size_t n) {
292 for (; n; --n) { CompressByte(*bytes++); }
295 private:
296 bbb::Predictor predictor_;
297 uint32 x1_;
298 uint32 x2_;
301 void CompressByte(unsigned char byte) {
302 for (int i=7; i>=0; --i) { CompressBit((byte>>i)&1); }
304 void CompressBit(int bit);
308 class BbbDecompressor : public StreamDecompressor {
309 public:
310 static StreamDecompressor* Create(const std::string&) {
311 return new BbbDecompressor;
313 BbbDecompressor() {}
314 virtual ~BbbDecompressor() {}
316 virtual void ReadBegin();
317 virtual void ReadEnd() {}
318 virtual void Read(char* bytes, size_t n) {
319 for (; n; --n) *bytes++ = GetUncompressedByte();
322 virtual int64 SizeInBytes() const { return 6 << 20; }
324 private:
325 bbb::Predictor predictor_;
326 uint32 x1_;
327 uint32 x2_;
328 uint32 x_;
330 unsigned char GetUncompressedByte() {
331 int byte = GetUncompressedBit();
332 for (int i = 7; i; --i) byte = (byte<<1) + GetUncompressedBit();
333 return byte;
335 int GetUncompressedBit();
338 } // namespace dcsbwt
340 #endif // DCSBWT_BBB_COMPRESS_H__