tools/bwt/dcs-bwt/src/bbb_compress.h

   1 // (C) 2006, Matt Mahoney, mmahoney (at) cs.fit.edu
   2 // Copyright 2007 Google Inc.
   3 //
   4 // This program is free software; you can redistribute it and/or
   5 // modify it under the terms of the GNU General Public License
   6 // as published by the Free Software Foundation; either version 2
   7 // of the License, or (at your option) any later version.
   8 //
   9 // This program is distributed in the hope that it will be useful,
  10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 // GNU General Public License for more details.
  13 //
  14 // You should have received a copy of the GNU General Public License
  15 // along with this program; if not, write to the Free Software
  16 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 // This is an algorithm for entropy compression of the Burrows-Wheeler
  18 // transform of a text. It was originally written by Matt Mahoney as
  19 // bbb.cpp - big block BWT compressor version 1, Aug. 31, 2006.
  20 // http://cs.fit.edu/~mmahoney/compression/bbb.cpp
  21 //
  22 // Only the entropy compression of bbb.cpp is included here.
  23 // Other parts, including the computation of the transform, is not included.
  24 // The entropy compressor has been fitted with an interface that
  25 // ties it with the rest of the dcs-bwt-compressor.
  26 //
  27 // The class Predictor has seen the following modifications:
  28 // - static variables in member function p() has been changed into
  29 //   data members of Predictor.
  30 // - The APM chain has been modified a bit.
  31 // Here is the original documentation of the entropy coding with
  32 // the changes to bbb.cpp added in square brackets.
  33 //
  34 // ENTROPY CODING
  35 //
  36 // BWT data is best coded with an order 0 model.  The transformed text tends
  37 // to have long runs of identical bytes (e.g. "nnbaaa").  The BWT data is
  38 // modeled with a modified PAQ with just one context (no mixing) followed
  39 // by a 5 stage SSE (APM) and bitwise arithmetic coding.  Modeling typically
  40 // takes about as much time as sorting and unsorting in slow mode.
  41 // The model uses about 5 MB memory.
  42 // [ Now reduced to about 256KB of memory. ]
  43 //
  44 // The order 0 model consists of a mapping:
  45 //
  46 //             order 1, 2, 3 contexts ----------+
  47 //                                              V
  48 //  order 0 context -> bit history -> p -> APM chain -> arithmetic coder
  49 //                  t1             sm
  50 //
  51 // Bits are coded one at a time.  The arithmetic coder maintains a range
  52 // [lo, hi), initially [0, 1) and repeatedly subdivides the range in proportion
  53 // to p(0), p(1), the next bit probabilites predicted by the model.  The final
  54 // output is the shortest base 256 number x such that lo <= x < hi.  As the
  55 // leading bytes of x become known, they are output.  To decompress, the model
  56 // predictions are repeated as during compression, then the actual bit is
  57 // determined by which half of the subrange contains x.
  58 //
  59 // The model inputs a bytewise order 0 context consisting of the last 0 to 7
  60 // bits of the current byte, plus the number of bits.  There are a total of
  61 // 255 possible bitwise contexts.  For each context, a table (t1) maintains
  62 // an 8 bit state representing the history of 0 and 1 bits previously seen.
  63 // This history is mapped by another table (a StateMap sm) to a probability,
  64 // p, that the next bit will be 1. This table is adaptive: after each
  65 // prediction, the mapping (state -> p) is adjusted to improve the last
  66 // prediction.
  67 //
  68 // The output of the StateMap is passed through a series of 6 more adaptive
  69 // tables, (Adaptive Probability Maps, or APM) each of which maps a context
  70 // and the input probability to an output probability.  The input probability
  71 // is interpolated between 33 bins on a nonlinear scale with smaller bins
  72 // near 0 and 1.  After each prediction, the corresponding table entries
  73 // on both sides of p are adjusted to improve the last prediction.
  74 //  The APM chain is like this:
  75 //
  76 //      + A11 ->+            +--->---+ +--->---+
  77 //      |       |            |       | |       |
  78 //  p ->+       +-> A2 -> A3 +-> A4 -+-+-> A5 -+-> Encoder
  79 //      |       |
  80 //      + A12 ->+
  81 //
  82 // [ The APM chain has been modified into:
  83 //
  84 //                 +--->---+ +--->---+
  85 //                 |       | |       |
  86 //  p --> A2 -> A3 +-> A5 -+-+-> A4 -+-> Encoder
  87 //
  88 // ]
  89 //
  90 // A11 and A12 both take c0 (the preceding bits of the current byte) as
  91 // additional context, but one is fast adapting and the other is slow
  92 // adapting.  Their outputs are averaged.
  93 //
  94 // A2 is an order 1 context (previous byte and current partial byte).
  95 // [ A2 has been modified so that it uses only two bits of information
  96 // from the previous byte: what is the bit in the current bit position
  97 // and whether the preceding bits are same or different from c0. ]
  98 //
  99 // A3 takes the previous (but not current) byte as context, plus 2 bits
 100 // that depend on the current run length (0, 1, 2-3, or 4+), the number
 101 // of times the last byte was repeated.
 102 // [ A3 now only takes the two bits on run length. ]
 103 //
 104 // A4 takes the current byte and the low 5 bits of the second byte back.
 105 // The output is averaged with 3/4 weight to the A3 output with 1/4 weight.
 106 // [ A4 has been moved after A5, it takes only the current byte (not the
 107 // 5 additional bits), and the averaging weights are 1/2 and 1/2. ]
 108 //
 109 // A5 takes a 14 bit hash of an order 3 context (last 3 bytes plus
 110 // current partial byte) and is averaged with 1/2 weight to the A4 output.
 111 // [ A5 takes now 11 bit hash of an order 4 context. ]
 112 //
 113 // The StateMap, state table, APM, Encoder, and associated code (Array,
 114 // squash(), stretch()) are taken from PAQ8 with minor non-functional
 115 // changes (e.g. removing global context).
 116
 117
 118 #ifndef DCSBWT_BBB_COMPRESS_H__
 119 #define DCSBWT_BBB_COMPRESS_H__
 120
 121 #include "stream_compressor.h"
 122 #include "inttypes.h"
 123
 124 #include <string>
 125 #include <cassert>
 126
 127 namespace dcsbwt {
 128
 129 namespace bbb {
 130
 131 //////////////////////////////////////////////////////////////////
 132 // Array<T, ALIGN> a(n); creates n elements of T initialized to 0 bits.
 133 // Constructors for T are not called.
 134 // Indexing is bounds checked if assertions are on.
 135 // a.size() returns n.
 136 // a.resize(n) changes size to n, padding with 0 bits or truncating.
 137 // Copy and assignment are not supported.
 138 // Memory is aligned on a ALIGN byte boundary (power of 2), default is none.
 139 //////////////////////////////////////////////////////////////////
 140 template <class T, int ALIGN=0> class Array {
 141 public:
 142   explicit Array(int i=0) { create(i);}
 143   ~Array();
 144   T& operator[](int i) {
 145     assert(i >= 0);
 146     assert(i < n);
 147     return data[i];
 148   }
 149   const T& operator[](int i) const {
 150     assert(i >= 0);
 151     assert(i < n);
 152     return data[i];
 153   }
 154   int size() const { return n; }
 155   void resize(int i);  // change size to i
 156 private:
 157   int n;     // user size
 158   int reserved;  // actual size
 159   char *ptr; // allocated memory, zeroed
 160   T* data;   // start of n elements of aligned data
 161
 162   void create(int i);  // create with size i
 163
 164   Array(const Array&);  // no copy or assignment
 165   Array& operator=(const Array&);
 166 };
 167
 168 template<class T, int ALIGN> void Array<T, ALIGN>::resize(int i) {
 169   if (i<=reserved) {
 170     n=i;
 171     return;
 172   }
 173   char *saveptr=ptr;
 174   T *savedata=data;
 175   int saven=n;
 176   create(i);
 177   if (savedata && saveptr) {
 178     memcpy(data, savedata, sizeof(T)*std::min(i, saven));
 179     free(saveptr);
 180   }
 181 }
 182
 183 template<class T, int ALIGN> void Array<T, ALIGN>::create(int i) {
 184   n=reserved=i;
 185   if (i<=0) {
 186     data=0;
 187     ptr=0;
 188     return;
 189   }
 190   const int sz=ALIGN+n*sizeof(T);
 191   ptr = (char*)calloc(sz, 1);
 192   if (!ptr) fprintf(stderr, "Out of memory\n"), exit(1);
 193   data = (ALIGN ? (T*)(ptr+ALIGN-(((long)ptr)&(ALIGN-1))) : (T*)ptr);
 194   assert((char*)data >= ptr);
 195   assert((char*)data <= ptr+ALIGN);
 196 }
 197
 198 template<class T, int ALIGN> Array<T, ALIGN>::~Array() {
 199   free(ptr);
 200 }
 201
 202 //////////////////////////////////////////////////////////////////
 203 // A StateMap maps a nonstationary counter state to a probability.
 204 // After each mapping, the mapping is adjusted to improve future
 205 // predictions.  Methods:
 206 //
 207 // sm.p(y, cx) converts state cx (0-255) to a probability (0-4095),
 208 //   and trains by updating the previous prediction with y (0-1).
 209 //
 210 // Counter state -> probability * 256
 211 //////////////////////////////////////////////////////////////////
 212 class StateMap {
 213 public:
 214   StateMap();
 215   int p(int y, int cx) {
 216     assert(cx>=0 && cx<t.size());
 217     t[cxt]+=((y<<16)-t[cxt]+128) >> 8;
 218     return t[cxt=cx] >> 4;
 219   }
 220  private:
 221   int cxt;  // context
 222   Array<uint16> t; // 256 states -> probability * 64K
 223 };
 224
 225
 226 //////////////////////////////////////////////////////////////////
 227 // APM maps a probability and a context into a new probability
 228 // that bit y will next be 1.  After each guess it updates
 229 // its state to improve future guesses.  Methods:
 230 //
 231 // APM a(N) creates with N contexts, uses 66*N bytes memory.
 232 // a.p(y, pr, cx, rate=8) returned adjusted probability in context cx (0 to
 233 //   N-1).  rate determines the learning rate (smaller = faster, default 8).
 234 //   Probabilities are scaled 12 bits (0-4095).  Update on last bit y (0-1).
 235 //////////////////////////////////////////////////////////////////
 236 class APM {
 237  public:
 238   explicit APM(int n);
 239   int p(int y, int pr=2048, int cxt=0, int rate=8);
 240
 241  private:
 242   int index;     // last p, context
 243   const int N;   // number of contexts
 244   Array<uint16> t;  // [N][33]:  p, context -> p
 245 };
 246
 247 //////////////////////////// Predictor //////////////////////////
 248 class Predictor {
 249 public:
 250   Predictor(): pr(2048), c0(1), c4(0), bpos(0), t1(256), cp(&t1[0]),
 251                run(0), runcxt(0),
 252                // a11(256), a12(256), a2(65536), a3(1024), a4(8192), a5(16384)
 253                a2(1024), a3(4), a5(2048), a4(256) {}
 254
 255   int p() const {return pr;}
 256   void update(int y);
 257
 258  private:
 259   int pr;  // next return value of p() (0-4095)
 260   int c0;  // bitwise context: last 0-7 bits with a leading 1 (1-255)
 261   uint32 c4;  // last 4 whole bytes, last is in low 8 bits
 262   int bpos; // number of bits in c0 (0-7)
 263   Array<uint8> t1; // context -> state
 264   StateMap sm;  // state -> pr
 265   uint8* cp;  // context pointer
 266   int run;  // count of consecutive identical bytes (0-65535)
 267   int runcxt;  // (0-3) if run is 0, 1, 2-3, 4+
 268   //APM a11, a12, a2, a3, a4, a5;
 269   APM a2, a3, a5, a4;
 270 };
 271
 272 }  // namespace bbb
 273
 274 class BbbCompressor : public StreamCompressor {
 275  public:
 276   class Options : public StreamCompressor::OptionsBase {
 277    public:
 278     virtual bool Set(const std::string& options) {
 279       return 0 == options.length();
 280     }
 281     virtual std::string Get() const { return std::string(); }
 282     int64 SizeInBytes() const { return 256 << 10; }
 283     virtual StreamCompressor* GetCompressor() { return new BbbCompressor; }
 284   };
 285
 286   BbbCompressor() {}
 287   virtual ~BbbCompressor() { }
 288
 289   virtual void WriteBegin();
 290   virtual void WriteEndPrivate();
 291   virtual void Write(const char* bytes, size_t n) {
 292     for (; n; --n) { CompressByte(*bytes++); }
 293   }
 294
 295  private:
 296   bbb::Predictor predictor_;
 297   uint32 x1_;
 298   uint32 x2_;
 299
 300
 301   void CompressByte(unsigned char byte) {
 302     for (int i=7; i>=0; --i) { CompressBit((byte>>i)&1); }
 303   }
 304   void CompressBit(int bit);
 305 };
 306
 307
 308 class BbbDecompressor : public StreamDecompressor {
 309  public:
 310   static StreamDecompressor* Create(const std::string&) {
 311     return new BbbDecompressor;
 312   }
 313   BbbDecompressor() {}
 314   virtual ~BbbDecompressor() {}
 315
 316   virtual void ReadBegin();
 317   virtual void ReadEnd() {}
 318   virtual void Read(char* bytes, size_t n) {
 319     for (; n; --n) *bytes++ = GetUncompressedByte();
 320   }
 321
 322   virtual int64 SizeInBytes() const { return 6 << 20; }
 323
 324  private:
 325   bbb::Predictor predictor_;
 326   uint32 x1_;
 327   uint32 x2_;
 328   uint32 x_;
 329
 330   unsigned char GetUncompressedByte() {
 331     int byte = GetUncompressedBit();
 332     for (int i = 7; i; --i) byte = (byte<<1) + GetUncompressedBit();
 333     return byte;
 334   }
 335   int GetUncompressedBit();
 336 };
 337
 338 }  // namespace dcsbwt
 339
 340 #endif  // DCSBWT_BBB_COMPRESS_H__