drivers/block/drbd/drbd_vli.h

   1 /*
   2 -*- linux-c -*-
   3    drbd_receiver.c
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #ifndef _DRBD_VLI_H
  26 #define _DRBD_VLI_H
  27
  28 /*
  29  * At a granularity of 4KiB storage represented per bit,
  30  * and stroage sizes of several TiB,
  31  * and possibly small-bandwidth replication,
  32  * the bitmap transfer time can take much too long,
  33  * if transmitted in plain text.
  34  *
  35  * We try to reduce the transfered bitmap information
  36  * by encoding runlengths of bit polarity.
  37  *
  38  * We never actually need to encode a "zero" (runlengths are positive).
  39  * But then we have to store the value of the first bit.
  40  * The first bit of information thus shall encode if the first runlength
  41  * gives the number of set or unset bits.
  42  *
  43  * We assume that large areas are either completely set or unset,
  44  * which gives good compression with any runlength method,
  45  * even when encoding the runlength as fixed size 32bit/64bit integers.
  46  *
  47  * Still, there may be areas where the polarity flips every few bits,
  48  * and encoding the runlength sequence of those areas with fix size
  49  * integers would be much worse than plaintext.
  50  *
  51  * We want to encode small runlength values with minimum code length,
  52  * while still being able to encode a Huge run of all zeros.
  53  *
  54  * Thus we need a Variable Length Integer encoding, VLI.
  55  *
  56  * For some cases, we produce more code bits than plaintext input.
  57  * We need to send incompressible chunks as plaintext, skip over them
  58  * and then see if the next chunk compresses better.
  59  *
  60  * We don't care too much about "excellent" compression ratio for large
  61  * runlengths (all set/all clear): whether we achieve a factor of 100
  62  * or 1000 is not that much of an issue.
  63  * We do not want to waste too much on short runlengths in the "noisy"
  64  * parts of the bitmap, though.
  65  *
  66  * There are endless variants of VLI, we experimented with:
  67  *  * simple byte-based
  68  *  * various bit based with different code word length.
  69  *
  70  * To avoid yet an other configuration parameter (choice of bitmap compression
  71  * algorithm) which was difficult to explain and tune, we just chose the one
  72  * variant that turned out best in all test cases.
  73  * Based on real world usage patterns, with device sizes ranging from a few GiB
  74  * to several TiB, file server/mailserver/webserver/mysql/postgress,
  75  * mostly idle to really busy, the all time winner (though sometimes only
  76  * marginally better) is:
  77  */
  78
  79 /*
  80  * encoding is "visualised" as
  81  * __little endian__ bitstream, least significant bit first (left most)
  82  *
  83  * this particular encoding is chosen so that the prefix code
  84  * starts as unary encoding the level, then modified so that
  85  * 10 levels can be described in 8bit, with minimal overhead
  86  * for the smaller levels.
  87  *
  88  * Number of data bits follow fibonacci sequence, with the exception of the
  89  * last level (+1 data bit, so it makes 64bit total).  The only worse code when
  90  * encoding bit polarity runlength is 1 plain bits => 2 code bits.
  91 prefix    data bits                                    max val  Nº data bits
  92 0 x                                                         0x2            1
  93 10 x                                                        0x4            1
  94 110 xx                                                      0x8            2
  95 1110 xxx                                                   0x10            3
  96 11110 xxx xx                                               0x30            5
  97 111110 xx xxxxxx                                          0x130            8
  98 11111100  xxxxxxxx xxxxx                                 0x2130           13
  99 11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
 100 11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
 101 11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
 102  * maximum encodable value: 0x100000400202130 == 2**56 + some */
 103
 104 /* compression "table":
 105  transmitted   x                                0.29
 106  as plaintext x                                  ........................
 107              x                                   ........................
 108             x                                    ........................
 109            x    0.59                         0.21........................
 110           x      ........................................................
 111          x       .. c ...................................................
 112         x    0.44.. o ...................................................
 113        x .......... d ...................................................
 114       x  .......... e ...................................................
 115      X.............   ...................................................
 116     x.............. b ...................................................
 117 2.0x............... i ...................................................
 118  #X................ t ...................................................
 119  #................. s ...........................  plain bits  ..........
 120 -+-----------------------------------------------------------------------
 121  1             16              32                              64
 122 */
 123
 124 /* LEVEL: (total bits, prefix bits, prefix value),
 125  * sorted ascending by number of total bits.
 126  * The rest of the code table is calculated at compiletime from this. */
 127
 128 /* fibonacci data 1, 1, ... */
 129 #define VLI_L_1_1() do { \
 130         LEVEL( 2, 1, 0x00); \
 131         LEVEL( 3, 2, 0x01); \
 132         LEVEL( 5, 3, 0x03); \
 133         LEVEL( 7, 4, 0x07); \
 134         LEVEL(10, 5, 0x0f); \
 135         LEVEL(14, 6, 0x1f); \
 136         LEVEL(21, 8, 0x3f); \
 137         LEVEL(29, 8, 0x7f); \
 138         LEVEL(42, 8, 0xbf); \
 139         LEVEL(64, 8, 0xff); \
 140         } while (0)
 141
 142 /* finds a suitable level to decode the least significant part of in.
 143  * returns number of bits consumed.
 144  *
 145  * BUG() for bad input, as that would mean a buggy code table. */
 146 static inline int vli_decode_bits(u64 *out, const u64 in)
 147 {
 148         u64 adj = 1;
 149
 150 #define LEVEL(t,b,v)                                    \
 151         do {                                            \
 152                 if ((in & ((1 << b) -1)) == v) {        \
 153                         *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
 154                         return t;                       \
 155                 }                                       \
 156                 adj += 1ULL << (t - b);                 \
 157         } while (0)
 158
 159         VLI_L_1_1();
 160
 161         /* NOT REACHED, if VLI_LEVELS code table is defined properly */
 162         BUG();
 163 #undef LEVEL
 164 }
 165
 166 /* return number of code bits needed,
 167  * or negative error number */
 168 static inline int __vli_encode_bits(u64 *out, const u64 in)
 169 {
 170         u64 max = 0;
 171         u64 adj = 1;
 172
 173         if (in == 0)
 174                 return -EINVAL;
 175
 176 #define LEVEL(t,b,v) do {               \
 177                 max += 1ULL << (t - b); \
 178                 if (in <= max) {        \
 179                         if (out)        \
 180                                 *out = ((in - adj) << b) | v;   \
 181                         return t;       \
 182                 }                       \
 183                 adj = max + 1;          \
 184         } while (0)
 185
 186         VLI_L_1_1();
 187
 188         return -EOVERFLOW;
 189 #undef LEVEL
 190 }
 191
 192 #undef VLI_L_1_1
 193
 194 /* code from here down is independend of actually used bit code */
 195
 196 /*
 197  * Code length is determined by some unique (e.g. unary) prefix.
 198  * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
 199  * not a byte stream.
 200  */
 201
 202 /* for the bitstream, we need a cursor */
 203 struct bitstream_cursor {
 204         /* the current byte */
 205         u8 *b;
 206         /* the current bit within *b, nomalized: 0..7 */
 207         unsigned int bit;
 208 };
 209
 210 /* initialize cursor to point to first bit of stream */
 211 static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
 212 {
 213         cur->b = s;
 214         cur->bit = 0;
 215 }
 216
 217 /* advance cursor by that many bits; maximum expected input value: 64,
 218  * but depending on VLI implementation, it may be more. */
 219 static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
 220 {
 221         bits += cur->bit;
 222         cur->b = cur->b + (bits >> 3);
 223         cur->bit = bits & 7;
 224 }
 225
 226 /* the bitstream itself knows its length */
 227 struct bitstream {
 228         struct bitstream_cursor cur;
 229         unsigned char *buf;
 230         size_t buf_len;         /* in bytes */
 231
 232         /* for input stream:
 233          * number of trailing 0 bits for padding
 234          * total number of valid bits in stream: buf_len * 8 - pad_bits */
 235         unsigned int pad_bits;
 236 };
 237
 238 static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
 239 {
 240         bs->buf = s;
 241         bs->buf_len = len;
 242         bs->pad_bits = pad_bits;
 243         bitstream_cursor_reset(&bs->cur, bs->buf);
 244 }
 245
 246 static inline void bitstream_rewind(struct bitstream *bs)
 247 {
 248         bitstream_cursor_reset(&bs->cur, bs->buf);
 249         memset(bs->buf, 0, bs->buf_len);
 250 }
 251
 252 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
 253  * Ignores "pad_bits".
 254  * Returns zero if bits == 0 (nothing to do).
 255  * Returns number of bits used if successful.
 256  *
 257  * If there is not enough room left in bitstream,
 258  * leaves bitstream unchanged and returns -ENOBUFS.
 259  */
 260 static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
 261 {
 262         unsigned char *b = bs->cur.b;
 263         unsigned int tmp;
 264
 265         if (bits == 0)
 266                 return 0;
 267
 268         if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
 269                 return -ENOBUFS;
 270
 271         /* paranoia: strip off hi bits; they should not be set anyways. */
 272         if (bits < 64)
 273                 val &= ~0ULL >> (64 - bits);
 274
 275         *b++ |= (val & 0xff) << bs->cur.bit;
 276
 277         for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
 278                 *b++ |= (val >> tmp) & 0xff;
 279
 280         bitstream_cursor_advance(&bs->cur, bits);
 281         return bits;
 282 }
 283
 284 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
 285  *
 286  * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
 287  *
 288  * If there are less than the requested number of valid bits left in the
 289  * bitstream, still fetches all available bits.
 290  *
 291  * Returns number of actually fetched bits.
 292  */
 293 static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
 294 {
 295         u64 val;
 296         unsigned int n;
 297
 298         if (bits > 64)
 299                 return -EINVAL;
 300
 301         if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
 302                 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
 303                         - bs->cur.bit - bs->pad_bits;
 304
 305         if (bits == 0) {
 306                 *out = 0;
 307                 return 0;
 308         }
 309
 310         /* get the high bits */
 311         val = 0;
 312         n = (bs->cur.bit + bits + 7) >> 3;
 313         /* n may be at most 9, if cur.bit + bits > 64 */
 314         /* which means this copies at most 8 byte */
 315         if (n) {
 316                 memcpy(&val, bs->cur.b+1, n - 1);
 317                 val = le64_to_cpu(val) << (8 - bs->cur.bit);
 318         }
 319
 320         /* we still need the low bits */
 321         val |= bs->cur.b[0] >> bs->cur.bit;
 322
 323         /* and mask out bits we don't want */
 324         val &= ~0ULL >> (64 - bits);
 325
 326         bitstream_cursor_advance(&bs->cur, bits);
 327         *out = val;
 328
 329         return bits;
 330 }
 331
 332 /* encodes @in as vli into @bs;
 333
 334  * return values
 335  *  > 0: number of bits successfully stored in bitstream
 336  * -ENOBUFS @bs is full
 337  * -EINVAL input zero (invalid)
 338  * -EOVERFLOW input too large for this vli code (invalid)
 339  */
 340 static inline int vli_encode_bits(struct bitstream *bs, u64 in)
 341 {
 342         u64 code = code;
 343         int bits = __vli_encode_bits(&code, in);
 344
 345         if (bits <= 0)
 346                 return bits;
 347
 348         return bitstream_put_bits(bs, code, bits);
 349 }
 350
 351 #endif