Factor out function to decode 2 hex digits
[xapian.git] / xapian-core / common / bitstream.cc
blobd95ec9cadc1ed5e2bcdc6d1c967ca9e30579f47e
1 /** @file
2 * @brief Classes to encode/decode a bitstream.
3 */
4 /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "bitstream.h"
26 #include <xapian/types.h>
28 #include "omassert.h"
29 #include "pack.h"
31 #include <cmath>
32 #include <vector>
34 using namespace std;
36 // Highly optimised fls() implementation.
37 template<typename T>
38 static inline int
39 highest_order_bit(T mask)
41 #ifdef HAVE_DO_CLZ
42 return mask ? sizeof(T) * 8 - do_clz(mask) : 0;
43 #else
44 static const unsigned char flstab[256] = {
45 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
46 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
47 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
48 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
49 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
50 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
51 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
52 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
53 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
54 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
55 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
56 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
57 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
58 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
59 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
60 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
63 int result = 0;
64 if (sizeof(T) > 4) {
65 if (mask >= 0x100000000ul) {
66 mask >>= 32;
67 result += 32;
70 if (mask >= 0x10000u) {
71 mask >>= 16;
72 result += 16;
74 if (mask >= 0x100u) {
75 mask >>= 8;
76 result += 8;
78 return result + flstab[mask];
79 #endif
82 namespace Xapian {
84 /// Shift left that's safe for shifts wider than the type.
85 template<typename T, typename U>
86 static constexpr inline
87 T safe_shl(T x, U shift)
89 return (shift >= sizeof(T) * 8 ? 0 : x << shift);
92 void
93 BitWriter::encode(Xapian::termpos value, Xapian::termpos outof)
95 Assert(value < outof);
96 unsigned bits = highest_order_bit(outof - Xapian::termpos(1));
97 const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
98 if (spare) {
99 /* If we have spare values, we can use one fewer bit to encode some
100 * values. We shorten the values in the middle of the range, as
101 * testing (on positional data) shows this works best. "Managing
102 * Gigabytes" suggests reversing this for the lowest level and encoding
103 * the end values of the range shorter, which is contrary to our
104 * testing (MG is talking about posting lists, which probably have
105 * different characteristics).
107 * For example, if outof is 11, the codes emitted are:
109 * value output
110 * 0 0000
111 * 1 0001
112 * 2 0010
113 * 3 011
114 * 4 100
115 * 5 101
116 * 6 110
117 * 7 111
118 * 8 1000
119 * 9 1001
120 * 10 1010
122 * Note the LSB comes first in the bitstream, so these codes need to be
123 * suffix-free to be decoded.
125 const Xapian::termpos mid_start = (outof - spare) / 2;
126 if (value >= mid_start + spare) {
127 value = (value - (mid_start + spare)) |
128 (Xapian::termpos(1) << (bits - 1));
129 } else if (value >= mid_start) {
130 --bits;
134 if (bits + n_bits > sizeof(acc) * 8) {
135 // We need to write more bits than there's empty room for in
136 // the accumulator. So we arrange to shift out 8 bits, then
137 // adjust things so we're adding 8 fewer bits.
138 Assert(bits <= sizeof(acc) * 8);
139 acc |= (value << n_bits);
140 buf += char(acc);
141 acc >>= 8;
142 value >>= 8;
143 bits -= 8;
145 acc |= (value << n_bits);
146 n_bits += bits;
147 while (n_bits >= 8) {
148 buf += char(acc);
149 acc >>= 8;
150 n_bits -= 8;
154 void
155 BitWriter::encode_interpolative(const vector<Xapian::termpos>& pos, int j, int k)
157 // "Interpolative code" - for an algorithm description, see "Managing
158 // Gigabytes" - pages 126-127 in the second edition. You can probably
159 // view those pages in google books.
160 while (j + 1 < k) {
161 const Xapian::termpos mid = j + (k - j) / 2;
162 // Encode one out of (pos[k] - pos[j] + 1) values
163 // (less some at either end because we must be able to fit
164 // all the intervening pos in)
165 const Xapian::termpos outof = pos[k] - pos[j] + j - k + 1;
166 const Xapian::termpos lowest = pos[j] + mid - j;
167 encode(pos[mid] - lowest, outof);
168 encode_interpolative(pos, j, mid);
169 j = mid;
173 Xapian::termpos
174 BitReader::decode(Xapian::termpos outof, bool force)
176 (void)force;
177 Assert(force == di_current.is_initialized());
178 Xapian::termpos bits = highest_order_bit(outof - Xapian::termpos(1));
179 const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
180 const Xapian::termpos mid_start = (outof - spare) / 2;
181 Xapian::termpos p;
182 if (spare) {
183 p = read_bits(bits - 1);
184 if (p < mid_start) {
185 if (read_bits(1)) p += mid_start + spare;
187 } else {
188 p = read_bits(bits);
190 Assert(p < outof);
191 return p;
194 Xapian::termpos
195 BitReader::read_bits(int count)
197 Xapian::termpos result;
198 if (count > int(sizeof(acc) * 8 - 7)) {
199 // If we need more than 7 bits less than fit in acc do the read in two
200 // goes to ensure that we don't overflow acc. This is a little more
201 // conservative than it needs to be, but such large values will
202 // inevitably be rare (because you can't fit very many of them into
203 // the full Xapian::termpos range).
204 Assert(count <= int(sizeof(acc) * 8));
205 const size_t half_the_bits = sizeof(acc) * 4;
206 result = read_bits(half_the_bits);
207 return result | (read_bits(count - half_the_bits) << half_the_bits);
209 while (n_bits < count) {
210 Assert(idx < buf.size());
211 unsigned char byte = buf[idx++];
212 acc |= Xapian::termpos(byte) << n_bits;
213 n_bits += 8;
215 result = acc & ((Xapian::termpos(1) << count) - Xapian::termpos(1));
216 acc >>= count;
217 n_bits -= count;
218 return result;
221 void
222 BitReader::decode_interpolative(int j, int k,
223 Xapian::termpos pos_j, Xapian::termpos pos_k)
225 Assert(!di_current.is_initialized());
226 di_stack.reserve(highest_order_bit(pos_k - pos_j));
227 di_current.set_j(j, pos_j);
228 di_current.set_k(k, pos_k);
231 Xapian::termpos
232 BitReader::decode_interpolative_next()
234 Assert(di_current.is_initialized());
235 while (!di_stack.empty() || di_current.is_next()) {
236 if (!di_current.is_next()) {
237 Xapian::termpos pos_ret = di_current.pos_k;
238 di_current = di_stack.back();
239 di_stack.pop_back();
240 int mid = (di_current.j + di_current.k) / 2;
241 di_current.set_j(mid, pos_ret);
242 return pos_ret;
244 di_stack.push_back(di_current);
245 int mid = (di_current.j + di_current.k) / 2;
246 Xapian::termpos pos_mid = decode(di_current.outof(), true) +
247 (di_current.pos_j + mid - di_current.j);
248 di_current.set_k(mid, pos_mid);
250 #ifdef XAPIAN_ASSERTIONS
251 di_current.uninit();
252 #endif
253 return di_current.pos_k;