liblzma: Prevent uninitialzed warning in mt stream encoder.
[xz/debian.git] / src / liblzma / lz / lz_encoder_mf.c
blob1fdc2d794909885d3220aebdd4ab509ee1266e58
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file lz_encoder_mf.c
4 /// \brief Match finders
5 ///
6 // Authors: Igor Pavlov
7 // Lasse Collin
8 //
9 // This file has been put into the public domain.
10 // You can do whatever you want with this file.
12 ///////////////////////////////////////////////////////////////////////////////
14 #include "lz_encoder.h"
15 #include "lz_encoder_hash.h"
16 #include "memcmplen.h"
19 /// \brief Find matches starting from the current byte
20 ///
21 /// \return The length of the longest match found
22 extern uint32_t
23 lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
25 // Call the match finder. It returns the number of length-distance
26 // pairs found.
27 // FIXME: Minimum count is zero, what _exactly_ is the maximum?
28 const uint32_t count = mf->find(mf, matches);
30 // Length of the longest match; assume that no matches were found
31 // and thus the maximum length is zero.
32 uint32_t len_best = 0;
34 if (count > 0) {
35 #ifndef NDEBUG
36 // Validate the matches.
37 for (uint32_t i = 0; i < count; ++i) {
38 assert(matches[i].len <= mf->nice_len);
39 assert(matches[i].dist < mf->read_pos);
40 assert(memcmp(mf_ptr(mf) - 1,
41 mf_ptr(mf) - matches[i].dist - 2,
42 matches[i].len) == 0);
44 #endif
46 // The last used element in the array contains
47 // the longest match.
48 len_best = matches[count - 1].len;
50 // If a match of maximum search length was found, try to
51 // extend the match to maximum possible length.
52 if (len_best == mf->nice_len) {
53 // The limit for the match length is either the
54 // maximum match length supported by the LZ-based
55 // encoder or the number of bytes left in the
56 // dictionary, whichever is smaller.
57 uint32_t limit = mf_avail(mf) + 1;
58 if (limit > mf->match_len_max)
59 limit = mf->match_len_max;
61 // Pointer to the byte we just ran through
62 // the match finder.
63 const uint8_t *p1 = mf_ptr(mf) - 1;
65 // Pointer to the beginning of the match. We need -1
66 // here because the match distances are zero based.
67 const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
69 len_best = lzma_memcmplen(p1, p2, len_best, limit);
73 *count_ptr = count;
75 // Finally update the read position to indicate that match finder was
76 // run for this dictionary offset.
77 ++mf->read_ahead;
79 return len_best;
83 /// Hash value to indicate unused element in the hash. Since we start the
84 /// positions from dict_size + 1, zero is always too far to qualify
85 /// as usable match position.
86 #define EMPTY_HASH_VALUE 0
89 /// Normalization must be done when lzma_mf.offset + lzma_mf.read_pos
90 /// reaches MUST_NORMALIZE_POS.
91 #define MUST_NORMALIZE_POS UINT32_MAX
94 /// \brief Normalizes hash values
95 ///
96 /// The hash arrays store positions of match candidates. The positions are
97 /// relative to an arbitrary offset that is not the same as the absolute
98 /// offset in the input stream. The relative position of the current byte
99 /// is lzma_mf.offset + lzma_mf.read_pos. The distances of the matches are
100 /// the differences of the current read position and the position found from
101 /// the hash.
103 /// To prevent integer overflows of the offsets stored in the hash arrays,
104 /// we need to "normalize" the stored values now and then. During the
105 /// normalization, we drop values that indicate distance greater than the
106 /// dictionary size, thus making space for new values.
107 static void
108 normalize(lzma_mf *mf)
110 assert(mf->read_pos + mf->offset == MUST_NORMALIZE_POS);
112 // In future we may not want to touch the lowest bits, because there
113 // may be match finders that use larger resolution than one byte.
114 const uint32_t subvalue
115 = (MUST_NORMALIZE_POS - mf->cyclic_size);
116 // & ~((UINT32_C(1) << 10) - 1);
118 for (uint32_t i = 0; i < mf->hash_count; ++i) {
119 // If the distance is greater than the dictionary size,
120 // we can simply mark the hash element as empty.
121 if (mf->hash[i] <= subvalue)
122 mf->hash[i] = EMPTY_HASH_VALUE;
123 else
124 mf->hash[i] -= subvalue;
127 for (uint32_t i = 0; i < mf->sons_count; ++i) {
128 // Do the same for mf->son.
130 // NOTE: There may be uninitialized elements in mf->son.
131 // Valgrind may complain that the "if" below depends on
132 // an uninitialized value. In this case it is safe to ignore
133 // the warning. See also the comments in lz_encoder_init()
134 // in lz_encoder.c.
135 if (mf->son[i] <= subvalue)
136 mf->son[i] = EMPTY_HASH_VALUE;
137 else
138 mf->son[i] -= subvalue;
141 // Update offset to match the new locations.
142 mf->offset -= subvalue;
144 return;
148 /// Mark the current byte as processed from point of view of the match finder.
149 static void
150 move_pos(lzma_mf *mf)
152 if (++mf->cyclic_pos == mf->cyclic_size)
153 mf->cyclic_pos = 0;
155 ++mf->read_pos;
156 assert(mf->read_pos <= mf->write_pos);
158 if (unlikely(mf->read_pos + mf->offset == UINT32_MAX))
159 normalize(mf);
163 /// When flushing, we cannot run the match finder unless there is nice_len
164 /// bytes available in the dictionary. Instead, we skip running the match
165 /// finder (indicating that no match was found), and count how many bytes we
166 /// have ignored this way.
168 /// When new data is given after the flushing was completed, the match finder
169 /// is restarted by rewinding mf->read_pos backwards by mf->pending. Then
170 /// the missed bytes are added to the hash using the match finder's skip
171 /// function (with small amount of input, it may start using mf->pending
172 /// again if flushing).
174 /// Due to this rewinding, we don't touch cyclic_pos or test for
175 /// normalization. It will be done when the match finder's skip function
176 /// catches up after a flush.
177 static void
178 move_pending(lzma_mf *mf)
180 ++mf->read_pos;
181 assert(mf->read_pos <= mf->write_pos);
182 ++mf->pending;
186 /// Calculate len_limit and determine if there is enough input to run
187 /// the actual match finder code. Sets up "cur" and "pos". This macro
188 /// is used by all find functions and binary tree skip functions. Hash
189 /// chain skip function doesn't need len_limit so a simpler code is used
190 /// in them.
191 #define header(is_bt, len_min, ret_op) \
192 uint32_t len_limit = mf_avail(mf); \
193 if (mf->nice_len <= len_limit) { \
194 len_limit = mf->nice_len; \
195 } else if (len_limit < (len_min) \
196 || (is_bt && mf->action == LZMA_SYNC_FLUSH)) { \
197 assert(mf->action != LZMA_RUN); \
198 move_pending(mf); \
199 ret_op; \
201 const uint8_t *cur = mf_ptr(mf); \
202 const uint32_t pos = mf->read_pos + mf->offset
205 /// Header for find functions. "return 0" indicates that zero matches
206 /// were found.
207 #define header_find(is_bt, len_min) \
208 header(is_bt, len_min, return 0); \
209 uint32_t matches_count = 0
212 /// Header for a loop in a skip function. "continue" tells to skip the rest
213 /// of the code in the loop.
214 #define header_skip(is_bt, len_min) \
215 header(is_bt, len_min, continue)
218 /// Calls hc_find_func() or bt_find_func() and calculates the total number
219 /// of matches found. Updates the dictionary position and returns the number
220 /// of matches found.
221 #define call_find(func, len_best) \
222 do { \
223 matches_count = (uint32_t)(func(len_limit, pos, cur, cur_match, \
224 mf->depth, mf->son, \
225 mf->cyclic_pos, mf->cyclic_size, \
226 matches + matches_count, len_best) \
227 - matches); \
228 move_pos(mf); \
229 return matches_count; \
230 } while (0)
233 ////////////////
234 // Hash Chain //
235 ////////////////
237 #if defined(HAVE_MF_HC3) || defined(HAVE_MF_HC4)
240 /// \param len_limit Don't look for matches longer than len_limit.
241 /// \param pos lzma_mf.read_pos + lzma_mf.offset
242 /// \param cur Pointer to current byte (mf_ptr(mf))
243 /// \param cur_match Start position of the current match candidate
244 /// \param depth Maximum length of the hash chain
245 /// \param son lzma_mf.son (contains the hash chain)
246 /// \param cyclic_pos lzma_mf.cyclic_pos
247 /// \param cyclic_size lzma_mf_cyclic_size
248 /// \param matches Array to hold the matches.
249 /// \param len_best The length of the longest match found so far.
250 static lzma_match *
251 hc_find_func(
252 const uint32_t len_limit,
253 const uint32_t pos,
254 const uint8_t *const cur,
255 uint32_t cur_match,
256 uint32_t depth,
257 uint32_t *const son,
258 const uint32_t cyclic_pos,
259 const uint32_t cyclic_size,
260 lzma_match *matches,
261 uint32_t len_best)
263 son[cyclic_pos] = cur_match;
265 while (true) {
266 const uint32_t delta = pos - cur_match;
267 if (depth-- == 0 || delta >= cyclic_size)
268 return matches;
270 const uint8_t *const pb = cur - delta;
271 cur_match = son[cyclic_pos - delta
272 + (delta > cyclic_pos ? cyclic_size : 0)];
274 if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) {
275 uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit);
277 if (len_best < len) {
278 len_best = len;
279 matches->len = len;
280 matches->dist = delta - 1;
281 ++matches;
283 if (len == len_limit)
284 return matches;
291 #define hc_find(len_best) \
292 call_find(hc_find_func, len_best)
295 #define hc_skip() \
296 do { \
297 mf->son[mf->cyclic_pos] = cur_match; \
298 move_pos(mf); \
299 } while (0)
301 #endif
304 #ifdef HAVE_MF_HC3
305 extern uint32_t
306 lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches)
308 header_find(false, 3);
310 hash_3_calc();
312 const uint32_t delta2 = pos - mf->hash[hash_2_value];
313 const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
315 mf->hash[hash_2_value] = pos;
316 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
318 uint32_t len_best = 2;
320 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
321 len_best = lzma_memcmplen(cur - delta2, cur,
322 len_best, len_limit);
324 matches[0].len = len_best;
325 matches[0].dist = delta2 - 1;
326 matches_count = 1;
328 if (len_best == len_limit) {
329 hc_skip();
330 return 1; // matches_count
334 hc_find(len_best);
338 extern void
339 lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount)
341 do {
342 if (mf_avail(mf) < 3) {
343 move_pending(mf);
344 continue;
347 const uint8_t *cur = mf_ptr(mf);
348 const uint32_t pos = mf->read_pos + mf->offset;
350 hash_3_calc();
352 const uint32_t cur_match
353 = mf->hash[FIX_3_HASH_SIZE + hash_value];
355 mf->hash[hash_2_value] = pos;
356 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
358 hc_skip();
360 } while (--amount != 0);
362 #endif
365 #ifdef HAVE_MF_HC4
366 extern uint32_t
367 lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches)
369 header_find(false, 4);
371 hash_4_calc();
373 uint32_t delta2 = pos - mf->hash[hash_2_value];
374 const uint32_t delta3
375 = pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
376 const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
378 mf->hash[hash_2_value ] = pos;
379 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
380 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
382 uint32_t len_best = 1;
384 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
385 len_best = 2;
386 matches[0].len = 2;
387 matches[0].dist = delta2 - 1;
388 matches_count = 1;
391 if (delta2 != delta3 && delta3 < mf->cyclic_size
392 && *(cur - delta3) == *cur) {
393 len_best = 3;
394 matches[matches_count++].dist = delta3 - 1;
395 delta2 = delta3;
398 if (matches_count != 0) {
399 len_best = lzma_memcmplen(cur - delta2, cur,
400 len_best, len_limit);
402 matches[matches_count - 1].len = len_best;
404 if (len_best == len_limit) {
405 hc_skip();
406 return matches_count;
410 if (len_best < 3)
411 len_best = 3;
413 hc_find(len_best);
417 extern void
418 lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount)
420 do {
421 if (mf_avail(mf) < 4) {
422 move_pending(mf);
423 continue;
426 const uint8_t *cur = mf_ptr(mf);
427 const uint32_t pos = mf->read_pos + mf->offset;
429 hash_4_calc();
431 const uint32_t cur_match
432 = mf->hash[FIX_4_HASH_SIZE + hash_value];
434 mf->hash[hash_2_value] = pos;
435 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
436 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
438 hc_skip();
440 } while (--amount != 0);
442 #endif
445 /////////////////
446 // Binary Tree //
447 /////////////////
449 #if defined(HAVE_MF_BT2) || defined(HAVE_MF_BT3) || defined(HAVE_MF_BT4)
450 static lzma_match *
451 bt_find_func(
452 const uint32_t len_limit,
453 const uint32_t pos,
454 const uint8_t *const cur,
455 uint32_t cur_match,
456 uint32_t depth,
457 uint32_t *const son,
458 const uint32_t cyclic_pos,
459 const uint32_t cyclic_size,
460 lzma_match *matches,
461 uint32_t len_best)
463 uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
464 uint32_t *ptr1 = son + (cyclic_pos << 1);
466 uint32_t len0 = 0;
467 uint32_t len1 = 0;
469 while (true) {
470 const uint32_t delta = pos - cur_match;
471 if (depth-- == 0 || delta >= cyclic_size) {
472 *ptr0 = EMPTY_HASH_VALUE;
473 *ptr1 = EMPTY_HASH_VALUE;
474 return matches;
477 uint32_t *const pair = son + ((cyclic_pos - delta
478 + (delta > cyclic_pos ? cyclic_size : 0))
479 << 1);
481 const uint8_t *const pb = cur - delta;
482 uint32_t len = my_min(len0, len1);
484 if (pb[len] == cur[len]) {
485 len = lzma_memcmplen(pb, cur, len + 1, len_limit);
487 if (len_best < len) {
488 len_best = len;
489 matches->len = len;
490 matches->dist = delta - 1;
491 ++matches;
493 if (len == len_limit) {
494 *ptr1 = pair[0];
495 *ptr0 = pair[1];
496 return matches;
501 if (pb[len] < cur[len]) {
502 *ptr1 = cur_match;
503 ptr1 = pair + 1;
504 cur_match = *ptr1;
505 len1 = len;
506 } else {
507 *ptr0 = cur_match;
508 ptr0 = pair;
509 cur_match = *ptr0;
510 len0 = len;
516 static void
517 bt_skip_func(
518 const uint32_t len_limit,
519 const uint32_t pos,
520 const uint8_t *const cur,
521 uint32_t cur_match,
522 uint32_t depth,
523 uint32_t *const son,
524 const uint32_t cyclic_pos,
525 const uint32_t cyclic_size)
527 uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
528 uint32_t *ptr1 = son + (cyclic_pos << 1);
530 uint32_t len0 = 0;
531 uint32_t len1 = 0;
533 while (true) {
534 const uint32_t delta = pos - cur_match;
535 if (depth-- == 0 || delta >= cyclic_size) {
536 *ptr0 = EMPTY_HASH_VALUE;
537 *ptr1 = EMPTY_HASH_VALUE;
538 return;
541 uint32_t *pair = son + ((cyclic_pos - delta
542 + (delta > cyclic_pos ? cyclic_size : 0))
543 << 1);
544 const uint8_t *pb = cur - delta;
545 uint32_t len = my_min(len0, len1);
547 if (pb[len] == cur[len]) {
548 len = lzma_memcmplen(pb, cur, len + 1, len_limit);
550 if (len == len_limit) {
551 *ptr1 = pair[0];
552 *ptr0 = pair[1];
553 return;
557 if (pb[len] < cur[len]) {
558 *ptr1 = cur_match;
559 ptr1 = pair + 1;
560 cur_match = *ptr1;
561 len1 = len;
562 } else {
563 *ptr0 = cur_match;
564 ptr0 = pair;
565 cur_match = *ptr0;
566 len0 = len;
572 #define bt_find(len_best) \
573 call_find(bt_find_func, len_best)
575 #define bt_skip() \
576 do { \
577 bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, \
578 mf->son, mf->cyclic_pos, \
579 mf->cyclic_size); \
580 move_pos(mf); \
581 } while (0)
583 #endif
586 #ifdef HAVE_MF_BT2
587 extern uint32_t
588 lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches)
590 header_find(true, 2);
592 hash_2_calc();
594 const uint32_t cur_match = mf->hash[hash_value];
595 mf->hash[hash_value] = pos;
597 bt_find(1);
601 extern void
602 lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount)
604 do {
605 header_skip(true, 2);
607 hash_2_calc();
609 const uint32_t cur_match = mf->hash[hash_value];
610 mf->hash[hash_value] = pos;
612 bt_skip();
614 } while (--amount != 0);
616 #endif
619 #ifdef HAVE_MF_BT3
620 extern uint32_t
621 lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches)
623 header_find(true, 3);
625 hash_3_calc();
627 const uint32_t delta2 = pos - mf->hash[hash_2_value];
628 const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
630 mf->hash[hash_2_value] = pos;
631 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
633 uint32_t len_best = 2;
635 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
636 len_best = lzma_memcmplen(
637 cur, cur - delta2, len_best, len_limit);
639 matches[0].len = len_best;
640 matches[0].dist = delta2 - 1;
641 matches_count = 1;
643 if (len_best == len_limit) {
644 bt_skip();
645 return 1; // matches_count
649 bt_find(len_best);
653 extern void
654 lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount)
656 do {
657 header_skip(true, 3);
659 hash_3_calc();
661 const uint32_t cur_match
662 = mf->hash[FIX_3_HASH_SIZE + hash_value];
664 mf->hash[hash_2_value] = pos;
665 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
667 bt_skip();
669 } while (--amount != 0);
671 #endif
674 #ifdef HAVE_MF_BT4
675 extern uint32_t
676 lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches)
678 header_find(true, 4);
680 hash_4_calc();
682 uint32_t delta2 = pos - mf->hash[hash_2_value];
683 const uint32_t delta3
684 = pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
685 const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
687 mf->hash[hash_2_value] = pos;
688 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
689 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
691 uint32_t len_best = 1;
693 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
694 len_best = 2;
695 matches[0].len = 2;
696 matches[0].dist = delta2 - 1;
697 matches_count = 1;
700 if (delta2 != delta3 && delta3 < mf->cyclic_size
701 && *(cur - delta3) == *cur) {
702 len_best = 3;
703 matches[matches_count++].dist = delta3 - 1;
704 delta2 = delta3;
707 if (matches_count != 0) {
708 len_best = lzma_memcmplen(
709 cur, cur - delta2, len_best, len_limit);
711 matches[matches_count - 1].len = len_best;
713 if (len_best == len_limit) {
714 bt_skip();
715 return matches_count;
719 if (len_best < 3)
720 len_best = 3;
722 bt_find(len_best);
726 extern void
727 lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount)
729 do {
730 header_skip(true, 4);
732 hash_4_calc();
734 const uint32_t cur_match
735 = mf->hash[FIX_4_HASH_SIZE + hash_value];
737 mf->hash[hash_2_value] = pos;
738 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
739 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
741 bt_skip();
743 } while (--amount != 0);
745 #endif