2 * Copyright (c) Yann Collet, Facebook, Inc.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
11 #include "zstd_compress_internal.h"
12 #include "zstd_lazy.h"
15 /*-*************************************
17 ***************************************/
20 ZSTD_updateDUBT(ZSTD_matchState_t
* ms
,
21 const BYTE
* ip
, const BYTE
* iend
,
24 const ZSTD_compressionParameters
* const cParams
= &ms
->cParams
;
25 U32
* const hashTable
= ms
->hashTable
;
26 U32
const hashLog
= cParams
->hashLog
;
28 U32
* const bt
= ms
->chainTable
;
29 U32
const btLog
= cParams
->chainLog
- 1;
30 U32
const btMask
= (1 << btLog
) - 1;
32 const BYTE
* const base
= ms
->window
.base
;
33 U32
const target
= (U32
)(ip
- base
);
34 U32 idx
= ms
->nextToUpdate
;
37 DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
38 idx
, target
, ms
->window
.dictLimit
);
39 assert(ip
+ 8 <= iend
); /* condition for ZSTD_hashPtr */
42 assert(idx
>= ms
->window
.dictLimit
); /* condition for valid base+idx */
43 for ( ; idx
< target
; idx
++) {
44 size_t const h
= ZSTD_hashPtr(base
+ idx
, hashLog
, mls
); /* assumption : ip + 8 <= iend */
45 U32
const matchIndex
= hashTable
[h
];
47 U32
* const nextCandidatePtr
= bt
+ 2*(idx
&btMask
);
48 U32
* const sortMarkPtr
= nextCandidatePtr
+ 1;
50 DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx
);
51 hashTable
[h
] = idx
; /* Update Hash Table */
52 *nextCandidatePtr
= matchIndex
; /* update BT like a chain */
53 *sortMarkPtr
= ZSTD_DUBT_UNSORTED_MARK
;
55 ms
->nextToUpdate
= target
;
59 /* ZSTD_insertDUBT1() :
60 * sort one already inserted but unsorted position
61 * assumption : curr >= btlow == (curr - btmask)
64 ZSTD_insertDUBT1(const ZSTD_matchState_t
* ms
,
65 U32 curr
, const BYTE
* inputEnd
,
66 U32 nbCompares
, U32 btLow
,
67 const ZSTD_dictMode_e dictMode
)
69 const ZSTD_compressionParameters
* const cParams
= &ms
->cParams
;
70 U32
* const bt
= ms
->chainTable
;
71 U32
const btLog
= cParams
->chainLog
- 1;
72 U32
const btMask
= (1 << btLog
) - 1;
73 size_t commonLengthSmaller
=0, commonLengthLarger
=0;
74 const BYTE
* const base
= ms
->window
.base
;
75 const BYTE
* const dictBase
= ms
->window
.dictBase
;
76 const U32 dictLimit
= ms
->window
.dictLimit
;
77 const BYTE
* const ip
= (curr
>=dictLimit
) ? base
+ curr
: dictBase
+ curr
;
78 const BYTE
* const iend
= (curr
>=dictLimit
) ? inputEnd
: dictBase
+ dictLimit
;
79 const BYTE
* const dictEnd
= dictBase
+ dictLimit
;
80 const BYTE
* const prefixStart
= base
+ dictLimit
;
82 U32
* smallerPtr
= bt
+ 2*(curr
&btMask
);
83 U32
* largerPtr
= smallerPtr
+ 1;
84 U32 matchIndex
= *smallerPtr
; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85 U32 dummy32
; /* to be nullified at the end */
86 U32
const windowValid
= ms
->window
.lowLimit
;
87 U32
const maxDistance
= 1U << cParams
->windowLog
;
88 U32
const windowLow
= (curr
- windowValid
> maxDistance
) ? curr
- maxDistance
: windowValid
;
91 DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92 curr
, dictLimit
, windowLow
);
93 assert(curr
>= btLow
);
94 assert(ip
< iend
); /* condition for ZSTD_count */
96 for (; nbCompares
&& (matchIndex
> windowLow
); --nbCompares
) {
97 U32
* const nextPtr
= bt
+ 2*(matchIndex
& btMask
);
98 size_t matchLength
= MIN(commonLengthSmaller
, commonLengthLarger
); /* guaranteed minimum nb of common bytes */
99 assert(matchIndex
< curr
);
100 /* note : all candidates are now supposed sorted,
101 * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102 * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
104 if ( (dictMode
!= ZSTD_extDict
)
105 || (matchIndex
+matchLength
>= dictLimit
) /* both in current segment*/
106 || (curr
< dictLimit
) /* both in extDict */) {
107 const BYTE
* const mBase
= ( (dictMode
!= ZSTD_extDict
)
108 || (matchIndex
+matchLength
>= dictLimit
)) ?
110 assert( (matchIndex
+matchLength
>= dictLimit
) /* might be wrong if extDict is incorrectly set to 0 */
111 || (curr
< dictLimit
) );
112 match
= mBase
+ matchIndex
;
113 matchLength
+= ZSTD_count(ip
+matchLength
, match
+matchLength
, iend
);
115 match
= dictBase
+ matchIndex
;
116 matchLength
+= ZSTD_count_2segments(ip
+matchLength
, match
+matchLength
, iend
, dictEnd
, prefixStart
);
117 if (matchIndex
+matchLength
>= dictLimit
)
118 match
= base
+ matchIndex
; /* preparation for next read of match[matchLength] */
121 DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122 curr
, matchIndex
, (U32
)matchLength
);
124 if (ip
+matchLength
== iend
) { /* equal : no way to know if inf or sup */
125 break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
128 if (match
[matchLength
] < ip
[matchLength
]) { /* necessarily within buffer */
129 /* match is smaller than current */
130 *smallerPtr
= matchIndex
; /* update smaller idx */
131 commonLengthSmaller
= matchLength
; /* all smaller will now have at least this guaranteed common length */
132 if (matchIndex
<= btLow
) { smallerPtr
=&dummy32
; break; } /* beyond tree size, stop searching */
133 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
134 matchIndex
, btLow
, nextPtr
[1]);
135 smallerPtr
= nextPtr
+1; /* new "candidate" => larger than match, which was smaller than target */
136 matchIndex
= nextPtr
[1]; /* new matchIndex, larger than previous and closer to current */
138 /* match is larger than current */
139 *largerPtr
= matchIndex
;
140 commonLengthLarger
= matchLength
;
141 if (matchIndex
<= btLow
) { largerPtr
=&dummy32
; break; } /* beyond tree size, stop searching */
142 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
143 matchIndex
, btLow
, nextPtr
[0]);
145 matchIndex
= nextPtr
[0];
148 *smallerPtr
= *largerPtr
= 0;
153 ZSTD_DUBT_findBetterDictMatch (
154 const ZSTD_matchState_t
* ms
,
155 const BYTE
* const ip
, const BYTE
* const iend
,
160 const ZSTD_dictMode_e dictMode
)
162 const ZSTD_matchState_t
* const dms
= ms
->dictMatchState
;
163 const ZSTD_compressionParameters
* const dmsCParams
= &dms
->cParams
;
164 const U32
* const dictHashTable
= dms
->hashTable
;
165 U32
const hashLog
= dmsCParams
->hashLog
;
166 size_t const h
= ZSTD_hashPtr(ip
, hashLog
, mls
);
167 U32 dictMatchIndex
= dictHashTable
[h
];
169 const BYTE
* const base
= ms
->window
.base
;
170 const BYTE
* const prefixStart
= base
+ ms
->window
.dictLimit
;
171 U32
const curr
= (U32
)(ip
-base
);
172 const BYTE
* const dictBase
= dms
->window
.base
;
173 const BYTE
* const dictEnd
= dms
->window
.nextSrc
;
174 U32
const dictHighLimit
= (U32
)(dms
->window
.nextSrc
- dms
->window
.base
);
175 U32
const dictLowLimit
= dms
->window
.lowLimit
;
176 U32
const dictIndexDelta
= ms
->window
.lowLimit
- dictHighLimit
;
178 U32
* const dictBt
= dms
->chainTable
;
179 U32
const btLog
= dmsCParams
->chainLog
- 1;
180 U32
const btMask
= (1 << btLog
) - 1;
181 U32
const btLow
= (btMask
>= dictHighLimit
- dictLowLimit
) ? dictLowLimit
: dictHighLimit
- btMask
;
183 size_t commonLengthSmaller
=0, commonLengthLarger
=0;
186 assert(dictMode
== ZSTD_dictMatchState
);
188 for (; nbCompares
&& (dictMatchIndex
> dictLowLimit
); --nbCompares
) {
189 U32
* const nextPtr
= dictBt
+ 2*(dictMatchIndex
& btMask
);
190 size_t matchLength
= MIN(commonLengthSmaller
, commonLengthLarger
); /* guaranteed minimum nb of common bytes */
191 const BYTE
* match
= dictBase
+ dictMatchIndex
;
192 matchLength
+= ZSTD_count_2segments(ip
+matchLength
, match
+matchLength
, iend
, dictEnd
, prefixStart
);
193 if (dictMatchIndex
+matchLength
>= dictHighLimit
)
194 match
= base
+ dictMatchIndex
+ dictIndexDelta
; /* to prepare for next usage of match[matchLength] */
196 if (matchLength
> bestLength
) {
197 U32 matchIndex
= dictMatchIndex
+ dictIndexDelta
;
198 if ( (4*(int)(matchLength
-bestLength
)) > (int)(ZSTD_highbit32(curr
-matchIndex
+1) - ZSTD_highbit32((U32
)offsetPtr
[0]+1)) ) {
199 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200 curr
, (U32
)bestLength
, (U32
)matchLength
, (U32
)*offsetPtr
, STORE_OFFSET(curr
- matchIndex
), dictMatchIndex
, matchIndex
);
201 bestLength
= matchLength
, *offsetPtr
= STORE_OFFSET(curr
- matchIndex
);
203 if (ip
+matchLength
== iend
) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204 break; /* drop, to guarantee consistency (miss a little bit of compression) */
208 if (match
[matchLength
] < ip
[matchLength
]) {
209 if (dictMatchIndex
<= btLow
) { break; } /* beyond tree size, stop the search */
210 commonLengthSmaller
= matchLength
; /* all smaller will now have at least this guaranteed common length */
211 dictMatchIndex
= nextPtr
[1]; /* new matchIndex larger than previous (closer to current) */
213 /* match is larger than current */
214 if (dictMatchIndex
<= btLow
) { break; } /* beyond tree size, stop the search */
215 commonLengthLarger
= matchLength
;
216 dictMatchIndex
= nextPtr
[0];
220 if (bestLength
>= MINMATCH
) {
221 U32
const mIndex
= curr
- (U32
)STORED_OFFSET(*offsetPtr
); (void)mIndex
;
222 DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223 curr
, (U32
)bestLength
, (U32
)*offsetPtr
, mIndex
);
231 ZSTD_DUBT_findBestMatch(ZSTD_matchState_t
* ms
,
232 const BYTE
* const ip
, const BYTE
* const iend
,
235 const ZSTD_dictMode_e dictMode
)
237 const ZSTD_compressionParameters
* const cParams
= &ms
->cParams
;
238 U32
* const hashTable
= ms
->hashTable
;
239 U32
const hashLog
= cParams
->hashLog
;
240 size_t const h
= ZSTD_hashPtr(ip
, hashLog
, mls
);
241 U32 matchIndex
= hashTable
[h
];
243 const BYTE
* const base
= ms
->window
.base
;
244 U32
const curr
= (U32
)(ip
-base
);
245 U32
const windowLow
= ZSTD_getLowestMatchIndex(ms
, curr
, cParams
->windowLog
);
247 U32
* const bt
= ms
->chainTable
;
248 U32
const btLog
= cParams
->chainLog
- 1;
249 U32
const btMask
= (1 << btLog
) - 1;
250 U32
const btLow
= (btMask
>= curr
) ? 0 : curr
- btMask
;
251 U32
const unsortLimit
= MAX(btLow
, windowLow
);
253 U32
* nextCandidate
= bt
+ 2*(matchIndex
&btMask
);
254 U32
* unsortedMark
= bt
+ 2*(matchIndex
&btMask
) + 1;
255 U32 nbCompares
= 1U << cParams
->searchLog
;
256 U32 nbCandidates
= nbCompares
;
257 U32 previousCandidate
= 0;
259 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr
);
260 assert(ip
<= iend
-8); /* required for h calculation */
261 assert(dictMode
!= ZSTD_dedicatedDictSearch
);
263 /* reach end of unsorted candidates list */
264 while ( (matchIndex
> unsortLimit
)
265 && (*unsortedMark
== ZSTD_DUBT_UNSORTED_MARK
)
266 && (nbCandidates
> 1) ) {
267 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
269 *unsortedMark
= previousCandidate
; /* the unsortedMark becomes a reversed chain, to move up back to original position */
270 previousCandidate
= matchIndex
;
271 matchIndex
= *nextCandidate
;
272 nextCandidate
= bt
+ 2*(matchIndex
&btMask
);
273 unsortedMark
= bt
+ 2*(matchIndex
&btMask
) + 1;
277 /* nullify last candidate if it's still unsorted
278 * simplification, detrimental to compression ratio, beneficial for speed */
279 if ( (matchIndex
> unsortLimit
)
280 && (*unsortedMark
==ZSTD_DUBT_UNSORTED_MARK
) ) {
281 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
283 *nextCandidate
= *unsortedMark
= 0;
286 /* batch sort stacked candidates */
287 matchIndex
= previousCandidate
;
288 while (matchIndex
) { /* will end on matchIndex == 0 */
289 U32
* const nextCandidateIdxPtr
= bt
+ 2*(matchIndex
&btMask
) + 1;
290 U32
const nextCandidateIdx
= *nextCandidateIdxPtr
;
291 ZSTD_insertDUBT1(ms
, matchIndex
, iend
,
292 nbCandidates
, unsortLimit
, dictMode
);
293 matchIndex
= nextCandidateIdx
;
297 /* find longest match */
298 { size_t commonLengthSmaller
= 0, commonLengthLarger
= 0;
299 const BYTE
* const dictBase
= ms
->window
.dictBase
;
300 const U32 dictLimit
= ms
->window
.dictLimit
;
301 const BYTE
* const dictEnd
= dictBase
+ dictLimit
;
302 const BYTE
* const prefixStart
= base
+ dictLimit
;
303 U32
* smallerPtr
= bt
+ 2*(curr
&btMask
);
304 U32
* largerPtr
= bt
+ 2*(curr
&btMask
) + 1;
305 U32 matchEndIdx
= curr
+ 8 + 1;
306 U32 dummy32
; /* to be nullified at the end */
307 size_t bestLength
= 0;
309 matchIndex
= hashTable
[h
];
310 hashTable
[h
] = curr
; /* Update Hash Table */
312 for (; nbCompares
&& (matchIndex
> windowLow
); --nbCompares
) {
313 U32
* const nextPtr
= bt
+ 2*(matchIndex
& btMask
);
314 size_t matchLength
= MIN(commonLengthSmaller
, commonLengthLarger
); /* guaranteed minimum nb of common bytes */
317 if ((dictMode
!= ZSTD_extDict
) || (matchIndex
+matchLength
>= dictLimit
)) {
318 match
= base
+ matchIndex
;
319 matchLength
+= ZSTD_count(ip
+matchLength
, match
+matchLength
, iend
);
321 match
= dictBase
+ matchIndex
;
322 matchLength
+= ZSTD_count_2segments(ip
+matchLength
, match
+matchLength
, iend
, dictEnd
, prefixStart
);
323 if (matchIndex
+matchLength
>= dictLimit
)
324 match
= base
+ matchIndex
; /* to prepare for next usage of match[matchLength] */
327 if (matchLength
> bestLength
) {
328 if (matchLength
> matchEndIdx
- matchIndex
)
329 matchEndIdx
= matchIndex
+ (U32
)matchLength
;
330 if ( (4*(int)(matchLength
-bestLength
)) > (int)(ZSTD_highbit32(curr
-matchIndex
+1) - ZSTD_highbit32((U32
)offsetPtr
[0]+1)) )
331 bestLength
= matchLength
, *offsetPtr
= STORE_OFFSET(curr
- matchIndex
);
332 if (ip
+matchLength
== iend
) { /* equal : no way to know if inf or sup */
333 if (dictMode
== ZSTD_dictMatchState
) {
334 nbCompares
= 0; /* in addition to avoiding checking any
335 * further in this loop, make sure we
336 * skip checking in the dictionary. */
338 break; /* drop, to guarantee consistency (miss a little bit of compression) */
342 if (match
[matchLength
] < ip
[matchLength
]) {
343 /* match is smaller than current */
344 *smallerPtr
= matchIndex
; /* update smaller idx */
345 commonLengthSmaller
= matchLength
; /* all smaller will now have at least this guaranteed common length */
346 if (matchIndex
<= btLow
) { smallerPtr
=&dummy32
; break; } /* beyond tree size, stop the search */
347 smallerPtr
= nextPtr
+1; /* new "smaller" => larger of match */
348 matchIndex
= nextPtr
[1]; /* new matchIndex larger than previous (closer to current) */
350 /* match is larger than current */
351 *largerPtr
= matchIndex
;
352 commonLengthLarger
= matchLength
;
353 if (matchIndex
<= btLow
) { largerPtr
=&dummy32
; break; } /* beyond tree size, stop the search */
355 matchIndex
= nextPtr
[0];
358 *smallerPtr
= *largerPtr
= 0;
360 assert(nbCompares
<= (1U << ZSTD_SEARCHLOG_MAX
)); /* Check we haven't underflowed. */
361 if (dictMode
== ZSTD_dictMatchState
&& nbCompares
) {
362 bestLength
= ZSTD_DUBT_findBetterDictMatch(
364 offsetPtr
, bestLength
, nbCompares
,
368 assert(matchEndIdx
> curr
+8); /* ensure nextToUpdate is increased */
369 ms
->nextToUpdate
= matchEndIdx
- 8; /* skip repetitive patterns */
370 if (bestLength
>= MINMATCH
) {
371 U32
const mIndex
= curr
- (U32
)STORED_OFFSET(*offsetPtr
); (void)mIndex
;
372 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373 curr
, (U32
)bestLength
, (U32
)*offsetPtr
, mIndex
);
380 /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
381 FORCE_INLINE_TEMPLATE
size_t
382 ZSTD_BtFindBestMatch( ZSTD_matchState_t
* ms
,
383 const BYTE
* const ip
, const BYTE
* const iLimit
,
385 const U32 mls
/* template */,
386 const ZSTD_dictMode_e dictMode
)
388 DEBUGLOG(7, "ZSTD_BtFindBestMatch");
389 if (ip
< ms
->window
.base
+ ms
->nextToUpdate
) return 0; /* skipped area */
390 ZSTD_updateDUBT(ms
, ip
, iLimit
, mls
);
391 return ZSTD_DUBT_findBestMatch(ms
, ip
, iLimit
, offsetPtr
, mls
, dictMode
);
394 /* *********************************
395 * Dedicated dict search
396 ***********************************/
398 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t
* ms
, const BYTE
* const ip
)
400 const BYTE
* const base
= ms
->window
.base
;
401 U32
const target
= (U32
)(ip
- base
);
402 U32
* const hashTable
= ms
->hashTable
;
403 U32
* const chainTable
= ms
->chainTable
;
404 U32
const chainSize
= 1 << ms
->cParams
.chainLog
;
405 U32 idx
= ms
->nextToUpdate
;
406 U32
const minChain
= chainSize
< target
- idx
? target
- chainSize
: idx
;
407 U32
const bucketSize
= 1 << ZSTD_LAZY_DDSS_BUCKET_LOG
;
408 U32
const cacheSize
= bucketSize
- 1;
409 U32
const chainAttempts
= (1 << ms
->cParams
.searchLog
) - cacheSize
;
410 U32
const chainLimit
= chainAttempts
> 255 ? 255 : chainAttempts
;
412 /* We know the hashtable is oversized by a factor of `bucketSize`.
413 * We are going to temporarily pretend `bucketSize == 1`, keeping only a
414 * single entry. We will use the rest of the space to construct a temporary
417 U32
const hashLog
= ms
->cParams
.hashLog
- ZSTD_LAZY_DDSS_BUCKET_LOG
;
418 U32
* const tmpHashTable
= hashTable
;
419 U32
* const tmpChainTable
= hashTable
+ ((size_t)1 << hashLog
);
420 U32
const tmpChainSize
= (U32
)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG
) - 1) << hashLog
;
421 U32
const tmpMinChain
= tmpChainSize
< target
? target
- tmpChainSize
: idx
;
424 assert(ms
->cParams
.chainLog
<= 24);
425 assert(ms
->cParams
.hashLog
> ms
->cParams
.chainLog
);
427 assert(tmpMinChain
<= minChain
);
429 /* fill conventional hash table and conventional chain table */
430 for ( ; idx
< target
; idx
++) {
431 U32
const h
= (U32
)ZSTD_hashPtr(base
+ idx
, hashLog
, ms
->cParams
.minMatch
);
432 if (idx
>= tmpMinChain
) {
433 tmpChainTable
[idx
- tmpMinChain
] = hashTable
[h
];
435 tmpHashTable
[h
] = idx
;
438 /* sort chains into ddss chain table */
441 for (hashIdx
= 0; hashIdx
< (1U << hashLog
); hashIdx
++) {
443 U32 countBeyondMinChain
= 0;
444 U32 i
= tmpHashTable
[hashIdx
];
445 for (count
= 0; i
>= tmpMinChain
&& count
< cacheSize
; count
++) {
446 /* skip through the chain to the first position that won't be
447 * in the hash cache bucket */
449 countBeyondMinChain
++;
451 i
= tmpChainTable
[i
- tmpMinChain
];
453 if (count
== cacheSize
) {
454 for (count
= 0; count
< chainLimit
;) {
456 if (!i
|| ++countBeyondMinChain
> cacheSize
) {
457 /* only allow pulling `cacheSize` number of entries
458 * into the cache or chainTable beyond `minChain`,
459 * to replace the entries pulled out of the
460 * chainTable into the cache. This lets us reach
461 * back further without increasing the total number
462 * of entries in the chainTable, guaranteeing the
463 * DDSS chain table will fit into the space
464 * allocated for the regular one. */
468 chainTable
[chainPos
++] = i
;
470 if (i
< tmpMinChain
) {
473 i
= tmpChainTable
[i
- tmpMinChain
];
479 tmpHashTable
[hashIdx
] = ((chainPos
- count
) << 8) + count
;
481 tmpHashTable
[hashIdx
] = 0;
484 assert(chainPos
<= chainSize
); /* I believe this is guaranteed... */
487 /* move chain pointers into the last entry of each hash bucket */
488 for (hashIdx
= (1 << hashLog
); hashIdx
; ) {
489 U32
const bucketIdx
= --hashIdx
<< ZSTD_LAZY_DDSS_BUCKET_LOG
;
490 U32
const chainPackedPointer
= tmpHashTable
[hashIdx
];
492 for (i
= 0; i
< cacheSize
; i
++) {
493 hashTable
[bucketIdx
+ i
] = 0;
495 hashTable
[bucketIdx
+ bucketSize
- 1] = chainPackedPointer
;
498 /* fill the buckets of the hash table */
499 for (idx
= ms
->nextToUpdate
; idx
< target
; idx
++) {
500 U32
const h
= (U32
)ZSTD_hashPtr(base
+ idx
, hashLog
, ms
->cParams
.minMatch
)
501 << ZSTD_LAZY_DDSS_BUCKET_LOG
;
503 /* Shift hash cache down 1. */
504 for (i
= cacheSize
- 1; i
; i
--)
505 hashTable
[h
+ i
] = hashTable
[h
+ i
- 1];
509 ms
->nextToUpdate
= target
;
512 /* Returns the longest match length found in the dedicated dict search structure.
513 * If none are longer than the argument ml, then ml will be returned.
515 FORCE_INLINE_TEMPLATE
516 size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr
, size_t ml
, U32 nbAttempts
,
517 const ZSTD_matchState_t
* const dms
,
518 const BYTE
* const ip
, const BYTE
* const iLimit
,
519 const BYTE
* const prefixStart
, const U32 curr
,
520 const U32 dictLimit
, const size_t ddsIdx
) {
521 const U32 ddsLowestIndex
= dms
->window
.dictLimit
;
522 const BYTE
* const ddsBase
= dms
->window
.base
;
523 const BYTE
* const ddsEnd
= dms
->window
.nextSrc
;
524 const U32 ddsSize
= (U32
)(ddsEnd
- ddsBase
);
525 const U32 ddsIndexDelta
= dictLimit
- ddsSize
;
526 const U32 bucketSize
= (1 << ZSTD_LAZY_DDSS_BUCKET_LOG
);
527 const U32 bucketLimit
= nbAttempts
< bucketSize
- 1 ? nbAttempts
: bucketSize
- 1;
531 for (ddsAttempt
= 0; ddsAttempt
< bucketSize
- 1; ddsAttempt
++) {
532 PREFETCH_L1(ddsBase
+ dms
->hashTable
[ddsIdx
+ ddsAttempt
]);
536 U32
const chainPackedPointer
= dms
->hashTable
[ddsIdx
+ bucketSize
- 1];
537 U32
const chainIndex
= chainPackedPointer
>> 8;
539 PREFETCH_L1(&dms
->chainTable
[chainIndex
]);
542 for (ddsAttempt
= 0; ddsAttempt
< bucketLimit
; ddsAttempt
++) {
545 matchIndex
= dms
->hashTable
[ddsIdx
+ ddsAttempt
];
546 match
= ddsBase
+ matchIndex
;
552 /* guaranteed by table construction */
553 (void)ddsLowestIndex
;
554 assert(matchIndex
>= ddsLowestIndex
);
555 assert(match
+4 <= ddsEnd
);
556 if (MEM_read32(match
) == MEM_read32(ip
)) {
557 /* assumption : matchIndex <= dictLimit-4 (by table construction) */
558 currentMl
= ZSTD_count_2segments(ip
+4, match
+4, iLimit
, ddsEnd
, prefixStart
) + 4;
561 /* save best solution */
562 if (currentMl
> ml
) {
564 *offsetPtr
= STORE_OFFSET(curr
- (matchIndex
+ ddsIndexDelta
));
565 if (ip
+currentMl
== iLimit
) {
566 /* best possible, avoids read overflow on next attempt */
573 U32
const chainPackedPointer
= dms
->hashTable
[ddsIdx
+ bucketSize
- 1];
574 U32 chainIndex
= chainPackedPointer
>> 8;
575 U32
const chainLength
= chainPackedPointer
& 0xFF;
576 U32
const chainAttempts
= nbAttempts
- ddsAttempt
;
577 U32
const chainLimit
= chainAttempts
> chainLength
? chainLength
: chainAttempts
;
580 for (chainAttempt
= 0 ; chainAttempt
< chainLimit
; chainAttempt
++) {
581 PREFETCH_L1(ddsBase
+ dms
->chainTable
[chainIndex
+ chainAttempt
]);
584 for (chainAttempt
= 0 ; chainAttempt
< chainLimit
; chainAttempt
++, chainIndex
++) {
587 matchIndex
= dms
->chainTable
[chainIndex
];
588 match
= ddsBase
+ matchIndex
;
590 /* guaranteed by table construction */
591 assert(matchIndex
>= ddsLowestIndex
);
592 assert(match
+4 <= ddsEnd
);
593 if (MEM_read32(match
) == MEM_read32(ip
)) {
594 /* assumption : matchIndex <= dictLimit-4 (by table construction) */
595 currentMl
= ZSTD_count_2segments(ip
+4, match
+4, iLimit
, ddsEnd
, prefixStart
) + 4;
598 /* save best solution */
599 if (currentMl
> ml
) {
601 *offsetPtr
= STORE_OFFSET(curr
- (matchIndex
+ ddsIndexDelta
));
602 if (ip
+currentMl
== iLimit
) break; /* best possible, avoids read overflow on next attempt */
610 /* *********************************
612 ***********************************/
613 #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
615 /* Update chains up to ip (excluded)
616 Assumption : always within prefix (i.e. not within extDict) */
617 FORCE_INLINE_TEMPLATE U32
ZSTD_insertAndFindFirstIndex_internal(
618 ZSTD_matchState_t
* ms
,
619 const ZSTD_compressionParameters
* const cParams
,
620 const BYTE
* ip
, U32
const mls
)
622 U32
* const hashTable
= ms
->hashTable
;
623 const U32 hashLog
= cParams
->hashLog
;
624 U32
* const chainTable
= ms
->chainTable
;
625 const U32 chainMask
= (1 << cParams
->chainLog
) - 1;
626 const BYTE
* const base
= ms
->window
.base
;
627 const U32 target
= (U32
)(ip
- base
);
628 U32 idx
= ms
->nextToUpdate
;
630 while(idx
< target
) { /* catch up */
631 size_t const h
= ZSTD_hashPtr(base
+idx
, hashLog
, mls
);
632 NEXT_IN_CHAIN(idx
, chainMask
) = hashTable
[h
];
637 ms
->nextToUpdate
= target
;
638 return hashTable
[ZSTD_hashPtr(ip
, hashLog
, mls
)];
641 U32
ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t
* ms
, const BYTE
* ip
) {
642 const ZSTD_compressionParameters
* const cParams
= &ms
->cParams
;
643 return ZSTD_insertAndFindFirstIndex_internal(ms
, cParams
, ip
, ms
->cParams
.minMatch
);
646 /* inlining is important to hardwire a hot branch (template emulation) */
647 FORCE_INLINE_TEMPLATE
648 size_t ZSTD_HcFindBestMatch(
649 ZSTD_matchState_t
* ms
,
650 const BYTE
* const ip
, const BYTE
* const iLimit
,
652 const U32 mls
, const ZSTD_dictMode_e dictMode
)
654 const ZSTD_compressionParameters
* const cParams
= &ms
->cParams
;
655 U32
* const chainTable
= ms
->chainTable
;
656 const U32 chainSize
= (1 << cParams
->chainLog
);
657 const U32 chainMask
= chainSize
-1;
658 const BYTE
* const base
= ms
->window
.base
;
659 const BYTE
* const dictBase
= ms
->window
.dictBase
;
660 const U32 dictLimit
= ms
->window
.dictLimit
;
661 const BYTE
* const prefixStart
= base
+ dictLimit
;
662 const BYTE
* const dictEnd
= dictBase
+ dictLimit
;
663 const U32 curr
= (U32
)(ip
-base
);
664 const U32 maxDistance
= 1U << cParams
->windowLog
;
665 const U32 lowestValid
= ms
->window
.lowLimit
;
666 const U32 withinMaxDistance
= (curr
- lowestValid
> maxDistance
) ? curr
- maxDistance
: lowestValid
;
667 const U32 isDictionary
= (ms
->loadedDictEnd
!= 0);
668 const U32 lowLimit
= isDictionary
? lowestValid
: withinMaxDistance
;
669 const U32 minChain
= curr
> chainSize
? curr
- chainSize
: 0;
670 U32 nbAttempts
= 1U << cParams
->searchLog
;
673 const ZSTD_matchState_t
* const dms
= ms
->dictMatchState
;
674 const U32 ddsHashLog
= dictMode
== ZSTD_dedicatedDictSearch
675 ? dms
->cParams
.hashLog
- ZSTD_LAZY_DDSS_BUCKET_LOG
: 0;
676 const size_t ddsIdx
= dictMode
== ZSTD_dedicatedDictSearch
677 ? ZSTD_hashPtr(ip
, ddsHashLog
, mls
) << ZSTD_LAZY_DDSS_BUCKET_LOG
: 0;
681 if (dictMode
== ZSTD_dedicatedDictSearch
) {
682 const U32
* entry
= &dms
->hashTable
[ddsIdx
];
686 /* HC4 match finder */
687 matchIndex
= ZSTD_insertAndFindFirstIndex_internal(ms
, cParams
, ip
, mls
);
689 for ( ; (matchIndex
>=lowLimit
) & (nbAttempts
>0) ; nbAttempts
--) {
691 if ((dictMode
!= ZSTD_extDict
) || matchIndex
>= dictLimit
) {
692 const BYTE
* const match
= base
+ matchIndex
;
693 assert(matchIndex
>= dictLimit
); /* ensures this is true if dictMode != ZSTD_extDict */
694 if (match
[ml
] == ip
[ml
]) /* potentially better */
695 currentMl
= ZSTD_count(ip
, match
, iLimit
);
697 const BYTE
* const match
= dictBase
+ matchIndex
;
698 assert(match
+4 <= dictEnd
);
699 if (MEM_read32(match
) == MEM_read32(ip
)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
700 currentMl
= ZSTD_count_2segments(ip
+4, match
+4, iLimit
, dictEnd
, prefixStart
) + 4;
703 /* save best solution */
704 if (currentMl
> ml
) {
706 *offsetPtr
= STORE_OFFSET(curr
- matchIndex
);
707 if (ip
+currentMl
== iLimit
) break; /* best possible, avoids read overflow on next attempt */
710 if (matchIndex
<= minChain
) break;
711 matchIndex
= NEXT_IN_CHAIN(matchIndex
, chainMask
);
714 assert(nbAttempts
<= (1U << ZSTD_SEARCHLOG_MAX
)); /* Check we haven't underflowed. */
715 if (dictMode
== ZSTD_dedicatedDictSearch
) {
716 ml
= ZSTD_dedicatedDictSearch_lazy_search(offsetPtr
, ml
, nbAttempts
, dms
,
717 ip
, iLimit
, prefixStart
, curr
, dictLimit
, ddsIdx
);
718 } else if (dictMode
== ZSTD_dictMatchState
) {
719 const U32
* const dmsChainTable
= dms
->chainTable
;
720 const U32 dmsChainSize
= (1 << dms
->cParams
.chainLog
);
721 const U32 dmsChainMask
= dmsChainSize
- 1;
722 const U32 dmsLowestIndex
= dms
->window
.dictLimit
;
723 const BYTE
* const dmsBase
= dms
->window
.base
;
724 const BYTE
* const dmsEnd
= dms
->window
.nextSrc
;
725 const U32 dmsSize
= (U32
)(dmsEnd
- dmsBase
);
726 const U32 dmsIndexDelta
= dictLimit
- dmsSize
;
727 const U32 dmsMinChain
= dmsSize
> dmsChainSize
? dmsSize
- dmsChainSize
: 0;
729 matchIndex
= dms
->hashTable
[ZSTD_hashPtr(ip
, dms
->cParams
.hashLog
, mls
)];
731 for ( ; (matchIndex
>=dmsLowestIndex
) & (nbAttempts
>0) ; nbAttempts
--) {
733 const BYTE
* const match
= dmsBase
+ matchIndex
;
734 assert(match
+4 <= dmsEnd
);
735 if (MEM_read32(match
) == MEM_read32(ip
)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
736 currentMl
= ZSTD_count_2segments(ip
+4, match
+4, iLimit
, dmsEnd
, prefixStart
) + 4;
738 /* save best solution */
739 if (currentMl
> ml
) {
741 assert(curr
> matchIndex
+ dmsIndexDelta
);
742 *offsetPtr
= STORE_OFFSET(curr
- (matchIndex
+ dmsIndexDelta
));
743 if (ip
+currentMl
== iLimit
) break; /* best possible, avoids read overflow on next attempt */
746 if (matchIndex
<= dmsMinChain
) break;
748 matchIndex
= dmsChainTable
[matchIndex
& dmsChainMask
];
755 /* *********************************
756 * (SIMD) Row-based matchfinder
757 ***********************************/
758 /* Constants for row-based hash */
759 #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760 #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
761 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762 #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
764 #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
766 typedef U64 ZSTD_VecMask
; /* Clarifies when we are interacting with a U64 representing a mask of matches */
768 /* ZSTD_VecMask_next():
769 * Starting from the LSB, returns the idx of the next non-zero bit.
770 * Basically counting the nb of trailing zeroes.
772 static U32
ZSTD_VecMask_next(ZSTD_VecMask val
) {
774 # if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
775 if (sizeof(size_t) == 4) {
776 U32 mostSignificantWord
= (U32
)(val
>> 32);
777 U32 leastSignificantWord
= (U32
)val
;
778 if (leastSignificantWord
== 0) {
779 return 32 + (U32
)__builtin_ctz(mostSignificantWord
);
781 return (U32
)__builtin_ctz(leastSignificantWord
);
784 return (U32
)__builtin_ctzll(val
);
787 /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
788 * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
790 val
= ~val
& (val
- 1ULL); /* Lowest set bit mask */
791 val
= val
- ((val
>> 1) & 0x5555555555555555);
792 val
= (val
& 0x3333333333333333ULL
) + ((val
>> 2) & 0x3333333333333333ULL
);
793 return (U32
)((((val
+ (val
>> 4)) & 0xF0F0F0F0F0F0F0FULL
) * 0x101010101010101ULL
) >> 56);
797 /* ZSTD_rotateRight_*():
798 * Rotates a bitfield to the right by "count" bits.
799 * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
801 FORCE_INLINE_TEMPLATE
802 U64
ZSTD_rotateRight_U64(U64
const value
, U32 count
) {
804 count
&= 0x3F; /* for fickle pattern recognition */
805 return (value
>> count
) | (U64
)(value
<< ((0U - count
) & 0x3F));
808 FORCE_INLINE_TEMPLATE
809 U32
ZSTD_rotateRight_U32(U32
const value
, U32 count
) {
811 count
&= 0x1F; /* for fickle pattern recognition */
812 return (value
>> count
) | (U32
)(value
<< ((0U - count
) & 0x1F));
815 FORCE_INLINE_TEMPLATE
816 U16
ZSTD_rotateRight_U16(U16
const value
, U32 count
) {
818 count
&= 0x0F; /* for fickle pattern recognition */
819 return (value
>> count
) | (U16
)(value
<< ((0U - count
) & 0x0F));
822 /* ZSTD_row_nextIndex():
823 * Returns the next index to insert at within a tagTable row, and updates the "head"
824 * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
826 FORCE_INLINE_TEMPLATE U32
ZSTD_row_nextIndex(BYTE
* const tagRow
, U32
const rowMask
) {
827 U32
const next
= (*tagRow
- 1) & rowMask
;
828 *tagRow
= (BYTE
)next
;
833 * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
835 MEM_STATIC
int ZSTD_isAligned(void const* ptr
, size_t align
) {
836 assert((align
& (align
- 1)) == 0);
837 return (((size_t)ptr
) & (align
- 1)) == 0;
840 /* ZSTD_row_prefetch():
841 * Performs prefetching for the hashTable and tagTable at a given row.
843 FORCE_INLINE_TEMPLATE
void ZSTD_row_prefetch(U32
const* hashTable
, U16
const* tagTable
, U32
const relRow
, U32
const rowLog
) {
844 PREFETCH_L1(hashTable
+ relRow
);
846 PREFETCH_L1(hashTable
+ relRow
+ 16);
847 /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
849 PREFETCH_L1(tagTable
+ relRow
);
851 PREFETCH_L1(tagTable
+ relRow
+ 32);
853 assert(rowLog
== 4 || rowLog
== 5 || rowLog
== 6);
854 assert(ZSTD_isAligned(hashTable
+ relRow
, 64)); /* prefetched hash row always 64-byte aligned */
855 assert(ZSTD_isAligned(tagTable
+ relRow
, (size_t)1 << rowLog
)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
858 /* ZSTD_row_fillHashCache():
859 * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
860 * but not beyond iLimit.
862 FORCE_INLINE_TEMPLATE
void ZSTD_row_fillHashCache(ZSTD_matchState_t
* ms
, const BYTE
* base
,
863 U32
const rowLog
, U32
const mls
,
864 U32 idx
, const BYTE
* const iLimit
)
866 U32
const* const hashTable
= ms
->hashTable
;
867 U16
const* const tagTable
= ms
->tagTable
;
868 U32
const hashLog
= ms
->rowHashLog
;
869 U32
const maxElemsToPrefetch
= (base
+ idx
) > iLimit
? 0 : (U32
)(iLimit
- (base
+ idx
) + 1);
870 U32
const lim
= idx
+ MIN(ZSTD_ROW_HASH_CACHE_SIZE
, maxElemsToPrefetch
);
872 for (; idx
< lim
; ++idx
) {
873 U32
const hash
= (U32
)ZSTD_hashPtr(base
+ idx
, hashLog
+ ZSTD_ROW_HASH_TAG_BITS
, mls
);
874 U32
const row
= (hash
>> ZSTD_ROW_HASH_TAG_BITS
) << rowLog
;
875 ZSTD_row_prefetch(hashTable
, tagTable
, row
, rowLog
);
876 ms
->hashCache
[idx
& ZSTD_ROW_HASH_CACHE_MASK
] = hash
;
879 DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms
->hashCache
[0], ms
->hashCache
[1],
880 ms
->hashCache
[2], ms
->hashCache
[3], ms
->hashCache
[4],
881 ms
->hashCache
[5], ms
->hashCache
[6], ms
->hashCache
[7]);
884 /* ZSTD_row_nextCachedHash():
885 * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
886 * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
888 FORCE_INLINE_TEMPLATE U32
ZSTD_row_nextCachedHash(U32
* cache
, U32
const* hashTable
,
889 U16
const* tagTable
, BYTE
const* base
,
890 U32 idx
, U32
const hashLog
,
891 U32
const rowLog
, U32
const mls
)
893 U32
const newHash
= (U32
)ZSTD_hashPtr(base
+idx
+ZSTD_ROW_HASH_CACHE_SIZE
, hashLog
+ ZSTD_ROW_HASH_TAG_BITS
, mls
);
894 U32
const row
= (newHash
>> ZSTD_ROW_HASH_TAG_BITS
) << rowLog
;
895 ZSTD_row_prefetch(hashTable
, tagTable
, row
, rowLog
);
896 { U32
const hash
= cache
[idx
& ZSTD_ROW_HASH_CACHE_MASK
];
897 cache
[idx
& ZSTD_ROW_HASH_CACHE_MASK
] = newHash
;
902 /* ZSTD_row_update_internalImpl():
903 * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
905 FORCE_INLINE_TEMPLATE
void ZSTD_row_update_internalImpl(ZSTD_matchState_t
* ms
,
906 U32 updateStartIdx
, U32
const updateEndIdx
,
907 U32
const mls
, U32
const rowLog
,
908 U32
const rowMask
, U32
const useCache
)
910 U32
* const hashTable
= ms
->hashTable
;
911 U16
* const tagTable
= ms
->tagTable
;
912 U32
const hashLog
= ms
->rowHashLog
;
913 const BYTE
* const base
= ms
->window
.base
;
915 DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx
, updateEndIdx
);
916 for (; updateStartIdx
< updateEndIdx
; ++updateStartIdx
) {
917 U32
const hash
= useCache
? ZSTD_row_nextCachedHash(ms
->hashCache
, hashTable
, tagTable
, base
, updateStartIdx
, hashLog
, rowLog
, mls
)
918 : (U32
)ZSTD_hashPtr(base
+ updateStartIdx
, hashLog
+ ZSTD_ROW_HASH_TAG_BITS
, mls
);
919 U32
const relRow
= (hash
>> ZSTD_ROW_HASH_TAG_BITS
) << rowLog
;
920 U32
* const row
= hashTable
+ relRow
;
921 BYTE
* tagRow
= (BYTE
*)(tagTable
+ relRow
); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
922 Explicit cast allows us to get exact desired position within each row */
923 U32
const pos
= ZSTD_row_nextIndex(tagRow
, rowMask
);
925 assert(hash
== ZSTD_hashPtr(base
+ updateStartIdx
, hashLog
+ ZSTD_ROW_HASH_TAG_BITS
, mls
));
926 ((BYTE
*)tagRow
)[pos
+ ZSTD_ROW_HASH_TAG_OFFSET
] = hash
& ZSTD_ROW_HASH_TAG_MASK
;
927 row
[pos
] = updateStartIdx
;
931 /* ZSTD_row_update_internal():
932 * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
933 * Skips sections of long matches as is necessary.
935 FORCE_INLINE_TEMPLATE
void ZSTD_row_update_internal(ZSTD_matchState_t
* ms
, const BYTE
* ip
,
936 U32
const mls
, U32
const rowLog
,
937 U32
const rowMask
, U32
const useCache
)
939 U32 idx
= ms
->nextToUpdate
;
940 const BYTE
* const base
= ms
->window
.base
;
941 const U32 target
= (U32
)(ip
- base
);
942 const U32 kSkipThreshold
= 384;
943 const U32 kMaxMatchStartPositionsToUpdate
= 96;
944 const U32 kMaxMatchEndPositionsToUpdate
= 32;
947 /* Only skip positions when using hash cache, i.e.
948 * if we are loading a dict, don't skip anything.
949 * If we decide to skip, then we only update a set number
950 * of positions at the beginning and end of the match.
952 if (UNLIKELY(target
- idx
> kSkipThreshold
)) {
953 U32
const bound
= idx
+ kMaxMatchStartPositionsToUpdate
;
954 ZSTD_row_update_internalImpl(ms
, idx
, bound
, mls
, rowLog
, rowMask
, useCache
);
955 idx
= target
- kMaxMatchEndPositionsToUpdate
;
956 ZSTD_row_fillHashCache(ms
, base
, rowLog
, mls
, idx
, ip
+1);
959 assert(target
>= idx
);
960 ZSTD_row_update_internalImpl(ms
, idx
, target
, mls
, rowLog
, rowMask
, useCache
);
961 ms
->nextToUpdate
= target
;
964 /* ZSTD_row_update():
965 * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
968 void ZSTD_row_update(ZSTD_matchState_t
* const ms
, const BYTE
* ip
) {
969 const U32 rowLog
= BOUNDED(4, ms
->cParams
.searchLog
, 6);
970 const U32 rowMask
= (1u << rowLog
) - 1;
971 const U32 mls
= MIN(ms
->cParams
.minMatch
, 6 /* mls caps out at 6 */);
973 DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog
);
974 ZSTD_row_update_internal(ms
, ip
, mls
, rowLog
, rowMask
, 0 /* dont use cache */);
977 #if defined(ZSTD_ARCH_X86_SSE2)
978 FORCE_INLINE_TEMPLATE ZSTD_VecMask
979 ZSTD_row_getSSEMask(int nbChunks
, const BYTE
* const src
, const BYTE tag
, const U32 head
)
981 const __m128i comparisonMask
= _mm_set1_epi8((char)tag
);
982 int matches
[4] = {0};
984 assert(nbChunks
== 1 || nbChunks
== 2 || nbChunks
== 4);
985 for (i
=0; i
<nbChunks
; i
++) {
986 const __m128i chunk
= _mm_loadu_si128((const __m128i
*)(const void*)(src
+ 16*i
));
987 const __m128i equalMask
= _mm_cmpeq_epi8(chunk
, comparisonMask
);
988 matches
[i
] = _mm_movemask_epi8(equalMask
);
990 if (nbChunks
== 1) return ZSTD_rotateRight_U16((U16
)matches
[0], head
);
991 if (nbChunks
== 2) return ZSTD_rotateRight_U32((U32
)matches
[1] << 16 | (U32
)matches
[0], head
);
992 assert(nbChunks
== 4);
993 return ZSTD_rotateRight_U64((U64
)matches
[3] << 48 | (U64
)matches
[2] << 32 | (U64
)matches
[1] << 16 | (U64
)matches
[0], head
);
997 /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
998 * the hash at the nth position in a row of the tagTable.
999 * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1000 * to match up with the actual layout of the entries within the hashTable */
1001 FORCE_INLINE_TEMPLATE ZSTD_VecMask
1002 ZSTD_row_getMatchMask(const BYTE
* const tagRow
, const BYTE tag
, const U32 head
, const U32 rowEntries
)
1004 const BYTE
* const src
= tagRow
+ ZSTD_ROW_HASH_TAG_OFFSET
;
1005 assert((rowEntries
== 16) || (rowEntries
== 32) || rowEntries
== 64);
1006 assert(rowEntries
<= ZSTD_ROW_HASH_MAX_ENTRIES
);
1008 #if defined(ZSTD_ARCH_X86_SSE2)
1010 return ZSTD_row_getSSEMask(rowEntries
/ 16, src
, tag
, head
);
1012 #else /* SW or NEON-LE */
1014 # if defined(ZSTD_ARCH_ARM_NEON)
1015 /* This NEON path only works for little endian - otherwise use SWAR below */
1016 if (MEM_isLittleEndian()) {
1017 if (rowEntries
== 16) {
1018 const uint8x16_t chunk
= vld1q_u8(src
);
1019 const uint16x8_t equalMask
= vreinterpretq_u16_u8(vceqq_u8(chunk
, vdupq_n_u8(tag
)));
1020 const uint16x8_t t0
= vshlq_n_u16(equalMask
, 7);
1021 const uint32x4_t t1
= vreinterpretq_u32_u16(vsriq_n_u16(t0
, t0
, 14));
1022 const uint64x2_t t2
= vreinterpretq_u64_u32(vshrq_n_u32(t1
, 14));
1023 const uint8x16_t t3
= vreinterpretq_u8_u64(vsraq_n_u64(t2
, t2
, 28));
1024 const U16 hi
= (U16
)vgetq_lane_u8(t3
, 8);
1025 const U16 lo
= (U16
)vgetq_lane_u8(t3
, 0);
1026 return ZSTD_rotateRight_U16((hi
<< 8) | lo
, head
);
1027 } else if (rowEntries
== 32) {
1028 const uint16x8x2_t chunk
= vld2q_u16((const U16
*)(const void*)src
);
1029 const uint8x16_t chunk0
= vreinterpretq_u8_u16(chunk
.val
[0]);
1030 const uint8x16_t chunk1
= vreinterpretq_u8_u16(chunk
.val
[1]);
1031 const uint8x16_t equalMask0
= vceqq_u8(chunk0
, vdupq_n_u8(tag
));
1032 const uint8x16_t equalMask1
= vceqq_u8(chunk1
, vdupq_n_u8(tag
));
1033 const int8x8_t pack0
= vqmovn_s16(vreinterpretq_s16_u8(equalMask0
));
1034 const int8x8_t pack1
= vqmovn_s16(vreinterpretq_s16_u8(equalMask1
));
1035 const uint8x8_t t0
= vreinterpret_u8_s8(pack0
);
1036 const uint8x8_t t1
= vreinterpret_u8_s8(pack1
);
1037 const uint8x8_t t2
= vsri_n_u8(t1
, t0
, 2);
1038 const uint8x8x2_t t3
= vuzp_u8(t2
, t0
);
1039 const uint8x8_t t4
= vsri_n_u8(t3
.val
[1], t3
.val
[0], 4);
1040 const U32 matches
= vget_lane_u32(vreinterpret_u32_u8(t4
), 0);
1041 return ZSTD_rotateRight_U32(matches
, head
);
1042 } else { /* rowEntries == 64 */
1043 const uint8x16x4_t chunk
= vld4q_u8(src
);
1044 const uint8x16_t dup
= vdupq_n_u8(tag
);
1045 const uint8x16_t cmp0
= vceqq_u8(chunk
.val
[0], dup
);
1046 const uint8x16_t cmp1
= vceqq_u8(chunk
.val
[1], dup
);
1047 const uint8x16_t cmp2
= vceqq_u8(chunk
.val
[2], dup
);
1048 const uint8x16_t cmp3
= vceqq_u8(chunk
.val
[3], dup
);
1050 const uint8x16_t t0
= vsriq_n_u8(cmp1
, cmp0
, 1);
1051 const uint8x16_t t1
= vsriq_n_u8(cmp3
, cmp2
, 1);
1052 const uint8x16_t t2
= vsriq_n_u8(t1
, t0
, 2);
1053 const uint8x16_t t3
= vsriq_n_u8(t2
, t2
, 4);
1054 const uint8x8_t t4
= vshrn_n_u16(vreinterpretq_u16_u8(t3
), 4);
1055 const U64 matches
= vget_lane_u64(vreinterpret_u64_u8(t4
), 0);
1056 return ZSTD_rotateRight_U64(matches
, head
);
1059 # endif /* ZSTD_ARCH_ARM_NEON */
1061 { const size_t chunkSize
= sizeof(size_t);
1062 const size_t shiftAmount
= ((chunkSize
* 8) - chunkSize
);
1063 const size_t xFF
= ~((size_t)0);
1064 const size_t x01
= xFF
/ 0xFF;
1065 const size_t x80
= x01
<< 7;
1066 const size_t splatChar
= tag
* x01
;
1067 ZSTD_VecMask matches
= 0;
1068 int i
= rowEntries
- chunkSize
;
1069 assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1070 if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1071 const size_t extractMagic
= (xFF
/ 0x7F) >> chunkSize
;
1073 size_t chunk
= MEM_readST(&src
[i
]);
1075 chunk
= (((chunk
| x80
) - x01
) | chunk
) & x80
;
1076 matches
<<= chunkSize
;
1077 matches
|= (chunk
* extractMagic
) >> shiftAmount
;
1080 } else { /* big endian: reverse bits during extraction */
1081 const size_t msb
= xFF
^ (xFF
>> 1);
1082 const size_t extractMagic
= (msb
/ 0x1FF) | msb
;
1084 size_t chunk
= MEM_readST(&src
[i
]);
1086 chunk
= (((chunk
| x80
) - x01
) | chunk
) & x80
;
1087 matches
<<= chunkSize
;
1088 matches
|= ((chunk
>> 7) * extractMagic
) >> shiftAmount
;
1093 if (rowEntries
== 16) {
1094 return ZSTD_rotateRight_U16((U16
)matches
, head
);
1095 } else if (rowEntries
== 32) {
1096 return ZSTD_rotateRight_U32((U32
)matches
, head
);
1098 return ZSTD_rotateRight_U64((U64
)matches
, head
);
1104 /* The high-level approach of the SIMD row based match finder is as follows:
1105 * - Figure out where to insert the new entry:
1106 * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1107 * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1108 * which row to insert into.
1109 * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1110 * be considered as a circular buffer with a "head" index that resides in the tagTable.
1111 * - Also insert the "tag" into the equivalent row and position in the tagTable.
1112 * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1113 * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1114 * for alignment/performance reasons, leaving some bytes unused.
1115 * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1116 * generate a bitfield that we can cycle through to check the collisions in the hash table.
1117 * - Pick the longest match.
1119 FORCE_INLINE_TEMPLATE
1120 size_t ZSTD_RowFindBestMatch(
1121 ZSTD_matchState_t
* ms
,
1122 const BYTE
* const ip
, const BYTE
* const iLimit
,
1124 const U32 mls
, const ZSTD_dictMode_e dictMode
,
1127 U32
* const hashTable
= ms
->hashTable
;
1128 U16
* const tagTable
= ms
->tagTable
;
1129 U32
* const hashCache
= ms
->hashCache
;
1130 const U32 hashLog
= ms
->rowHashLog
;
1131 const ZSTD_compressionParameters
* const cParams
= &ms
->cParams
;
1132 const BYTE
* const base
= ms
->window
.base
;
1133 const BYTE
* const dictBase
= ms
->window
.dictBase
;
1134 const U32 dictLimit
= ms
->window
.dictLimit
;
1135 const BYTE
* const prefixStart
= base
+ dictLimit
;
1136 const BYTE
* const dictEnd
= dictBase
+ dictLimit
;
1137 const U32 curr
= (U32
)(ip
-base
);
1138 const U32 maxDistance
= 1U << cParams
->windowLog
;
1139 const U32 lowestValid
= ms
->window
.lowLimit
;
1140 const U32 withinMaxDistance
= (curr
- lowestValid
> maxDistance
) ? curr
- maxDistance
: lowestValid
;
1141 const U32 isDictionary
= (ms
->loadedDictEnd
!= 0);
1142 const U32 lowLimit
= isDictionary
? lowestValid
: withinMaxDistance
;
1143 const U32 rowEntries
= (1U << rowLog
);
1144 const U32 rowMask
= rowEntries
- 1;
1145 const U32 cappedSearchLog
= MIN(cParams
->searchLog
, rowLog
); /* nb of searches is capped at nb entries per row */
1146 U32 nbAttempts
= 1U << cappedSearchLog
;
1149 /* DMS/DDS variables that may be referenced laster */
1150 const ZSTD_matchState_t
* const dms
= ms
->dictMatchState
;
1152 /* Initialize the following variables to satisfy static analyzer */
1154 U32 ddsExtraAttempts
= 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1157 BYTE
* dmsTagRow
= NULL
;
1159 if (dictMode
== ZSTD_dedicatedDictSearch
) {
1160 const U32 ddsHashLog
= dms
->cParams
.hashLog
- ZSTD_LAZY_DDSS_BUCKET_LOG
;
1161 { /* Prefetch DDS hashtable entry */
1162 ddsIdx
= ZSTD_hashPtr(ip
, ddsHashLog
, mls
) << ZSTD_LAZY_DDSS_BUCKET_LOG
;
1163 PREFETCH_L1(&dms
->hashTable
[ddsIdx
]);
1165 ddsExtraAttempts
= cParams
->searchLog
> rowLog
? 1U << (cParams
->searchLog
- rowLog
) : 0;
1168 if (dictMode
== ZSTD_dictMatchState
) {
1169 /* Prefetch DMS rows */
1170 U32
* const dmsHashTable
= dms
->hashTable
;
1171 U16
* const dmsTagTable
= dms
->tagTable
;
1172 U32
const dmsHash
= (U32
)ZSTD_hashPtr(ip
, dms
->rowHashLog
+ ZSTD_ROW_HASH_TAG_BITS
, mls
);
1173 U32
const dmsRelRow
= (dmsHash
>> ZSTD_ROW_HASH_TAG_BITS
) << rowLog
;
1174 dmsTag
= dmsHash
& ZSTD_ROW_HASH_TAG_MASK
;
1175 dmsTagRow
= (BYTE
*)(dmsTagTable
+ dmsRelRow
);
1176 dmsRow
= dmsHashTable
+ dmsRelRow
;
1177 ZSTD_row_prefetch(dmsHashTable
, dmsTagTable
, dmsRelRow
, rowLog
);
1180 /* Update the hashTable and tagTable up to (but not including) ip */
1181 ZSTD_row_update_internal(ms
, ip
, mls
, rowLog
, rowMask
, 1 /* useCache */);
1182 { /* Get the hash for ip, compute the appropriate row */
1183 U32
const hash
= ZSTD_row_nextCachedHash(hashCache
, hashTable
, tagTable
, base
, curr
, hashLog
, rowLog
, mls
);
1184 U32
const relRow
= (hash
>> ZSTD_ROW_HASH_TAG_BITS
) << rowLog
;
1185 U32
const tag
= hash
& ZSTD_ROW_HASH_TAG_MASK
;
1186 U32
* const row
= hashTable
+ relRow
;
1187 BYTE
* tagRow
= (BYTE
*)(tagTable
+ relRow
);
1188 U32
const head
= *tagRow
& rowMask
;
1189 U32 matchBuffer
[ZSTD_ROW_HASH_MAX_ENTRIES
];
1190 size_t numMatches
= 0;
1191 size_t currMatch
= 0;
1192 ZSTD_VecMask matches
= ZSTD_row_getMatchMask(tagRow
, (BYTE
)tag
, head
, rowEntries
);
1194 /* Cycle through the matches and prefetch */
1195 for (; (matches
> 0) && (nbAttempts
> 0); --nbAttempts
, matches
&= (matches
- 1)) {
1196 U32
const matchPos
= (head
+ ZSTD_VecMask_next(matches
)) & rowMask
;
1197 U32
const matchIndex
= row
[matchPos
];
1198 assert(numMatches
< rowEntries
);
1199 if (matchIndex
< lowLimit
)
1201 if ((dictMode
!= ZSTD_extDict
) || matchIndex
>= dictLimit
) {
1202 PREFETCH_L1(base
+ matchIndex
);
1204 PREFETCH_L1(dictBase
+ matchIndex
);
1206 matchBuffer
[numMatches
++] = matchIndex
;
1209 /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1210 in ZSTD_row_update_internal() at the next search. */
1212 U32
const pos
= ZSTD_row_nextIndex(tagRow
, rowMask
);
1213 tagRow
[pos
+ ZSTD_ROW_HASH_TAG_OFFSET
] = (BYTE
)tag
;
1214 row
[pos
] = ms
->nextToUpdate
++;
1217 /* Return the longest match */
1218 for (; currMatch
< numMatches
; ++currMatch
) {
1219 U32
const matchIndex
= matchBuffer
[currMatch
];
1221 assert(matchIndex
< curr
);
1222 assert(matchIndex
>= lowLimit
);
1224 if ((dictMode
!= ZSTD_extDict
) || matchIndex
>= dictLimit
) {
1225 const BYTE
* const match
= base
+ matchIndex
;
1226 assert(matchIndex
>= dictLimit
); /* ensures this is true if dictMode != ZSTD_extDict */
1227 if (match
[ml
] == ip
[ml
]) /* potentially better */
1228 currentMl
= ZSTD_count(ip
, match
, iLimit
);
1230 const BYTE
* const match
= dictBase
+ matchIndex
;
1231 assert(match
+4 <= dictEnd
);
1232 if (MEM_read32(match
) == MEM_read32(ip
)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1233 currentMl
= ZSTD_count_2segments(ip
+4, match
+4, iLimit
, dictEnd
, prefixStart
) + 4;
1236 /* Save best solution */
1237 if (currentMl
> ml
) {
1239 *offsetPtr
= STORE_OFFSET(curr
- matchIndex
);
1240 if (ip
+currentMl
== iLimit
) break; /* best possible, avoids read overflow on next attempt */
1245 assert(nbAttempts
<= (1U << ZSTD_SEARCHLOG_MAX
)); /* Check we haven't underflowed. */
1246 if (dictMode
== ZSTD_dedicatedDictSearch
) {
1247 ml
= ZSTD_dedicatedDictSearch_lazy_search(offsetPtr
, ml
, nbAttempts
+ ddsExtraAttempts
, dms
,
1248 ip
, iLimit
, prefixStart
, curr
, dictLimit
, ddsIdx
);
1249 } else if (dictMode
== ZSTD_dictMatchState
) {
1250 /* TODO: Measure and potentially add prefetching to DMS */
1251 const U32 dmsLowestIndex
= dms
->window
.dictLimit
;
1252 const BYTE
* const dmsBase
= dms
->window
.base
;
1253 const BYTE
* const dmsEnd
= dms
->window
.nextSrc
;
1254 const U32 dmsSize
= (U32
)(dmsEnd
- dmsBase
);
1255 const U32 dmsIndexDelta
= dictLimit
- dmsSize
;
1257 { U32
const head
= *dmsTagRow
& rowMask
;
1258 U32 matchBuffer
[ZSTD_ROW_HASH_MAX_ENTRIES
];
1259 size_t numMatches
= 0;
1260 size_t currMatch
= 0;
1261 ZSTD_VecMask matches
= ZSTD_row_getMatchMask(dmsTagRow
, (BYTE
)dmsTag
, head
, rowEntries
);
1263 for (; (matches
> 0) && (nbAttempts
> 0); --nbAttempts
, matches
&= (matches
- 1)) {
1264 U32
const matchPos
= (head
+ ZSTD_VecMask_next(matches
)) & rowMask
;
1265 U32
const matchIndex
= dmsRow
[matchPos
];
1266 if (matchIndex
< dmsLowestIndex
)
1268 PREFETCH_L1(dmsBase
+ matchIndex
);
1269 matchBuffer
[numMatches
++] = matchIndex
;
1272 /* Return the longest match */
1273 for (; currMatch
< numMatches
; ++currMatch
) {
1274 U32
const matchIndex
= matchBuffer
[currMatch
];
1276 assert(matchIndex
>= dmsLowestIndex
);
1277 assert(matchIndex
< curr
);
1279 { const BYTE
* const match
= dmsBase
+ matchIndex
;
1280 assert(match
+4 <= dmsEnd
);
1281 if (MEM_read32(match
) == MEM_read32(ip
))
1282 currentMl
= ZSTD_count_2segments(ip
+4, match
+4, iLimit
, dmsEnd
, prefixStart
) + 4;
1285 if (currentMl
> ml
) {
1287 assert(curr
> matchIndex
+ dmsIndexDelta
);
1288 *offsetPtr
= STORE_OFFSET(curr
- (matchIndex
+ dmsIndexDelta
));
1289 if (ip
+currentMl
== iLimit
) break;
1299 * Generate search functions templated on (dictMode, mls, rowLog).
1300 * These functions are outlined for code size & compilation time.
1301 * ZSTD_searchMax() dispatches to the correct implementation function.
1303 * TODO: The start of the search function involves loading and calculating a
1304 * bunch of constants from the ZSTD_matchState_t. These computations could be
1305 * done in an initialization function, and saved somewhere in the match state.
1306 * Then we could pass a pointer to the saved state instead of the match state,
1307 * and avoid duplicate computations.
1309 * TODO: Move the match re-winding into searchMax. This improves compression
1310 * ratio, and unlocks further simplifications with the next TODO.
1312 * TODO: Try moving the repcode search into searchMax. After the re-winding
1313 * and repcode search are in searchMax, there is no more logic in the match
1314 * finder loop that requires knowledge about the dictMode. So we should be
1315 * able to avoid force inlining it, and we can join the extDict loop with
1316 * the single segment loop. It should go in searchMax instead of its own
1317 * function to avoid having multiple virtual function calls per search.
1320 #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1321 #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1322 #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1324 #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1326 #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1327 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1328 ZSTD_matchState_t* ms, \
1329 const BYTE* ip, const BYTE* const iLimit, \
1330 size_t* offBasePtr) \
1332 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1333 return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1336 #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1337 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1338 ZSTD_matchState_t* ms, \
1339 const BYTE* ip, const BYTE* const iLimit, \
1340 size_t* offsetPtr) \
1342 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343 return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1346 #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1347 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1348 ZSTD_matchState_t* ms, \
1349 const BYTE* ip, const BYTE* const iLimit, \
1350 size_t* offsetPtr) \
1352 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1353 assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1354 return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1357 #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1358 X(dictMode, mls, 4) \
1359 X(dictMode, mls, 5) \
1362 #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1363 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1364 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1365 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1367 #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1372 #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1373 X(__VA_ARGS__, noDict) \
1374 X(__VA_ARGS__, extDict) \
1375 X(__VA_ARGS__, dictMatchState) \
1376 X(__VA_ARGS__, dedicatedDictSearch)
1378 /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1379 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG
, GEN_ZSTD_ROW_SEARCH_FN
)
1380 /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1381 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS
, GEN_ZSTD_BT_SEARCH_FN
)
1382 /* Generate hash chain search fns for each combination of (dictMode, mls) */
1383 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS
, GEN_ZSTD_HC_SEARCH_FN
)
1385 typedef enum { search_hashChain
=0, search_binaryTree
=1, search_rowHash
=2 } searchMethod_e
;
1387 #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1389 return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1390 #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1392 return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1393 #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1395 return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1397 #define ZSTD_SWITCH_MLS(X, dictMode) \
1399 ZSTD_FOR_EACH_MLS(X, dictMode) \
1402 #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1405 ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1410 #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1411 switch (searchMethod) { \
1412 case search_hashChain: \
1413 ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1415 case search_binaryTree: \
1416 ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1418 case search_rowHash: \
1419 ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1425 * Searches for the longest match at @p ip.
1426 * Dispatches to the correct implementation function based on the
1427 * (searchMethod, dictMode, mls, rowLog). We use switch statements
1428 * here instead of using an indirect function call through a function
1429 * pointer because after Spectre and Meltdown mitigations, indirect
1430 * function calls can be very costly, especially in the kernel.
1432 * NOTE: dictMode and searchMethod should be templated, so those switch
1433 * statements should be optimized out. Only the mls & rowLog switches
1436 * @param ms The match state.
1437 * @param ip The position to search at.
1438 * @param iend The end of the input data.
1439 * @param[out] offsetPtr Stores the match offset into this pointer.
1440 * @param mls The minimum search length, in the range [4, 6].
1441 * @param rowLog The row log (if applicable), in the range [4, 6].
1442 * @param searchMethod The search method to use (templated).
1443 * @param dictMode The dictMode (templated).
1445 * @returns The length of the longest match found, or < mls if no match is found.
1446 * If a match is found its offset is stored in @p offsetPtr.
1448 FORCE_INLINE_TEMPLATE
size_t ZSTD_searchMax(
1449 ZSTD_matchState_t
* ms
,
1455 searchMethod_e
const searchMethod
,
1456 ZSTD_dictMode_e
const dictMode
)
1458 if (dictMode
== ZSTD_noDict
) {
1459 ZSTD_SWITCH_SEARCH_METHOD(noDict
)
1460 } else if (dictMode
== ZSTD_extDict
) {
1461 ZSTD_SWITCH_SEARCH_METHOD(extDict
)
1462 } else if (dictMode
== ZSTD_dictMatchState
) {
1463 ZSTD_SWITCH_SEARCH_METHOD(dictMatchState
)
1464 } else if (dictMode
== ZSTD_dedicatedDictSearch
) {
1465 ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch
)
1471 /* *******************************
1472 * Common parser - lazy strategy
1473 *********************************/
1475 FORCE_INLINE_TEMPLATE
size_t
1476 ZSTD_compressBlock_lazy_generic(
1477 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
,
1478 U32 rep
[ZSTD_REP_NUM
],
1479 const void* src
, size_t srcSize
,
1480 const searchMethod_e searchMethod
, const U32 depth
,
1481 ZSTD_dictMode_e
const dictMode
)
1483 const BYTE
* const istart
= (const BYTE
*)src
;
1484 const BYTE
* ip
= istart
;
1485 const BYTE
* anchor
= istart
;
1486 const BYTE
* const iend
= istart
+ srcSize
;
1487 const BYTE
* const ilimit
= (searchMethod
== search_rowHash
) ? iend
- 8 - ZSTD_ROW_HASH_CACHE_SIZE
: iend
- 8;
1488 const BYTE
* const base
= ms
->window
.base
;
1489 const U32 prefixLowestIndex
= ms
->window
.dictLimit
;
1490 const BYTE
* const prefixLowest
= base
+ prefixLowestIndex
;
1491 const U32 mls
= BOUNDED(4, ms
->cParams
.minMatch
, 6);
1492 const U32 rowLog
= BOUNDED(4, ms
->cParams
.searchLog
, 6);
1494 U32 offset_1
= rep
[0], offset_2
= rep
[1], savedOffset
=0;
1496 const int isDMS
= dictMode
== ZSTD_dictMatchState
;
1497 const int isDDS
= dictMode
== ZSTD_dedicatedDictSearch
;
1498 const int isDxS
= isDMS
|| isDDS
;
1499 const ZSTD_matchState_t
* const dms
= ms
->dictMatchState
;
1500 const U32 dictLowestIndex
= isDxS
? dms
->window
.dictLimit
: 0;
1501 const BYTE
* const dictBase
= isDxS
? dms
->window
.base
: NULL
;
1502 const BYTE
* const dictLowest
= isDxS
? dictBase
+ dictLowestIndex
: NULL
;
1503 const BYTE
* const dictEnd
= isDxS
? dms
->window
.nextSrc
: NULL
;
1504 const U32 dictIndexDelta
= isDxS
?
1505 prefixLowestIndex
- (U32
)(dictEnd
- dictBase
) :
1507 const U32 dictAndPrefixLength
= (U32
)((ip
- prefixLowest
) + (dictEnd
- dictLowest
));
1509 DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32
)dictMode
, (U32
)searchMethod
);
1510 ip
+= (dictAndPrefixLength
== 0);
1511 if (dictMode
== ZSTD_noDict
) {
1512 U32
const curr
= (U32
)(ip
- base
);
1513 U32
const windowLow
= ZSTD_getLowestPrefixIndex(ms
, curr
, ms
->cParams
.windowLog
);
1514 U32
const maxRep
= curr
- windowLow
;
1515 if (offset_2
> maxRep
) savedOffset
= offset_2
, offset_2
= 0;
1516 if (offset_1
> maxRep
) savedOffset
= offset_1
, offset_1
= 0;
1519 /* dictMatchState repCode checks don't currently handle repCode == 0
1521 assert(offset_1
<= dictAndPrefixLength
);
1522 assert(offset_2
<= dictAndPrefixLength
);
1525 if (searchMethod
== search_rowHash
) {
1526 ZSTD_row_fillHashCache(ms
, base
, rowLog
,
1527 MIN(ms
->cParams
.minMatch
, 6 /* mls caps out at 6 */),
1528 ms
->nextToUpdate
, ilimit
);
1532 #if defined(__x86_64__)
1533 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1534 * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1536 __asm__(".p2align 5");
1538 while (ip
< ilimit
) {
1539 size_t matchLength
=0;
1540 size_t offcode
=STORE_REPCODE_1
;
1541 const BYTE
* start
=ip
+1;
1542 DEBUGLOG(7, "search baseline (depth 0)");
1546 const U32 repIndex
= (U32
)(ip
- base
) + 1 - offset_1
;
1547 const BYTE
* repMatch
= ((dictMode
== ZSTD_dictMatchState
|| dictMode
== ZSTD_dedicatedDictSearch
)
1548 && repIndex
< prefixLowestIndex
) ?
1549 dictBase
+ (repIndex
- dictIndexDelta
) :
1551 if (((U32
)((prefixLowestIndex
-1) - repIndex
) >= 3 /* intentional underflow */)
1552 && (MEM_read32(repMatch
) == MEM_read32(ip
+1)) ) {
1553 const BYTE
* repMatchEnd
= repIndex
< prefixLowestIndex
? dictEnd
: iend
;
1554 matchLength
= ZSTD_count_2segments(ip
+1+4, repMatch
+4, iend
, repMatchEnd
, prefixLowest
) + 4;
1555 if (depth
==0) goto _storeSequence
;
1558 if ( dictMode
== ZSTD_noDict
1559 && ((offset_1
> 0) & (MEM_read32(ip
+1-offset_1
) == MEM_read32(ip
+1)))) {
1560 matchLength
= ZSTD_count(ip
+1+4, ip
+1+4-offset_1
, iend
) + 4;
1561 if (depth
==0) goto _storeSequence
;
1564 /* first search (depth 0) */
1565 { size_t offsetFound
= 999999999;
1566 size_t const ml2
= ZSTD_searchMax(ms
, ip
, iend
, &offsetFound
, mls
, rowLog
, searchMethod
, dictMode
);
1567 if (ml2
> matchLength
)
1568 matchLength
= ml2
, start
= ip
, offcode
=offsetFound
;
1571 if (matchLength
< 4) {
1572 ip
+= ((ip
-anchor
) >> kSearchStrength
) + 1; /* jump faster over incompressible sections */
1576 /* let's try to find a better solution */
1579 DEBUGLOG(7, "search depth 1");
1581 if ( (dictMode
== ZSTD_noDict
)
1582 && (offcode
) && ((offset_1
>0) & (MEM_read32(ip
) == MEM_read32(ip
- offset_1
)))) {
1583 size_t const mlRep
= ZSTD_count(ip
+4, ip
+4-offset_1
, iend
) + 4;
1584 int const gain2
= (int)(mlRep
* 3);
1585 int const gain1
= (int)(matchLength
*3 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 1);
1586 if ((mlRep
>= 4) && (gain2
> gain1
))
1587 matchLength
= mlRep
, offcode
= STORE_REPCODE_1
, start
= ip
;
1590 const U32 repIndex
= (U32
)(ip
- base
) - offset_1
;
1591 const BYTE
* repMatch
= repIndex
< prefixLowestIndex
?
1592 dictBase
+ (repIndex
- dictIndexDelta
) :
1594 if (((U32
)((prefixLowestIndex
-1) - repIndex
) >= 3 /* intentional underflow */)
1595 && (MEM_read32(repMatch
) == MEM_read32(ip
)) ) {
1596 const BYTE
* repMatchEnd
= repIndex
< prefixLowestIndex
? dictEnd
: iend
;
1597 size_t const mlRep
= ZSTD_count_2segments(ip
+4, repMatch
+4, iend
, repMatchEnd
, prefixLowest
) + 4;
1598 int const gain2
= (int)(mlRep
* 3);
1599 int const gain1
= (int)(matchLength
*3 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 1);
1600 if ((mlRep
>= 4) && (gain2
> gain1
))
1601 matchLength
= mlRep
, offcode
= STORE_REPCODE_1
, start
= ip
;
1604 { size_t offset2
=999999999;
1605 size_t const ml2
= ZSTD_searchMax(ms
, ip
, iend
, &offset2
, mls
, rowLog
, searchMethod
, dictMode
);
1606 int const gain2
= (int)(ml2
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offset2
))); /* raw approx */
1607 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 4);
1608 if ((ml2
>= 4) && (gain2
> gain1
)) {
1609 matchLength
= ml2
, offcode
= offset2
, start
= ip
;
1610 continue; /* search a better one */
1613 /* let's find an even better one */
1614 if ((depth
==2) && (ip
<ilimit
)) {
1615 DEBUGLOG(7, "search depth 2");
1617 if ( (dictMode
== ZSTD_noDict
)
1618 && (offcode
) && ((offset_1
>0) & (MEM_read32(ip
) == MEM_read32(ip
- offset_1
)))) {
1619 size_t const mlRep
= ZSTD_count(ip
+4, ip
+4-offset_1
, iend
) + 4;
1620 int const gain2
= (int)(mlRep
* 4);
1621 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 1);
1622 if ((mlRep
>= 4) && (gain2
> gain1
))
1623 matchLength
= mlRep
, offcode
= STORE_REPCODE_1
, start
= ip
;
1626 const U32 repIndex
= (U32
)(ip
- base
) - offset_1
;
1627 const BYTE
* repMatch
= repIndex
< prefixLowestIndex
?
1628 dictBase
+ (repIndex
- dictIndexDelta
) :
1630 if (((U32
)((prefixLowestIndex
-1) - repIndex
) >= 3 /* intentional underflow */)
1631 && (MEM_read32(repMatch
) == MEM_read32(ip
)) ) {
1632 const BYTE
* repMatchEnd
= repIndex
< prefixLowestIndex
? dictEnd
: iend
;
1633 size_t const mlRep
= ZSTD_count_2segments(ip
+4, repMatch
+4, iend
, repMatchEnd
, prefixLowest
) + 4;
1634 int const gain2
= (int)(mlRep
* 4);
1635 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 1);
1636 if ((mlRep
>= 4) && (gain2
> gain1
))
1637 matchLength
= mlRep
, offcode
= STORE_REPCODE_1
, start
= ip
;
1640 { size_t offset2
=999999999;
1641 size_t const ml2
= ZSTD_searchMax(ms
, ip
, iend
, &offset2
, mls
, rowLog
, searchMethod
, dictMode
);
1642 int const gain2
= (int)(ml2
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offset2
))); /* raw approx */
1643 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 7);
1644 if ((ml2
>= 4) && (gain2
> gain1
)) {
1645 matchLength
= ml2
, offcode
= offset2
, start
= ip
;
1648 break; /* nothing found : store previous solution */
1652 * Pay attention that `start[-value]` can lead to strange undefined behavior
1653 * notably if `value` is unsigned, resulting in a large positive `-value`.
1656 if (STORED_IS_OFFSET(offcode
)) {
1657 if (dictMode
== ZSTD_noDict
) {
1658 while ( ((start
> anchor
) & (start
- STORED_OFFSET(offcode
) > prefixLowest
))
1659 && (start
[-1] == (start
-STORED_OFFSET(offcode
))[-1]) ) /* only search for offset within prefix */
1660 { start
--; matchLength
++; }
1663 U32
const matchIndex
= (U32
)((size_t)(start
-base
) - STORED_OFFSET(offcode
));
1664 const BYTE
* match
= (matchIndex
< prefixLowestIndex
) ? dictBase
+ matchIndex
- dictIndexDelta
: base
+ matchIndex
;
1665 const BYTE
* const mStart
= (matchIndex
< prefixLowestIndex
) ? dictLowest
: prefixLowest
;
1666 while ((start
>anchor
) && (match
>mStart
) && (start
[-1] == match
[-1])) { start
--; match
--; matchLength
++; } /* catch up */
1668 offset_2
= offset_1
; offset_1
= (U32
)STORED_OFFSET(offcode
);
1670 /* store sequence */
1672 { size_t const litLength
= (size_t)(start
- anchor
);
1673 ZSTD_storeSeq(seqStore
, litLength
, anchor
, iend
, (U32
)offcode
, matchLength
);
1674 anchor
= ip
= start
+ matchLength
;
1677 /* check immediate repcode */
1679 while (ip
<= ilimit
) {
1680 U32
const current2
= (U32
)(ip
-base
);
1681 U32
const repIndex
= current2
- offset_2
;
1682 const BYTE
* repMatch
= repIndex
< prefixLowestIndex
?
1683 dictBase
- dictIndexDelta
+ repIndex
:
1685 if ( ((U32
)((prefixLowestIndex
-1) - (U32
)repIndex
) >= 3 /* intentional overflow */)
1686 && (MEM_read32(repMatch
) == MEM_read32(ip
)) ) {
1687 const BYTE
* const repEnd2
= repIndex
< prefixLowestIndex
? dictEnd
: iend
;
1688 matchLength
= ZSTD_count_2segments(ip
+4, repMatch
+4, iend
, repEnd2
, prefixLowest
) + 4;
1689 offcode
= offset_2
; offset_2
= offset_1
; offset_1
= (U32
)offcode
; /* swap offset_2 <=> offset_1 */
1690 ZSTD_storeSeq(seqStore
, 0, anchor
, iend
, STORE_REPCODE_1
, matchLength
);
1699 if (dictMode
== ZSTD_noDict
) {
1700 while ( ((ip
<= ilimit
) & (offset_2
>0))
1701 && (MEM_read32(ip
) == MEM_read32(ip
- offset_2
)) ) {
1702 /* store sequence */
1703 matchLength
= ZSTD_count(ip
+4, ip
+4-offset_2
, iend
) + 4;
1704 offcode
= offset_2
; offset_2
= offset_1
; offset_1
= (U32
)offcode
; /* swap repcodes */
1705 ZSTD_storeSeq(seqStore
, 0, anchor
, iend
, STORE_REPCODE_1
, matchLength
);
1708 continue; /* faster when present ... (?) */
1711 /* Save reps for next block */
1712 rep
[0] = offset_1
? offset_1
: savedOffset
;
1713 rep
[1] = offset_2
? offset_2
: savedOffset
;
1715 /* Return the last literals size */
1716 return (size_t)(iend
- anchor
);
1720 size_t ZSTD_compressBlock_btlazy2(
1721 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1722 void const* src
, size_t srcSize
)
1724 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_binaryTree
, 2, ZSTD_noDict
);
1727 size_t ZSTD_compressBlock_lazy2(
1728 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1729 void const* src
, size_t srcSize
)
1731 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 2, ZSTD_noDict
);
1734 size_t ZSTD_compressBlock_lazy(
1735 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1736 void const* src
, size_t srcSize
)
1738 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 1, ZSTD_noDict
);
1741 size_t ZSTD_compressBlock_greedy(
1742 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1743 void const* src
, size_t srcSize
)
1745 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 0, ZSTD_noDict
);
1748 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1749 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1750 void const* src
, size_t srcSize
)
1752 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_binaryTree
, 2, ZSTD_dictMatchState
);
1755 size_t ZSTD_compressBlock_lazy2_dictMatchState(
1756 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1757 void const* src
, size_t srcSize
)
1759 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 2, ZSTD_dictMatchState
);
1762 size_t ZSTD_compressBlock_lazy_dictMatchState(
1763 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1764 void const* src
, size_t srcSize
)
1766 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 1, ZSTD_dictMatchState
);
1769 size_t ZSTD_compressBlock_greedy_dictMatchState(
1770 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1771 void const* src
, size_t srcSize
)
1773 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 0, ZSTD_dictMatchState
);
1777 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1778 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1779 void const* src
, size_t srcSize
)
1781 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 2, ZSTD_dedicatedDictSearch
);
1784 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1785 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1786 void const* src
, size_t srcSize
)
1788 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 1, ZSTD_dedicatedDictSearch
);
1791 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1792 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1793 void const* src
, size_t srcSize
)
1795 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 0, ZSTD_dedicatedDictSearch
);
1798 /* Row-based matchfinder */
1799 size_t ZSTD_compressBlock_lazy2_row(
1800 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1801 void const* src
, size_t srcSize
)
1803 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 2, ZSTD_noDict
);
1806 size_t ZSTD_compressBlock_lazy_row(
1807 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1808 void const* src
, size_t srcSize
)
1810 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 1, ZSTD_noDict
);
1813 size_t ZSTD_compressBlock_greedy_row(
1814 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1815 void const* src
, size_t srcSize
)
1817 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 0, ZSTD_noDict
);
1820 size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1821 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1822 void const* src
, size_t srcSize
)
1824 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 2, ZSTD_dictMatchState
);
1827 size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1828 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1829 void const* src
, size_t srcSize
)
1831 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 1, ZSTD_dictMatchState
);
1834 size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1835 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1836 void const* src
, size_t srcSize
)
1838 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 0, ZSTD_dictMatchState
);
1842 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1843 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1844 void const* src
, size_t srcSize
)
1846 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 2, ZSTD_dedicatedDictSearch
);
1849 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1850 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1851 void const* src
, size_t srcSize
)
1853 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 1, ZSTD_dedicatedDictSearch
);
1856 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1857 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
1858 void const* src
, size_t srcSize
)
1860 return ZSTD_compressBlock_lazy_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 0, ZSTD_dedicatedDictSearch
);
1863 FORCE_INLINE_TEMPLATE
1864 size_t ZSTD_compressBlock_lazy_extDict_generic(
1865 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
,
1866 U32 rep
[ZSTD_REP_NUM
],
1867 const void* src
, size_t srcSize
,
1868 const searchMethod_e searchMethod
, const U32 depth
)
1870 const BYTE
* const istart
= (const BYTE
*)src
;
1871 const BYTE
* ip
= istart
;
1872 const BYTE
* anchor
= istart
;
1873 const BYTE
* const iend
= istart
+ srcSize
;
1874 const BYTE
* const ilimit
= searchMethod
== search_rowHash
? iend
- 8 - ZSTD_ROW_HASH_CACHE_SIZE
: iend
- 8;
1875 const BYTE
* const base
= ms
->window
.base
;
1876 const U32 dictLimit
= ms
->window
.dictLimit
;
1877 const BYTE
* const prefixStart
= base
+ dictLimit
;
1878 const BYTE
* const dictBase
= ms
->window
.dictBase
;
1879 const BYTE
* const dictEnd
= dictBase
+ dictLimit
;
1880 const BYTE
* const dictStart
= dictBase
+ ms
->window
.lowLimit
;
1881 const U32 windowLog
= ms
->cParams
.windowLog
;
1882 const U32 mls
= BOUNDED(4, ms
->cParams
.minMatch
, 6);
1883 const U32 rowLog
= BOUNDED(4, ms
->cParams
.searchLog
, 6);
1885 U32 offset_1
= rep
[0], offset_2
= rep
[1];
1887 DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32
)searchMethod
);
1890 ip
+= (ip
== prefixStart
);
1891 if (searchMethod
== search_rowHash
) {
1892 ZSTD_row_fillHashCache(ms
, base
, rowLog
,
1893 MIN(ms
->cParams
.minMatch
, 6 /* mls caps out at 6 */),
1894 ms
->nextToUpdate
, ilimit
);
1898 #if defined(__x86_64__)
1899 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1900 * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1902 __asm__(".p2align 5");
1904 while (ip
< ilimit
) {
1905 size_t matchLength
=0;
1906 size_t offcode
=STORE_REPCODE_1
;
1907 const BYTE
* start
=ip
+1;
1908 U32 curr
= (U32
)(ip
-base
);
1911 { const U32 windowLow
= ZSTD_getLowestMatchIndex(ms
, curr
+1, windowLog
);
1912 const U32 repIndex
= (U32
)(curr
+1 - offset_1
);
1913 const BYTE
* const repBase
= repIndex
< dictLimit
? dictBase
: base
;
1914 const BYTE
* const repMatch
= repBase
+ repIndex
;
1915 if ( ((U32
)((dictLimit
-1) - repIndex
) >= 3) /* intentional overflow */
1916 & (offset_1
<= curr
+1 - windowLow
) ) /* note: we are searching at curr+1 */
1917 if (MEM_read32(ip
+1) == MEM_read32(repMatch
)) {
1918 /* repcode detected we should take it */
1919 const BYTE
* const repEnd
= repIndex
< dictLimit
? dictEnd
: iend
;
1920 matchLength
= ZSTD_count_2segments(ip
+1+4, repMatch
+4, iend
, repEnd
, prefixStart
) + 4;
1921 if (depth
==0) goto _storeSequence
;
1924 /* first search (depth 0) */
1925 { size_t offsetFound
= 999999999;
1926 size_t const ml2
= ZSTD_searchMax(ms
, ip
, iend
, &offsetFound
, mls
, rowLog
, searchMethod
, ZSTD_extDict
);
1927 if (ml2
> matchLength
)
1928 matchLength
= ml2
, start
= ip
, offcode
=offsetFound
;
1931 if (matchLength
< 4) {
1932 ip
+= ((ip
-anchor
) >> kSearchStrength
) + 1; /* jump faster over incompressible sections */
1936 /* let's try to find a better solution */
1943 const U32 windowLow
= ZSTD_getLowestMatchIndex(ms
, curr
, windowLog
);
1944 const U32 repIndex
= (U32
)(curr
- offset_1
);
1945 const BYTE
* const repBase
= repIndex
< dictLimit
? dictBase
: base
;
1946 const BYTE
* const repMatch
= repBase
+ repIndex
;
1947 if ( ((U32
)((dictLimit
-1) - repIndex
) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1948 & (offset_1
<= curr
- windowLow
) ) /* equivalent to `curr > repIndex >= windowLow` */
1949 if (MEM_read32(ip
) == MEM_read32(repMatch
)) {
1950 /* repcode detected */
1951 const BYTE
* const repEnd
= repIndex
< dictLimit
? dictEnd
: iend
;
1952 size_t const repLength
= ZSTD_count_2segments(ip
+4, repMatch
+4, iend
, repEnd
, prefixStart
) + 4;
1953 int const gain2
= (int)(repLength
* 3);
1954 int const gain1
= (int)(matchLength
*3 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 1);
1955 if ((repLength
>= 4) && (gain2
> gain1
))
1956 matchLength
= repLength
, offcode
= STORE_REPCODE_1
, start
= ip
;
1959 /* search match, depth 1 */
1960 { size_t offset2
=999999999;
1961 size_t const ml2
= ZSTD_searchMax(ms
, ip
, iend
, &offset2
, mls
, rowLog
, searchMethod
, ZSTD_extDict
);
1962 int const gain2
= (int)(ml2
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offset2
))); /* raw approx */
1963 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 4);
1964 if ((ml2
>= 4) && (gain2
> gain1
)) {
1965 matchLength
= ml2
, offcode
= offset2
, start
= ip
;
1966 continue; /* search a better one */
1969 /* let's find an even better one */
1970 if ((depth
==2) && (ip
<ilimit
)) {
1975 const U32 windowLow
= ZSTD_getLowestMatchIndex(ms
, curr
, windowLog
);
1976 const U32 repIndex
= (U32
)(curr
- offset_1
);
1977 const BYTE
* const repBase
= repIndex
< dictLimit
? dictBase
: base
;
1978 const BYTE
* const repMatch
= repBase
+ repIndex
;
1979 if ( ((U32
)((dictLimit
-1) - repIndex
) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1980 & (offset_1
<= curr
- windowLow
) ) /* equivalent to `curr > repIndex >= windowLow` */
1981 if (MEM_read32(ip
) == MEM_read32(repMatch
)) {
1982 /* repcode detected */
1983 const BYTE
* const repEnd
= repIndex
< dictLimit
? dictEnd
: iend
;
1984 size_t const repLength
= ZSTD_count_2segments(ip
+4, repMatch
+4, iend
, repEnd
, prefixStart
) + 4;
1985 int const gain2
= (int)(repLength
* 4);
1986 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 1);
1987 if ((repLength
>= 4) && (gain2
> gain1
))
1988 matchLength
= repLength
, offcode
= STORE_REPCODE_1
, start
= ip
;
1991 /* search match, depth 2 */
1992 { size_t offset2
=999999999;
1993 size_t const ml2
= ZSTD_searchMax(ms
, ip
, iend
, &offset2
, mls
, rowLog
, searchMethod
, ZSTD_extDict
);
1994 int const gain2
= (int)(ml2
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offset2
))); /* raw approx */
1995 int const gain1
= (int)(matchLength
*4 - ZSTD_highbit32((U32
)STORED_TO_OFFBASE(offcode
)) + 7);
1996 if ((ml2
>= 4) && (gain2
> gain1
)) {
1997 matchLength
= ml2
, offcode
= offset2
, start
= ip
;
2000 break; /* nothing found : store previous solution */
2004 if (STORED_IS_OFFSET(offcode
)) {
2005 U32
const matchIndex
= (U32
)((size_t)(start
-base
) - STORED_OFFSET(offcode
));
2006 const BYTE
* match
= (matchIndex
< dictLimit
) ? dictBase
+ matchIndex
: base
+ matchIndex
;
2007 const BYTE
* const mStart
= (matchIndex
< dictLimit
) ? dictStart
: prefixStart
;
2008 while ((start
>anchor
) && (match
>mStart
) && (start
[-1] == match
[-1])) { start
--; match
--; matchLength
++; } /* catch up */
2009 offset_2
= offset_1
; offset_1
= (U32
)STORED_OFFSET(offcode
);
2012 /* store sequence */
2014 { size_t const litLength
= (size_t)(start
- anchor
);
2015 ZSTD_storeSeq(seqStore
, litLength
, anchor
, iend
, (U32
)offcode
, matchLength
);
2016 anchor
= ip
= start
+ matchLength
;
2019 /* check immediate repcode */
2020 while (ip
<= ilimit
) {
2021 const U32 repCurrent
= (U32
)(ip
-base
);
2022 const U32 windowLow
= ZSTD_getLowestMatchIndex(ms
, repCurrent
, windowLog
);
2023 const U32 repIndex
= repCurrent
- offset_2
;
2024 const BYTE
* const repBase
= repIndex
< dictLimit
? dictBase
: base
;
2025 const BYTE
* const repMatch
= repBase
+ repIndex
;
2026 if ( ((U32
)((dictLimit
-1) - repIndex
) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2027 & (offset_2
<= repCurrent
- windowLow
) ) /* equivalent to `curr > repIndex >= windowLow` */
2028 if (MEM_read32(ip
) == MEM_read32(repMatch
)) {
2029 /* repcode detected we should take it */
2030 const BYTE
* const repEnd
= repIndex
< dictLimit
? dictEnd
: iend
;
2031 matchLength
= ZSTD_count_2segments(ip
+4, repMatch
+4, iend
, repEnd
, prefixStart
) + 4;
2032 offcode
= offset_2
; offset_2
= offset_1
; offset_1
= (U32
)offcode
; /* swap offset history */
2033 ZSTD_storeSeq(seqStore
, 0, anchor
, iend
, STORE_REPCODE_1
, matchLength
);
2036 continue; /* faster when present ... (?) */
2041 /* Save reps for next block */
2045 /* Return the last literals size */
2046 return (size_t)(iend
- anchor
);
2050 size_t ZSTD_compressBlock_greedy_extDict(
2051 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2052 void const* src
, size_t srcSize
)
2054 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 0);
2057 size_t ZSTD_compressBlock_lazy_extDict(
2058 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2059 void const* src
, size_t srcSize
)
2062 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 1);
2065 size_t ZSTD_compressBlock_lazy2_extDict(
2066 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2067 void const* src
, size_t srcSize
)
2070 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_hashChain
, 2);
2073 size_t ZSTD_compressBlock_btlazy2_extDict(
2074 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2075 void const* src
, size_t srcSize
)
2078 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_binaryTree
, 2);
2081 size_t ZSTD_compressBlock_greedy_extDict_row(
2082 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2083 void const* src
, size_t srcSize
)
2085 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 0);
2088 size_t ZSTD_compressBlock_lazy_extDict_row(
2089 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2090 void const* src
, size_t srcSize
)
2093 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 1);
2096 size_t ZSTD_compressBlock_lazy2_extDict_row(
2097 ZSTD_matchState_t
* ms
, seqStore_t
* seqStore
, U32 rep
[ZSTD_REP_NUM
],
2098 void const* src
, size_t srcSize
)
2101 return ZSTD_compressBlock_lazy_extDict_generic(ms
, seqStore
, rep
, src
, srcSize
, search_rowHash
, 2);