2 * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
11 /* zstd_decompress_block :
12 * this module takes care of decompressing _compressed_ block */
14 /*-*******************************************************
16 *********************************************************/
17 #include <string.h> /* memcpy, memmove, memset */
18 #include "../common/compiler.h" /* prefetch */
19 #include "../common/cpu.h" /* bmi2 */
20 #include "../common/mem.h" /* low level memory routines */
21 #define FSE_STATIC_LINKING_ONLY
22 #include "../common/fse.h"
23 #define HUF_STATIC_LINKING_ONLY
24 #include "../common/huf.h"
25 #include "../common/zstd_internal.h"
26 #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27 #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28 #include "zstd_decompress_block.h"
30 /*_*******************************************************
32 **********************************************************/
34 /* These two optional macros force the use one way or another of the two
35 * ZSTD_decompressSequences implementations. You can't force in both directions
38 #if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
39 defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
40 #error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
44 /*_*******************************************************
46 **********************************************************/
47 static void ZSTD_copy4(void* dst
, const void* src
) { memcpy(dst
, src
, 4); }
50 /*-*************************************************************
52 ***************************************************************/
54 /*! ZSTD_getcBlockSize() :
55 * Provides the size of compressed block from block header `src` */
56 size_t ZSTD_getcBlockSize(const void* src
, size_t srcSize
,
57 blockProperties_t
* bpPtr
)
59 RETURN_ERROR_IF(srcSize
< ZSTD_blockHeaderSize
, srcSize_wrong
, "");
61 { U32
const cBlockHeader
= MEM_readLE24(src
);
62 U32
const cSize
= cBlockHeader
>> 3;
63 bpPtr
->lastBlock
= cBlockHeader
& 1;
64 bpPtr
->blockType
= (blockType_e
)((cBlockHeader
>> 1) & 3);
65 bpPtr
->origSize
= cSize
; /* only useful for RLE */
66 if (bpPtr
->blockType
== bt_rle
) return 1;
67 RETURN_ERROR_IF(bpPtr
->blockType
== bt_reserved
, corruption_detected
, "");
73 /* Hidden declaration for fullbench */
74 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx
* dctx
,
75 const void* src
, size_t srcSize
);
76 /*! ZSTD_decodeLiteralsBlock() :
77 * @return : nb of bytes read from src (< srcSize )
78 * note : symbol not declared but exposed for fullbench */
79 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx
* dctx
,
80 const void* src
, size_t srcSize
) /* note : srcSize < BLOCKSIZE */
82 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83 RETURN_ERROR_IF(srcSize
< MIN_CBLOCK_SIZE
, corruption_detected
, "");
85 { const BYTE
* const istart
= (const BYTE
*) src
;
86 symbolEncodingType_e
const litEncType
= (symbolEncodingType_e
)(istart
[0] & 3);
91 DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92 RETURN_ERROR_IF(dctx
->litEntropy
==0, dictionary_corrupted
, "");
96 RETURN_ERROR_IF(srcSize
< 5, corruption_detected
, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
97 { size_t lhSize
, litSize
, litCSize
;
99 U32
const lhlCode
= (istart
[0] >> 2) & 3;
100 U32
const lhc
= MEM_readLE32(istart
);
104 case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
105 /* 2 - 2 - 10 - 10 */
106 singleStream
= !lhlCode
;
108 litSize
= (lhc
>> 4) & 0x3FF;
109 litCSize
= (lhc
>> 14) & 0x3FF;
112 /* 2 - 2 - 14 - 14 */
114 litSize
= (lhc
>> 4) & 0x3FFF;
115 litCSize
= lhc
>> 18;
118 /* 2 - 2 - 18 - 18 */
120 litSize
= (lhc
>> 4) & 0x3FFFF;
121 litCSize
= (lhc
>> 22) + ((size_t)istart
[4] << 10);
124 RETURN_ERROR_IF(litSize
> ZSTD_BLOCKSIZE_MAX
, corruption_detected
, "");
125 RETURN_ERROR_IF(litCSize
+ lhSize
> srcSize
, corruption_detected
, "");
127 /* prefetch huffman table if cold */
128 if (dctx
->ddictIsCold
&& (litSize
> 768 /* heuristic */)) {
129 PREFETCH_AREA(dctx
->HUFptr
, sizeof(dctx
->entropy
.hufTable
));
132 if (litEncType
==set_repeat
) {
134 hufSuccess
= HUF_decompress1X_usingDTable_bmi2(
135 dctx
->litBuffer
, litSize
, istart
+lhSize
, litCSize
,
136 dctx
->HUFptr
, dctx
->bmi2
);
138 hufSuccess
= HUF_decompress4X_usingDTable_bmi2(
139 dctx
->litBuffer
, litSize
, istart
+lhSize
, litCSize
,
140 dctx
->HUFptr
, dctx
->bmi2
);
144 #if defined(HUF_FORCE_DECOMPRESS_X2)
145 hufSuccess
= HUF_decompress1X_DCtx_wksp(
146 dctx
->entropy
.hufTable
, dctx
->litBuffer
, litSize
,
147 istart
+lhSize
, litCSize
, dctx
->workspace
,
148 sizeof(dctx
->workspace
));
150 hufSuccess
= HUF_decompress1X1_DCtx_wksp_bmi2(
151 dctx
->entropy
.hufTable
, dctx
->litBuffer
, litSize
,
152 istart
+lhSize
, litCSize
, dctx
->workspace
,
153 sizeof(dctx
->workspace
), dctx
->bmi2
);
156 hufSuccess
= HUF_decompress4X_hufOnly_wksp_bmi2(
157 dctx
->entropy
.hufTable
, dctx
->litBuffer
, litSize
,
158 istart
+lhSize
, litCSize
, dctx
->workspace
,
159 sizeof(dctx
->workspace
), dctx
->bmi2
);
163 RETURN_ERROR_IF(HUF_isError(hufSuccess
), corruption_detected
, "");
165 dctx
->litPtr
= dctx
->litBuffer
;
166 dctx
->litSize
= litSize
;
167 dctx
->litEntropy
= 1;
168 if (litEncType
==set_compressed
) dctx
->HUFptr
= dctx
->entropy
.hufTable
;
169 memset(dctx
->litBuffer
+ dctx
->litSize
, 0, WILDCOPY_OVERLENGTH
);
170 return litCSize
+ lhSize
;
174 { size_t litSize
, lhSize
;
175 U32
const lhlCode
= ((istart
[0]) >> 2) & 3;
178 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
180 litSize
= istart
[0] >> 3;
184 litSize
= MEM_readLE16(istart
) >> 4;
188 litSize
= MEM_readLE24(istart
) >> 4;
192 if (lhSize
+litSize
+WILDCOPY_OVERLENGTH
> srcSize
) { /* risk reading beyond src buffer with wildcopy */
193 RETURN_ERROR_IF(litSize
+lhSize
> srcSize
, corruption_detected
, "");
194 memcpy(dctx
->litBuffer
, istart
+lhSize
, litSize
);
195 dctx
->litPtr
= dctx
->litBuffer
;
196 dctx
->litSize
= litSize
;
197 memset(dctx
->litBuffer
+ dctx
->litSize
, 0, WILDCOPY_OVERLENGTH
);
198 return lhSize
+litSize
;
200 /* direct reference into compressed stream */
201 dctx
->litPtr
= istart
+lhSize
;
202 dctx
->litSize
= litSize
;
203 return lhSize
+litSize
;
207 { U32
const lhlCode
= ((istart
[0]) >> 2) & 3;
208 size_t litSize
, lhSize
;
211 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
213 litSize
= istart
[0] >> 3;
217 litSize
= MEM_readLE16(istart
) >> 4;
221 litSize
= MEM_readLE24(istart
) >> 4;
222 RETURN_ERROR_IF(srcSize
<4, corruption_detected
, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
225 RETURN_ERROR_IF(litSize
> ZSTD_BLOCKSIZE_MAX
, corruption_detected
, "");
226 memset(dctx
->litBuffer
, istart
[lhSize
], litSize
+ WILDCOPY_OVERLENGTH
);
227 dctx
->litPtr
= dctx
->litBuffer
;
228 dctx
->litSize
= litSize
;
232 RETURN_ERROR(corruption_detected
, "impossible");
237 /* Default FSE distribution tables.
238 * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239 * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
240 * They were generated programmatically with following method :
241 * - start from default distributions, present in /lib/common/zstd_internal.h
242 * - generate tables normally, using ZSTD_buildFSETable()
243 * - printout the content of tables
244 * - pretify output, report below, test with fuzzer to ensure it's correct */
246 /* Default FSE distribution table for Literal Lengths */
247 static const ZSTD_seqSymbol LL_defaultDTable
[(1<<LL_DEFAULTNORMLOG
)+1] = {
248 { 1, 1, 1, LL_DEFAULTNORMLOG
}, /* header : fastMode, tableLog */
249 /* nextState, nbAddBits, nbBits, baseVal */
250 { 0, 0, 4, 0}, { 16, 0, 4, 0},
251 { 32, 0, 5, 1}, { 0, 0, 5, 3},
252 { 0, 0, 5, 4}, { 0, 0, 5, 6},
253 { 0, 0, 5, 7}, { 0, 0, 5, 9},
254 { 0, 0, 5, 10}, { 0, 0, 5, 12},
255 { 0, 0, 6, 14}, { 0, 1, 5, 16},
256 { 0, 1, 5, 20}, { 0, 1, 5, 22},
257 { 0, 2, 5, 28}, { 0, 3, 5, 32},
258 { 0, 4, 5, 48}, { 32, 6, 5, 64},
259 { 0, 7, 5, 128}, { 0, 8, 6, 256},
260 { 0, 10, 6, 1024}, { 0, 12, 6, 4096},
261 { 32, 0, 4, 0}, { 0, 0, 4, 1},
262 { 0, 0, 5, 2}, { 32, 0, 5, 4},
263 { 0, 0, 5, 5}, { 32, 0, 5, 7},
264 { 0, 0, 5, 8}, { 32, 0, 5, 10},
265 { 0, 0, 5, 11}, { 0, 0, 6, 13},
266 { 32, 1, 5, 16}, { 0, 1, 5, 18},
267 { 32, 1, 5, 22}, { 0, 2, 5, 24},
268 { 32, 3, 5, 32}, { 0, 3, 5, 40},
269 { 0, 6, 4, 64}, { 16, 6, 4, 64},
270 { 32, 7, 5, 128}, { 0, 9, 6, 512},
271 { 0, 11, 6, 2048}, { 48, 0, 4, 0},
272 { 16, 0, 4, 1}, { 32, 0, 5, 2},
273 { 32, 0, 5, 3}, { 32, 0, 5, 5},
274 { 32, 0, 5, 6}, { 32, 0, 5, 8},
275 { 32, 0, 5, 9}, { 32, 0, 5, 11},
276 { 32, 0, 5, 12}, { 0, 0, 6, 15},
277 { 32, 1, 5, 18}, { 32, 1, 5, 20},
278 { 32, 2, 5, 24}, { 32, 2, 5, 28},
279 { 32, 3, 5, 40}, { 32, 4, 5, 48},
280 { 0, 16, 6,65536}, { 0, 15, 6,32768},
281 { 0, 14, 6,16384}, { 0, 13, 6, 8192},
282 }; /* LL_defaultDTable */
284 /* Default FSE distribution table for Offset Codes */
285 static const ZSTD_seqSymbol OF_defaultDTable
[(1<<OF_DEFAULTNORMLOG
)+1] = {
286 { 1, 1, 1, OF_DEFAULTNORMLOG
}, /* header : fastMode, tableLog */
287 /* nextState, nbAddBits, nbBits, baseVal */
288 { 0, 0, 5, 0}, { 0, 6, 4, 61},
289 { 0, 9, 5, 509}, { 0, 15, 5,32765},
290 { 0, 21, 5,2097149}, { 0, 3, 5, 5},
291 { 0, 7, 4, 125}, { 0, 12, 5, 4093},
292 { 0, 18, 5,262141}, { 0, 23, 5,8388605},
293 { 0, 5, 5, 29}, { 0, 8, 4, 253},
294 { 0, 14, 5,16381}, { 0, 20, 5,1048573},
295 { 0, 2, 5, 1}, { 16, 7, 4, 125},
296 { 0, 11, 5, 2045}, { 0, 17, 5,131069},
297 { 0, 22, 5,4194301}, { 0, 4, 5, 13},
298 { 16, 8, 4, 253}, { 0, 13, 5, 8189},
299 { 0, 19, 5,524285}, { 0, 1, 5, 1},
300 { 16, 6, 4, 61}, { 0, 10, 5, 1021},
301 { 0, 16, 5,65533}, { 0, 28, 5,268435453},
302 { 0, 27, 5,134217725}, { 0, 26, 5,67108861},
303 { 0, 25, 5,33554429}, { 0, 24, 5,16777213},
304 }; /* OF_defaultDTable */
307 /* Default FSE distribution table for Match Lengths */
308 static const ZSTD_seqSymbol ML_defaultDTable
[(1<<ML_DEFAULTNORMLOG
)+1] = {
309 { 1, 1, 1, ML_DEFAULTNORMLOG
}, /* header : fastMode, tableLog */
310 /* nextState, nbAddBits, nbBits, baseVal */
311 { 0, 0, 6, 3}, { 0, 0, 4, 4},
312 { 32, 0, 5, 5}, { 0, 0, 5, 6},
313 { 0, 0, 5, 8}, { 0, 0, 5, 9},
314 { 0, 0, 5, 11}, { 0, 0, 6, 13},
315 { 0, 0, 6, 16}, { 0, 0, 6, 19},
316 { 0, 0, 6, 22}, { 0, 0, 6, 25},
317 { 0, 0, 6, 28}, { 0, 0, 6, 31},
318 { 0, 0, 6, 34}, { 0, 1, 6, 37},
319 { 0, 1, 6, 41}, { 0, 2, 6, 47},
320 { 0, 3, 6, 59}, { 0, 4, 6, 83},
321 { 0, 7, 6, 131}, { 0, 9, 6, 515},
322 { 16, 0, 4, 4}, { 0, 0, 4, 5},
323 { 32, 0, 5, 6}, { 0, 0, 5, 7},
324 { 32, 0, 5, 9}, { 0, 0, 5, 10},
325 { 0, 0, 6, 12}, { 0, 0, 6, 15},
326 { 0, 0, 6, 18}, { 0, 0, 6, 21},
327 { 0, 0, 6, 24}, { 0, 0, 6, 27},
328 { 0, 0, 6, 30}, { 0, 0, 6, 33},
329 { 0, 1, 6, 35}, { 0, 1, 6, 39},
330 { 0, 2, 6, 43}, { 0, 3, 6, 51},
331 { 0, 4, 6, 67}, { 0, 5, 6, 99},
332 { 0, 8, 6, 259}, { 32, 0, 4, 4},
333 { 48, 0, 4, 4}, { 16, 0, 4, 5},
334 { 32, 0, 5, 7}, { 32, 0, 5, 8},
335 { 32, 0, 5, 10}, { 32, 0, 5, 11},
336 { 0, 0, 6, 14}, { 0, 0, 6, 17},
337 { 0, 0, 6, 20}, { 0, 0, 6, 23},
338 { 0, 0, 6, 26}, { 0, 0, 6, 29},
339 { 0, 0, 6, 32}, { 0, 16, 6,65539},
340 { 0, 15, 6,32771}, { 0, 14, 6,16387},
341 { 0, 13, 6, 8195}, { 0, 12, 6, 4099},
342 { 0, 11, 6, 2051}, { 0, 10, 6, 1027},
343 }; /* ML_defaultDTable */
346 static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol
* dt
, U32 baseValue
, U32 nbAddBits
)
349 ZSTD_seqSymbol_header
* const DTableH
= (ZSTD_seqSymbol_header
*)ptr
;
350 ZSTD_seqSymbol
* const cell
= dt
+ 1;
352 DTableH
->tableLog
= 0;
353 DTableH
->fastMode
= 0;
357 assert(nbAddBits
< 255);
358 cell
->nbAdditionalBits
= (BYTE
)nbAddBits
;
359 cell
->baseValue
= baseValue
;
363 /* ZSTD_buildFSETable() :
364 * generate FSE decoding table for one symbol (ll, ml or off)
365 * cannot fail if input is valid =>
366 * all inputs are presumed validated at this stage */
368 ZSTD_buildFSETable(ZSTD_seqSymbol
* dt
,
369 const short* normalizedCounter
, unsigned maxSymbolValue
,
370 const U32
* baseValue
, const U32
* nbAdditionalBits
,
373 ZSTD_seqSymbol
* const tableDecode
= dt
+1;
374 U16 symbolNext
[MaxSeq
+1];
376 U32
const maxSV1
= maxSymbolValue
+ 1;
377 U32
const tableSize
= 1 << tableLog
;
378 U32 highThreshold
= tableSize
-1;
381 assert(maxSymbolValue
<= MaxSeq
);
382 assert(tableLog
<= MaxFSELog
);
384 /* Init, lay down lowprob symbols */
385 { ZSTD_seqSymbol_header DTableH
;
386 DTableH
.tableLog
= tableLog
;
387 DTableH
.fastMode
= 1;
388 { S16
const largeLimit
= (S16
)(1 << (tableLog
-1));
390 for (s
=0; s
<maxSV1
; s
++) {
391 if (normalizedCounter
[s
]==-1) {
392 tableDecode
[highThreshold
--].baseValue
= s
;
395 if (normalizedCounter
[s
] >= largeLimit
) DTableH
.fastMode
=0;
396 assert(normalizedCounter
[s
]>=0);
397 symbolNext
[s
] = (U16
)normalizedCounter
[s
];
399 memcpy(dt
, &DTableH
, sizeof(DTableH
));
403 { U32
const tableMask
= tableSize
-1;
404 U32
const step
= FSE_TABLESTEP(tableSize
);
406 for (s
=0; s
<maxSV1
; s
++) {
408 for (i
=0; i
<normalizedCounter
[s
]; i
++) {
409 tableDecode
[position
].baseValue
= s
;
410 position
= (position
+ step
) & tableMask
;
411 while (position
> highThreshold
) position
= (position
+ step
) & tableMask
; /* lowprob area */
413 assert(position
== 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
416 /* Build Decoding table */
418 for (u
=0; u
<tableSize
; u
++) {
419 U32
const symbol
= tableDecode
[u
].baseValue
;
420 U32
const nextState
= symbolNext
[symbol
]++;
421 tableDecode
[u
].nbBits
= (BYTE
) (tableLog
- BIT_highbit32(nextState
) );
422 tableDecode
[u
].nextState
= (U16
) ( (nextState
<< tableDecode
[u
].nbBits
) - tableSize
);
423 assert(nbAdditionalBits
[symbol
] < 255);
424 tableDecode
[u
].nbAdditionalBits
= (BYTE
)nbAdditionalBits
[symbol
];
425 tableDecode
[u
].baseValue
= baseValue
[symbol
];
430 /*! ZSTD_buildSeqTable() :
431 * @return : nb bytes read from src,
432 * or an error code if it fails */
433 static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol
* DTableSpace
, const ZSTD_seqSymbol
** DTablePtr
,
434 symbolEncodingType_e type
, unsigned max
, U32 maxLog
,
435 const void* src
, size_t srcSize
,
436 const U32
* baseValue
, const U32
* nbAdditionalBits
,
437 const ZSTD_seqSymbol
* defaultTable
, U32 flagRepeatTable
,
438 int ddictIsCold
, int nbSeq
)
443 RETURN_ERROR_IF(!srcSize
, srcSize_wrong
, "");
444 RETURN_ERROR_IF((*(const BYTE
*)src
) > max
, corruption_detected
, "");
445 { U32
const symbol
= *(const BYTE
*)src
;
446 U32
const baseline
= baseValue
[symbol
];
447 U32
const nbBits
= nbAdditionalBits
[symbol
];
448 ZSTD_buildSeqTable_rle(DTableSpace
, baseline
, nbBits
);
450 *DTablePtr
= DTableSpace
;
453 *DTablePtr
= defaultTable
;
456 RETURN_ERROR_IF(!flagRepeatTable
, corruption_detected
, "");
457 /* prefetch FSE table if used */
458 if (ddictIsCold
&& (nbSeq
> 24 /* heuristic */)) {
459 const void* const pStart
= *DTablePtr
;
460 size_t const pSize
= sizeof(ZSTD_seqSymbol
) * (SEQSYMBOL_TABLE_SIZE(maxLog
));
461 PREFETCH_AREA(pStart
, pSize
);
464 case set_compressed
:
467 size_t const headerSize
= FSE_readNCount(norm
, &max
, &tableLog
, src
, srcSize
);
468 RETURN_ERROR_IF(FSE_isError(headerSize
), corruption_detected
, "");
469 RETURN_ERROR_IF(tableLog
> maxLog
, corruption_detected
, "");
470 ZSTD_buildFSETable(DTableSpace
, norm
, max
, baseValue
, nbAdditionalBits
, tableLog
);
471 *DTablePtr
= DTableSpace
;
476 RETURN_ERROR(GENERIC
, "impossible");
480 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx
* dctx
, int* nbSeqPtr
,
481 const void* src
, size_t srcSize
)
483 const BYTE
* const istart
= (const BYTE
* const)src
;
484 const BYTE
* const iend
= istart
+ srcSize
;
485 const BYTE
* ip
= istart
;
487 DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
490 RETURN_ERROR_IF(srcSize
< MIN_SEQUENCES_SIZE
, srcSize_wrong
, "");
496 RETURN_ERROR_IF(srcSize
!= 1, srcSize_wrong
, "");
501 RETURN_ERROR_IF(ip
+2 > iend
, srcSize_wrong
, "");
502 nbSeq
= MEM_readLE16(ip
) + LONGNBSEQ
, ip
+=2;
504 RETURN_ERROR_IF(ip
>= iend
, srcSize_wrong
, "");
505 nbSeq
= ((nbSeq
-0x80)<<8) + *ip
++;
510 /* FSE table descriptors */
511 RETURN_ERROR_IF(ip
+1 > iend
, srcSize_wrong
, ""); /* minimum possible size: 1 byte for symbol encoding types */
512 { symbolEncodingType_e
const LLtype
= (symbolEncodingType_e
)(*ip
>> 6);
513 symbolEncodingType_e
const OFtype
= (symbolEncodingType_e
)((*ip
>> 4) & 3);
514 symbolEncodingType_e
const MLtype
= (symbolEncodingType_e
)((*ip
>> 2) & 3);
518 { size_t const llhSize
= ZSTD_buildSeqTable(dctx
->entropy
.LLTable
, &dctx
->LLTptr
,
519 LLtype
, MaxLL
, LLFSELog
,
522 LL_defaultDTable
, dctx
->fseEntropy
,
523 dctx
->ddictIsCold
, nbSeq
);
524 RETURN_ERROR_IF(ZSTD_isError(llhSize
), corruption_detected
, "ZSTD_buildSeqTable failed");
528 { size_t const ofhSize
= ZSTD_buildSeqTable(dctx
->entropy
.OFTable
, &dctx
->OFTptr
,
529 OFtype
, MaxOff
, OffFSELog
,
532 OF_defaultDTable
, dctx
->fseEntropy
,
533 dctx
->ddictIsCold
, nbSeq
);
534 RETURN_ERROR_IF(ZSTD_isError(ofhSize
), corruption_detected
, "ZSTD_buildSeqTable failed");
538 { size_t const mlhSize
= ZSTD_buildSeqTable(dctx
->entropy
.MLTable
, &dctx
->MLTptr
,
539 MLtype
, MaxML
, MLFSELog
,
542 ML_defaultDTable
, dctx
->fseEntropy
,
543 dctx
->ddictIsCold
, nbSeq
);
544 RETURN_ERROR_IF(ZSTD_isError(mlhSize
), corruption_detected
, "ZSTD_buildSeqTable failed");
561 const ZSTD_seqSymbol
* table
;
565 BIT_DStream_t DStream
;
566 ZSTD_fseState stateLL
;
567 ZSTD_fseState stateOffb
;
568 ZSTD_fseState stateML
;
569 size_t prevOffset
[ZSTD_REP_NUM
];
572 /*! ZSTD_overlapCopy8() :
573 * Copies 8 bytes from ip to op and updates op and ip where ip <= op.
574 * If the offset is < 8 then the offset is spread to at least 8 bytes.
576 * Precondition: *ip <= *op
577 * Postcondition: *op - *op >= 8
579 HINT_INLINE
void ZSTD_overlapCopy8(BYTE
** op
, BYTE
const** ip
, size_t offset
) {
582 /* close range match, overlap */
583 static const U32 dec32table
[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
584 static const int dec64table
[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
585 int const sub2
= dec64table
[offset
];
590 *ip
+= dec32table
[offset
];
591 ZSTD_copy4(*op
+4, *ip
);
594 ZSTD_copy8(*op
, *ip
);
598 assert(*op
- *ip
>= 8);
601 /*! ZSTD_safecopy() :
602 * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
603 * and write up to 16 bytes past oend_w (op >= oend_w is allowed).
604 * This function is only called in the uncommon case where the sequence is near the end of the block. It
605 * should be fast for a single long sequence, but can be slow for several short sequences.
607 * @param ovtype controls the overlap detection
608 * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
609 * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
610 * The src buffer must be before the dst buffer.
612 static void ZSTD_safecopy(BYTE
* op
, BYTE
* const oend_w
, BYTE
const* ip
, ptrdiff_t length
, ZSTD_overlap_e ovtype
) {
613 ptrdiff_t const diff
= op
- ip
;
614 BYTE
* const oend
= op
+ length
;
616 assert((ovtype
== ZSTD_no_overlap
&& (diff
<= -8 || diff
>= 8 || op
>= oend_w
)) ||
617 (ovtype
== ZSTD_overlap_src_before_dst
&& diff
>= 0));
620 /* Handle short lengths. */
621 while (op
< oend
) *op
++ = *ip
++;
624 if (ovtype
== ZSTD_overlap_src_before_dst
) {
625 /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
627 ZSTD_overlapCopy8(&op
, &ip
, diff
);
628 assert(op
- ip
>= 8);
632 if (oend
<= oend_w
) {
633 /* No risk of overwrite. */
634 ZSTD_wildcopy(op
, ip
, length
, ovtype
);
638 /* Wildcopy until we get close to the end. */
639 assert(oend
> oend_w
);
640 ZSTD_wildcopy(op
, ip
, oend_w
- op
, ovtype
);
644 /* Handle the leftovers. */
645 while (op
< oend
) *op
++ = *ip
++;
648 /* ZSTD_execSequenceEnd():
649 * This version handles cases that are near the end of the output buffer. It requires
650 * more careful checks to make sure there is no overflow. By separating out these hard
651 * and unlikely cases, we can speed up the common cases.
653 * NOTE: This function needs to be fast for a single long sequence, but doesn't need
654 * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
657 size_t ZSTD_execSequenceEnd(BYTE
* op
,
658 BYTE
* const oend
, seq_t sequence
,
659 const BYTE
** litPtr
, const BYTE
* const litLimit
,
660 const BYTE
* const prefixStart
, const BYTE
* const virtualStart
, const BYTE
* const dictEnd
)
662 BYTE
* const oLitEnd
= op
+ sequence
.litLength
;
663 size_t const sequenceLength
= sequence
.litLength
+ sequence
.matchLength
;
664 const BYTE
* const iLitEnd
= *litPtr
+ sequence
.litLength
;
665 const BYTE
* match
= oLitEnd
- sequence
.offset
;
666 BYTE
* const oend_w
= oend
- WILDCOPY_OVERLENGTH
;
668 /* bounds checks : careful of address space overflow in 32-bit mode */
669 RETURN_ERROR_IF(sequenceLength
> (size_t)(oend
- op
), dstSize_tooSmall
, "last match must fit within dstBuffer");
670 RETURN_ERROR_IF(sequence
.litLength
> (size_t)(litLimit
- *litPtr
), corruption_detected
, "try to read beyond literal buffer");
671 assert(op
< op
+ sequenceLength
);
672 assert(oLitEnd
< op
+ sequenceLength
);
675 ZSTD_safecopy(op
, oend_w
, *litPtr
, sequence
.litLength
, ZSTD_no_overlap
);
680 if (sequence
.offset
> (size_t)(oLitEnd
- prefixStart
)) {
681 /* offset beyond prefix */
682 RETURN_ERROR_IF(sequence
.offset
> (size_t)(oLitEnd
- virtualStart
), corruption_detected
, "");
683 match
= dictEnd
- (prefixStart
-match
);
684 if (match
+ sequence
.matchLength
<= dictEnd
) {
685 memmove(oLitEnd
, match
, sequence
.matchLength
);
686 return sequenceLength
;
688 /* span extDict & currentPrefixSegment */
689 { size_t const length1
= dictEnd
- match
;
690 memmove(oLitEnd
, match
, length1
);
691 op
= oLitEnd
+ length1
;
692 sequence
.matchLength
-= length1
;
695 ZSTD_safecopy(op
, oend_w
, match
, sequence
.matchLength
, ZSTD_overlap_src_before_dst
);
696 return sequenceLength
;
700 size_t ZSTD_execSequence(BYTE
* op
,
701 BYTE
* const oend
, seq_t sequence
,
702 const BYTE
** litPtr
, const BYTE
* const litLimit
,
703 const BYTE
* const prefixStart
, const BYTE
* const virtualStart
, const BYTE
* const dictEnd
)
705 BYTE
* const oLitEnd
= op
+ sequence
.litLength
;
706 size_t const sequenceLength
= sequence
.litLength
+ sequence
.matchLength
;
707 BYTE
* const oMatchEnd
= op
+ sequenceLength
; /* risk : address space overflow (32-bits) */
708 BYTE
* const oend_w
= oend
- WILDCOPY_OVERLENGTH
; /* risk : address space underflow on oend=NULL */
709 const BYTE
* const iLitEnd
= *litPtr
+ sequence
.litLength
;
710 const BYTE
* match
= oLitEnd
- sequence
.offset
;
712 assert(op
!= NULL
/* Precondition */);
713 assert(oend_w
< oend
/* No underflow */);
714 /* Handle edge cases in a slow path:
715 * - Read beyond end of literals
716 * - Match end is within WILDCOPY_OVERLIMIT of oend
717 * - 32-bit mode and the match length overflows
720 iLitEnd
> litLimit
||
721 oMatchEnd
> oend_w
||
722 (MEM_32bits() && (size_t)(oend
- op
) < sequenceLength
+ WILDCOPY_OVERLENGTH
)))
723 return ZSTD_execSequenceEnd(op
, oend
, sequence
, litPtr
, litLimit
, prefixStart
, virtualStart
, dictEnd
);
725 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
726 assert(op
<= oLitEnd
/* No overflow */);
727 assert(oLitEnd
< oMatchEnd
/* Non-zero match & no overflow */);
728 assert(oMatchEnd
<= oend
/* No underflow */);
729 assert(iLitEnd
<= litLimit
/* Literal length is in bounds */);
730 assert(oLitEnd
<= oend_w
/* Can wildcopy literals */);
731 assert(oMatchEnd
<= oend_w
/* Can wildcopy matches */);
734 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
735 * We likely don't need the full 32-byte wildcopy.
737 assert(WILDCOPY_OVERLENGTH
>= 16);
738 ZSTD_copy16(op
, (*litPtr
));
739 if (UNLIKELY(sequence
.litLength
> 16)) {
740 ZSTD_wildcopy(op
+16, (*litPtr
)+16, sequence
.litLength
-16, ZSTD_no_overlap
);
743 *litPtr
= iLitEnd
; /* update for next sequence */
746 if (sequence
.offset
> (size_t)(oLitEnd
- prefixStart
)) {
747 /* offset beyond prefix -> go into extDict */
748 RETURN_ERROR_IF(UNLIKELY(sequence
.offset
> (size_t)(oLitEnd
- virtualStart
)), corruption_detected
, "");
749 match
= dictEnd
+ (match
- prefixStart
);
750 if (match
+ sequence
.matchLength
<= dictEnd
) {
751 memmove(oLitEnd
, match
, sequence
.matchLength
);
752 return sequenceLength
;
754 /* span extDict & currentPrefixSegment */
755 { size_t const length1
= dictEnd
- match
;
756 memmove(oLitEnd
, match
, length1
);
757 op
= oLitEnd
+ length1
;
758 sequence
.matchLength
-= length1
;
761 /* Match within prefix of 1 or more bytes */
762 assert(op
<= oMatchEnd
);
763 assert(oMatchEnd
<= oend_w
);
764 assert(match
>= prefixStart
);
765 assert(sequence
.matchLength
>= 1);
767 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
768 * without overlap checking.
770 if (LIKELY(sequence
.offset
>= WILDCOPY_VECLEN
)) {
771 /* We bet on a full wildcopy for matches, since we expect matches to be
772 * longer than literals (in general). In silesia, ~10% of matches are longer
775 ZSTD_wildcopy(op
, match
, (ptrdiff_t)sequence
.matchLength
, ZSTD_no_overlap
);
776 return sequenceLength
;
778 assert(sequence
.offset
< WILDCOPY_VECLEN
);
780 /* Copy 8 bytes and spread the offset to be >= 8. */
781 ZSTD_overlapCopy8(&op
, &match
, sequence
.offset
);
783 /* If the match length is > 8 bytes, then continue with the wildcopy. */
784 if (sequence
.matchLength
> 8) {
785 assert(op
< oMatchEnd
);
786 ZSTD_wildcopy(op
, match
, (ptrdiff_t)sequence
.matchLength
-8, ZSTD_overlap_src_before_dst
);
788 return sequenceLength
;
792 ZSTD_initFseState(ZSTD_fseState
* DStatePtr
, BIT_DStream_t
* bitD
, const ZSTD_seqSymbol
* dt
)
794 const void* ptr
= dt
;
795 const ZSTD_seqSymbol_header
* const DTableH
= (const ZSTD_seqSymbol_header
*)ptr
;
796 DStatePtr
->state
= BIT_readBits(bitD
, DTableH
->tableLog
);
797 DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
798 (U32
)DStatePtr
->state
, DTableH
->tableLog
);
799 BIT_reloadDStream(bitD
);
800 DStatePtr
->table
= dt
+ 1;
803 FORCE_INLINE_TEMPLATE
void
804 ZSTD_updateFseState(ZSTD_fseState
* DStatePtr
, BIT_DStream_t
* bitD
)
806 ZSTD_seqSymbol
const DInfo
= DStatePtr
->table
[DStatePtr
->state
];
807 U32
const nbBits
= DInfo
.nbBits
;
808 size_t const lowBits
= BIT_readBits(bitD
, nbBits
);
809 DStatePtr
->state
= DInfo
.nextState
+ lowBits
;
812 FORCE_INLINE_TEMPLATE
void
813 ZSTD_updateFseStateWithDInfo(ZSTD_fseState
* DStatePtr
, BIT_DStream_t
* bitD
, ZSTD_seqSymbol
const DInfo
)
815 U32
const nbBits
= DInfo
.nbBits
;
816 size_t const lowBits
= BIT_readBits(bitD
, nbBits
);
817 DStatePtr
->state
= DInfo
.nextState
+ lowBits
;
820 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
821 * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
822 * bits before reloading. This value is the maximum number of bytes we read
823 * after reloading when we are decoding long offsets.
825 #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
826 (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
827 ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
830 typedef enum { ZSTD_lo_isRegularOffset
, ZSTD_lo_isLongOffset
=1 } ZSTD_longOffset_e
;
832 FORCE_INLINE_TEMPLATE seq_t
833 ZSTD_decodeSequence(seqState_t
* seqState
, const ZSTD_longOffset_e longOffsets
)
836 ZSTD_seqSymbol
const llDInfo
= seqState
->stateLL
.table
[seqState
->stateLL
.state
];
837 ZSTD_seqSymbol
const mlDInfo
= seqState
->stateML
.table
[seqState
->stateML
.state
];
838 ZSTD_seqSymbol
const ofDInfo
= seqState
->stateOffb
.table
[seqState
->stateOffb
.state
];
839 U32
const llBase
= llDInfo
.baseValue
;
840 U32
const mlBase
= mlDInfo
.baseValue
;
841 U32
const ofBase
= ofDInfo
.baseValue
;
842 BYTE
const llBits
= llDInfo
.nbAdditionalBits
;
843 BYTE
const mlBits
= mlDInfo
.nbAdditionalBits
;
844 BYTE
const ofBits
= ofDInfo
.nbAdditionalBits
;
845 BYTE
const totalBits
= llBits
+mlBits
+ofBits
;
850 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset
== 1);
851 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32
== 5);
852 assert(ofBits
<= MaxOff
);
853 if (MEM_32bits() && longOffsets
&& (ofBits
>= STREAM_ACCUMULATOR_MIN_32
)) {
854 U32
const extraBits
= ofBits
- MIN(ofBits
, 32 - seqState
->DStream
.bitsConsumed
);
855 offset
= ofBase
+ (BIT_readBitsFast(&seqState
->DStream
, ofBits
- extraBits
) << extraBits
);
856 BIT_reloadDStream(&seqState
->DStream
);
857 if (extraBits
) offset
+= BIT_readBitsFast(&seqState
->DStream
, extraBits
);
858 assert(extraBits
<= LONG_OFFSETS_MAX_EXTRA_BITS_32
); /* to avoid another reload */
860 offset
= ofBase
+ BIT_readBitsFast(&seqState
->DStream
, ofBits
/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
861 if (MEM_32bits()) BIT_reloadDStream(&seqState
->DStream
);
863 seqState
->prevOffset
[2] = seqState
->prevOffset
[1];
864 seqState
->prevOffset
[1] = seqState
->prevOffset
[0];
865 seqState
->prevOffset
[0] = offset
;
867 U32
const ll0
= (llBase
== 0);
868 if (LIKELY((ofBits
== 0))) {
870 offset
= seqState
->prevOffset
[0];
872 offset
= seqState
->prevOffset
[1];
873 seqState
->prevOffset
[1] = seqState
->prevOffset
[0];
874 seqState
->prevOffset
[0] = offset
;
877 offset
= ofBase
+ ll0
+ BIT_readBitsFast(&seqState
->DStream
, 1);
878 { size_t temp
= (offset
==3) ? seqState
->prevOffset
[0] - 1 : seqState
->prevOffset
[offset
];
879 temp
+= !temp
; /* 0 is not valid; input is corrupted; force offset to 1 */
880 if (offset
!= 1) seqState
->prevOffset
[2] = seqState
->prevOffset
[1];
881 seqState
->prevOffset
[1] = seqState
->prevOffset
[0];
882 seqState
->prevOffset
[0] = offset
= temp
;
887 seq
.matchLength
= mlBase
;
889 seq
.matchLength
+= BIT_readBitsFast(&seqState
->DStream
, mlBits
/*>0*/);
891 if (MEM_32bits() && (mlBits
+llBits
>= STREAM_ACCUMULATOR_MIN_32
-LONG_OFFSETS_MAX_EXTRA_BITS_32
))
892 BIT_reloadDStream(&seqState
->DStream
);
893 if (MEM_64bits() && UNLIKELY(totalBits
>= STREAM_ACCUMULATOR_MIN_64
-(LLFSELog
+MLFSELog
+OffFSELog
)))
894 BIT_reloadDStream(&seqState
->DStream
);
895 /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
896 ZSTD_STATIC_ASSERT(16+LLFSELog
+MLFSELog
+OffFSELog
< STREAM_ACCUMULATOR_MIN_64
);
898 seq
.litLength
= llBase
;
900 seq
.litLength
+= BIT_readBitsFast(&seqState
->DStream
, llBits
/*>0*/);
903 BIT_reloadDStream(&seqState
->DStream
);
905 DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
906 (U32
)seq
.litLength
, (U32
)seq
.matchLength
, (U32
)seq
.offset
);
909 * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
910 * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
911 * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
912 * better option, so it is the default for other compilers. But, if you
913 * measure that it is worse, please put up a pull request.
916 #if defined(__GNUC__) && !defined(__clang__)
917 const int kUseUpdateFseState
= 1;
919 const int kUseUpdateFseState
= 0;
921 if (kUseUpdateFseState
) {
922 ZSTD_updateFseState(&seqState
->stateLL
, &seqState
->DStream
); /* <= 9 bits */
923 ZSTD_updateFseState(&seqState
->stateML
, &seqState
->DStream
); /* <= 9 bits */
924 if (MEM_32bits()) BIT_reloadDStream(&seqState
->DStream
); /* <= 18 bits */
925 ZSTD_updateFseState(&seqState
->stateOffb
, &seqState
->DStream
); /* <= 8 bits */
927 ZSTD_updateFseStateWithDInfo(&seqState
->stateLL
, &seqState
->DStream
, llDInfo
); /* <= 9 bits */
928 ZSTD_updateFseStateWithDInfo(&seqState
->stateML
, &seqState
->DStream
, mlDInfo
); /* <= 9 bits */
929 if (MEM_32bits()) BIT_reloadDStream(&seqState
->DStream
); /* <= 18 bits */
930 ZSTD_updateFseStateWithDInfo(&seqState
->stateOffb
, &seqState
->DStream
, ofDInfo
); /* <= 8 bits */
937 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
938 static int ZSTD_dictionaryIsActive(ZSTD_DCtx
const* dctx
, BYTE
const* prefixStart
, BYTE
const* oLitEnd
)
940 size_t const windowSize
= dctx
->fParams
.windowSize
;
941 /* No dictionary used. */
942 if (dctx
->dictContentEndForFuzzing
== NULL
) return 0;
943 /* Dictionary is our prefix. */
944 if (prefixStart
== dctx
->dictContentBeginForFuzzing
) return 1;
945 /* Dictionary is not our ext-dict. */
946 if (dctx
->dictEnd
!= dctx
->dictContentEndForFuzzing
) return 0;
947 /* Dictionary is not within our window size. */
948 if ((size_t)(oLitEnd
- prefixStart
) >= windowSize
) return 0;
949 /* Dictionary is active. */
953 MEM_STATIC
void ZSTD_assertValidSequence(
954 ZSTD_DCtx
const* dctx
,
955 BYTE
const* op
, BYTE
const* oend
,
957 BYTE
const* prefixStart
, BYTE
const* virtualStart
)
959 size_t const windowSize
= dctx
->fParams
.windowSize
;
960 size_t const sequenceSize
= seq
.litLength
+ seq
.matchLength
;
961 BYTE
const* const oLitEnd
= op
+ seq
.litLength
;
962 DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
963 (U32
)seq
.litLength
, (U32
)seq
.matchLength
, (U32
)seq
.offset
);
965 assert((size_t)(oend
- op
) >= sequenceSize
);
966 assert(sequenceSize
<= ZSTD_BLOCKSIZE_MAX
);
967 if (ZSTD_dictionaryIsActive(dctx
, prefixStart
, oLitEnd
)) {
968 size_t const dictSize
= (size_t)((char const*)dctx
->dictContentEndForFuzzing
- (char const*)dctx
->dictContentBeginForFuzzing
);
969 /* Offset must be within the dictionary. */
970 assert(seq
.offset
<= (size_t)(oLitEnd
- virtualStart
));
971 assert(seq
.offset
<= windowSize
+ dictSize
);
973 /* Offset must be within our window. */
974 assert(seq
.offset
<= windowSize
);
979 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
980 FORCE_INLINE_TEMPLATE
size_t
982 ZSTD_decompressSequences_body( ZSTD_DCtx
* dctx
,
983 void* dst
, size_t maxDstSize
,
984 const void* seqStart
, size_t seqSize
, int nbSeq
,
985 const ZSTD_longOffset_e isLongOffset
,
988 const BYTE
* ip
= (const BYTE
*)seqStart
;
989 const BYTE
* const iend
= ip
+ seqSize
;
990 BYTE
* const ostart
= (BYTE
* const)dst
;
991 BYTE
* const oend
= ostart
+ maxDstSize
;
993 const BYTE
* litPtr
= dctx
->litPtr
;
994 const BYTE
* const litEnd
= litPtr
+ dctx
->litSize
;
995 const BYTE
* const prefixStart
= (const BYTE
*) (dctx
->prefixStart
);
996 const BYTE
* const vBase
= (const BYTE
*) (dctx
->virtualStart
);
997 const BYTE
* const dictEnd
= (const BYTE
*) (dctx
->dictEnd
);
998 DEBUGLOG(5, "ZSTD_decompressSequences_body");
1001 /* Regen sequences */
1003 seqState_t seqState
;
1005 dctx
->fseEntropy
= 1;
1006 { U32 i
; for (i
=0; i
<ZSTD_REP_NUM
; i
++) seqState
.prevOffset
[i
] = dctx
->entropy
.rep
[i
]; }
1008 ERR_isError(BIT_initDStream(&seqState
.DStream
, ip
, iend
-ip
)),
1009 corruption_detected
, "");
1010 ZSTD_initFseState(&seqState
.stateLL
, &seqState
.DStream
, dctx
->LLTptr
);
1011 ZSTD_initFseState(&seqState
.stateOffb
, &seqState
.DStream
, dctx
->OFTptr
);
1012 ZSTD_initFseState(&seqState
.stateML
, &seqState
.DStream
, dctx
->MLTptr
);
1013 assert(dst
!= NULL
);
1016 BIT_DStream_unfinished
< BIT_DStream_completed
&&
1017 BIT_DStream_endOfBuffer
< BIT_DStream_completed
&&
1018 BIT_DStream_completed
< BIT_DStream_overflow
);
1020 #if defined(__GNUC__) && defined(__x86_64__)
1021 /* Align the decompression loop to 32 + 16 bytes.
1023 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1024 * speed swings based on the alignment of the decompression loop. This
1025 * performance swing is caused by parts of the decompression loop falling
1026 * out of the DSB. The entire decompression loop should fit in the DSB,
1027 * when it can't we get much worse performance. You can measure if you've
1028 * hit the good case or the bad case with this perf command for some
1029 * compressed file test.zst:
1031 * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1032 * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1034 * If you see most cycles served out of the MITE you've hit the bad case.
1035 * If you see most cycles served out of the DSB you've hit the good case.
1036 * If it is pretty even then you may be in an okay case.
1038 * I've been able to reproduce this issue on the following CPUs:
1039 * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1040 * Use Instruments->Counters to get DSB/MITE cycles.
1041 * I never got performance swings, but I was able to
1042 * go from the good case of mostly DSB to half of the
1043 * cycles served from MITE.
1044 * - Coffeelake: Intel i9-9900k
1046 * I haven't been able to reproduce the instability or DSB misses on any
1047 * of the following CPUS:
1049 * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1052 * If you are seeing performance stability this script can help test.
1053 * It tests on 4 commits in zstd where I saw performance change.
1055 * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1057 __asm__(".p2align 5");
1059 __asm__(".p2align 4");
1062 seq_t
const sequence
= ZSTD_decodeSequence(&seqState
, isLongOffset
);
1063 size_t const oneSeqSize
= ZSTD_execSequence(op
, oend
, sequence
, &litPtr
, litEnd
, prefixStart
, vBase
, dictEnd
);
1064 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1065 assert(!ZSTD_isError(oneSeqSize
));
1066 if (frame
) ZSTD_assertValidSequence(dctx
, op
, oend
, sequence
, prefixStart
, vBase
);
1068 DEBUGLOG(6, "regenerated sequence size : %u", (U32
)oneSeqSize
);
1069 BIT_reloadDStream(&(seqState
.DStream
));
1070 /* gcc and clang both don't like early returns in this loop.
1071 * gcc doesn't like early breaks either.
1072 * Instead save an error and report it at the end.
1073 * When there is an error, don't increment op, so we don't
1076 if (UNLIKELY(ZSTD_isError(oneSeqSize
))) error
= oneSeqSize
;
1077 else op
+= oneSeqSize
;
1078 if (UNLIKELY(!--nbSeq
)) break;
1081 /* check if reached exact end */
1082 DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq
);
1083 if (ZSTD_isError(error
)) return error
;
1084 RETURN_ERROR_IF(nbSeq
, corruption_detected
, "");
1085 RETURN_ERROR_IF(BIT_reloadDStream(&seqState
.DStream
) < BIT_DStream_completed
, corruption_detected
, "");
1086 /* save reps for next block */
1087 { U32 i
; for (i
=0; i
<ZSTD_REP_NUM
; i
++) dctx
->entropy
.rep
[i
] = (U32
)(seqState
.prevOffset
[i
]); }
1090 /* last literal segment */
1091 { size_t const lastLLSize
= litEnd
- litPtr
;
1092 RETURN_ERROR_IF(lastLLSize
> (size_t)(oend
-op
), dstSize_tooSmall
, "");
1094 memcpy(op
, litPtr
, lastLLSize
);
1103 ZSTD_decompressSequences_default(ZSTD_DCtx
* dctx
,
1104 void* dst
, size_t maxDstSize
,
1105 const void* seqStart
, size_t seqSize
, int nbSeq
,
1106 const ZSTD_longOffset_e isLongOffset
,
1109 return ZSTD_decompressSequences_body(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1111 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1113 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1115 FORCE_INLINE_TEMPLATE
size_t
1116 ZSTD_prefetchMatch(size_t prefixPos
, seq_t
const sequence
,
1117 const BYTE
* const prefixStart
, const BYTE
* const dictEnd
)
1119 prefixPos
+= sequence
.litLength
;
1120 { const BYTE
* const matchBase
= (sequence
.offset
> prefixPos
) ? dictEnd
: prefixStart
;
1121 const BYTE
* const match
= matchBase
+ prefixPos
- sequence
.offset
; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1122 * No consequence though : no memory access will occur, offset is only used for prefetching */
1123 PREFETCH_L1(match
); PREFETCH_L1(match
+ sequence
.matchLength
- 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1125 return prefixPos
+ sequence
.matchLength
;
1128 /* This decoding function employs prefetching
1129 * to reduce latency impact of cache misses.
1130 * It's generally employed when block contains a significant portion of long-distance matches
1131 * or when coupled with a "cold" dictionary */
1132 FORCE_INLINE_TEMPLATE
size_t
1133 ZSTD_decompressSequencesLong_body(
1135 void* dst
, size_t maxDstSize
,
1136 const void* seqStart
, size_t seqSize
, int nbSeq
,
1137 const ZSTD_longOffset_e isLongOffset
,
1140 const BYTE
* ip
= (const BYTE
*)seqStart
;
1141 const BYTE
* const iend
= ip
+ seqSize
;
1142 BYTE
* const ostart
= (BYTE
* const)dst
;
1143 BYTE
* const oend
= ostart
+ maxDstSize
;
1145 const BYTE
* litPtr
= dctx
->litPtr
;
1146 const BYTE
* const litEnd
= litPtr
+ dctx
->litSize
;
1147 const BYTE
* const prefixStart
= (const BYTE
*) (dctx
->prefixStart
);
1148 const BYTE
* const dictStart
= (const BYTE
*) (dctx
->virtualStart
);
1149 const BYTE
* const dictEnd
= (const BYTE
*) (dctx
->dictEnd
);
1152 /* Regen sequences */
1154 #define STORED_SEQS 4
1155 #define STORED_SEQS_MASK (STORED_SEQS-1)
1156 #define ADVANCED_SEQS 4
1157 seq_t sequences
[STORED_SEQS
];
1158 int const seqAdvance
= MIN(nbSeq
, ADVANCED_SEQS
);
1159 seqState_t seqState
;
1161 size_t prefixPos
= (size_t)(op
-prefixStart
); /* track position relative to prefixStart */
1163 dctx
->fseEntropy
= 1;
1164 { int i
; for (i
=0; i
<ZSTD_REP_NUM
; i
++) seqState
.prevOffset
[i
] = dctx
->entropy
.rep
[i
]; }
1165 assert(dst
!= NULL
);
1168 ERR_isError(BIT_initDStream(&seqState
.DStream
, ip
, iend
-ip
)),
1169 corruption_detected
, "");
1170 ZSTD_initFseState(&seqState
.stateLL
, &seqState
.DStream
, dctx
->LLTptr
);
1171 ZSTD_initFseState(&seqState
.stateOffb
, &seqState
.DStream
, dctx
->OFTptr
);
1172 ZSTD_initFseState(&seqState
.stateML
, &seqState
.DStream
, dctx
->MLTptr
);
1174 /* prepare in advance */
1175 for (seqNb
=0; (BIT_reloadDStream(&seqState
.DStream
) <= BIT_DStream_completed
) && (seqNb
<seqAdvance
); seqNb
++) {
1176 seq_t
const sequence
= ZSTD_decodeSequence(&seqState
, isLongOffset
);
1177 prefixPos
= ZSTD_prefetchMatch(prefixPos
, sequence
, prefixStart
, dictEnd
);
1178 sequences
[seqNb
] = sequence
;
1180 RETURN_ERROR_IF(seqNb
<seqAdvance
, corruption_detected
, "");
1182 /* decode and decompress */
1183 for ( ; (BIT_reloadDStream(&(seqState
.DStream
)) <= BIT_DStream_completed
) && (seqNb
<nbSeq
) ; seqNb
++) {
1184 seq_t
const sequence
= ZSTD_decodeSequence(&seqState
, isLongOffset
);
1185 size_t const oneSeqSize
= ZSTD_execSequence(op
, oend
, sequences
[(seqNb
-ADVANCED_SEQS
) & STORED_SEQS_MASK
], &litPtr
, litEnd
, prefixStart
, dictStart
, dictEnd
);
1186 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1187 assert(!ZSTD_isError(oneSeqSize
));
1188 if (frame
) ZSTD_assertValidSequence(dctx
, op
, oend
, sequences
[(seqNb
-ADVANCED_SEQS
) & STORED_SEQS_MASK
], prefixStart
, dictStart
);
1190 if (ZSTD_isError(oneSeqSize
)) return oneSeqSize
;
1192 prefixPos
= ZSTD_prefetchMatch(prefixPos
, sequence
, prefixStart
, dictEnd
);
1193 sequences
[seqNb
& STORED_SEQS_MASK
] = sequence
;
1196 RETURN_ERROR_IF(seqNb
<nbSeq
, corruption_detected
, "");
1199 seqNb
-= seqAdvance
;
1200 for ( ; seqNb
<nbSeq
; seqNb
++) {
1201 size_t const oneSeqSize
= ZSTD_execSequence(op
, oend
, sequences
[seqNb
&STORED_SEQS_MASK
], &litPtr
, litEnd
, prefixStart
, dictStart
, dictEnd
);
1202 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1203 assert(!ZSTD_isError(oneSeqSize
));
1204 if (frame
) ZSTD_assertValidSequence(dctx
, op
, oend
, sequences
[seqNb
&STORED_SEQS_MASK
], prefixStart
, dictStart
);
1206 if (ZSTD_isError(oneSeqSize
)) return oneSeqSize
;
1210 /* save reps for next block */
1211 { U32 i
; for (i
=0; i
<ZSTD_REP_NUM
; i
++) dctx
->entropy
.rep
[i
] = (U32
)(seqState
.prevOffset
[i
]); }
1214 /* last literal segment */
1215 { size_t const lastLLSize
= litEnd
- litPtr
;
1216 RETURN_ERROR_IF(lastLLSize
> (size_t)(oend
-op
), dstSize_tooSmall
, "");
1218 memcpy(op
, litPtr
, lastLLSize
);
1227 ZSTD_decompressSequencesLong_default(ZSTD_DCtx
* dctx
,
1228 void* dst
, size_t maxDstSize
,
1229 const void* seqStart
, size_t seqSize
, int nbSeq
,
1230 const ZSTD_longOffset_e isLongOffset
,
1233 return ZSTD_decompressSequencesLong_body(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1235 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1241 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1242 static TARGET_ATTRIBUTE("bmi2") size_t
1244 ZSTD_decompressSequences_bmi2(ZSTD_DCtx
* dctx
,
1245 void* dst
, size_t maxDstSize
,
1246 const void* seqStart
, size_t seqSize
, int nbSeq
,
1247 const ZSTD_longOffset_e isLongOffset
,
1250 return ZSTD_decompressSequences_body(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1252 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1254 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1255 static TARGET_ATTRIBUTE("bmi2") size_t
1256 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx
* dctx
,
1257 void* dst
, size_t maxDstSize
,
1258 const void* seqStart
, size_t seqSize
, int nbSeq
,
1259 const ZSTD_longOffset_e isLongOffset
,
1262 return ZSTD_decompressSequencesLong_body(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1264 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1266 #endif /* DYNAMIC_BMI2 */
1268 typedef size_t (*ZSTD_decompressSequences_t
)(
1270 void* dst
, size_t maxDstSize
,
1271 const void* seqStart
, size_t seqSize
, int nbSeq
,
1272 const ZSTD_longOffset_e isLongOffset
,
1275 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1277 ZSTD_decompressSequences(ZSTD_DCtx
* dctx
, void* dst
, size_t maxDstSize
,
1278 const void* seqStart
, size_t seqSize
, int nbSeq
,
1279 const ZSTD_longOffset_e isLongOffset
,
1282 DEBUGLOG(5, "ZSTD_decompressSequences");
1285 return ZSTD_decompressSequences_bmi2(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1288 return ZSTD_decompressSequences_default(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1290 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1293 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1294 /* ZSTD_decompressSequencesLong() :
1295 * decompression function triggered when a minimum share of offsets is considered "long",
1297 * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
1298 * This function will try to mitigate main memory latency through the use of prefetching */
1300 ZSTD_decompressSequencesLong(ZSTD_DCtx
* dctx
,
1301 void* dst
, size_t maxDstSize
,
1302 const void* seqStart
, size_t seqSize
, int nbSeq
,
1303 const ZSTD_longOffset_e isLongOffset
,
1306 DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1309 return ZSTD_decompressSequencesLong_bmi2(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1312 return ZSTD_decompressSequencesLong_default(dctx
, dst
, maxDstSize
, seqStart
, seqSize
, nbSeq
, isLongOffset
, frame
);
1314 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1318 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1319 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1320 /* ZSTD_getLongOffsetsShare() :
1321 * condition : offTable must be valid
1322 * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1323 * compared to maximum possible of (1<<OffFSELog) */
1325 ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol
* offTable
)
1327 const void* ptr
= offTable
;
1328 U32
const tableLog
= ((const ZSTD_seqSymbol_header
*)ptr
)[0].tableLog
;
1329 const ZSTD_seqSymbol
* table
= offTable
+ 1;
1330 U32
const max
= 1 << tableLog
;
1332 DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog
);
1334 assert(max
<= (1 << OffFSELog
)); /* max not too large */
1335 for (u
=0; u
<max
; u
++) {
1336 if (table
[u
].nbAdditionalBits
> 22) total
+= 1;
1339 assert(tableLog
<= OffFSELog
);
1340 total
<<= (OffFSELog
- tableLog
); /* scale to OffFSELog */
1347 ZSTD_decompressBlock_internal(ZSTD_DCtx
* dctx
,
1348 void* dst
, size_t dstCapacity
,
1349 const void* src
, size_t srcSize
, const int frame
)
1350 { /* blockType == blockCompressed */
1351 const BYTE
* ip
= (const BYTE
*)src
;
1352 /* isLongOffset must be true if there are long offsets.
1353 * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1354 * We don't expect that to be the case in 64-bit mode.
1355 * In block mode, window size is not known, so we have to be conservative.
1356 * (note: but it could be evaluated from current-lowLimit)
1358 ZSTD_longOffset_e
const isLongOffset
= (ZSTD_longOffset_e
)(MEM_32bits() && (!frame
|| (dctx
->fParams
.windowSize
> (1ULL << STREAM_ACCUMULATOR_MIN
))));
1359 DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32
)srcSize
);
1361 RETURN_ERROR_IF(srcSize
>= ZSTD_BLOCKSIZE_MAX
, srcSize_wrong
, "");
1363 /* Decode literals section */
1364 { size_t const litCSize
= ZSTD_decodeLiteralsBlock(dctx
, src
, srcSize
);
1365 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32
)litCSize
);
1366 if (ZSTD_isError(litCSize
)) return litCSize
;
1368 srcSize
-= litCSize
;
1371 /* Build Decoding Tables */
1373 /* These macros control at build-time which decompressor implementation
1374 * we use. If neither is defined, we do some inspection and dispatch at
1377 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1378 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1379 int usePrefetchDecoder
= dctx
->ddictIsCold
;
1382 size_t const seqHSize
= ZSTD_decodeSeqHeaders(dctx
, &nbSeq
, ip
, srcSize
);
1383 if (ZSTD_isError(seqHSize
)) return seqHSize
;
1385 srcSize
-= seqHSize
;
1387 RETURN_ERROR_IF(dst
== NULL
&& nbSeq
> 0, dstSize_tooSmall
, "NULL not handled");
1389 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1390 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1391 if ( !usePrefetchDecoder
1392 && (!frame
|| (dctx
->fParams
.windowSize
> (1<<24)))
1393 && (nbSeq
>ADVANCED_SEQS
) ) { /* could probably use a larger nbSeq limit */
1394 U32
const shareLongOffsets
= ZSTD_getLongOffsetsShare(dctx
->OFTptr
);
1395 U32
const minShare
= MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
1396 usePrefetchDecoder
= (shareLongOffsets
>= minShare
);
1400 dctx
->ddictIsCold
= 0;
1402 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1403 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1404 if (usePrefetchDecoder
)
1406 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1407 return ZSTD_decompressSequencesLong(dctx
, dst
, dstCapacity
, ip
, srcSize
, nbSeq
, isLongOffset
, frame
);
1410 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1412 return ZSTD_decompressSequences(dctx
, dst
, dstCapacity
, ip
, srcSize
, nbSeq
, isLongOffset
, frame
);
1418 void ZSTD_checkContinuity(ZSTD_DCtx
* dctx
, const void* dst
)
1420 if (dst
!= dctx
->previousDstEnd
) { /* not contiguous */
1421 dctx
->dictEnd
= dctx
->previousDstEnd
;
1422 dctx
->virtualStart
= (const char*)dst
- ((const char*)(dctx
->previousDstEnd
) - (const char*)(dctx
->prefixStart
));
1423 dctx
->prefixStart
= dst
;
1424 dctx
->previousDstEnd
= dst
;
1429 size_t ZSTD_decompressBlock(ZSTD_DCtx
* dctx
,
1430 void* dst
, size_t dstCapacity
,
1431 const void* src
, size_t srcSize
)
1434 ZSTD_checkContinuity(dctx
, dst
);
1435 dSize
= ZSTD_decompressBlock_internal(dctx
, dst
, dstCapacity
, src
, srcSize
, /* frame */ 0);
1436 dctx
->previousDstEnd
= (char*)dst
+ dSize
;