2 * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
11 /*-*************************************
13 ***************************************/
14 #include "zstd_compress_superblock.h"
16 #include "../common/zstd_internal.h" /* ZSTD_getSequenceLength */
17 #include "hist.h" /* HIST_countFast_wksp */
18 #include "zstd_compress_internal.h"
19 #include "zstd_compress_sequences.h"
20 #include "zstd_compress_literals.h"
22 /*-*************************************
23 * Superblock entropy buffer structs
24 ***************************************/
25 /** ZSTD_hufCTablesMetadata_t :
26 * Stores Literals Block Type for a super-block in hType, and
27 * huffman tree description in hufDesBuffer.
28 * hufDesSize refers to the size of huffman tree description in bytes.
29 * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */
31 symbolEncodingType_e hType
;
32 BYTE hufDesBuffer
[500]; /* TODO give name to this value */
34 } ZSTD_hufCTablesMetadata_t
;
36 /** ZSTD_fseCTablesMetadata_t :
37 * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
38 * fse tables in fseTablesBuffer.
39 * fseTablesSize refers to the size of fse tables in bytes.
40 * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */
42 symbolEncodingType_e llType
;
43 symbolEncodingType_e ofType
;
44 symbolEncodingType_e mlType
;
45 BYTE fseTablesBuffer
[500]; /* TODO give name to this value */
47 size_t lastCountSize
; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */
48 } ZSTD_fseCTablesMetadata_t
;
51 ZSTD_hufCTablesMetadata_t hufMetadata
;
52 ZSTD_fseCTablesMetadata_t fseMetadata
;
53 } ZSTD_entropyCTablesMetadata_t
;
56 /** ZSTD_buildSuperBlockEntropy_literal() :
57 * Builds entropy for the super-block literals.
58 * Stores literals block type (raw, rle, compressed, repeat) and
59 * huffman description table to hufMetadata.
60 * @return : size of huffman description table or error code */
61 static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src
, size_t srcSize
,
62 const ZSTD_hufCTables_t
* prevHuf
,
63 ZSTD_hufCTables_t
* nextHuf
,
64 ZSTD_hufCTablesMetadata_t
* hufMetadata
,
65 const int disableLiteralsCompression
,
66 void* workspace
, size_t wkspSize
)
68 BYTE
* const wkspStart
= (BYTE
*)workspace
;
69 BYTE
* const wkspEnd
= wkspStart
+ wkspSize
;
70 BYTE
* const countWkspStart
= wkspStart
;
71 unsigned* const countWksp
= (unsigned*)workspace
;
72 const size_t countWkspSize
= (HUF_SYMBOLVALUE_MAX
+ 1) * sizeof(unsigned);
73 BYTE
* const nodeWksp
= countWkspStart
+ countWkspSize
;
74 const size_t nodeWkspSize
= wkspEnd
-nodeWksp
;
75 unsigned maxSymbolValue
= 255;
76 unsigned huffLog
= HUF_TABLELOG_DEFAULT
;
77 HUF_repeat repeat
= prevHuf
->repeatMode
;
79 DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize
);
81 /* Prepare nextEntropy assuming reusing the existing table */
82 memcpy(nextHuf
, prevHuf
, sizeof(*prevHuf
));
84 if (disableLiteralsCompression
) {
85 DEBUGLOG(5, "set_basic - disabled");
86 hufMetadata
->hType
= set_basic
;
90 /* small ? don't even attempt compression (speed opt) */
91 # define COMPRESS_LITERALS_SIZE_MIN 63
92 { size_t const minLitSize
= (prevHuf
->repeatMode
== HUF_repeat_valid
) ? 6 : COMPRESS_LITERALS_SIZE_MIN
;
93 if (srcSize
<= minLitSize
) {
94 DEBUGLOG(5, "set_basic - too small");
95 hufMetadata
->hType
= set_basic
;
100 /* Scan input and build symbol stats */
101 { size_t const largest
= HIST_count_wksp (countWksp
, &maxSymbolValue
, (const BYTE
*)src
, srcSize
, workspace
, wkspSize
);
102 FORWARD_IF_ERROR(largest
, "HIST_count_wksp failed");
103 if (largest
== srcSize
) {
104 DEBUGLOG(5, "set_rle");
105 hufMetadata
->hType
= set_rle
;
108 if (largest
<= (srcSize
>> 7)+4) {
109 DEBUGLOG(5, "set_basic - no gain");
110 hufMetadata
->hType
= set_basic
;
115 /* Validate the previous Huffman table */
116 if (repeat
== HUF_repeat_check
&& !HUF_validateCTable((HUF_CElt
const*)prevHuf
->CTable
, countWksp
, maxSymbolValue
)) {
117 repeat
= HUF_repeat_none
;
120 /* Build Huffman Tree */
121 memset(nextHuf
->CTable
, 0, sizeof(nextHuf
->CTable
));
122 huffLog
= HUF_optimalTableLog(huffLog
, srcSize
, maxSymbolValue
);
123 { size_t const maxBits
= HUF_buildCTable_wksp((HUF_CElt
*)nextHuf
->CTable
, countWksp
,
124 maxSymbolValue
, huffLog
,
125 nodeWksp
, nodeWkspSize
);
126 FORWARD_IF_ERROR(maxBits
, "HUF_buildCTable_wksp");
127 huffLog
= (U32
)maxBits
;
128 { /* Build and write the CTable */
129 size_t const newCSize
= HUF_estimateCompressedSize(
130 (HUF_CElt
*)nextHuf
->CTable
, countWksp
, maxSymbolValue
);
131 size_t const hSize
= HUF_writeCTable(
132 hufMetadata
->hufDesBuffer
, sizeof(hufMetadata
->hufDesBuffer
),
133 (HUF_CElt
*)nextHuf
->CTable
, maxSymbolValue
, huffLog
);
134 /* Check against repeating the previous CTable */
135 if (repeat
!= HUF_repeat_none
) {
136 size_t const oldCSize
= HUF_estimateCompressedSize(
137 (HUF_CElt
const*)prevHuf
->CTable
, countWksp
, maxSymbolValue
);
138 if (oldCSize
< srcSize
&& (oldCSize
<= hSize
+ newCSize
|| hSize
+ 12 >= srcSize
)) {
139 DEBUGLOG(5, "set_repeat - smaller");
140 memcpy(nextHuf
, prevHuf
, sizeof(*prevHuf
));
141 hufMetadata
->hType
= set_repeat
;
145 if (newCSize
+ hSize
>= srcSize
) {
146 DEBUGLOG(5, "set_basic - no gains");
147 memcpy(nextHuf
, prevHuf
, sizeof(*prevHuf
));
148 hufMetadata
->hType
= set_basic
;
151 DEBUGLOG(5, "set_compressed (hSize=%u)", (U32
)hSize
);
152 hufMetadata
->hType
= set_compressed
;
153 nextHuf
->repeatMode
= HUF_repeat_check
;
159 /** ZSTD_buildSuperBlockEntropy_sequences() :
160 * Builds entropy for the super-block sequences.
161 * Stores symbol compression modes and fse table to fseMetadata.
162 * @return : size of fse tables or error code */
163 static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t
* seqStorePtr
,
164 const ZSTD_fseCTables_t
* prevEntropy
,
165 ZSTD_fseCTables_t
* nextEntropy
,
166 const ZSTD_CCtx_params
* cctxParams
,
167 ZSTD_fseCTablesMetadata_t
* fseMetadata
,
168 void* workspace
, size_t wkspSize
)
170 BYTE
* const wkspStart
= (BYTE
*)workspace
;
171 BYTE
* const wkspEnd
= wkspStart
+ wkspSize
;
172 BYTE
* const countWkspStart
= wkspStart
;
173 unsigned* const countWksp
= (unsigned*)workspace
;
174 const size_t countWkspSize
= (MaxSeq
+ 1) * sizeof(unsigned);
175 BYTE
* const cTableWksp
= countWkspStart
+ countWkspSize
;
176 const size_t cTableWkspSize
= wkspEnd
-cTableWksp
;
177 ZSTD_strategy
const strategy
= cctxParams
->cParams
.strategy
;
178 FSE_CTable
* CTable_LitLength
= nextEntropy
->litlengthCTable
;
179 FSE_CTable
* CTable_OffsetBits
= nextEntropy
->offcodeCTable
;
180 FSE_CTable
* CTable_MatchLength
= nextEntropy
->matchlengthCTable
;
181 const BYTE
* const ofCodeTable
= seqStorePtr
->ofCode
;
182 const BYTE
* const llCodeTable
= seqStorePtr
->llCode
;
183 const BYTE
* const mlCodeTable
= seqStorePtr
->mlCode
;
184 size_t const nbSeq
= seqStorePtr
->sequences
- seqStorePtr
->sequencesStart
;
185 BYTE
* const ostart
= fseMetadata
->fseTablesBuffer
;
186 BYTE
* const oend
= ostart
+ sizeof(fseMetadata
->fseTablesBuffer
);
189 assert(cTableWkspSize
>= (1 << MaxFSELog
) * sizeof(FSE_FUNCTION_TYPE
));
190 DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq
);
191 memset(workspace
, 0, wkspSize
);
193 fseMetadata
->lastCountSize
= 0;
194 /* convert length/distances into codes */
195 ZSTD_seqToCodes(seqStorePtr
);
196 /* build CTable for Literal Lengths */
198 unsigned max
= MaxLL
;
199 size_t const mostFrequent
= HIST_countFast_wksp(countWksp
, &max
, llCodeTable
, nbSeq
, workspace
, wkspSize
); /* can't fail */
200 DEBUGLOG(5, "Building LL table");
201 nextEntropy
->litlength_repeatMode
= prevEntropy
->litlength_repeatMode
;
202 LLtype
= ZSTD_selectEncodingType(&nextEntropy
->litlength_repeatMode
,
203 countWksp
, max
, mostFrequent
, nbSeq
,
204 LLFSELog
, prevEntropy
->litlengthCTable
,
205 LL_defaultNorm
, LL_defaultNormLog
,
206 ZSTD_defaultAllowed
, strategy
);
207 assert(set_basic
< set_compressed
&& set_rle
< set_compressed
);
208 assert(!(LLtype
< set_compressed
&& nextEntropy
->litlength_repeatMode
!= FSE_repeat_none
)); /* We don't copy tables */
209 { size_t const countSize
= ZSTD_buildCTable(op
, oend
- op
, CTable_LitLength
, LLFSELog
, (symbolEncodingType_e
)LLtype
,
210 countWksp
, max
, llCodeTable
, nbSeq
, LL_defaultNorm
, LL_defaultNormLog
, MaxLL
,
211 prevEntropy
->litlengthCTable
, sizeof(prevEntropy
->litlengthCTable
),
212 cTableWksp
, cTableWkspSize
);
213 FORWARD_IF_ERROR(countSize
, "ZSTD_buildCTable for LitLens failed");
214 if (LLtype
== set_compressed
)
215 fseMetadata
->lastCountSize
= countSize
;
217 fseMetadata
->llType
= (symbolEncodingType_e
) LLtype
;
219 /* build CTable for Offsets */
221 unsigned max
= MaxOff
;
222 size_t const mostFrequent
= HIST_countFast_wksp(countWksp
, &max
, ofCodeTable
, nbSeq
, workspace
, wkspSize
); /* can't fail */
223 /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
224 ZSTD_defaultPolicy_e
const defaultPolicy
= (max
<= DefaultMaxOff
) ? ZSTD_defaultAllowed
: ZSTD_defaultDisallowed
;
225 DEBUGLOG(5, "Building OF table");
226 nextEntropy
->offcode_repeatMode
= prevEntropy
->offcode_repeatMode
;
227 Offtype
= ZSTD_selectEncodingType(&nextEntropy
->offcode_repeatMode
,
228 countWksp
, max
, mostFrequent
, nbSeq
,
229 OffFSELog
, prevEntropy
->offcodeCTable
,
230 OF_defaultNorm
, OF_defaultNormLog
,
231 defaultPolicy
, strategy
);
232 assert(!(Offtype
< set_compressed
&& nextEntropy
->offcode_repeatMode
!= FSE_repeat_none
)); /* We don't copy tables */
233 { size_t const countSize
= ZSTD_buildCTable(op
, oend
- op
, CTable_OffsetBits
, OffFSELog
, (symbolEncodingType_e
)Offtype
,
234 countWksp
, max
, ofCodeTable
, nbSeq
, OF_defaultNorm
, OF_defaultNormLog
, DefaultMaxOff
,
235 prevEntropy
->offcodeCTable
, sizeof(prevEntropy
->offcodeCTable
),
236 cTableWksp
, cTableWkspSize
);
237 FORWARD_IF_ERROR(countSize
, "ZSTD_buildCTable for Offsets failed");
238 if (Offtype
== set_compressed
)
239 fseMetadata
->lastCountSize
= countSize
;
241 fseMetadata
->ofType
= (symbolEncodingType_e
) Offtype
;
243 /* build CTable for MatchLengths */
245 unsigned max
= MaxML
;
246 size_t const mostFrequent
= HIST_countFast_wksp(countWksp
, &max
, mlCodeTable
, nbSeq
, workspace
, wkspSize
); /* can't fail */
247 DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend
-op
));
248 nextEntropy
->matchlength_repeatMode
= prevEntropy
->matchlength_repeatMode
;
249 MLtype
= ZSTD_selectEncodingType(&nextEntropy
->matchlength_repeatMode
,
250 countWksp
, max
, mostFrequent
, nbSeq
,
251 MLFSELog
, prevEntropy
->matchlengthCTable
,
252 ML_defaultNorm
, ML_defaultNormLog
,
253 ZSTD_defaultAllowed
, strategy
);
254 assert(!(MLtype
< set_compressed
&& nextEntropy
->matchlength_repeatMode
!= FSE_repeat_none
)); /* We don't copy tables */
255 { size_t const countSize
= ZSTD_buildCTable(op
, oend
- op
, CTable_MatchLength
, MLFSELog
, (symbolEncodingType_e
)MLtype
,
256 countWksp
, max
, mlCodeTable
, nbSeq
, ML_defaultNorm
, ML_defaultNormLog
, MaxML
,
257 prevEntropy
->matchlengthCTable
, sizeof(prevEntropy
->matchlengthCTable
),
258 cTableWksp
, cTableWkspSize
);
259 FORWARD_IF_ERROR(countSize
, "ZSTD_buildCTable for MatchLengths failed");
260 if (MLtype
== set_compressed
)
261 fseMetadata
->lastCountSize
= countSize
;
263 fseMetadata
->mlType
= (symbolEncodingType_e
) MLtype
;
265 assert((size_t) (op
-ostart
) <= sizeof(fseMetadata
->fseTablesBuffer
));
270 /** ZSTD_buildSuperBlockEntropy() :
271 * Builds entropy for the super-block.
272 * @return : 0 on success or error code */
274 ZSTD_buildSuperBlockEntropy(seqStore_t
* seqStorePtr
,
275 const ZSTD_entropyCTables_t
* prevEntropy
,
276 ZSTD_entropyCTables_t
* nextEntropy
,
277 const ZSTD_CCtx_params
* cctxParams
,
278 ZSTD_entropyCTablesMetadata_t
* entropyMetadata
,
279 void* workspace
, size_t wkspSize
)
281 size_t const litSize
= seqStorePtr
->lit
- seqStorePtr
->litStart
;
282 DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy");
283 entropyMetadata
->hufMetadata
.hufDesSize
=
284 ZSTD_buildSuperBlockEntropy_literal(seqStorePtr
->litStart
, litSize
,
285 &prevEntropy
->huf
, &nextEntropy
->huf
,
286 &entropyMetadata
->hufMetadata
,
287 ZSTD_disableLiteralsCompression(cctxParams
),
288 workspace
, wkspSize
);
289 FORWARD_IF_ERROR(entropyMetadata
->hufMetadata
.hufDesSize
, "ZSTD_buildSuperBlockEntropy_literal failed");
290 entropyMetadata
->fseMetadata
.fseTablesSize
=
291 ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr
,
292 &prevEntropy
->fse
, &nextEntropy
->fse
,
294 &entropyMetadata
->fseMetadata
,
295 workspace
, wkspSize
);
296 FORWARD_IF_ERROR(entropyMetadata
->fseMetadata
.fseTablesSize
, "ZSTD_buildSuperBlockEntropy_sequences failed");
300 /** ZSTD_compressSubBlock_literal() :
301 * Compresses literals section for a sub-block.
302 * When we have to write the Huffman table we will sometimes choose a header
303 * size larger than necessary. This is because we have to pick the header size
304 * before we know the table size + compressed size, so we have a bound on the
305 * table size. If we guessed incorrectly, we fall back to uncompressed literals.
307 * We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded
308 * in writing the header, otherwise it is set to 0.
310 * hufMetadata->hType has literals block type info.
311 * If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
312 * If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
313 * If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
314 * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
315 * and the following sub-blocks' literals sections will be Treeless_Literals_Block.
316 * @return : compressed size of literals section of a sub-block
317 * Or 0 if it unable to compress.
319 static size_t ZSTD_compressSubBlock_literal(const HUF_CElt
* hufTable
,
320 const ZSTD_hufCTablesMetadata_t
* hufMetadata
,
321 const BYTE
* literals
, size_t litSize
,
322 void* dst
, size_t dstSize
,
323 const int bmi2
, int writeEntropy
, int* entropyWritten
)
325 size_t const header
= writeEntropy
? 200 : 0;
326 size_t const lhSize
= 3 + (litSize
>= (1 KB
- header
)) + (litSize
>= (16 KB
- header
));
327 BYTE
* const ostart
= (BYTE
*)dst
;
328 BYTE
* const oend
= ostart
+ dstSize
;
329 BYTE
* op
= ostart
+ lhSize
;
330 U32
const singleStream
= lhSize
== 3;
331 symbolEncodingType_e hType
= writeEntropy
? hufMetadata
->hType
: set_repeat
;
334 (void)bmi2
; /* TODO bmi2... */
336 DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize
, lhSize
, writeEntropy
);
339 if (litSize
== 0 || hufMetadata
->hType
== set_basic
) {
340 DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
341 return ZSTD_noCompressLiterals(dst
, dstSize
, literals
, litSize
);
342 } else if (hufMetadata
->hType
== set_rle
) {
343 DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
344 return ZSTD_compressRleLiteralsBlock(dst
, dstSize
, literals
, litSize
);
348 assert(hufMetadata
->hType
== set_compressed
|| hufMetadata
->hType
== set_repeat
);
350 if (writeEntropy
&& hufMetadata
->hType
== set_compressed
) {
351 memcpy(op
, hufMetadata
->hufDesBuffer
, hufMetadata
->hufDesSize
);
352 op
+= hufMetadata
->hufDesSize
;
353 cLitSize
+= hufMetadata
->hufDesSize
;
354 DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata
->hufDesSize
);
358 { const size_t cSize
= singleStream
? HUF_compress1X_usingCTable(op
, oend
-op
, literals
, litSize
, hufTable
)
359 : HUF_compress4X_usingCTable(op
, oend
-op
, literals
, litSize
, hufTable
);
362 if (cSize
== 0 || ERR_isError(cSize
)) {
363 DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize
));
366 /* If we expand and we aren't writing a header then emit uncompressed */
367 if (!writeEntropy
&& cLitSize
>= litSize
) {
368 DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
369 return ZSTD_noCompressLiterals(dst
, dstSize
, literals
, litSize
);
371 /* If we are writing headers then allow expansion that doesn't change our header size. */
372 if (lhSize
< (size_t)(3 + (cLitSize
>= 1 KB
) + (cLitSize
>= 16 KB
))) {
373 assert(cLitSize
> litSize
);
374 DEBUGLOG(5, "Literals expanded beyond allowed header size");
375 return ZSTD_noCompressLiterals(dst
, dstSize
, literals
, litSize
);
377 DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize
);
383 case 3: /* 2 - 2 - 10 - 10 */
384 { U32
const lhc
= hType
+ ((!singleStream
) << 2) + ((U32
)litSize
<<4) + ((U32
)cLitSize
<<14);
385 MEM_writeLE24(ostart
, lhc
);
388 case 4: /* 2 - 2 - 14 - 14 */
389 { U32
const lhc
= hType
+ (2 << 2) + ((U32
)litSize
<<4) + ((U32
)cLitSize
<<18);
390 MEM_writeLE32(ostart
, lhc
);
393 case 5: /* 2 - 2 - 18 - 18 */
394 { U32
const lhc
= hType
+ (3 << 2) + ((U32
)litSize
<<4) + ((U32
)cLitSize
<<22);
395 MEM_writeLE32(ostart
, lhc
);
396 ostart
[4] = (BYTE
)(cLitSize
>> 10);
399 default: /* not possible : lhSize is {3,4,5} */
403 DEBUGLOG(5, "Compressed literals: %u -> %u", (U32
)litSize
, (U32
)(op
-ostart
));
407 static size_t ZSTD_seqDecompressedSize(seqStore_t
const* seqStore
, const seqDef
* sequences
, size_t nbSeq
, size_t litSize
, int lastSequence
) {
408 const seqDef
* const sstart
= sequences
;
409 const seqDef
* const send
= sequences
+ nbSeq
;
410 const seqDef
* sp
= sstart
;
411 size_t matchLengthSum
= 0;
412 size_t litLengthSum
__attribute__ ((unused
)) = 0;
413 while (send
-sp
> 0) {
414 ZSTD_sequenceLength
const seqLen
= ZSTD_getSequenceLength(seqStore
, sp
);
415 litLengthSum
+= seqLen
.litLength
;
416 matchLengthSum
+= seqLen
.matchLength
;
419 assert(litLengthSum
<= litSize
);
421 assert(litLengthSum
== litSize
);
423 return matchLengthSum
+ litSize
;
426 /** ZSTD_compressSubBlock_sequences() :
427 * Compresses sequences section for a sub-block.
428 * fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
429 * symbol compression modes for the super-block.
430 * The first successfully compressed block will have these in its header.
431 * We set entropyWritten=1 when we succeed in compressing the sequences.
432 * The following sub-blocks will always have repeat mode.
433 * @return : compressed size of sequences section of a sub-block
434 * Or 0 if it is unable to compress
436 static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t
* fseTables
,
437 const ZSTD_fseCTablesMetadata_t
* fseMetadata
,
438 const seqDef
* sequences
, size_t nbSeq
,
439 const BYTE
* llCode
, const BYTE
* mlCode
, const BYTE
* ofCode
,
440 const ZSTD_CCtx_params
* cctxParams
,
441 void* dst
, size_t dstCapacity
,
442 const int bmi2
, int writeEntropy
, int* entropyWritten
)
444 const int longOffsets
= cctxParams
->cParams
.windowLog
> STREAM_ACCUMULATOR_MIN
;
445 BYTE
* const ostart
= (BYTE
*)dst
;
446 BYTE
* const oend
= ostart
+ dstCapacity
;
450 DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq
, writeEntropy
, longOffsets
);
453 /* Sequences Header */
454 RETURN_ERROR_IF((oend
-op
) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
455 dstSize_tooSmall
, "");
458 else if (nbSeq
< LONGNBSEQ
)
459 op
[0] = (BYTE
)((nbSeq
>>8) + 0x80), op
[1] = (BYTE
)nbSeq
, op
+=2;
461 op
[0]=0xFF, MEM_writeLE16(op
+1, (U16
)(nbSeq
- LONGNBSEQ
)), op
+=3;
466 /* seqHead : flags for FSE encoding type */
469 DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op
-ostart
));
472 const U32 LLtype
= fseMetadata
->llType
;
473 const U32 Offtype
= fseMetadata
->ofType
;
474 const U32 MLtype
= fseMetadata
->mlType
;
475 DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata
->fseTablesSize
);
476 *seqHead
= (BYTE
)((LLtype
<<6) + (Offtype
<<4) + (MLtype
<<2));
477 memcpy(op
, fseMetadata
->fseTablesBuffer
, fseMetadata
->fseTablesSize
);
478 op
+= fseMetadata
->fseTablesSize
;
480 const U32 repeat
= set_repeat
;
481 *seqHead
= (BYTE
)((repeat
<<6) + (repeat
<<4) + (repeat
<<2));
484 { size_t const bitstreamSize
= ZSTD_encodeSequences(
486 fseTables
->matchlengthCTable
, mlCode
,
487 fseTables
->offcodeCTable
, ofCode
,
488 fseTables
->litlengthCTable
, llCode
,
491 FORWARD_IF_ERROR(bitstreamSize
, "ZSTD_encodeSequences failed");
493 /* zstd versions <= 1.3.4 mistakenly report corruption when
494 * FSE_readNCount() receives a buffer < 4 bytes.
495 * Fixed by https://github.com/facebook/zstd/pull/1146.
496 * This can happen when the last set_compressed table present is 2
497 * bytes and the bitstream is only one byte.
498 * In this exceedingly rare case, we will simply emit an uncompressed
499 * block, since it isn't worth optimizing.
501 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
502 if (writeEntropy
&& fseMetadata
->lastCountSize
&& fseMetadata
->lastCountSize
+ bitstreamSize
< 4) {
503 /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
504 assert(fseMetadata
->lastCountSize
+ bitstreamSize
== 3);
505 DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
506 "emitting an uncompressed block.");
510 DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize
);
513 /* zstd versions <= 1.4.0 mistakenly report error when
514 * sequences section body size is less than 3 bytes.
515 * Fixed by https://github.com/facebook/zstd/pull/1664.
516 * This can happen when the previous sequences section block is compressed
517 * with rle mode and the current block's sequences section is compressed
518 * with repeat mode where sequences section body size can be 1 byte.
520 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
521 if (op
-seqHead
< 4) {
522 DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
523 "an uncompressed block when sequences are < 4 bytes");
532 /** ZSTD_compressSubBlock() :
533 * Compresses a single sub-block.
534 * @return : compressed size of the sub-block
535 * Or 0 if it failed to compress. */
536 static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t
* entropy
,
537 const ZSTD_entropyCTablesMetadata_t
* entropyMetadata
,
538 const seqDef
* sequences
, size_t nbSeq
,
539 const BYTE
* literals
, size_t litSize
,
540 const BYTE
* llCode
, const BYTE
* mlCode
, const BYTE
* ofCode
,
541 const ZSTD_CCtx_params
* cctxParams
,
542 void* dst
, size_t dstCapacity
,
544 int writeLitEntropy
, int writeSeqEntropy
,
545 int* litEntropyWritten
, int* seqEntropyWritten
,
548 BYTE
* const ostart
= (BYTE
*)dst
;
549 BYTE
* const oend
= ostart
+ dstCapacity
;
550 BYTE
* op
= ostart
+ ZSTD_blockHeaderSize
;
551 DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
552 litSize
, nbSeq
, writeLitEntropy
, writeSeqEntropy
, lastBlock
);
553 { size_t cLitSize
= ZSTD_compressSubBlock_literal((const HUF_CElt
*)entropy
->huf
.CTable
,
554 &entropyMetadata
->hufMetadata
, literals
, litSize
,
555 op
, oend
-op
, bmi2
, writeLitEntropy
, litEntropyWritten
);
556 FORWARD_IF_ERROR(cLitSize
, "ZSTD_compressSubBlock_literal failed");
557 if (cLitSize
== 0) return 0;
560 { size_t cSeqSize
= ZSTD_compressSubBlock_sequences(&entropy
->fse
,
561 &entropyMetadata
->fseMetadata
,
563 llCode
, mlCode
, ofCode
,
566 bmi2
, writeSeqEntropy
, seqEntropyWritten
);
567 FORWARD_IF_ERROR(cSeqSize
, "ZSTD_compressSubBlock_sequences failed");
568 if (cSeqSize
== 0) return 0;
571 /* Write block header */
572 { size_t cSize
= (op
-ostart
)-ZSTD_blockHeaderSize
;
573 U32
const cBlockHeader24
= lastBlock
+ (((U32
)bt_compressed
)<<1) + (U32
)(cSize
<< 3);
574 MEM_writeLE24(ostart
, cBlockHeader24
);
579 static size_t ZSTD_estimateSubBlockSize_literal(const BYTE
* literals
, size_t litSize
,
580 const ZSTD_hufCTables_t
* huf
,
581 const ZSTD_hufCTablesMetadata_t
* hufMetadata
,
582 void* workspace
, size_t wkspSize
,
585 unsigned* const countWksp
= (unsigned*)workspace
;
586 unsigned maxSymbolValue
= 255;
587 size_t literalSectionHeaderSize
= 3; /* Use hard coded size of 3 bytes */
589 if (hufMetadata
->hType
== set_basic
) return litSize
;
590 else if (hufMetadata
->hType
== set_rle
) return 1;
591 else if (hufMetadata
->hType
== set_compressed
|| hufMetadata
->hType
== set_repeat
) {
592 size_t const largest
= HIST_count_wksp (countWksp
, &maxSymbolValue
, (const BYTE
*)literals
, litSize
, workspace
, wkspSize
);
593 if (ZSTD_isError(largest
)) return litSize
;
594 { size_t cLitSizeEstimate
= HUF_estimateCompressedSize((const HUF_CElt
*)huf
->CTable
, countWksp
, maxSymbolValue
);
595 if (writeEntropy
) cLitSizeEstimate
+= hufMetadata
->hufDesSize
;
596 return cLitSizeEstimate
+ literalSectionHeaderSize
;
598 assert(0); /* impossible */
602 static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type
,
603 const BYTE
* codeTable
, unsigned maxCode
,
604 size_t nbSeq
, const FSE_CTable
* fseCTable
,
605 const U32
* additionalBits
,
606 short const* defaultNorm
, U32 defaultNormLog
, U32 defaultMax
,
607 void* workspace
, size_t wkspSize
)
609 unsigned* const countWksp
= (unsigned*)workspace
;
610 const BYTE
* ctp
= codeTable
;
611 const BYTE
* const ctStart
= ctp
;
612 const BYTE
* const ctEnd
= ctStart
+ nbSeq
;
613 size_t cSymbolTypeSizeEstimateInBits
= 0;
614 unsigned max
= maxCode
;
616 HIST_countFast_wksp(countWksp
, &max
, codeTable
, nbSeq
, workspace
, wkspSize
); /* can't fail */
617 if (type
== set_basic
) {
618 /* We selected this encoding type, so it must be valid. */
619 assert(max
<= defaultMax
);
620 cSymbolTypeSizeEstimateInBits
= max
<= defaultMax
621 ? ZSTD_crossEntropyCost(defaultNorm
, defaultNormLog
, countWksp
, max
)
623 } else if (type
== set_rle
) {
624 cSymbolTypeSizeEstimateInBits
= 0;
625 } else if (type
== set_compressed
|| type
== set_repeat
) {
626 cSymbolTypeSizeEstimateInBits
= ZSTD_fseBitCost(fseCTable
, countWksp
, max
);
628 if (ZSTD_isError(cSymbolTypeSizeEstimateInBits
)) return nbSeq
* 10;
629 while (ctp
< ctEnd
) {
630 if (additionalBits
) cSymbolTypeSizeEstimateInBits
+= additionalBits
[*ctp
];
631 else cSymbolTypeSizeEstimateInBits
+= *ctp
; /* for offset, offset code is also the number of additional bits */
634 return cSymbolTypeSizeEstimateInBits
/ 8;
637 static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE
* ofCodeTable
,
638 const BYTE
* llCodeTable
,
639 const BYTE
* mlCodeTable
,
641 const ZSTD_fseCTables_t
* fseTables
,
642 const ZSTD_fseCTablesMetadata_t
* fseMetadata
,
643 void* workspace
, size_t wkspSize
,
646 size_t sequencesSectionHeaderSize
= 3; /* Use hard coded size of 3 bytes */
647 size_t cSeqSizeEstimate
= 0;
648 cSeqSizeEstimate
+= ZSTD_estimateSubBlockSize_symbolType(fseMetadata
->ofType
, ofCodeTable
, MaxOff
,
649 nbSeq
, fseTables
->offcodeCTable
, NULL
,
650 OF_defaultNorm
, OF_defaultNormLog
, DefaultMaxOff
,
651 workspace
, wkspSize
);
652 cSeqSizeEstimate
+= ZSTD_estimateSubBlockSize_symbolType(fseMetadata
->llType
, llCodeTable
, MaxLL
,
653 nbSeq
, fseTables
->litlengthCTable
, LL_bits
,
654 LL_defaultNorm
, LL_defaultNormLog
, MaxLL
,
655 workspace
, wkspSize
);
656 cSeqSizeEstimate
+= ZSTD_estimateSubBlockSize_symbolType(fseMetadata
->mlType
, mlCodeTable
, MaxML
,
657 nbSeq
, fseTables
->matchlengthCTable
, ML_bits
,
658 ML_defaultNorm
, ML_defaultNormLog
, MaxML
,
659 workspace
, wkspSize
);
660 if (writeEntropy
) cSeqSizeEstimate
+= fseMetadata
->fseTablesSize
;
661 return cSeqSizeEstimate
+ sequencesSectionHeaderSize
;
664 static size_t ZSTD_estimateSubBlockSize(const BYTE
* literals
, size_t litSize
,
665 const BYTE
* ofCodeTable
,
666 const BYTE
* llCodeTable
,
667 const BYTE
* mlCodeTable
,
669 const ZSTD_entropyCTables_t
* entropy
,
670 const ZSTD_entropyCTablesMetadata_t
* entropyMetadata
,
671 void* workspace
, size_t wkspSize
,
672 int writeLitEntropy
, int writeSeqEntropy
) {
673 size_t cSizeEstimate
= 0;
674 cSizeEstimate
+= ZSTD_estimateSubBlockSize_literal(literals
, litSize
,
675 &entropy
->huf
, &entropyMetadata
->hufMetadata
,
676 workspace
, wkspSize
, writeLitEntropy
);
677 cSizeEstimate
+= ZSTD_estimateSubBlockSize_sequences(ofCodeTable
, llCodeTable
, mlCodeTable
,
678 nbSeq
, &entropy
->fse
, &entropyMetadata
->fseMetadata
,
679 workspace
, wkspSize
, writeSeqEntropy
);
680 return cSizeEstimate
+ ZSTD_blockHeaderSize
;
683 static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t
const* fseMetadata
)
685 if (fseMetadata
->llType
== set_compressed
|| fseMetadata
->llType
== set_rle
)
687 if (fseMetadata
->mlType
== set_compressed
|| fseMetadata
->mlType
== set_rle
)
689 if (fseMetadata
->ofType
== set_compressed
|| fseMetadata
->ofType
== set_rle
)
694 /** ZSTD_compressSubBlock_multi() :
695 * Breaks super-block into multiple sub-blocks and compresses them.
696 * Entropy will be written to the first block.
697 * The following blocks will use repeat mode to compress.
698 * All sub-blocks are compressed blocks (no raw or rle blocks).
699 * @return : compressed size of the super block (which is multiple ZSTD blocks)
700 * Or 0 if it failed to compress. */
701 static size_t ZSTD_compressSubBlock_multi(const seqStore_t
* seqStorePtr
,
702 const ZSTD_compressedBlockState_t
* prevCBlock
,
703 ZSTD_compressedBlockState_t
* nextCBlock
,
704 const ZSTD_entropyCTablesMetadata_t
* entropyMetadata
,
705 const ZSTD_CCtx_params
* cctxParams
,
706 void* dst
, size_t dstCapacity
,
707 const void* src
, size_t srcSize
,
708 const int bmi2
, U32 lastBlock
,
709 void* workspace
, size_t wkspSize
)
711 const seqDef
* const sstart
= seqStorePtr
->sequencesStart
;
712 const seqDef
* const send
= seqStorePtr
->sequences
;
713 const seqDef
* sp
= sstart
;
714 const BYTE
* const lstart
= seqStorePtr
->litStart
;
715 const BYTE
* const lend
= seqStorePtr
->lit
;
716 const BYTE
* lp
= lstart
;
717 BYTE
const* ip
= (BYTE
const*)src
;
718 BYTE
const* const iend
= ip
+ srcSize
;
719 BYTE
* const ostart
= (BYTE
*)dst
;
720 BYTE
* const oend
= ostart
+ dstCapacity
;
722 const BYTE
* llCodePtr
= seqStorePtr
->llCode
;
723 const BYTE
* mlCodePtr
= seqStorePtr
->mlCode
;
724 const BYTE
* ofCodePtr
= seqStorePtr
->ofCode
;
725 size_t targetCBlockSize
= cctxParams
->targetCBlockSize
;
726 size_t litSize
, seqCount
;
727 int writeLitEntropy
= entropyMetadata
->hufMetadata
.hType
== set_compressed
;
728 int writeSeqEntropy
= 1;
729 int lastSequence
= 0;
731 DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
732 (unsigned)(lend
-lp
), (unsigned)(send
-sstart
));
737 size_t cBlockSizeEstimate
= 0;
738 if (sstart
== send
) {
741 const seqDef
* const sequence
= sp
+ seqCount
;
742 lastSequence
= sequence
== send
- 1;
743 litSize
+= ZSTD_getSequenceLength(seqStorePtr
, sequence
).litLength
;
748 assert(litSize
<= (size_t)(lend
- lp
));
749 litSize
= (size_t)(lend
- lp
);
751 /* I think there is an optimization opportunity here.
752 * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
753 * since it recalculates estimate from scratch.
754 * For example, it would recount literal distribution and symbol codes everytime.
756 cBlockSizeEstimate
= ZSTD_estimateSubBlockSize(lp
, litSize
, ofCodePtr
, llCodePtr
, mlCodePtr
, seqCount
,
757 &nextCBlock
->entropy
, entropyMetadata
,
758 workspace
, wkspSize
, writeLitEntropy
, writeSeqEntropy
);
759 if (cBlockSizeEstimate
> targetCBlockSize
|| lastSequence
) {
760 int litEntropyWritten
= 0;
761 int seqEntropyWritten
= 0;
762 const size_t decompressedSize
= ZSTD_seqDecompressedSize(seqStorePtr
, sp
, seqCount
, litSize
, lastSequence
);
763 const size_t cSize
= ZSTD_compressSubBlock(&nextCBlock
->entropy
, entropyMetadata
,
766 llCodePtr
, mlCodePtr
, ofCodePtr
,
769 bmi2
, writeLitEntropy
, writeSeqEntropy
,
770 &litEntropyWritten
, &seqEntropyWritten
,
771 lastBlock
&& lastSequence
);
772 FORWARD_IF_ERROR(cSize
, "ZSTD_compressSubBlock failed");
773 if (cSize
> 0 && cSize
< decompressedSize
) {
774 DEBUGLOG(5, "Committed the sub-block");
775 assert(ip
+ decompressedSize
<= iend
);
776 ip
+= decompressedSize
;
780 llCodePtr
+= seqCount
;
781 mlCodePtr
+= seqCount
;
782 ofCodePtr
+= seqCount
;
785 /* Entropy only needs to be written once */
786 if (litEntropyWritten
) {
789 if (seqEntropyWritten
) {
794 } while (!lastSequence
);
795 if (writeLitEntropy
) {
796 DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
797 memcpy(&nextCBlock
->entropy
.huf
, &prevCBlock
->entropy
.huf
, sizeof(prevCBlock
->entropy
.huf
));
799 if (writeSeqEntropy
&& ZSTD_needSequenceEntropyTables(&entropyMetadata
->fseMetadata
)) {
800 /* If we haven't written our entropy tables, then we've violated our contract and
801 * must emit an uncompressed block.
803 DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
807 size_t const cSize
= ZSTD_noCompressBlock(op
, oend
- op
, ip
, iend
- ip
, lastBlock
);
808 DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend
- ip
));
809 FORWARD_IF_ERROR(cSize
, "ZSTD_noCompressBlock failed");
812 /* We have to regenerate the repcodes because we've skipped some sequences */
816 memcpy(&rep
, prevCBlock
->rep
, sizeof(rep
));
817 for (seq
= sstart
; seq
< sp
; ++seq
) {
818 rep
= ZSTD_updateRep(rep
.rep
, seq
->offset
- 1, ZSTD_getSequenceLength(seqStorePtr
, seq
).litLength
== 0);
820 memcpy(nextCBlock
->rep
, &rep
, sizeof(rep
));
823 DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
827 size_t ZSTD_compressSuperBlock(ZSTD_CCtx
* zc
,
828 void* dst
, size_t dstCapacity
,
829 void const* src
, size_t srcSize
,
830 unsigned lastBlock
) {
831 ZSTD_entropyCTablesMetadata_t entropyMetadata
;
833 FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc
->seqStore
,
834 &zc
->blockState
.prevCBlock
->entropy
,
835 &zc
->blockState
.nextCBlock
->entropy
,
838 zc
->entropyWorkspace
, HUF_WORKSPACE_SIZE
/* statically allocated in resetCCtx */), "");
840 return ZSTD_compressSubBlock_multi(&zc
->seqStore
,
841 zc
->blockState
.prevCBlock
,
842 zc
->blockState
.nextCBlock
,
848 zc
->entropyWorkspace
, HUF_WORKSPACE_SIZE
/* statically allocated in resetCCtx */);