Merge branch 'maint-0.4.8'
[tor.git] / src / lib / compress / compress.c
blob346e77f07d9f43dceef544da4faedd8965c84605
1 /* Copyright (c) 2004, Roger Dingledine.
2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3 * Copyright (c) 2007-2021, The Tor Project, Inc. */
4 /* See LICENSE for licensing information */
6 /**
7 * \file compress.c
8 * \brief Common compression API implementation.
10 * This file provides a unified interface to all the compression libraries Tor
11 * knows how to use.
12 **/
14 #include "orconfig.h"
16 #include <stdlib.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include "lib/cc/torint.h"
21 #ifdef HAVE_NETINET_IN_H
22 #include <netinet/in.h>
23 #endif
25 #include "lib/log/log.h"
26 #include "lib/log/util_bug.h"
27 #include "lib/arch/bytes.h"
28 #include "lib/ctime/di_ops.h"
29 #include "lib/compress/compress.h"
30 #include "lib/compress/compress_lzma.h"
31 #include "lib/compress/compress_none.h"
32 #include "lib/compress/compress_sys.h"
33 #include "lib/compress/compress_zlib.h"
34 #include "lib/compress/compress_zstd.h"
35 #include "lib/intmath/cmp.h"
36 #include "lib/malloc/malloc.h"
37 #include "lib/subsys/subsys.h"
38 #include "lib/thread/threads.h"
40 /** Total number of bytes allocated for compression state overhead. */
41 static atomic_counter_t total_compress_allocation;
43 /** @{ */
44 /* These macros define the maximum allowable compression factor. Anything of
45 * size greater than CHECK_FOR_COMPRESSION_BOMB_AFTER is not allowed to
46 * have an uncompression factor (uncompressed size:compressed size ratio) of
47 * any greater than MAX_UNCOMPRESSION_FACTOR.
49 * Picking a value for MAX_UNCOMPRESSION_FACTOR is a trade-off: we want it to
50 * be small to limit the attack multiplier, but we also want it to be large
51 * enough so that no legitimate document --even ones we might invent in the
52 * future -- ever compresses by a factor of greater than
53 * MAX_UNCOMPRESSION_FACTOR. Within those parameters, there's a reasonably
54 * large range of possible values. IMO, anything over 8 is probably safe; IMO
55 * anything under 50 is probably sufficient.
57 #define MAX_UNCOMPRESSION_FACTOR 25
58 #define CHECK_FOR_COMPRESSION_BOMB_AFTER (1024*64)
59 /** @} */
61 /** Return true if uncompressing an input of size <b>in_size</b> to an input of
62 * size at least <b>size_out</b> looks like a compression bomb. */
63 MOCK_IMPL(int,
64 tor_compress_is_compression_bomb,(size_t size_in, size_t size_out))
66 if (size_in == 0 || size_out < CHECK_FOR_COMPRESSION_BOMB_AFTER)
67 return 0;
69 if (size_out / size_in > MAX_UNCOMPRESSION_FACTOR) {
70 log_warn(LD_GENERAL,
71 "Detected possible compression bomb with "
72 "input size = %"TOR_PRIuSZ " and output size = %"TOR_PRIuSZ,
73 size_in, size_out);
74 return 1;
77 return 0;
80 /** Guess the size that <b>in_len</b> will be after compression or
81 * decompression. */
82 static size_t
83 guess_compress_size(int compress, compress_method_t method,
84 compression_level_t compression_level,
85 size_t in_len)
87 // ignore these for now.
88 (void)compression_level;
89 if (method == NO_METHOD) {
90 /* Guess that we'll need an extra byte, to avoid a needless realloc
91 * for nul-termination */
92 return (in_len < SIZE_MAX) ? in_len + 1 : in_len;
95 /* Always guess a factor of 2. */
96 if (compress) {
97 in_len /= 2;
98 } else {
99 if (in_len < SIZE_T_CEILING/2)
100 in_len *= 2;
102 return MAX(in_len, 1024);
105 /** Internal function to implement tor_compress/tor_uncompress, depending on
106 * whether <b>compress</b> is set. All arguments are as for tor_compress or
107 * tor_uncompress. */
108 static int
109 tor_compress_impl(int compress,
110 char **out, size_t *out_len,
111 const char *in, size_t in_len,
112 compress_method_t method,
113 compression_level_t compression_level,
114 int complete_only,
115 int protocol_warn_level)
117 tor_compress_state_t *stream;
118 int rv;
120 stream = tor_compress_new(compress, method, compression_level);
122 if (stream == NULL) {
123 log_warn(LD_GENERAL, "NULL stream while %scompressing",
124 compress?"":"de");
125 log_debug(LD_GENERAL, "method: %d level: %d at len: %lu",
126 method, compression_level, (unsigned long)in_len);
127 return -1;
130 size_t in_len_orig = in_len;
131 size_t out_remaining, out_alloc;
132 char *outptr;
134 out_remaining = out_alloc =
135 guess_compress_size(compress, method, compression_level, in_len);
136 *out = outptr = tor_malloc(out_remaining);
138 const int finish = complete_only || compress;
140 while (1) {
141 switch (tor_compress_process(stream,
142 &outptr, &out_remaining,
143 &in, &in_len, finish)) {
144 case TOR_COMPRESS_DONE:
145 if (in_len == 0 || compress) {
146 goto done;
147 } else {
148 // More data is present, and we're decompressing. So we may need to
149 // reinitialize the stream if we are handling multiple concatenated
150 // inputs.
151 tor_compress_free(stream);
152 stream = tor_compress_new(compress, method, compression_level);
153 if (stream == NULL) {
154 log_warn(LD_GENERAL, "NULL stream while %scompressing",
155 compress?"":"de");
156 goto err;
159 break;
160 case TOR_COMPRESS_OK:
161 if (compress || complete_only) {
162 log_fn(protocol_warn_level, LD_PROTOCOL,
163 "Unexpected %s while %scompressing",
164 complete_only?"end of input":"result",
165 compress?"":"de");
166 log_debug(LD_GENERAL, "method: %d level: %d at len: %lu",
167 method, compression_level, (unsigned long)in_len);
168 goto err;
169 } else {
170 if (in_len == 0) {
171 goto done;
174 break;
175 case TOR_COMPRESS_BUFFER_FULL: {
176 if (!compress && outptr < *out+out_alloc) {
177 // A buffer error in this case means that we have a problem
178 // with our input.
179 log_fn(protocol_warn_level, LD_PROTOCOL,
180 "Possible truncated or corrupt compressed data");
181 goto err;
183 if (out_alloc >= SIZE_T_CEILING / 2) {
184 log_warn(LD_GENERAL, "While %scompressing data: ran out of space.",
185 compress?"":"un");
186 goto err;
188 if (!compress &&
189 tor_compress_is_compression_bomb(in_len_orig, out_alloc)) {
190 // This should already have been caught down in the backend logic.
191 // LCOV_EXCL_START
192 tor_assert_nonfatal_unreached();
193 goto err;
194 // LCOV_EXCL_STOP
196 const size_t offset = outptr - *out;
197 out_alloc *= 2;
198 *out = tor_realloc(*out, out_alloc);
199 outptr = *out + offset;
200 out_remaining = out_alloc - offset;
201 break;
203 case TOR_COMPRESS_ERROR:
204 log_fn(protocol_warn_level, LD_GENERAL,
205 "Error while %scompressing data: bad input?",
206 compress?"":"un");
207 goto err; // bad data.
209 // LCOV_EXCL_START
210 default:
211 tor_assert_nonfatal_unreached();
212 goto err;
213 // LCOV_EXCL_STOP
216 done:
217 *out_len = outptr - *out;
218 if (compress && tor_compress_is_compression_bomb(*out_len, in_len_orig)) {
219 log_warn(LD_BUG, "We compressed something and got an insanely high "
220 "compression factor; other Tors would think this was a "
221 "compression bomb.");
222 goto err;
224 if (!compress) {
225 // NUL-terminate our output.
226 if (out_alloc == *out_len)
227 *out = tor_realloc(*out, out_alloc + 1);
228 (*out)[*out_len] = '\0';
230 rv = 0;
231 goto out;
233 err:
234 tor_free(*out);
235 *out_len = 0;
236 rv = -1;
237 goto out;
239 out:
240 tor_compress_free(stream);
241 return rv;
244 /** Given <b>in_len</b> bytes at <b>in</b>, compress them into a newly
245 * allocated buffer, using the method described in <b>method</b>. Store the
246 * compressed string in *<b>out</b>, and its length in *<b>out_len</b>.
247 * Return 0 on success, -1 on failure.
250 tor_compress(char **out, size_t *out_len,
251 const char *in, size_t in_len,
252 compress_method_t method)
254 return tor_compress_impl(1, out, out_len, in, in_len, method,
255 BEST_COMPRESSION,
256 1, LOG_WARN);
259 /** Given zero or more compressed strings of total length <b>in_len</b> bytes
260 * at <b>in</b>, uncompress them into a newly allocated buffer, using the
261 * method described in <b>method</b>. Store the uncompressed string in
262 * *<b>out</b>, and its length in *<b>out_len</b>. Return 0 on success, -1 on
263 * failure.
265 * If any bytes are written to <b>out</b>, an extra byte NUL is always
266 * written at the end, but not counted in <b>out_len</b>. This is a
267 * safety feature to ensure that the output can be treated as a
268 * NUL-terminated string -- though of course, callers should check
269 * out_len anyway.
271 * If <b>complete_only</b> is true, we consider a truncated input as a
272 * failure; otherwise we decompress as much as we can. Warn about truncated
273 * or corrupt inputs at <b>protocol_warn_level</b>.
276 tor_uncompress(char **out, size_t *out_len,
277 const char *in, size_t in_len,
278 compress_method_t method,
279 int complete_only,
280 int protocol_warn_level)
282 return tor_compress_impl(0, out, out_len, in, in_len, method,
283 BEST_COMPRESSION,
284 complete_only, protocol_warn_level);
287 /** Try to tell whether the <b>in_len</b>-byte string in <b>in</b> is likely
288 * to be compressed or not. If it is, return the likeliest compression method.
289 * Otherwise, return UNKNOWN_METHOD.
291 compress_method_t
292 detect_compression_method(const char *in, size_t in_len)
294 if (in_len > 2 && fast_memeq(in, "\x1f\x8b", 2)) {
295 return GZIP_METHOD;
296 } else if (in_len > 2 && (in[0] & 0x0f) == 8 &&
297 (tor_ntohs(get_uint16(in)) % 31) == 0) {
298 return ZLIB_METHOD;
299 } else if (in_len > 2 &&
300 fast_memeq(in, "\x5d\x00\x00", 3)) {
301 return LZMA_METHOD;
302 } else if (in_len > 3 &&
303 fast_memeq(in, "\x28\xb5\x2f\xfd", 4)) {
304 return ZSTD_METHOD;
305 } else {
306 return UNKNOWN_METHOD;
310 /** Return 1 if a given <b>method</b> is supported; otherwise 0. */
312 tor_compress_supports_method(compress_method_t method)
314 switch (method) {
315 case GZIP_METHOD:
316 case ZLIB_METHOD:
317 return tor_zlib_method_supported();
318 case LZMA_METHOD:
319 return tor_lzma_method_supported();
320 case ZSTD_METHOD:
321 return tor_zstd_method_supported();
322 case NO_METHOD:
323 return 1;
324 case UNKNOWN_METHOD:
325 default:
326 return 0;
331 * Return a bitmask of the supported compression types, where 1&lt;&lt;m is
332 * set in the bitmask if and only if compression with method <b>m</b> is
333 * supported.
335 unsigned
336 tor_compress_get_supported_method_bitmask(void)
338 static unsigned supported = 0;
339 if (supported == 0) {
340 compress_method_t m;
341 for (m = NO_METHOD; m <= UNKNOWN_METHOD; ++m) {
342 if (tor_compress_supports_method(m)) {
343 supported |= (1u << m);
347 return supported;
350 /** Table of compression method names. These should have an "x-" prefix,
351 * if they are not listed in the IANA content coding registry. */
352 static const struct {
353 const char *name;
354 compress_method_t method;
355 } compression_method_names[] = {
356 { "gzip", GZIP_METHOD },
357 { "deflate", ZLIB_METHOD },
358 // We call this "x-tor-lzma" rather than "x-lzma", because we impose a
359 // lower maximum memory usage on the decoding side.
360 { "x-tor-lzma", LZMA_METHOD },
361 { "x-zstd" , ZSTD_METHOD },
362 { "identity", NO_METHOD },
364 /* Later entries in this table are not canonical; these are recognized but
365 * not emitted. */
366 { "x-gzip", GZIP_METHOD },
369 /** Return the canonical string representation of the compression method
370 * <b>method</b>, or NULL if the method isn't recognized. */
371 const char *
372 compression_method_get_name(compress_method_t method)
374 unsigned i;
375 for (i = 0; i < ARRAY_LENGTH(compression_method_names); ++i) {
376 if (method == compression_method_names[i].method)
377 return compression_method_names[i].name;
379 return NULL;
382 /** Table of compression human readable method names. */
383 static const struct {
384 compress_method_t method;
385 const char *name;
386 } compression_method_human_names[] = {
387 { NO_METHOD, "uncompressed" },
388 { GZIP_METHOD, "gzipped" },
389 { ZLIB_METHOD, "deflated" },
390 { LZMA_METHOD, "LZMA compressed" },
391 { ZSTD_METHOD, "Zstandard compressed" },
392 { UNKNOWN_METHOD, "unknown encoding" },
395 /** Return a human readable string representation of the compression method
396 * <b>method</b>, or NULL if the method isn't recognized. */
397 const char *
398 compression_method_get_human_name(compress_method_t method)
400 unsigned i;
401 for (i = 0; i < ARRAY_LENGTH(compression_method_human_names); ++i) {
402 if (method == compression_method_human_names[i].method)
403 return compression_method_human_names[i].name;
405 return NULL;
408 /** Return the compression method represented by the string <b>name</b>, or
409 * UNKNOWN_METHOD if the string isn't recognized. */
410 compress_method_t
411 compression_method_get_by_name(const char *name)
413 unsigned i;
414 for (i = 0; i < ARRAY_LENGTH(compression_method_names); ++i) {
415 if (!strcmp(compression_method_names[i].name, name))
416 return compression_method_names[i].method;
418 return UNKNOWN_METHOD;
421 /** Return a string representation of the version of the library providing the
422 * compression method given in <b>method</b>. Returns NULL if <b>method</b> is
423 * unknown or unsupported. */
424 const char *
425 tor_compress_version_str(compress_method_t method)
427 switch (method) {
428 case GZIP_METHOD:
429 case ZLIB_METHOD:
430 return tor_zlib_get_version_str();
431 case LZMA_METHOD:
432 return tor_lzma_get_version_str();
433 case ZSTD_METHOD:
434 return tor_zstd_get_version_str();
435 case NO_METHOD:
436 case UNKNOWN_METHOD:
437 default:
438 return NULL;
442 /** Return a string representation of the version of the library, found at
443 * compile time, providing the compression method given in <b>method</b>.
444 * Returns NULL if <b>method</b> is unknown or unsupported. */
445 const char *
446 tor_compress_header_version_str(compress_method_t method)
448 switch (method) {
449 case GZIP_METHOD:
450 case ZLIB_METHOD:
451 return tor_zlib_get_header_version_str();
452 case LZMA_METHOD:
453 return tor_lzma_get_header_version_str();
454 case ZSTD_METHOD:
455 return tor_zstd_get_header_version_str();
456 case NO_METHOD:
457 case UNKNOWN_METHOD:
458 default:
459 return NULL;
463 /** Return the approximate number of bytes allocated for all
464 * supported compression schemas. */
465 size_t
466 tor_compress_get_total_allocation(void)
468 return atomic_counter_get(&total_compress_allocation) +
469 tor_zlib_get_total_allocation() +
470 tor_lzma_get_total_allocation() +
471 tor_zstd_get_total_allocation();
474 /** Internal state for an incremental compression/decompression. The body of
475 * this struct is not exposed. */
476 struct tor_compress_state_t {
477 compress_method_t method; /**< The compression method. */
479 union {
480 tor_zlib_compress_state_t *zlib_state;
481 tor_lzma_compress_state_t *lzma_state;
482 tor_zstd_compress_state_t *zstd_state;
483 } u; /**< Compression backend state. */
486 /** Construct and return a tor_compress_state_t object using <b>method</b>. If
487 * <b>compress</b>, it's for compression; otherwise it's for decompression. */
488 tor_compress_state_t *
489 tor_compress_new(int compress, compress_method_t method,
490 compression_level_t compression_level)
492 tor_compress_state_t *state;
494 state = tor_malloc_zero(sizeof(tor_compress_state_t));
495 state->method = method;
497 switch (method) {
498 case GZIP_METHOD:
499 case ZLIB_METHOD: {
500 tor_zlib_compress_state_t *zlib_state =
501 tor_zlib_compress_new(compress, method, compression_level);
503 if (zlib_state == NULL)
504 goto err;
506 state->u.zlib_state = zlib_state;
507 break;
509 case LZMA_METHOD: {
510 tor_lzma_compress_state_t *lzma_state =
511 tor_lzma_compress_new(compress, method, compression_level);
513 if (lzma_state == NULL)
514 goto err;
516 state->u.lzma_state = lzma_state;
517 break;
519 case ZSTD_METHOD: {
520 tor_zstd_compress_state_t *zstd_state =
521 tor_zstd_compress_new(compress, method, compression_level);
523 if (zstd_state == NULL)
524 goto err;
526 state->u.zstd_state = zstd_state;
527 break;
529 case NO_METHOD: {
530 break;
532 case UNKNOWN_METHOD:
533 goto err;
536 atomic_counter_add(&total_compress_allocation,
537 sizeof(tor_compress_state_t));
538 return state;
540 err:
541 tor_free(state);
542 return NULL;
545 /** Compress/decompress some bytes using <b>state</b>. Read up to
546 * *<b>in_len</b> bytes from *<b>in</b>, and write up to *<b>out_len</b> bytes
547 * to *<b>out</b>, adjusting the values as we go. If <b>finish</b> is true,
548 * we've reached the end of the input.
550 * Return TOR_COMPRESS_DONE if we've finished the entire
551 * compression/decompression.
552 * Return TOR_COMPRESS_OK if we're processed everything from the input.
553 * Return TOR_COMPRESS_BUFFER_FULL if we're out of space on <b>out</b>.
554 * Return TOR_COMPRESS_ERROR if the stream is corrupt.
556 tor_compress_output_t
557 tor_compress_process(tor_compress_state_t *state,
558 char **out, size_t *out_len,
559 const char **in, size_t *in_len,
560 int finish)
562 tor_assert(state != NULL);
563 const size_t in_len_orig = *in_len;
564 const size_t out_len_orig = *out_len;
565 tor_compress_output_t rv;
567 if (*out_len == 0 && (*in_len > 0 || finish)) {
568 // If we still have input data, but no space for output data, we might as
569 // well return early and let the caller do the reallocation of the out
570 // variable.
571 return TOR_COMPRESS_BUFFER_FULL;
574 switch (state->method) {
575 case GZIP_METHOD:
576 case ZLIB_METHOD:
577 rv = tor_zlib_compress_process(state->u.zlib_state,
578 out, out_len, in, in_len,
579 finish);
580 break;
581 case LZMA_METHOD:
582 rv = tor_lzma_compress_process(state->u.lzma_state,
583 out, out_len, in, in_len,
584 finish);
585 break;
586 case ZSTD_METHOD:
587 rv = tor_zstd_compress_process(state->u.zstd_state,
588 out, out_len, in, in_len,
589 finish);
590 break;
591 case NO_METHOD:
592 rv = tor_cnone_compress_process(out, out_len, in, in_len,
593 finish);
594 break;
595 default:
596 case UNKNOWN_METHOD:
597 goto err;
599 if (BUG((rv == TOR_COMPRESS_OK) &&
600 *in_len == in_len_orig &&
601 *out_len == out_len_orig)) {
602 log_warn(LD_GENERAL,
603 "More info on the bug: method == %s, finish == %d, "
604 " *in_len == in_len_orig == %lu, "
605 "*out_len == out_len_orig == %lu",
606 compression_method_get_human_name(state->method), finish,
607 (unsigned long)in_len_orig, (unsigned long)out_len_orig);
608 return TOR_COMPRESS_ERROR;
611 return rv;
612 err:
613 return TOR_COMPRESS_ERROR;
616 /** Deallocate <b>state</b>. */
617 void
618 tor_compress_free_(tor_compress_state_t *state)
620 if (state == NULL)
621 return;
623 switch (state->method) {
624 case GZIP_METHOD:
625 case ZLIB_METHOD:
626 tor_zlib_compress_free(state->u.zlib_state);
627 break;
628 case LZMA_METHOD:
629 tor_lzma_compress_free(state->u.lzma_state);
630 break;
631 case ZSTD_METHOD:
632 tor_zstd_compress_free(state->u.zstd_state);
633 break;
634 case NO_METHOD:
635 break;
636 case UNKNOWN_METHOD:
637 break;
640 atomic_counter_sub(&total_compress_allocation,
641 sizeof(tor_compress_state_t));
642 tor_free(state);
645 /** Return the approximate number of bytes allocated for <b>state</b>. */
646 size_t
647 tor_compress_state_size(const tor_compress_state_t *state)
649 tor_assert(state != NULL);
651 size_t size = sizeof(tor_compress_state_t);
653 switch (state->method) {
654 case GZIP_METHOD:
655 case ZLIB_METHOD:
656 size += tor_zlib_compress_state_size(state->u.zlib_state);
657 break;
658 case LZMA_METHOD:
659 size += tor_lzma_compress_state_size(state->u.lzma_state);
660 break;
661 case ZSTD_METHOD:
662 size += tor_zstd_compress_state_size(state->u.zstd_state);
663 break;
664 case NO_METHOD:
665 case UNKNOWN_METHOD:
666 break;
669 return size;
672 /** Initialize all compression modules. */
674 tor_compress_init(void)
676 atomic_counter_init(&total_compress_allocation);
678 tor_zlib_init();
679 tor_lzma_init();
680 tor_zstd_init();
682 return 0;
685 /** Warn if we had any problems while setting up our compression libraries.
687 * (This isn't part of tor_compress_init, since the logs aren't set up yet.)
689 void
690 tor_compress_log_init_warnings(void)
692 // XXXX can we move this into tor_compress_init() after all? log.c queues
693 // XXXX log messages at startup.
694 tor_zstd_warn_if_version_mismatched();
697 static int
698 subsys_compress_initialize(void)
700 return tor_compress_init();
703 const subsys_fns_t sys_compress = {
704 .name = "compress",
705 SUBSYS_DECLARE_LOCATION(),
706 .supported = true,
707 .level = -55,
708 .initialize = subsys_compress_initialize,