2 * String utility routines
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 1998 Gerald Combs
8 * SPDX-License-Identifier: GPL-2.0-or-later
19 #include <ws_codepoints.h>
21 #include <wsutil/to_str.h>
24 static const char hex
[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
25 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
28 wmem_strconcat(wmem_allocator_t
*allocator
, const char *first
, ...)
39 len
= 1 + strlen(first
);
40 va_start(args
, first
);
41 while ((s
= va_arg(args
, char*))) {
46 ptr
= concat
= (char *)wmem_alloc(allocator
, len
);
48 ptr
= g_stpcpy(ptr
, first
);
49 va_start(args
, first
);
50 while ((s
= va_arg(args
, char*))) {
51 ptr
= g_stpcpy(ptr
, s
);
59 wmem_strjoin(wmem_allocator_t
*allocator
,
60 const char *separator
, const char *first
, ...)
72 if (separator
== NULL
) {
76 separator_len
= strlen (separator
);
78 len
= 1 + strlen(first
); /* + 1 for null byte */
79 va_start(args
, first
);
80 while ((s
= va_arg(args
, char*))) {
81 len
+= (separator_len
+ strlen(s
));
85 ptr
= concat
= (char *)wmem_alloc(allocator
, len
);
86 ptr
= g_stpcpy(ptr
, first
);
87 va_start(args
, first
);
88 while ((s
= va_arg(args
, char*))) {
89 ptr
= g_stpcpy(ptr
, separator
);
90 ptr
= g_stpcpy(ptr
, s
);
99 wmem_strjoinv(wmem_allocator_t
*allocator
,
100 const char *separator
, char **str_array
)
104 ws_return_val_if(!str_array
, NULL
);
106 if (separator
== NULL
) {
113 size_t len
, separator_len
;
115 separator_len
= strlen(separator
);
117 /* Get first part of length. Plus one for null byte. */
118 len
= 1 + strlen(str_array
[0]);
119 /* Get the full length, including the separators. */
120 for (i
= 1; str_array
[i
] != NULL
; i
++) {
121 len
+= separator_len
;
122 len
+= strlen(str_array
[i
]);
125 /* Allocate and build the string. */
126 string
= (char *)wmem_alloc(allocator
, len
);
127 ptr
= g_stpcpy(string
, str_array
[0]);
128 for (i
= 1; str_array
[i
] != NULL
; i
++) {
129 ptr
= g_stpcpy(ptr
, separator
);
130 ptr
= g_stpcpy(ptr
, str_array
[i
]);
133 string
= wmem_strdup(allocator
, "");
141 wmem_strsplit(wmem_allocator_t
*allocator
, const char *src
,
142 const char *delimiter
, int max_tokens
)
151 if (!src
|| !delimiter
|| !delimiter
[0])
154 /* An empty string results in an empty vector. */
156 vec
= wmem_new0(allocator
, char *);
160 splitted
= wmem_strdup(allocator
, src
);
161 sep_len
= (unsigned)strlen(delimiter
);
164 max_tokens
= INT_MAX
;
166 /* Calculate the number of fields. */
169 while (tokens
< (unsigned)max_tokens
&& (s
= strstr(s
, delimiter
))) {
174 vec
= wmem_alloc_array(allocator
, char *, tokens
+ 1);
176 /* Populate the array of string tokens. */
180 while (tokens
< (unsigned)max_tokens
&& (s
= strstr(s
, delimiter
))) {
181 for (i
= 0; i
< sep_len
; i
++)
195 * wmem_ascii_strdown:
196 * based on g_ascii_strdown.
199 wmem_ascii_strdown(wmem_allocator_t
*allocator
, const char *str
, ssize_t len
)
203 g_return_val_if_fail (str
!= NULL
, NULL
);
208 result
= wmem_strndup(allocator
, str
, len
);
209 for (s
= result
; *s
; s
++)
210 *s
= g_ascii_tolower (*s
);
229 case 'a': case 'A': return 10;
230 case 'b': case 'B': return 11;
231 case 'c': case 'C': return 12;
232 case 'd': case 'D': return 13;
233 case 'e': case 'E': return 14;
234 case 'f': case 'F': return 15;
239 /* Convert all ASCII letters to lower case, in place. */
241 ascii_strdown_inplace(char *str
)
245 for (s
= str
; *s
; s
++)
246 /* What 'g_ascii_tolower (char c)' does, this should be slightly more efficient */
247 *s
= g_ascii_isupper (*s
) ? *s
- 'A' + 'a' : *s
;
252 /* Convert all ASCII letters to upper case, in place. */
254 ascii_strup_inplace(char *str
)
258 for (s
= str
; *s
; s
++)
259 /* What 'g_ascii_toupper (char c)' does, this should be slightly more efficient */
260 *s
= g_ascii_islower (*s
) ? *s
- 'a' + 'A' : *s
;
265 /* Check if an entire string is printable. */
267 isprint_string(const char *str
)
271 /* Loop until we reach the end of the string (a null) */
272 for(pos
= 0; str
[pos
] != '\0'; pos
++){
273 if(!g_ascii_isprint(str
[pos
])){
274 /* The string contains a non-printable character */
279 /* The string contains only printable characters */
283 /* Check if an entire UTF-8 string is printable. */
285 isprint_utf8_string(const char *str
, const unsigned length
)
287 const char *strend
= str
+ length
;
289 if (!g_utf8_validate(str
, length
, NULL
)) {
293 while (str
< strend
) {
294 /* This returns false for G_UNICODE_CONTROL | G_UNICODE_FORMAT |
295 * G_UNICODE_UNASSIGNED | G_UNICODE_SURROGATE
296 * XXX: Could it be ok to have certain format characters, e.g.
297 * U+00AD SOFT HYPHEN? If so, format_text() should be changed too.
299 if (!g_unichar_isprint(g_utf8_get_char(str
))) {
302 str
= g_utf8_next_char(str
);
308 /* Check if an entire string is digits. */
310 isdigit_string(const unsigned char *str
)
314 /* Loop until we reach the end of the string (a null) */
315 for(pos
= 0; str
[pos
] != '\0'; pos
++){
316 if(!g_ascii_isdigit(str
[pos
])){
317 /* The string contains a non-digit character */
322 /* The string contains only digits */
327 ws_ascii_strcasestr(const char *haystack
, const char *needle
)
329 /* Do not use strcasestr() here, even if a system has it, as it is
330 * locale-dependent (and has different results for e.g. Turkic languages.)
331 * FreeBSD, NetBSD, macOS have a strcasestr_l() that could be used.
333 size_t hlen
= strlen(haystack
);
334 size_t nlen
= strlen(needle
);
336 while (hlen
-- >= nlen
) {
337 if (!g_ascii_strncasecmp(haystack
, needle
, nlen
))
344 /* Return the last occurrence of ch in the n bytes of haystack.
345 * If not found or n is 0, return NULL. */
347 ws_memrchr(const void *_haystack
, int ch
, size_t n
)
350 return memrchr(_haystack
, ch
, n
);
352 /* A generic implementation. This could be optimized considerably,
353 * e.g. by fetching a word at a time.
358 const uint8_t *haystack
= _haystack
;
360 uint8_t c
= (uint8_t)ch
;
362 const uint8_t *const end
= haystack
+ n
- 1;
364 for (p
= end
; p
>= haystack
; --p
) {
371 #endif /* HAVE_MEMRCHR */
374 #define FORMAT_SIZE_UNIT_MASK 0x00ff
375 #define FORMAT_SIZE_PFX_MASK 0xff00
377 static const char *thousands_grouping_fmt
;
378 static const char *thousands_grouping_fmt_flt
;
381 static void test_printf_thousands_grouping(void) {
382 /* test whether wmem_strbuf works with "'" flag character */
383 wmem_strbuf_t
*buf
= wmem_strbuf_new(NULL
, NULL
);
384 wmem_strbuf_append_printf(buf
, "%'d", 22);
385 if (g_strcmp0(wmem_strbuf_get_str(buf
), "22") == 0) {
386 thousands_grouping_fmt
= "%'"PRId64
;
387 thousands_grouping_fmt_flt
= "%'.*f";
390 thousands_grouping_fmt
= "%"PRId64
;
391 thousands_grouping_fmt_flt
= "%.*f";
393 wmem_strbuf_destroy(buf
);
397 static const char* decimal_point
= NULL
;
399 static void truncate_numeric_strbuf(wmem_strbuf_t
*strbuf
, int n
) {
401 const char *s
= wmem_strbuf_get_str(strbuf
);
405 if (decimal_point
== NULL
) {
406 decimal_point
= localeconv()->decimal_point
;
409 p
= strchr(s
, decimal_point
[0]);
424 if (*p
!= decimal_point
[0]) {
427 wmem_strbuf_truncate(strbuf
, p
- s
);
431 /* Given a floating point value, return it in a human-readable format,
432 * using units with metric prefixes (falling back to scientific notation
433 * with the base units if outside the range.)
436 format_units(wmem_allocator_t
*allocator
, double size
,
437 format_size_units_e unit
, uint16_t flags
,
440 wmem_strbuf_t
*human_str
= wmem_strbuf_new(allocator
, NULL
);
441 double power
= 1000.0;
443 bool is_small
= false;
444 /* is_small is when to use the longer, spelled out unit.
445 * We use it for inf, NaN, 0, and unprefixed small values,
446 * but not for unprefixed values using scientific notation
447 * the value is outside the supported prefix range.
449 bool scientific
= false;
450 double abs_size
= fabs(size
);
452 static const char * const si_prefix
[] = {" a", " f", " p", " n", " μ", " m", " ", " k", " M", " G", " T", " P", " E"};
453 static const char * const iec_prefix
[] = {" ", " Ki", " Mi", " Gi", " Ti", " Pi", " Ei"};
454 const char * const *prefix
= si_prefix
;
455 int max_exp
= (int)G_N_ELEMENTS(si_prefix
) - 1;
459 if (thousands_grouping_fmt
== NULL
)
460 test_printf_thousands_grouping();
462 if (flags
& FORMAT_SIZE_PREFIX_IEC
) {
464 max_exp
= (int)G_N_ELEMENTS(iec_prefix
) - 1;
468 if (isfinite(size
) && size
!= 0.0) {
470 double comp
= precision
== 0 ? 10.0 : 1.0;
472 /* For precision 0, use the range [10, 10*power) because only
473 * one significant digit is not as useful. This is what format_size
474 * does for integers. ("ls -h" uses one digit after the decimal
475 * point only for the [1, 10) range, g_format_size() always displays
476 * tenths.) Prefer non-prefixed units for the range [1,10), though.
478 * We have a limited number of units to check, so this (which
479 * can be unrolled) is presumably faster than log + floor + pow/exp
481 if (abs_size
< 1.0) {
482 while (abs_size
< comp
) {
485 if ((exponent
+ pfx_off
) < 0) {
491 while (abs_size
>= comp
*power
) {
494 if ((exponent
+ pfx_off
) > max_exp
) {
503 wmem_strbuf_append_printf(human_str
, "%.*g", precision
+ 1, size
);
509 size
= copysign(abs_size
, size
);
510 // Truncate trailing zeros, but do it this way because we know
511 // we don't want scientific notation, and we don't want %g to
512 // switch to that if precision is small. (We could always use
513 // %g when precision is large.)
514 wmem_strbuf_append_printf(human_str
, thousands_grouping_fmt_flt
, precision
, size
);
515 truncate_numeric_strbuf(human_str
, precision
);
516 // XXX - when rounding to a certain precision, printf might
517 // round up to "power" from something like 999.99999995, which
518 // looks a little odd on a graph when transitioning from 1,000 bytes
519 // (for values just under 1 kB) to 1 kB (for values 1 kB and larger.)
520 // Due to edge cases in binary fp representation and how printf might
521 // round things, the right way to handle it is taking the printf output
522 // and comparing it to "1000" and "1024" and adjusting the exponent
523 // if so - though we need to compare to the version with the thousands
524 // separator if we have that (which makes it harder to use strnatcmp
528 if ((size_t)(pfx_off
+ exponent
) < G_N_ELEMENTS(si_prefix
)) {
529 wmem_strbuf_append(human_str
, prefix
[pfx_off
+exponent
]);
533 case FORMAT_SIZE_UNIT_NONE
:
535 case FORMAT_SIZE_UNIT_BYTES
:
536 wmem_strbuf_append(human_str
, is_small
? "bytes" : "B");
538 case FORMAT_SIZE_UNIT_BITS
:
539 wmem_strbuf_append(human_str
, is_small
? "bits" : "b");
541 case FORMAT_SIZE_UNIT_BITS_S
:
542 wmem_strbuf_append(human_str
, is_small
? "bits/s" : "bps");
544 case FORMAT_SIZE_UNIT_BYTES_S
:
545 wmem_strbuf_append(human_str
, is_small
? "bytes/s" : "Bps");
547 case FORMAT_SIZE_UNIT_PACKETS
:
548 wmem_strbuf_append(human_str
, is_small
? "packets" : "pkts");
550 case FORMAT_SIZE_UNIT_PACKETS_S
:
551 wmem_strbuf_append(human_str
, is_small
? "packets/s" : "pkts/s");
553 case FORMAT_SIZE_UNIT_EVENTS
:
554 wmem_strbuf_append(human_str
, is_small
? "events" : "evts");
556 case FORMAT_SIZE_UNIT_EVENTS_S
:
557 wmem_strbuf_append(human_str
, is_small
? "events/s" : "evts/s");
559 case FORMAT_SIZE_UNIT_FIELDS
:
560 wmem_strbuf_append(human_str
, is_small
? "fields" : "flds");
562 case FORMAT_SIZE_UNIT_SECONDS
:
563 wmem_strbuf_append(human_str
, is_small
? "seconds" : "s");
565 case FORMAT_SIZE_UNIT_ERLANGS
:
566 wmem_strbuf_append(human_str
, is_small
? "erlangs" : "E");
569 ws_assert_not_reached();
572 ret_val
= wmem_strbuf_finalize(human_str
);
573 /* Convention is a space between the value and the units. If we have
574 * a prefix, the space is before the prefix. There are two possible
575 * uses of FORMAT_SIZE_UNIT_NONE:
576 * 1. Add a unit immediately after the string returned. In this case,
577 * we would want the string to end with a space if there's no prefix.
578 * 2. The unit appears somewhere else, e.g. in a legend, header, or
579 * different column. In this case, we don't want the string to end
580 * with a space if there's no prefix.
581 * chomping the string here, as we've traditionally done, optimizes for
582 * the latter case but makes the former case harder.
583 * Perhaps the right approach is to distinguish the cases with a new
586 return g_strchomp(ret_val
);
589 /* Given a size, return its value in a human-readable format */
590 /* This doesn't handle fractional values. We might want to just
591 * call the version with the double and precision 0 (possibly
592 * slower due to the use of floating point math, but do we care?)
595 format_size_wmem(wmem_allocator_t
*allocator
, int64_t size
,
596 format_size_units_e unit
, uint16_t flags
)
598 wmem_strbuf_t
*human_str
= wmem_strbuf_new(allocator
, NULL
);
601 bool is_small
= false;
602 static const char *prefix
[] = {" T", " G", " M", " k", " Ti", " Gi", " Mi", " Ki"};
605 if (thousands_grouping_fmt
== NULL
)
606 test_printf_thousands_grouping();
608 if (flags
& FORMAT_SIZE_PREFIX_IEC
) {
613 if (size
/ power
/ power
/ power
/ power
>= 10) {
614 wmem_strbuf_append_printf(human_str
, thousands_grouping_fmt
, size
/ power
/ power
/ power
/ power
);
615 wmem_strbuf_append(human_str
, prefix
[pfx_off
]);
616 } else if (size
/ power
/ power
/ power
>= 10) {
617 wmem_strbuf_append_printf(human_str
, thousands_grouping_fmt
, size
/ power
/ power
/ power
);
618 wmem_strbuf_append(human_str
, prefix
[pfx_off
+1]);
619 } else if (size
/ power
/ power
>= 10) {
620 wmem_strbuf_append_printf(human_str
, thousands_grouping_fmt
, size
/ power
/ power
);
621 wmem_strbuf_append(human_str
, prefix
[pfx_off
+2]);
622 } else if (size
/ power
>= 10) {
623 wmem_strbuf_append_printf(human_str
, thousands_grouping_fmt
, size
/ power
);
624 wmem_strbuf_append(human_str
, prefix
[pfx_off
+3]);
626 wmem_strbuf_append_printf(human_str
, thousands_grouping_fmt
, size
);
631 case FORMAT_SIZE_UNIT_NONE
:
633 case FORMAT_SIZE_UNIT_BYTES
:
634 wmem_strbuf_append(human_str
, is_small
? " bytes" : "B");
636 case FORMAT_SIZE_UNIT_BITS
:
637 wmem_strbuf_append(human_str
, is_small
? " bits" : "b");
639 case FORMAT_SIZE_UNIT_BITS_S
:
640 wmem_strbuf_append(human_str
, is_small
? " bits/s" : "bps");
642 case FORMAT_SIZE_UNIT_BYTES_S
:
643 wmem_strbuf_append(human_str
, is_small
? " bytes/s" : "Bps");
645 case FORMAT_SIZE_UNIT_PACKETS
:
646 wmem_strbuf_append(human_str
, is_small
? " packets" : "packets");
648 case FORMAT_SIZE_UNIT_PACKETS_S
:
649 wmem_strbuf_append(human_str
, is_small
? " packets/s" : "packets/s");
651 case FORMAT_SIZE_UNIT_FIELDS
:
652 wmem_strbuf_append(human_str
, is_small
? " fields" : "fields");
654 /* These aren't that practical to use with integers, but
655 * perhaps better than asserting.
657 case FORMAT_SIZE_UNIT_SECONDS
:
658 wmem_strbuf_append(human_str
, is_small
? " seconds" : "s");
660 case FORMAT_SIZE_UNIT_ERLANGS
:
661 wmem_strbuf_append(human_str
, is_small
? " erlangs" : "E");
664 ws_assert_not_reached();
667 ret_val
= wmem_strbuf_finalize(human_str
);
668 return g_strchomp(ret_val
);
672 printable_char_or_period(char c
)
674 return g_ascii_isprint(c
) ? c
: '.';
678 * This is used by the display filter engine and must be compatible
679 * with display filter syntax.
682 escape_char(char c
, char *p
)
688 * backslashes and double-quotes must be escaped (double-quotes
689 * are escaped by passing '"' as quote_char in escape_string_len)
690 * whitespace is also escaped.
693 case '\a': r
= 'a'; break;
694 case '\b': r
= 'b'; break;
695 case '\f': r
= 'f'; break;
696 case '\n': r
= 'n'; break;
697 case '\r': r
= 'r'; break;
698 case '\t': r
= 't'; break;
699 case '\v': r
= 'v'; break;
700 case '\\': r
= '\\'; break;
701 case '\0': r
= '0'; break;
712 escape_null(char c
, char *p
)
723 escape_string_len(wmem_allocator_t
*alloc
, const char *string
, ssize_t len
,
724 bool (*escape_func
)(char c
, char *p
), bool add_quotes
,
725 char quote_char
, bool double_quote
)
733 len
= strlen(string
);
739 buf
= wmem_strbuf_new_sized(alloc
, alloc_size
);
741 if (add_quotes
&& quote_char
!= '\0')
742 wmem_strbuf_append_c(buf
, quote_char
);
744 for (i
= 0; i
< len
; i
++) {
746 if ((escape_func(c
, &r
))) {
747 wmem_strbuf_append_c(buf
, '\\');
748 wmem_strbuf_append_c(buf
, r
);
750 else if (c
== quote_char
&& quote_char
!= '\0') {
751 /* If quoting, we must escape the quote_char somehow. */
753 wmem_strbuf_append_c(buf
, c
);
754 wmem_strbuf_append_c(buf
, c
);
756 wmem_strbuf_append_c(buf
, '\\');
757 wmem_strbuf_append_c(buf
, c
);
760 else if (c
== '\\' && quote_char
!= '\0' && !double_quote
) {
761 /* If quoting, and escaping the quote_char with a backslash,
762 * then backslash must be escaped, even if escape_func doesn't. */
763 wmem_strbuf_append_c(buf
, '\\');
764 wmem_strbuf_append_c(buf
, '\\');
767 /* Other UTF-8 bytes are passed through. */
768 wmem_strbuf_append_c(buf
, c
);
772 if (add_quotes
&& quote_char
!= '\0')
773 wmem_strbuf_append_c(buf
, quote_char
);
775 return wmem_strbuf_finalize(buf
);
779 ws_escape_string_len(wmem_allocator_t
*alloc
, const char *string
, ssize_t len
, bool add_quotes
)
781 return escape_string_len(alloc
, string
, len
, escape_char
, add_quotes
, '"', false);
785 ws_escape_string(wmem_allocator_t
*alloc
, const char *string
, bool add_quotes
)
787 return escape_string_len(alloc
, string
, -1, escape_char
, add_quotes
, '"', false);
790 char *ws_escape_null(wmem_allocator_t
*alloc
, const char *string
, size_t len
, bool add_quotes
)
792 /* XXX: The existing behavior (maintained) here is not to escape
793 * backslashes even though NUL is escaped.
795 return escape_string_len(alloc
, string
, len
, escape_null
, add_quotes
, add_quotes
? '"' : '\0', false);
798 char *ws_escape_csv(wmem_allocator_t
*alloc
, const char *string
, bool add_quotes
, char quote_char
, bool double_quote
, bool escape_whitespace
)
800 if (escape_whitespace
)
801 return escape_string_len(alloc
, string
, -1, escape_char
, add_quotes
, quote_char
, double_quote
);
803 return escape_string_len(alloc
, string
, -1, escape_null
, add_quotes
, quote_char
, double_quote
);
807 ws_strerrorname_r(int errnum
, char *buf
, size_t buf_size
)
809 #ifdef HAVE_STRERRORNAME_NP
810 const char *errstr
= strerrorname_np(errnum
);
811 if (errstr
!= NULL
) {
812 (void)g_strlcpy(buf
, errstr
, buf_size
);
816 snprintf(buf
, buf_size
, "Errno(%d)", errnum
);
821 ws_strdup_underline(wmem_allocator_t
*allocator
, long offset
, size_t len
)
826 wmem_strbuf_t
*buf
= wmem_strbuf_new_sized(allocator
, offset
+ len
);
828 for (int i
= 0; i
< offset
; i
++) {
829 wmem_strbuf_append_c(buf
, ' ');
831 wmem_strbuf_append_c(buf
, '^');
833 for (size_t l
= len
; l
> 1; l
--) {
834 wmem_strbuf_append_c(buf
, '~');
837 return wmem_strbuf_finalize(buf
);
840 #define INITIAL_FMTBUF_SIZE 128
843 * Declare, and initialize, the variables used for an output buffer.
845 #define FMTBUF_VARS \
846 char *fmtbuf = (char*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
847 unsigned fmtbuf_len = INITIAL_FMTBUF_SIZE; \
851 * Expand the buffer to be large enough to add nbytes bytes, plus a
854 #define FMTBUF_EXPAND(nbytes) \
856 * Is there enough room for those bytes and also enough room for \
857 * a terminating '\0'? \
859 if (column+(nbytes+1) >= fmtbuf_len) { \
861 * Double the buffer's size if it's not big enough. \
862 * The size of the buffer starts at 128, so doubling its size \
863 * adds at least another 128 bytes, which is more than enough \
864 * for one more character plus a terminating '\0'. \
867 fmtbuf = (char *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
871 * Put a byte into the buffer; space must have been ensured for it.
873 #define FMTBUF_PUTCHAR(b) \
874 fmtbuf[column] = (b); \
878 * Add the one-byte argument, as an octal escape sequence, to the end
881 #define FMTBUF_PUTBYTE_OCTAL(b) \
882 FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
883 FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
884 FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
887 * Add the one-byte argument, as a hex escape sequence, to the end
890 #define FMTBUF_PUTBYTE_HEX(b) \
891 FMTBUF_PUTCHAR('\\'); \
892 FMTBUF_PUTCHAR('x'); \
893 FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
894 FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
897 * Put the trailing '\0' at the end of the buffer.
899 #define FMTBUF_ENDSTR \
900 fmtbuf[column] = '\0'
903 format_text_internal(wmem_allocator_t
*allocator
,
904 const unsigned char *string
, size_t len
,
908 const unsigned char *stringend
= string
+ len
;
911 while (string
< stringend
) {
913 * Get the first byte of this character.
916 if (g_ascii_isprint(c
)) {
918 * Printable ASCII, so not part of a multi-byte UTF-8 sequence.
919 * Make sure there's enough room for one more byte, and add
924 } else if (replace_space
&& g_ascii_isspace(c
)) {
926 * ASCII, so not part of a multi-byte UTF-8 sequence, but
927 * not printable, but is a space character; show it as a
930 * Make sure there's enough room for one more byte, and add
935 } else if (c
< 128) {
937 * ASCII, so not part of a multi-byte UTF-8 sequence, but not
940 * That requires a minimum of 2 bytes, one for the backslash
941 * and one for a letter, so make sure we have enough room
942 * for that, plus a trailing '\0'.
945 FMTBUF_PUTCHAR('\\');
953 FMTBUF_PUTCHAR('b'); /* BS */
957 FMTBUF_PUTCHAR('f'); /* FF */
961 FMTBUF_PUTCHAR('n'); /* NL */
965 FMTBUF_PUTCHAR('r'); /* CR */
969 FMTBUF_PUTCHAR('t'); /* tab */
978 * We've already put the backslash, but this
979 * will put 3 more characters for the octal
980 * number; make sure we have enough room for
981 * that, plus the trailing '\0'.
984 FMTBUF_PUTBYTE_OCTAL(c
);
989 * We've fetched the first byte of a multi-byte UTF-8
997 if ((c
& 0xe0) == 0xc0) {
998 /* Starts a 2-byte UTF-8 sequence; 1 byte left */
1001 } else if ((c
& 0xf0) == 0xe0) {
1002 /* Starts a 3-byte UTF-8 sequence; 2 bytes left */
1005 } else if ((c
& 0xf8) == 0xf0) {
1006 /* Starts a 4-byte UTF-8 sequence; 3 bytes left */
1009 } else if ((c
& 0xfc) == 0xf8) {
1010 /* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
1013 } else if ((c
& 0xfe) == 0xfc) {
1014 /* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
1018 /* 0xfe or 0xff or a continuation byte - not valid */
1022 /* Try to construct the Unicode character */
1024 for (int i
= 0; i
< utf8_len
; i
++) {
1025 if (string
>= stringend
) {
1027 * Ran out of octets, so the character is
1028 * incomplete. Put in a REPLACEMENT CHARACTER
1029 * instead, and then continue the loop, which
1032 uc
= UNICODE_REPLACEMENT_CHARACTER
;
1036 if ((c
& 0xc0) != 0x80) {
1038 * Not valid UTF-8 continuation character; put in
1039 * a replacement character, and then re-process
1040 * this octet as the beginning of a new character.
1042 uc
= UNICODE_REPLACEMENT_CHARACTER
;
1046 uc
= (uc
<< 6) | (c
& 0x3f);
1050 * If this isn't a valid Unicode character, put in
1051 * a REPLACEMENT CHARACTER.
1053 if (!g_unichar_validate(uc
))
1054 uc
= UNICODE_REPLACEMENT_CHARACTER
;
1056 /* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
1057 uc
= UNICODE_REPLACEMENT_CHARACTER
;
1061 * OK, is it a printable Unicode character?
1063 if (g_unichar_isprint(uc
)) {
1065 * Yes - put it into the string as UTF-8.
1066 * This means that if it was an overlong
1067 * encoding, this will put out the right
1073 } else if (uc
< 0x800) {
1076 } else if (uc
< 0x10000) {
1079 } else if (uc
< 0x200000) {
1082 } else if (uc
< 0x4000000) {
1084 * This should never happen, as Unicode doesn't
1091 * This should never happen, as Unicode doesn't
1097 FMTBUF_EXPAND(utf8_len
);
1098 for (int i
= utf8_len
- 1; i
> 0; i
--) {
1099 fmtbuf
[column
+ i
] = (uc
& 0x3f) | 0x80;
1102 fmtbuf
[column
] = uc
| first
;
1104 } else if (replace_space
&& g_unichar_isspace(uc
)) {
1106 * Not printable, but is a space character; show it
1109 * Make sure there's enough room for one more byte,
1110 * and add the blank.
1113 FMTBUF_PUTCHAR(' ');
1114 } else if (c
< 128) {
1116 * ASCII, but not printable.
1117 * Yes, this could happen with an overlong encoding.
1119 * That requires a minimum of 2 bytes, one for the
1120 * backslash and one for a letter, so make sure we
1121 * have enough room for that, plus a trailing '\0'.
1124 FMTBUF_PUTCHAR('\\');
1128 FMTBUF_PUTCHAR('a');
1132 FMTBUF_PUTCHAR('b'); /* BS */
1136 FMTBUF_PUTCHAR('f'); /* FF */
1140 FMTBUF_PUTCHAR('n'); /* NL */
1144 FMTBUF_PUTCHAR('r'); /* CR */
1148 FMTBUF_PUTCHAR('t'); /* tab */
1152 FMTBUF_PUTCHAR('v');
1157 * We've already put the backslash, but this
1158 * will put 3 more characters for the octal
1159 * number; make sure we have enough room for
1160 * that, plus the trailing '\0'.
1163 FMTBUF_PUTBYTE_OCTAL(c
);
1168 * Unicode, but not printable, and not ASCII;
1169 * put it out as \uxxxx or \Uxxxxxxxx.
1173 FMTBUF_PUTCHAR('\\');
1174 FMTBUF_PUTCHAR('u');
1175 FMTBUF_PUTCHAR(hex
[(uc
>> 12) & 0xF]);
1176 FMTBUF_PUTCHAR(hex
[(uc
>> 8) & 0xF]);
1177 FMTBUF_PUTCHAR(hex
[(uc
>> 4) & 0xF]);
1178 FMTBUF_PUTCHAR(hex
[(uc
>> 0) & 0xF]);
1181 FMTBUF_PUTCHAR('\\');
1182 FMTBUF_PUTCHAR('U');
1183 FMTBUF_PUTCHAR(hex
[(uc
>> 28) & 0xF]);
1184 FMTBUF_PUTCHAR(hex
[(uc
>> 24) & 0xF]);
1185 FMTBUF_PUTCHAR(hex
[(uc
>> 20) & 0xF]);
1186 FMTBUF_PUTCHAR(hex
[(uc
>> 16) & 0xF]);
1187 FMTBUF_PUTCHAR(hex
[(uc
>> 12) & 0xF]);
1188 FMTBUF_PUTCHAR(hex
[(uc
>> 8) & 0xF]);
1189 FMTBUF_PUTCHAR(hex
[(uc
>> 4) & 0xF]);
1190 FMTBUF_PUTCHAR(hex
[(uc
>> 0) & 0xF]);
1202 * Given a wmem scope, a not-necessarily-null-terminated string,
1203 * expected to be in UTF-8 but possibly containing invalid sequences
1204 * (as it may have come from packet data), and the length of the string,
1205 * generate a valid UTF-8 string from it, allocated in the specified
1208 * shows printable Unicode characters as themselves;
1210 * shows non-printable ASCII characters as C-style escapes (octal
1211 * if not one of the standard ones such as LF -> '\n');
1213 * shows non-printable Unicode-but-not-ASCII characters as
1214 * their universal character names;
1216 * shows illegal UTF-8 sequences as a sequence of bytes represented
1217 * as C-style hex escapes (XXX: Does not actually do this. Some illegal
1218 * sequences, such as overlong encodings, the sequences reserved for
1219 * UTF-16 surrogate halves (paired or unpaired), and values outside
1220 * Unicode (i.e., the old sequences for code points above U+10FFFF)
1221 * will be decoded in a permissive way. Other illegal sequences,
1222 * such 0xFE and 0xFF and the presence of a continuation byte where
1223 * not expected (or vice versa its absence), are replaced with
1224 * REPLACEMENT CHARACTER.)
1226 * and return a pointer to it.
1229 format_text(wmem_allocator_t
*allocator
,
1230 const char *string
, size_t len
)
1232 return format_text_internal(allocator
, string
, len
, false);
1235 /** Given a wmem scope and a null-terminated string, expected to be in
1236 * UTF-8 but possibly containing invalid sequences (as it may have come
1237 * from packet data), and the length of the string, generate a valid
1238 * UTF-8 string from it, allocated in the specified wmem scope, that:
1240 * shows printable Unicode characters as themselves;
1242 * shows non-printable ASCII characters as C-style escapes (octal
1243 * if not one of the standard ones such as LF -> '\n');
1245 * shows non-printable Unicode-but-not-ASCII characters as
1246 * their universal character names;
1248 * shows illegal UTF-8 sequences as a sequence of bytes represented
1249 * as C-style hex escapes;
1251 * and return a pointer to it.
1254 format_text_string(wmem_allocator_t
* allocator
, const char *string
)
1256 return format_text_internal(allocator
, string
, strlen(string
), false);
1260 * Given a string, generate a string from it that shows non-printable
1261 * characters as C-style escapes except a whitespace character
1262 * (space, tab, carriage return, new line, vertical tab, or formfeed)
1263 * which will be replaced by a space, and return a pointer to it.
1266 format_text_wsp(wmem_allocator_t
* allocator
, const char *string
, size_t len
)
1268 return format_text_internal(allocator
, string
, len
, true);
1272 * Given a string, generate a string from it that shows non-printable
1273 * characters as the chr parameter passed, except a whitespace character
1274 * (space, tab, carriage return, new line, vertical tab, or formfeed)
1275 * which will be replaced by a space, and return a pointer to it.
1277 * This does *not* treat the input string as UTF-8.
1279 * This is useful for displaying binary data that frequently but not always
1280 * contains text; otherwise the number of C escape codes makes it unreadable.
1283 format_text_chr(wmem_allocator_t
*allocator
, const char *string
, size_t len
, char chr
)
1287 buf
= wmem_strbuf_new_sized(allocator
, len
+ 1);
1288 for (const char *p
= string
; p
< string
+ len
; p
++) {
1289 if (g_ascii_isprint(*p
)) {
1290 wmem_strbuf_append_c(buf
, *p
);
1292 else if (g_ascii_isspace(*p
)) {
1293 wmem_strbuf_append_c(buf
, ' ');
1296 wmem_strbuf_append_c(buf
, chr
);
1299 return wmem_strbuf_finalize(buf
);
1303 format_char(wmem_allocator_t
*allocator
, char c
)
1308 if (g_ascii_isprint(c
)) {
1309 buf
= wmem_alloc_array(allocator
, char, 2);
1314 if (escape_char(c
, &r
)) {
1315 buf
= wmem_alloc_array(allocator
, char, 3);
1321 buf
= wmem_alloc_array(allocator
, char, 5);
1324 buf
[2] = hex
[((uint8_t)c
>> 4) & 0xF];
1325 buf
[3] = hex
[((uint8_t)c
>> 0) & 0xF];
1331 ws_utf8_truncate(char *string
, size_t len
)
1335 /* Ensure that it is null terminated */
1337 last_char
= g_utf8_find_prev_char(string
, string
+ len
);
1338 if (last_char
!= NULL
&& g_utf8_get_char_validated(last_char
, -1) == (gunichar
)-2) {
1339 /* The last UTF-8 character was truncated into a partial sequence. */
1345 /* ASCII/EBCDIC conversion tables from
1346 * https://web.archive.org/web/20060813174742/http://www.room42.com/store/computer_center/code_tables.shtml
1349 static const uint8_t ASCII_translate_EBCDIC
[ 256 ] = {
1350 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
1351 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1352 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
1353 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1354 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D,
1355 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
1356 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
1357 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
1358 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8,
1359 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
1360 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
1361 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
1362 0x7D, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88,
1363 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
1364 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
1365 0xA8, 0xA9, 0xC0, 0x6A, 0xD0, 0xA1, 0x4B,
1366 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1367 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1368 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1369 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1370 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1371 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1372 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1373 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1374 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1375 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1376 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1377 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1378 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1379 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1380 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1381 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B
1385 ASCII_to_EBCDIC(uint8_t *buf
, unsigned bytes
)
1392 for (i
= 0; i
< bytes
; i
++, bufptr
++) {
1393 *bufptr
= ASCII_translate_EBCDIC
[*bufptr
];
1398 ASCII_to_EBCDIC1(uint8_t c
)
1400 return ASCII_translate_EBCDIC
[c
];
1404 static const uint8_t EBCDIC_translate_ASCII
[ 256 ] = {
1405 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1406 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1407 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1408 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1409 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
1410 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
1411 0x2E, 0x2E, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
1412 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x2E, 0x3F,
1413 0x20, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1414 0x2E, 0x2E, 0x2E, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
1415 0x26, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1416 0x2E, 0x2E, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
1417 0x2D, 0x2F, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1418 0x2E, 0x2E, 0x7C, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
1419 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1420 0x2E, 0x2E, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
1421 0x2E, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1422 0x68, 0x69, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1423 0x2E, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,
1424 0x71, 0x72, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1425 0x2E, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1426 0x79, 0x7A, 0x2E, 0x2E, 0x2E, 0x5B, 0x2E, 0x2E,
1427 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1428 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x5D, 0x2E, 0x2E,
1429 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
1430 0x48, 0x49, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1431 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,
1432 0x51, 0x52, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1433 0x5C, 0x2E, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
1434 0x59, 0x5A, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1435 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
1436 0x38, 0x39, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E
1440 EBCDIC_to_ASCII(uint8_t *buf
, unsigned bytes
)
1447 for (i
= 0; i
< bytes
; i
++, bufptr
++) {
1448 *bufptr
= EBCDIC_translate_ASCII
[*bufptr
];
1453 EBCDIC_to_ASCII1(uint8_t c
)
1455 return EBCDIC_translate_ASCII
[c
];
1459 * This routine is based on a routine created by Dan Lasley
1460 * <DLASLEY@PROMUS.com>.
1462 * It was modified for Wireshark by Gilbert Ramirez and others.
1465 #define MAX_OFFSET_LEN 8 /* max length of hex offset of bytes */
1466 #define BYTES_PER_LINE 16 /* max byte values printed on a line */
1467 #define HEX_DUMP_LEN (BYTES_PER_LINE*3)
1468 /* max number of characters hex dump takes -
1469 2 digits plus trailing blank */
1470 #define DATA_DUMP_LEN (HEX_DUMP_LEN + 2 + 2 + BYTES_PER_LINE)
1471 /* number of characters those bytes take;
1472 3 characters per byte of hex dump,
1473 2 blanks separating hex from ASCII,
1474 2 optional ASCII dump delimiters,
1475 1 character per byte of ASCII dump */
1476 #define MAX_LINE_LEN (MAX_OFFSET_LEN + 2 + DATA_DUMP_LEN)
1477 /* number of characters per line;
1478 offset, 2 blanks separating offset
1479 from data dump, data dump */
1482 hex_dump_buffer(bool (*print_line
)(void *, const char *), void *fp
,
1483 const unsigned char *cp
, unsigned length
,
1484 hex_dump_enc encoding
,
1485 unsigned ascii_option
)
1487 register unsigned int ad
, i
, j
, k
, l
;
1489 char line
[MAX_LINE_LEN
+ 1];
1490 unsigned int use_digits
;
1492 static char binhex
[16] = {
1493 '0', '1', '2', '3', '4', '5', '6', '7',
1494 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
1497 * How many of the leading digits of the offset will we supply?
1498 * We always supply at least 4 digits, but if the maximum offset
1499 * won't fit in 4 digits, we use as many digits as will be needed.
1501 if (((length
- 1) & 0xF0000000) != 0)
1502 use_digits
= 8; /* need all 8 digits */
1503 else if (((length
- 1) & 0x0F000000) != 0)
1504 use_digits
= 7; /* need 7 digits */
1505 else if (((length
- 1) & 0x00F00000) != 0)
1506 use_digits
= 6; /* need 6 digits */
1507 else if (((length
- 1) & 0x000F0000) != 0)
1508 use_digits
= 5; /* need 5 digits */
1510 use_digits
= 4; /* we'll supply 4 digits */
1516 while (i
< length
) {
1517 if ((i
& 15) == 0) {
1519 * Start of a new line.
1525 c
= (ad
>> (l
*4)) & 0xF;
1526 line
[j
++] = binhex
[c
];
1530 memset(line
+j
, ' ', DATA_DUMP_LEN
);
1533 * Offset in line of ASCII dump.
1535 k
= j
+ HEX_DUMP_LEN
+ 2;
1536 if (ascii_option
== HEXDUMP_ASCII_DELIMIT
)
1540 line
[j
++] = binhex
[c
>>4];
1541 line
[j
++] = binhex
[c
&0xf];
1543 if (ascii_option
!= HEXDUMP_ASCII_EXCLUDE
) {
1544 if (encoding
== HEXDUMP_ENC_EBCDIC
) {
1545 c
= EBCDIC_to_ASCII1(c
);
1547 line
[k
++] = ((c
>= ' ') && (c
< 0x7f)) ? c
: '.';
1550 if (((i
& 15) == 0) || (i
== length
)) {
1552 * We'll be starting a new line, or
1553 * we're finished printing this buffer;
1554 * dump out the line we've constructed,
1555 * and advance the offset.
1557 if (ascii_option
== HEXDUMP_ASCII_DELIMIT
)
1560 if (!print_line(fp
, line
))
1569 * Editor modelines - https://www.wireshark.org/tools/modelines.html
1574 * indent-tabs-mode: nil
1577 * vi: set shiftwidth=4 tabstop=8 expandtab:
1578 * :indentSize=4:tabSize=8:noTabs=true: