2 * String utility definitions
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 1998 Gerald Combs
8 * SPDX-License-Identifier: GPL-2.0-or-later
11 #ifndef __STR_UTIL_H__
12 #define __STR_UTIL_H__
14 #include <wireshark.h>
15 #include <wsutil/wmem/wmem.h>
19 #endif /* __cplusplus */
23 wmem_strconcat(wmem_allocator_t
*allocator
, const char *first
, ...)
24 G_GNUC_MALLOC G_GNUC_NULL_TERMINATED
;
28 wmem_strjoin(wmem_allocator_t
*allocator
,
29 const char *separator
, const char *first
, ...)
30 G_GNUC_MALLOC G_GNUC_NULL_TERMINATED
;
33 * As g_strjoinv, with the returned string wmem allocated.
34 * Joins a number of strings together to form one long string,
35 * with the optional separator inserted between each of them.
37 * @param allocator The wmem scope to use to allocate the returned string
38 * @param separator A string to insert between each of the strings, or NULL.
39 * @param str_array A NULL-terminated array of strings to join
41 * @note If str_array has no items, the return value is an empty string.
42 * str_array should not be NULL (NULL is returned with an warning.)
43 * NULL as a separator is equivalent to the empty string.
47 wmem_strjoinv(wmem_allocator_t
*allocator
,
48 const char *separator
, char **str_array
)
52 * Splits a string into a maximum of max_tokens pieces, using the given
53 * delimiter. If max_tokens is reached, the remainder of string is appended
54 * to the last token. Successive tokens are not folded and will instead result
55 * in an empty string as element.
57 * If src or delimiter are NULL, or if delimiter is empty, this will return
60 * Do not use with a NULL allocator, use g_strsplit instead.
64 wmem_strsplit(wmem_allocator_t
*allocator
, const char *src
,
65 const char *delimiter
, int max_tokens
);
69 * Based on g_ascii_strdown
70 * @param allocator An enumeration of the different types of available allocators.
71 * @param str a string.
72 * @param len length of str in bytes, or -1 if str is nul-terminated.
74 * Converts all upper case ASCII letters to lower case ASCII letters.
76 * Return value: a newly-allocated string, with all the upper case
77 * characters in str converted to lower case, with
78 * semantics that exactly match g_ascii_tolower(). (Note
79 * that this is unlike the old g_strdown(), which modified
80 * the string in place.)
84 wmem_ascii_strdown(wmem_allocator_t
*allocator
, const char *str
, ssize_t len
);
86 /** Convert all upper-case ASCII letters to their ASCII lower-case
87 * equivalents, in place, with a simple non-locale-dependent
88 * ASCII mapping (A-Z -> a-z).
89 * All other characters are left unchanged, as the mapping to
90 * lower case may be locale-dependent.
92 * The string is assumed to be in a character encoding, such as
93 * an ISO 8859 or other EUC encoding, or UTF-8, in which all
94 * bytes in the range 0x00 through 0x7F are ASCII characters and
95 * non-ASCII characters are constructed from one or more bytes in
96 * the range 0x80 through 0xFF.
98 * @param str The string to be lower-cased.
99 * @return ptr to the string
102 char *ascii_strdown_inplace(char *str
);
104 /** Convert all lower-case ASCII letters to their ASCII upper-case
105 * equivalents, in place, with a simple non-locale-dependent
106 * ASCII mapping (a-z -> A-Z).
107 * All other characters are left unchanged, as the mapping to
108 * lower case may be locale-dependent.
110 * The string is assumed to be in a character encoding, such as
111 * an ISO 8859 or other EUC encoding, or UTF-8, in which all
112 * bytes in the range 0x00 through 0x7F are ASCII characters and
113 * non-ASCII characters are constructed from one or more bytes in
114 * the range 0x80 through 0xFF.
116 * @param str The string to be upper-cased.
117 * @return ptr to the string
120 char *ascii_strup_inplace(char *str
);
122 /** Check if an entire string consists of printable characters
124 * @param str The string to be checked
125 * @return true if the entire string is printable, otherwise false
128 bool isprint_string(const char *str
);
130 /** Given a not-necessarily-null-terminated string, expected to be in
131 * UTF-8 but possibly containing invalid sequences (as it may have come
132 * from packet data), and the length of the string, deterimine if the
133 * string is valid UTF-8 consisting entirely of printable characters.
135 * This means that it:
137 * does not contain an illegal UTF-8 sequence (including overlong encodings,
138 * the sequences reserved for UTF-16 surrogate halves, and the values for
139 * code points above U+10FFFF that are no longer in Unicode)
141 * does not contain a non-printable Unicode character such as control
142 * characters (including internal NULL bytes)
144 * does not end in a partial sequence that could begin a valid character;
146 * does not start with a partial sequence that could end a valid character;
148 * and thus guarantees that the result of format_text() would be the same as
149 * that of wmem_strndup() with the same parameters.
151 * @param str The string to be checked
152 * @param length The number of bytes to validate
153 * @return true if the entire string is valid and printable UTF-8,
157 bool isprint_utf8_string(const char *str
, const unsigned length
);
159 /** Check if an entire string consists of digits
161 * @param str The string to be checked
162 * @return true if the entire string is digits, otherwise false
165 bool isdigit_string(const unsigned char *str
);
167 /** Finds the first occurrence of string 'needle' in string 'haystack'.
168 * The matching is done ignoring the case of ASCII characters in a
169 * non-locale-dependent way.
171 * The string is assumed to be in a character encoding, such as
172 * an ISO 8859 or other EUC encoding, or UTF-8, in which all
173 * bytes in the range 0x00 through 0x7F are ASCII characters and
174 * non-ASCII characters are constructed from one or more bytes in
175 * the range 0x80 through 0xFF.
177 * @param haystack The string possibly containing the substring
178 * @param needle The substring to be searched
179 * @return A pointer into 'haystack' where 'needle' is first found.
180 * Otherwise it returns NULL.
183 const char *ws_ascii_strcasestr(const char *haystack
, const char *needle
);
185 /** Like the memchr() function, except it scans backwards from the end.
187 * @param haystack Pointer to the bytes of memory to search
188 * @param ch The character to search
189 * @param n The length of bytes to search from the end
190 * @return A pointer to the last occurrence of "ch" in "haystack".
191 * If "ch" isn't found or "n" is 0, returns NULL.
194 const uint8_t *ws_memrchr(const void *haystack
, int ch
, size_t n
);
197 char *ws_escape_string(wmem_allocator_t
*alloc
, const char *string
, bool add_quotes
);
200 char *ws_escape_string_len(wmem_allocator_t
*alloc
, const char *string
, ssize_t len
, bool add_quotes
);
202 /* Replace null bytes with "\0". */
204 char *ws_escape_null(wmem_allocator_t
*alloc
, const char *string
, size_t len
, bool add_quotes
);
206 /* Escape as in a number of CSV dialects.
208 * @param allocator The wmem scope to use to allocate the returned string
209 * @param string The input string to escape
210 * @param add_quotes Whether to surround the string with quote_char
211 * @param quote_char The quote character, always escaped in some way.
212 * @param double_quote Whether to escape the quote character by doubling it
213 * @param escape_whitespace Whether to escape whitespace with a backslash
214 * @return The escaped string
216 * @note If double_quote is false, then quote_or_delim is escaped with a
217 * backslash ('\'). The quote character can be '\0', in which case it is
218 * ignored. If any character is being escaped with a backslash (i.e.,
219 * quote_char is not '\0' and double_quote is false, or escape_whitespace
220 * is true), then backslash is also escaped. If add_quotes is false, then
221 * quote_char can either be a quote character (if the string will be quoted
222 * later after further manipulation) or the delimiter (to escape it, since
223 * the string is not being quoted.).
226 char *ws_escape_csv(wmem_allocator_t
*alloc
, const char *string
, bool add_quotes
, char quote_char
, bool double_quote
, bool escape_whitespace
);
229 int ws_xton(char ch
);
232 FORMAT_SIZE_UNIT_NONE
, /**< No unit will be appended. You must supply your own. */
233 /* XXX - This does not append a trailing space if there is no prefix.
234 * That's good if you intend to list the unit somewhere else, e.g. in a
235 * legend, header, or other column, but doesn't work well if intending
236 * to append your own unit. You can test whether there's a prefix or
237 * not with g_ascii_isdigit() (plus special handling for inf and NaN).
239 FORMAT_SIZE_UNIT_BYTES
, /**< "bytes" for un-prefixed sizes, "B" otherwise. */
240 FORMAT_SIZE_UNIT_BITS
, /**< "bits" for un-prefixed sizes, "b" otherwise. */
241 FORMAT_SIZE_UNIT_BITS_S
, /**< "bits/s" for un-prefixed sizes, "bps" otherwise. */
242 FORMAT_SIZE_UNIT_BYTES_S
, /**< "bytes/s" for un-prefixed sizes, "Bps" otherwise. */
243 FORMAT_SIZE_UNIT_PACKETS
, /**< "packets" */
244 FORMAT_SIZE_UNIT_PACKETS_S
, /**< "packets/s" */
245 FORMAT_SIZE_UNIT_EVENTS
, /**< "events" */
246 FORMAT_SIZE_UNIT_EVENTS_S
, /**< "events/s" */
247 FORMAT_SIZE_UNIT_FIELDS
, /**< "fields" */
248 /* These next two aren't really for format_size (which takes an int) */
249 FORMAT_SIZE_UNIT_SECONDS
, /**< "seconds" for un-prefixed sizes, "s" otherwise. */
250 FORMAT_SIZE_UNIT_ERLANGS
, /**< "erlangs" for un-prefixed sizes, "E" otherwise. */
251 } format_size_units_e
;
253 #define FORMAT_SIZE_PREFIX_SI (1 << 0) /**< SI (power of 1000) prefixes will be used. */
254 #define FORMAT_SIZE_PREFIX_IEC (1 << 1) /**< IEC (power of 1024) prefixes will be used. */
256 /** Given a floating point value, return it in a human-readable format
258 * Prefixes up to "E/Ei" (exa, exbi) and down to "a" (atto; negative
259 * prefixes are SI-only) are currently supported. Values outside that
260 * range will use scientific notation.
262 * @param size The size value
263 * @param flags Flags to control the output (unit of measurement,
264 * SI vs IEC, etc). Unit and prefix flags may be ORed together.
265 * @param precision Maximum number of digits to appear after the
266 * decimal point. Trailing zeros are removed, as is the decimal
267 * point if not digits follow it.
268 * @return A newly-allocated string representing the value.
271 char *format_units(wmem_allocator_t
*allocator
, double size
,
272 format_size_units_e unit
, uint16_t flags
,
275 /** Given a size, return its value in a human-readable format
277 * Prefixes up to "T/Ti" (tera, tebi) are currently supported.
279 * @param size The size value
280 * @param flags Flags to control the output (unit of measurement,
281 * SI vs IEC, etc). Unit and prefix flags may be ORed together.
282 * @return A newly-allocated string representing the value.
285 char *format_size_wmem(wmem_allocator_t
*allocator
, int64_t size
,
286 format_size_units_e unit
, uint16_t flags
);
288 #define format_size(size, unit, flags) \
289 format_size_wmem(NULL, size, unit, flags)
292 char printable_char_or_period(char c
);
294 WS_DLL_PUBLIC WS_RETNONNULL
295 const char *ws_strerrorname_r(int errnum
, char *buf
, size_t buf_size
);
298 char *ws_strdup_underline(wmem_allocator_t
*allocator
, long offset
, size_t len
);
300 /** Given a wmem scope, a not-necessarily-null-terminated string,
301 * expected to be in UTF-8 but possibly containing invalid sequences
302 * (as it may have come from packet data), and the length of the string,
303 * generate a valid UTF-8 string from it, allocated in the specified
306 * shows printable Unicode characters as themselves;
308 * shows non-printable ASCII characters as C-style escapes (octal
309 * if not one of the standard ones such as LF -> '\n');
311 * shows non-printable Unicode-but-not-ASCII characters as
312 * their universal character names;
314 * Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
316 * and return a pointer to it.
318 * @param allocator The wmem scope
319 * @param string A pointer to the input string
320 * @param len The length of the input string
321 * @return A pointer to the formatted string
323 * @see tvb_format_text()
326 char *format_text(wmem_allocator_t
* allocator
, const char *string
, size_t len
);
328 /** Same as format_text() but accepts a nul-terminated string.
330 * @param allocator The wmem scope
331 * @param string A pointer to the input string
332 * @return A pointer to the formatted string
334 * @see tvb_format_text()
337 char *format_text_string(wmem_allocator_t
* allocator
, const char *string
);
340 * Same as format_text() but replaces any whitespace characters
341 * (space, tab, carriage return, new line, vertical tab, or formfeed)
344 * @param allocator The wmem scope
345 * @param line A pointer to the input string
346 * @param len The length of the input string
347 * @return A pointer to the formatted string
351 char *format_text_wsp(wmem_allocator_t
* allocator
, const char *line
, size_t len
);
354 * Given a string, generate a string from it that shows non-printable
355 * characters as the chr parameter passed, except a whitespace character
356 * (space, tab, carriage return, new line, vertical tab, or formfeed)
357 * which will be replaced by a space, and return a pointer to it.
359 * This does *not* treat the input string as UTF-8.
361 * This is useful for displaying binary data that frequently but not always
362 * contains text; otherwise the number of C escape codes makes it unreadable.
364 * @param allocator The wmem scope
365 * @param string A pointer to the input string
366 * @param len The length of the input string
367 * @param chr The character to use to replace non-printable characters
368 * @return A pointer to the formatted string
372 char *format_text_chr(wmem_allocator_t
*allocator
,
373 const char *string
, size_t len
, char chr
);
375 /** Given a wmem scope and an 8-bit character
376 * generate a valid UTF-8 string from it, allocated in the specified
379 * shows printable Unicode characters as themselves;
381 * shows non-printable ASCII characters as C-style escapes (hex
382 * if not one of the standard ones such as LF -> '\n');
384 * and return a pointer to it.
386 * @param allocator The wmem scope
387 * @param c A character to format
388 * @return A pointer to the formatted string
391 char *format_char(wmem_allocator_t
*allocator
, char c
);
394 * Truncate a UTF-8 string in place so that it is no larger than len bytes,
395 * ensuring that the string is null terminated and ends with a complete
396 * character instead of a partial sequence (e.g., possibly truncating up
397 * to 3 additional bytes if the terminal character is 4 bytes long).
399 * The buffer holding the string must be large enough (at least len + 1
400 * including the null terminator), and the first len bytes of the buffer
401 * must be a valid UTF-8 string, except for possibly ending in a partial
402 * sequence or not being null terminated. This is a convenience function
403 * that for speed does not check either of those conditions.
405 * A common use case is when a valid UTF-8 string has been copied into a
406 * buffer of length len+1 via snprintf, strlcpy, or strlcat and truncated,
407 * to ensure that the final UTF-8 character is not a partial sequence.
409 * @param string A pointer to the input string
410 * @param len The maximum length to truncate to
411 * @return ptr to the string
414 char* ws_utf8_truncate(char *string
, size_t len
);
417 void EBCDIC_to_ASCII(uint8_t *buf
, unsigned bytes
);
420 uint8_t EBCDIC_to_ASCII1(uint8_t c
);
422 /* Types of character encodings */
424 HEXDUMP_ENC_ASCII
= 0, /* ASCII */
425 HEXDUMP_ENC_EBCDIC
= 1 /* EBCDIC */
429 * Hexdump options for ASCII:
432 #define HEXDUMP_ASCII_MASK (0x0003U)
433 #define HEXDUMP_ASCII_OPTION(option) ((option) & HEXDUMP_ASCII_MASK)
435 #define HEXDUMP_ASCII_INCLUDE (0x0000U) /* include ASCII section no delimiters (legacy tshark behavior) */
436 #define HEXDUMP_ASCII_DELIMIT (0x0001U) /* include ASCII section with delimiters, useful for reliable detection of last hexdata */
437 #define HEXDUMP_ASCII_EXCLUDE (0x0002U) /* exclude ASCII section from hexdump reports, if we really don't want or need it */
440 bool hex_dump_buffer(bool (*print_line
)(void *, const char *), void *fp
,
441 const unsigned char *cp
, unsigned length
,
442 hex_dump_enc encoding
,
443 unsigned ascii_option
);
445 /* To pass one of two strings, singular or plural */
446 #define plurality(d,s,p) ((d) == 1 ? (s) : (p))
448 #define true_or_false(val) ((val) ? "TRUE" : "FALSE")
450 #define string_or_null(val) ((val) ? (val) : "[NULL]")
454 #endif /* __cplusplus */
456 #endif /* __STR_UTIL_H__ */