wsutil/str_util.h

   1 /** @file
   2  * String utility definitions
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 1998 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10
  11 #ifndef __STR_UTIL_H__
  12 #define __STR_UTIL_H__
  13
  14 #include <wireshark.h>
  15 #include <wsutil/wmem/wmem.h>
  16
  17 #ifdef __cplusplus
  18 extern "C" {
  19 #endif /* __cplusplus */
  20
  21 WS_DLL_PUBLIC
  22 char *
  23 wmem_strconcat(wmem_allocator_t *allocator, const char *first, ...)
  24 G_GNUC_MALLOC G_GNUC_NULL_TERMINATED;
  25
  26 WS_DLL_PUBLIC
  27 char *
  28 wmem_strjoin(wmem_allocator_t *allocator,
  29              const char *separator, const char *first, ...)
  30 G_GNUC_MALLOC G_GNUC_NULL_TERMINATED;
  31
  32 /**
  33  * As g_strjoinv, with the returned string wmem allocated.
  34  * Joins a number of strings together to form one long string,
  35  * with the optional separator inserted between each of them.
  36  *
  37  * @param allocator  The wmem scope to use to allocate the returned string
  38  * @param separator A string to insert between each of the strings, or NULL.
  39  * @param str_array A NULL-terminated array of strings to join
  40  *
  41  * @note If str_array has no items, the return value is an empty string.
  42  * str_array should not be NULL (NULL is returned with an warning.)
  43  * NULL as a separator is equivalent to the empty string.
  44  */
  45 WS_DLL_PUBLIC
  46 char *
  47 wmem_strjoinv(wmem_allocator_t *allocator,
  48               const char *separator, char **str_array)
  49 G_GNUC_MALLOC;
  50
  51 /**
  52  * Splits a string into a maximum of max_tokens pieces, using the given
  53  * delimiter. If max_tokens is reached, the remainder of string is appended
  54  * to the last token. Successive tokens are not folded and will instead result
  55  * in an empty string as element.
  56  *
  57  * If src or delimiter are NULL, or if delimiter is empty, this will return
  58  * NULL.
  59  *
  60  * Do not use with a NULL allocator, use g_strsplit instead.
  61  */
  62 WS_DLL_PUBLIC
  63 char **
  64 wmem_strsplit(wmem_allocator_t *allocator, const char *src,
  65         const char *delimiter, int max_tokens);
  66
  67 /**
  68  * wmem_ascii_strdown:
  69  * Based on g_ascii_strdown
  70  * @param allocator  An enumeration of the different types of available allocators.
  71  * @param str a string.
  72  * @param len length of str in bytes, or -1 if str is nul-terminated.
  73  *
  74  * Converts all upper case ASCII letters to lower case ASCII letters.
  75  *
  76  * Return value: a newly-allocated string, with all the upper case
  77  *               characters in str converted to lower case, with
  78  *               semantics that exactly match g_ascii_tolower(). (Note
  79  *               that this is unlike the old g_strdown(), which modified
  80  *               the string in place.)
  81  **/
  82 WS_DLL_PUBLIC
  83 char*
  84 wmem_ascii_strdown(wmem_allocator_t *allocator, const char *str, ssize_t len);
  85
  86 /** Convert all upper-case ASCII letters to their ASCII lower-case
  87  *  equivalents, in place, with a simple non-locale-dependent
  88  *  ASCII mapping (A-Z -> a-z).
  89  *  All other characters are left unchanged, as the mapping to
  90  *  lower case may be locale-dependent.
  91  *
  92  *  The string is assumed to be in a character encoding, such as
  93  *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
  94  *  bytes in the range 0x00 through 0x7F are ASCII characters and
  95  *  non-ASCII characters are constructed from one or more bytes in
  96  *  the range 0x80 through 0xFF.
  97  *
  98  * @param str The string to be lower-cased.
  99  * @return    ptr to the string
 100  */
 101 WS_DLL_PUBLIC
 102 char *ascii_strdown_inplace(char *str);
 103
 104 /** Convert all lower-case ASCII letters to their ASCII upper-case
 105  *  equivalents, in place, with a simple non-locale-dependent
 106  *  ASCII mapping (a-z -> A-Z).
 107  *  All other characters are left unchanged, as the mapping to
 108  *  lower case may be locale-dependent.
 109  *
 110  *  The string is assumed to be in a character encoding, such as
 111  *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
 112  *  bytes in the range 0x00 through 0x7F are ASCII characters and
 113  *  non-ASCII characters are constructed from one or more bytes in
 114  *  the range 0x80 through 0xFF.
 115  *
 116  * @param str The string to be upper-cased.
 117  * @return    ptr to the string
 118  */
 119 WS_DLL_PUBLIC
 120 char *ascii_strup_inplace(char *str);
 121
 122 /** Check if an entire string consists of printable characters
 123  *
 124  * @param str    The string to be checked
 125  * @return       true if the entire string is printable, otherwise false
 126  */
 127 WS_DLL_PUBLIC
 128 bool isprint_string(const char *str);
 129
 130 /** Given a not-necessarily-null-terminated string, expected to be in
 131  *  UTF-8 but possibly containing invalid sequences (as it may have come
 132  *  from packet data), and the length of the string, deterimine if the
 133  *  string is valid UTF-8 consisting entirely of printable characters.
 134  *
 135  *  This means that it:
 136  *
 137  *   does not contain an illegal UTF-8 sequence (including overlong encodings,
 138  *   the sequences reserved for UTF-16 surrogate halves, and the values for
 139  *   code points above U+10FFFF that are no longer in Unicode)
 140  *
 141  *   does not contain a non-printable Unicode character such as control
 142  *   characters (including internal NULL bytes)
 143  *
 144  *   does not end in a partial sequence that could begin a valid character;
 145  *
 146  *   does not start with a partial sequence that could end a valid character;
 147  *
 148  * and thus guarantees that the result of format_text() would be the same as
 149  * that of wmem_strndup() with the same parameters.
 150  *
 151  * @param str    The string to be checked
 152  * @param length The number of bytes to validate
 153  * @return       true if the entire string is valid and printable UTF-8,
 154  *               otherwise false
 155  */
 156 WS_DLL_PUBLIC
 157 bool isprint_utf8_string(const char *str, const unsigned length);
 158
 159 /** Check if an entire string consists of digits
 160  *
 161  * @param str    The string to be checked
 162  * @return       true if the entire string is digits, otherwise false
 163  */
 164 WS_DLL_PUBLIC
 165 bool isdigit_string(const unsigned char *str);
 166
 167 /** Finds the first occurrence of string 'needle' in string 'haystack'.
 168  *  The matching is done ignoring the case of ASCII characters in a
 169  *  non-locale-dependent way.
 170  *
 171  *  The string is assumed to be in a character encoding, such as
 172  *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
 173  *  bytes in the range 0x00 through 0x7F are ASCII characters and
 174  *  non-ASCII characters are constructed from one or more bytes in
 175  *  the range 0x80 through 0xFF.
 176  *
 177  * @param haystack The string possibly containing the substring
 178  * @param needle The substring to be searched
 179  * @return A pointer into 'haystack' where 'needle' is first found.
 180  *   Otherwise it returns NULL.
 181  */
 182 WS_DLL_PUBLIC
 183 const char *ws_ascii_strcasestr(const char *haystack, const char *needle);
 184
 185 /** Like the memchr() function, except it scans backwards from the end.
 186  *
 187  * @param haystack Pointer to the bytes of memory to search
 188  * @param ch The character to search
 189  * @param n The length of bytes to search from the end
 190  * @return A pointer to the last occurrence of "ch" in "haystack".
 191  * If "ch" isn't found or "n" is 0, returns NULL.
 192  */
 193 WS_DLL_PUBLIC
 194 const uint8_t *ws_memrchr(const void *haystack, int ch, size_t n);
 195
 196 WS_DLL_PUBLIC
 197 char *ws_escape_string(wmem_allocator_t *alloc, const char *string, bool add_quotes);
 198
 199 WS_DLL_PUBLIC
 200 char *ws_escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, bool add_quotes);
 201
 202 /* Replace null bytes with "\0". */
 203 WS_DLL_PUBLIC
 204 char *ws_escape_null(wmem_allocator_t *alloc, const char *string, size_t len, bool add_quotes);
 205
 206 /* Escape as in a number of CSV dialects.
 207  *
 208  * @param allocator  The wmem scope to use to allocate the returned string
 209  * @param string  The input string to escape
 210  * @param add_quotes  Whether to surround the string with quote_char
 211  * @param quote_char  The quote character, always escaped in some way.
 212  * @param double_quote  Whether to escape the quote character by doubling it
 213  * @param escape_whitespace  Whether to escape whitespace with a backslash
 214  * @return  The escaped string
 215  *
 216  * @note If double_quote is false, then quote_or_delim is escaped with a
 217  * backslash ('\'). The quote character can be '\0', in which case it is
 218  * ignored. If any character is being escaped with a backslash (i.e.,
 219  * quote_char is not '\0' and double_quote is false, or escape_whitespace
 220  * is true), then backslash is also escaped.  If add_quotes is false, then
 221  * quote_char can either be a quote character (if the string will be quoted
 222  * later after further manipulation) or the delimiter (to escape it, since
 223  * the string is not being quoted.).
 224  */
 225 WS_DLL_PUBLIC
 226 char *ws_escape_csv(wmem_allocator_t *alloc, const char *string, bool add_quotes, char quote_char, bool double_quote, bool escape_whitespace);
 227
 228 WS_DLL_PUBLIC
 229 int ws_xton(char ch);
 230
 231 typedef enum {
 232     FORMAT_SIZE_UNIT_NONE,          /**< No unit will be appended. You must supply your own. */
 233     /* XXX - This does not append a trailing space if there is no prefix.
 234      * That's good if you intend to list the unit somewhere else, e.g. in a
 235      * legend, header, or other column, but doesn't work well if intending
 236      * to append your own unit. You can test whether there's a prefix or
 237      * not with g_ascii_isdigit() (plus special handling for inf and NaN).
 238      */
 239     FORMAT_SIZE_UNIT_BYTES,         /**< "bytes" for un-prefixed sizes, "B" otherwise. */
 240     FORMAT_SIZE_UNIT_BITS,          /**< "bits" for un-prefixed sizes, "b" otherwise. */
 241     FORMAT_SIZE_UNIT_BITS_S,        /**< "bits/s" for un-prefixed sizes, "bps" otherwise. */
 242     FORMAT_SIZE_UNIT_BYTES_S,       /**< "bytes/s" for un-prefixed sizes, "Bps" otherwise. */
 243     FORMAT_SIZE_UNIT_PACKETS,       /**< "packets" */
 244     FORMAT_SIZE_UNIT_PACKETS_S,     /**< "packets/s" */
 245     FORMAT_SIZE_UNIT_EVENTS,        /**< "events" */
 246     FORMAT_SIZE_UNIT_EVENTS_S,      /**< "events/s" */
 247     FORMAT_SIZE_UNIT_FIELDS,        /**< "fields" */
 248     /* These next two aren't really for format_size (which takes an int) */
 249     FORMAT_SIZE_UNIT_SECONDS,       /**< "seconds" for un-prefixed sizes, "s" otherwise. */
 250     FORMAT_SIZE_UNIT_ERLANGS,       /**< "erlangs" for un-prefixed sizes, "E" otherwise. */
 251 } format_size_units_e;
 252
 253 #define FORMAT_SIZE_PREFIX_SI   (1 << 0)    /**< SI (power of 1000) prefixes will be used. */
 254 #define FORMAT_SIZE_PREFIX_IEC  (1 << 1)    /**< IEC (power of 1024) prefixes will be used. */
 255
 256 /** Given a floating point value, return it in a human-readable format
 257  *
 258  * Prefixes up to "E/Ei" (exa, exbi) and down to "a" (atto; negative
 259  * prefixes are SI-only) are currently supported. Values outside that
 260  * range will use scientific notation.
 261  *
 262  * @param size The size value
 263  * @param flags Flags to control the output (unit of measurement,
 264  * SI vs IEC, etc). Unit and prefix flags may be ORed together.
 265  * @param precision Maximum number of digits to appear after the
 266  * decimal point. Trailing zeros are removed, as is the decimal
 267  * point if not digits follow it.
 268  * @return A newly-allocated string representing the value.
 269  */
 270 WS_DLL_PUBLIC
 271 char *format_units(wmem_allocator_t *allocator, double size,
 272                    format_size_units_e unit, uint16_t flags,
 273                    int precision);
 274
 275 /** Given a size, return its value in a human-readable format
 276  *
 277  * Prefixes up to "T/Ti" (tera, tebi) are currently supported.
 278  *
 279  * @param size The size value
 280  * @param flags Flags to control the output (unit of measurement,
 281  * SI vs IEC, etc). Unit and prefix flags may be ORed together.
 282  * @return A newly-allocated string representing the value.
 283  */
 284 WS_DLL_PUBLIC
 285 char *format_size_wmem(wmem_allocator_t *allocator, int64_t size,
 286                         format_size_units_e unit, uint16_t flags);
 287
 288 #define format_size(size, unit, flags) \
 289     format_size_wmem(NULL, size, unit, flags)
 290
 291 WS_DLL_PUBLIC
 292 char printable_char_or_period(char c);
 293
 294 WS_DLL_PUBLIC WS_RETNONNULL
 295 const char *ws_strerrorname_r(int errnum, char *buf, size_t buf_size);
 296
 297 WS_DLL_PUBLIC
 298 char *ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len);
 299
 300 /** Given a wmem scope, a not-necessarily-null-terminated string,
 301  *  expected to be in UTF-8 but possibly containing invalid sequences
 302  *  (as it may have come from packet data), and the length of the string,
 303  *  generate a valid UTF-8 string from it, allocated in the specified
 304  *  wmem scope, that:
 305  *
 306  *   shows printable Unicode characters as themselves;
 307  *
 308  *   shows non-printable ASCII characters as C-style escapes (octal
 309  *   if not one of the standard ones such as LF -> '\n');
 310  *
 311  *   shows non-printable Unicode-but-not-ASCII characters as
 312  *   their universal character names;
 313  *
 314  *   Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
 315  *
 316  *  and return a pointer to it.
 317  *
 318  * @param allocator The wmem scope
 319  * @param string A pointer to the input string
 320  * @param len The length of the input string
 321  * @return A pointer to the formatted string
 322  *
 323  * @see tvb_format_text()
 324  */
 325 WS_DLL_PUBLIC
 326 char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);
 327
 328 /** Same as format_text() but accepts a nul-terminated string.
 329  *
 330  * @param allocator The wmem scope
 331  * @param string A pointer to the input string
 332  * @return A pointer to the formatted string
 333  *
 334  * @see tvb_format_text()
 335  */
 336 WS_DLL_PUBLIC
 337 char *format_text_string(wmem_allocator_t* allocator, const char *string);
 338
 339 /**
 340  * Same as format_text() but replaces any whitespace characters
 341  * (space, tab, carriage return, new line, vertical tab, or formfeed)
 342  * with a space.
 343  *
 344  * @param allocator The wmem scope
 345  * @param line A pointer to the input string
 346  * @param len The length of the input string
 347  * @return A pointer to the formatted string
 348  *
 349  */
 350 WS_DLL_PUBLIC
 351 char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);
 352
 353 /**
 354  * Given a string, generate a string from it that shows non-printable
 355  * characters as the chr parameter passed, except a whitespace character
 356  * (space, tab, carriage return, new line, vertical tab, or formfeed)
 357  * which will be replaced by a space, and return a pointer to it.
 358  *
 359  * This does *not* treat the input string as UTF-8.
 360  *
 361  * This is useful for displaying binary data that frequently but not always
 362  * contains text; otherwise the number of C escape codes makes it unreadable.
 363  *
 364  * @param allocator The wmem scope
 365  * @param string A pointer to the input string
 366  * @param len The length of the input string
 367  * @param chr The character to use to replace non-printable characters
 368  * @return A pointer to the formatted string
 369  *
 370  */
 371 WS_DLL_PUBLIC
 372 char *format_text_chr(wmem_allocator_t *allocator,
 373                         const char *string, size_t len, char chr);
 374
 375 /** Given a wmem scope and an 8-bit character
 376  *  generate a valid UTF-8 string from it, allocated in the specified
 377  *  wmem scope, that:
 378  *
 379  *   shows printable Unicode characters as themselves;
 380  *
 381  *   shows non-printable ASCII characters as C-style escapes (hex
 382  *   if not one of the standard ones such as LF -> '\n');
 383  *
 384  *  and return a pointer to it.
 385  *
 386  * @param allocator The wmem scope
 387  * @param c A character to format
 388  * @return A pointer to the formatted string
 389  */
 390 WS_DLL_PUBLIC
 391 char *format_char(wmem_allocator_t *allocator, char c);
 392
 393 /**
 394  * Truncate a UTF-8 string in place so that it is no larger than len bytes,
 395  * ensuring that the string is null terminated and ends with a complete
 396  * character instead of a partial sequence (e.g., possibly truncating up
 397  * to 3 additional bytes if the terminal character is 4 bytes long).
 398  *
 399  * The buffer holding the string must be large enough (at least len + 1
 400  * including the null terminator), and the first len bytes of the buffer
 401  * must be a valid UTF-8 string, except for possibly ending in a partial
 402  * sequence or not being null terminated. This is a convenience function
 403  * that for speed does not check either of those conditions.
 404  *
 405  * A common use case is when a valid UTF-8 string has been copied into a
 406  * buffer of length len+1 via snprintf, strlcpy, or strlcat and truncated,
 407  * to ensure that the final UTF-8 character is not a partial sequence.
 408  *
 409  * @param string A pointer to the input string
 410  * @param len The maximum length to truncate to
 411  * @return    ptr to the string
 412  */
 413 WS_DLL_PUBLIC
 414 char* ws_utf8_truncate(char *string, size_t len);
 415
 416 WS_DLL_PUBLIC
 417 void EBCDIC_to_ASCII(uint8_t *buf, unsigned bytes);
 418
 419 WS_DLL_PUBLIC
 420 uint8_t EBCDIC_to_ASCII1(uint8_t c);
 421
 422 /* Types of character encodings */
 423 typedef enum {
 424     HEXDUMP_ENC_ASCII     = 0, /* ASCII */
 425     HEXDUMP_ENC_EBCDIC    = 1  /* EBCDIC */
 426 } hex_dump_enc;
 427
 428 /*
 429  * Hexdump options for ASCII:
 430  */
 431
 432 #define HEXDUMP_ASCII_MASK            (0x0003U)
 433 #define HEXDUMP_ASCII_OPTION(option)  ((option) & HEXDUMP_ASCII_MASK)
 434
 435 #define HEXDUMP_ASCII_INCLUDE         (0x0000U) /* include ASCII section no delimiters (legacy tshark behavior) */
 436 #define HEXDUMP_ASCII_DELIMIT         (0x0001U) /* include ASCII section with delimiters, useful for reliable detection of last hexdata */
 437 #define HEXDUMP_ASCII_EXCLUDE         (0x0002U) /* exclude ASCII section from hexdump reports, if we really don't want or need it */
 438
 439 WS_DLL_PUBLIC
 440 bool hex_dump_buffer(bool (*print_line)(void *, const char *), void *fp,
 441                                     const unsigned char *cp, unsigned length,
 442                                     hex_dump_enc encoding,
 443                                     unsigned ascii_option);
 444
 445 /* To pass one of two strings, singular or plural */
 446 #define plurality(d,s,p) ((d) == 1 ? (s) : (p))
 447
 448 #define true_or_false(val) ((val) ? "TRUE" : "FALSE")
 449
 450 #define string_or_null(val) ((val) ? (val) : "[NULL]")
 451
 452 #ifdef __cplusplus
 453 }
 454 #endif /* __cplusplus */
 455
 456 #endif /* __STR_UTIL_H__ */