epan/strutil.c

   1 /* strutil.c
   2  * String utility routines
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 1998 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10
  11 #include "config.h"
  12
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <glib.h>
  16 #include "strutil.h"
  17
  18 #include <wsutil/str_util.h>
  19 #include <wsutil/unicode-utils.h>
  20 #include <epan/proto.h>
  21
  22 #ifdef _WIN32
  23 #include <windows.h>
  24 #include <tchar.h>
  25 #include <wchar.h>
  26 #endif
  27
  28
  29 /*
  30  * Given a pointer into a data buffer, and to the end of the buffer,
  31  * find the end of the (putative) line at that position in the data
  32  * buffer.
  33  * Return a pointer to the EOL character(s) in "*eol".
  34  */
  35 const unsigned char *
  36 find_line_end(const unsigned char *data, const unsigned char *dataend, const unsigned char **eol)
  37 {
  38     const unsigned char *lineend;
  39
  40     lineend = (unsigned char *)memchr(data, '\n', dataend - data);
  41     if (lineend == NULL) {
  42         /*
  43          * No LF - line is probably continued in next TCP segment.
  44          */
  45         lineend = dataend;
  46         *eol = dataend;
  47     } else {
  48         /*
  49          * Is the LF at the beginning of the line?
  50          */
  51         if (lineend > data) {
  52             /*
  53              * No - is it preceded by a carriage return?
  54              * (Perhaps it's supposed to be, but that's not guaranteed....)
  55              */
  56             if (*(lineend - 1) == '\r') {
  57                 /*
  58                  * Yes.  The EOL starts with the CR.
  59                  */
  60                 *eol = lineend - 1;
  61             } else {
  62                 /*
  63                  * No.  The EOL starts with the LF.
  64                  */
  65                 *eol = lineend;
  66
  67                 /*
  68                  * I seem to remember that we once saw lines ending with LF-CR
  69                  * in an HTTP request or response, so check if it's *followed*
  70                  * by a carriage return.
  71                  */
  72                 if (lineend < (dataend - 1) && *(lineend + 1) == '\r') {
  73                     /*
  74                      * It's <non-LF><LF><CR>; say it ends with the CR.
  75                      */
  76                     lineend++;
  77                 }
  78             }
  79         } else {
  80             /*
  81              * Yes - the EOL starts with the LF.
  82              */
  83             *eol = lineend;
  84         }
  85
  86         /*
  87          * Point to the character after the last character.
  88          */
  89         lineend++;
  90     }
  91     return lineend;
  92 }
  93
  94 /*
  95  * Get the length of the next token in a line, and the beginning of the
  96  * next token after that (if any).
  97  * Return 0 if there is no next token.
  98  */
  99 int
 100 get_token_len(const unsigned char *linep, const unsigned char *lineend,
 101         const unsigned char **next_token)
 102 {
 103     const unsigned char *tokenp;
 104     int token_len;
 105
 106     tokenp = linep;
 107
 108     /*
 109      * Search for a blank, a CR or an LF, or the end of the buffer.
 110      */
 111     while (linep < lineend && *linep != ' ' && *linep != '\r' && *linep != '\n')
 112         linep++;
 113     token_len = (int) (linep - tokenp);
 114
 115     /*
 116      * Skip trailing blanks.
 117      */
 118     while (linep < lineend && *linep == ' ')
 119         linep++;
 120
 121     *next_token = linep;
 122
 123     return token_len;
 124 }
 125
 126 static bool
 127 is_byte_sep(uint8_t c)
 128 {
 129     return (c == '-' || c == ':' || c == '.');
 130 }
 131
 132 /* Turn a string of hex digits with optional separators (defined by
 133  * is_byte_sep() into a byte array.
 134  *
 135  * XXX - This function is perhaps too generous in what it accepts.
 136  * It allows the separator to change from one character to another,
 137  * or to and from no separator if force_separators is false.
 138  */
 139 bool
 140 hex_str_to_bytes(const char *hex_str, GByteArray *bytes, bool force_separators)
 141 {
 142     uint8_t       val;
 143     const char     *p, *q, *r, *s, *punct;
 144     char        four_digits_first_half[3];
 145     char        four_digits_second_half[3];
 146     char        two_digits[3];
 147     char        one_digit[2];
 148
 149     if (! hex_str || ! bytes) {
 150         return false;
 151     }
 152     g_byte_array_set_size(bytes, 0);
 153     p = hex_str;
 154     while (*p) {
 155         q = p+1;
 156         r = p+2;
 157         s = p+3;
 158
 159         if (*q && *r
 160                 && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*q) &&
 161                 g_ascii_isxdigit(*r)) {
 162
 163             /*
 164              * Three hex bytes in a row, followed by a non hex byte
 165              * (possibly the end of the string). We don't accept an
 166              * odd number of hex digits except for single digits
 167              * by themselves or after a separator.
 168              */
 169             if (!g_ascii_isxdigit(*s)) {
 170                 return false;
 171             }
 172             four_digits_first_half[0] = *p;
 173             four_digits_first_half[1] = *q;
 174             four_digits_first_half[2] = '\0';
 175             four_digits_second_half[0] = *r;
 176             four_digits_second_half[1] = *s;
 177             four_digits_second_half[2] = '\0';
 178
 179             /*
 180              * Four or more hex digits in a row.
 181              */
 182             val = (uint8_t) strtoul(four_digits_first_half, NULL, 16);
 183             g_byte_array_append(bytes, &val, 1);
 184             val = (uint8_t) strtoul(four_digits_second_half, NULL, 16);
 185             g_byte_array_append(bytes, &val, 1);
 186
 187             punct = s + 1;
 188             if (*punct) {
 189                 /*
 190                  * Make sure the character after
 191                  * the fourth hex digit is a byte
 192                  * separator, i.e. that we don't have
 193                  * more than four hex digits, or a
 194                  * bogus character.
 195                  */
 196                 if (is_byte_sep(*punct)) {
 197                     p = punct + 1;
 198                     continue;
 199                 }
 200                 else if (force_separators) {
 201                     return false;
 202                 }
 203             }
 204             p = punct;
 205             continue;
 206         }
 207         else if (*q && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*q)) {
 208             two_digits[0] = *p;
 209             two_digits[1] = *q;
 210             two_digits[2] = '\0';
 211
 212             /*
 213              * Two hex digits in a row.
 214              */
 215             val = (uint8_t) strtoul(two_digits, NULL, 16);
 216             g_byte_array_append(bytes, &val, 1);
 217             punct = q + 1;
 218             if (*punct) {
 219                 /*
 220                  * Make sure the character after
 221                  * the second hex digit is a byte
 222                  * separator, i.e. that we don't have
 223                  * more than two hex digits, or a
 224                  * bogus character.
 225                  */
 226                 if (is_byte_sep(*punct)) {
 227                     p = punct + 1;
 228                     continue;
 229                 }
 230                 else if (force_separators) {
 231                     return false;
 232                 }
 233             }
 234             p = punct;
 235             continue;
 236         }
 237         else if (*q && g_ascii_isxdigit(*p) && is_byte_sep(*q)) {
 238             one_digit[0] = *p;
 239             one_digit[1] = '\0';
 240
 241             /*
 242              * Only one hex digit (not at the end of the string)
 243              */
 244             val = (uint8_t) strtoul(one_digit, NULL, 16);
 245             g_byte_array_append(bytes, &val, 1);
 246             p = q + 1;
 247             continue;
 248         }
 249         else if (!*q && g_ascii_isxdigit(*p)) {
 250             one_digit[0] = *p;
 251             one_digit[1] = '\0';
 252
 253             /*
 254              * Only one hex digit (at the end of the string)
 255              */
 256             val = (uint8_t) strtoul(one_digit, NULL, 16);
 257             g_byte_array_append(bytes, &val, 1);
 258             p = q;
 259             continue;
 260         }
 261         else {
 262             return false;
 263         }
 264     }
 265     return true;
 266 }
 267
 268 static inline char
 269 get_valid_byte_sep(char c, const unsigned encoding)
 270 {
 271     char retval = -1; /* -1 means failure */
 272
 273     switch (c) {
 274         case ':':
 275             if (encoding & ENC_SEP_COLON)
 276                 retval = c;
 277             break;
 278         case '-':
 279             if (encoding & ENC_SEP_DASH)
 280                 retval = c;
 281             break;
 282         case '.':
 283             if (encoding & ENC_SEP_DOT)
 284                 retval = c;
 285             break;
 286         case ' ':
 287             if (encoding & ENC_SEP_SPACE)
 288                 retval = c;
 289             break;
 290         case '\0':
 291             /* we were given the end of the string, so it's fine */
 292             retval = 0;
 293             break;
 294         default:
 295             if (g_ascii_isxdigit(c) && (encoding & ENC_SEP_NONE))
 296                 retval = 0;
 297             /* anything else means we've got a failure */
 298             break;
 299     }
 300
 301     return retval;
 302 }
 303
 304 /* Turn a string of hex digits with optional separators (defined by is_byte_sep())
 305  * into a byte array. Unlike hex_str_to_bytes(), this will read as many hex-char
 306  * pairs as possible and not error if it hits a non-hex-char; instead it just ends
 307  * there. (i.e., like strtol()/atoi()/etc.) Unless fail_if_partial is true.
 308  *
 309  * The **endptr, if not NULL, is set to the char after the last hex character.
 310  */
 311 bool
 312 hex_str_to_bytes_encoding(const char *hex_str, GByteArray *bytes, const char **endptr,
 313                           const unsigned encoding, const bool fail_if_partial)
 314 {
 315     int8_t c, d;
 316     uint8_t val;
 317     const char *end = hex_str;
 318     bool retval = false;
 319     char sep = -1;
 320
 321     /* a map from ASCII hex chars to their value */
 322     static const int8_t str_to_nibble[256] = {
 323         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 324         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 325         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 326          0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
 327         -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 328         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 329         -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 330         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 331         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 332         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 333         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 334         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 335         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 336         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 337         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 338         -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
 339     };
 340
 341     /* we must see two hex chars at the beginning, or fail */
 342     if (bytes && *end && g_ascii_isxdigit(*end) && g_ascii_isxdigit(*(end+1))) {
 343         retval = true;
 344
 345         /* set the separator character we'll allow; if this returns a -1, it means something's
 346          * invalid after the hex, but we'll let the while-loop grab the first hex-pair anyway
 347          */
 348         sep = get_valid_byte_sep(*(end+2), encoding);
 349
 350         while (*end) {
 351             c = str_to_nibble[(unsigned char)*end];
 352             if (c < 0) {
 353                 if (fail_if_partial) retval = false;
 354                 break;
 355             }
 356
 357             d = str_to_nibble[(unsigned char)*(end+1)];
 358             if (d < 0) {
 359                 if (fail_if_partial) retval = false;
 360                 break;
 361             }
 362             val = ((uint8_t)c * 16) + d;
 363             g_byte_array_append(bytes, &val, 1);
 364             end += 2;
 365
 366             /* check for separator and peek at next char to make sure we should keep going */
 367             if (sep > 0 && *end == sep && str_to_nibble[(unsigned char)*(end+1)] > -1) {
 368                 /* yes, it's the right sep and followed by more hex, so skip the sep */
 369                 ++end;
 370             } else if (sep != 0 && *end) {
 371                 /* we either need a separator, but we don't see one; or the get_valid_byte_sep()
 372                    earlier didn't find a valid one to begin with */
 373                 if (fail_if_partial) retval = false;
 374                 break;
 375             }
 376             /* otherwise, either no separator allowed, or *end is null, or *end is an invalid
 377              * sep, or *end is a valid sep but after it is not a hex char - in all those
 378              * cases, just loop back up and let it fail later naturally.
 379              */
 380         }
 381     }
 382
 383     if (!retval) {
 384         if (bytes) g_byte_array_set_size(bytes, 0);
 385         end = hex_str;
 386     }
 387
 388     if (endptr) *endptr = end;
 389
 390     return retval;
 391 }
 392
 393 /*
 394  * Turn an RFC 3986 percent-encoded array of characters, not
 395  * necessarily null-terminated, into a byte array.
 396  * XXX - We don't check for reserved characters.
 397  * XXX - g_uri_unescape_bytes is superior, but limited to
 398  * glib >= 2.66
 399  */
 400 #define HEX_DIGIT_BUF_LEN 3
 401 bool
 402 uri_to_bytes(const char *uri_str, GByteArray *bytes, size_t len)
 403 {
 404     uint8_t       val;
 405     const char   *p;
 406     const char   *uri_end = uri_str + len;
 407     char          hex_digit[HEX_DIGIT_BUF_LEN];
 408
 409     g_byte_array_set_size(bytes, 0);
 410     if (! uri_str) {
 411         return false;
 412     }
 413
 414     p = uri_str;
 415
 416     while (p < uri_end) {
 417         if (!g_ascii_isprint(*p))
 418             return false;
 419         if (*p == '%') {
 420             p++;
 421             if (*p == '\0') return false;
 422             hex_digit[0] = *p;
 423             p++;
 424             if (*p == '\0') return false;
 425             hex_digit[1] = *p;
 426             hex_digit[2] = '\0';
 427             if (! g_ascii_isxdigit(hex_digit[0]) || ! g_ascii_isxdigit(hex_digit[1]))
 428                 return false;
 429             val = (uint8_t) strtoul(hex_digit, NULL, 16);
 430             g_byte_array_append(bytes, &val, 1);
 431         } else {
 432             g_byte_array_append(bytes, (const uint8_t *) p, 1);
 433         }
 434         p++;
 435
 436     }
 437     return true;
 438 }
 439
 440 /*
 441  * Turn an RFC 3986 percent-encoded string into a byte array.
 442  * XXX - We don't check for reserved characters.
 443  * XXX - Just use g_uri_unescape_string instead?
 444  */
 445 bool
 446 uri_str_to_bytes(const char *uri_str, GByteArray *bytes)
 447 {
 448     return uri_to_bytes(uri_str, bytes, strlen(uri_str));
 449 }
 450
 451 /**
 452  * Create a copy of a GByteArray
 453  *
 454  * @param ba The byte array to be copied.
 455  * @return If ba exists, a freshly allocated copy.  NULL otherwise.
 456  *
 457  */
 458 GByteArray *
 459 byte_array_dup(const GByteArray *ba)
 460 {
 461     GByteArray *new_ba;
 462
 463     if (!ba)
 464         return NULL;
 465
 466     new_ba = g_byte_array_new();
 467     g_byte_array_append(new_ba, ba->data, ba->len);
 468     return new_ba;
 469 }
 470
 471 #define SUBID_BUF_LEN 5
 472 bool
 473 oid_str_to_bytes(const char *oid_str, GByteArray *bytes)
 474 {
 475     return rel_oid_str_to_bytes(oid_str, bytes, true);
 476 }
 477 bool
 478 rel_oid_str_to_bytes(const char *oid_str, GByteArray *bytes, bool is_absolute)
 479 {
 480     uint32_t subid0, subid, sicnt, i;
 481     const char *p, *dot;
 482     uint8_t buf[SUBID_BUF_LEN];
 483
 484     g_byte_array_set_size(bytes, 0);
 485
 486     /* check syntax */
 487     p = oid_str;
 488     dot = NULL;
 489     while (*p) {
 490         if (!g_ascii_isdigit(*p) && (*p != '.')) return false;
 491         if (*p == '.') {
 492             if (p == oid_str && is_absolute) return false;
 493             if (!*(p+1)) return false;
 494             if ((p-1) == dot) return false;
 495             dot = p;
 496         }
 497         p++;
 498     }
 499     if (!dot) return false;
 500
 501     p = oid_str;
 502     sicnt = is_absolute ? 0 : 2;
 503     if (!is_absolute) p++;
 504     subid0 = 0;    /* squelch GCC complaints */
 505     while (*p) {
 506         subid = 0;
 507         while (g_ascii_isdigit(*p)) {
 508             subid *= 10;
 509             subid += *p - '0';
 510             p++;
 511         }
 512         if (sicnt == 0) {
 513             subid0 = subid;
 514             if (subid0 > 2) return false;
 515         } else if (sicnt == 1) {
 516             if ((subid0 < 2) && (subid > 39)) return false;
 517             subid += 40 * subid0;
 518         }
 519         if (sicnt) {
 520             i = SUBID_BUF_LEN;
 521             do {
 522                 i--;
 523                 buf[i] = 0x80 | (subid % 0x80);
 524                 subid >>= 7;
 525             } while (subid && i);
 526             buf[SUBID_BUF_LEN-1] &= 0x7F;
 527             g_byte_array_append(bytes, buf + i, SUBID_BUF_LEN - i);
 528         }
 529         sicnt++;
 530         if (*p) p++;
 531     }
 532
 533     return true;
 534 }
 535
 536 /**
 537  * Compare the contents of two GByteArrays
 538  *
 539  * @param ba1 A byte array
 540  * @param ba2 A byte array
 541  * @return If both arrays are non-NULL and their lengths are equal and
 542  *         their contents are equal, returns true.  Otherwise, returns
 543  *         false.
 544  *
 545  * XXX - Should this be in strutil.c?
 546  */
 547 bool
 548 byte_array_equal(GByteArray *ba1, GByteArray *ba2)
 549 {
 550     if (!ba1 || !ba2)
 551         return false;
 552
 553     if (ba1->len != ba2->len)
 554         return false;
 555
 556     if (memcmp(ba1->data, ba2->data, ba1->len) != 0)
 557         return false;
 558
 559     return true;
 560 }
 561
 562
 563 /* Return a XML escaped representation of the unescaped string.
 564  * The returned string must be freed when no longer in use. */
 565 char *
 566 xml_escape(const char *unescaped)
 567 {
 568     GString *buffer = g_string_sized_new(128);
 569     const char *p;
 570     char c;
 571
 572     p = unescaped;
 573     while ( (c = *p++) ) {
 574         switch (c) {
 575             case '<':
 576                 g_string_append(buffer, "&lt;");
 577                 break;
 578             case '>':
 579                 g_string_append(buffer, "&gt;");
 580                 break;
 581             case '&':
 582                 g_string_append(buffer, "&amp;");
 583                 break;
 584             case '\'':
 585                 g_string_append(buffer, "&#x27;");
 586                 break;
 587             case '"':
 588                 g_string_append(buffer, "&quot;");
 589                 break;
 590             case '\t':
 591             case '\n':
 592             case '\r':
 593                 g_string_append_c(buffer, c);
 594                 break;
 595             default:
 596                 /* XML 1.0 doesn't allow ASCII control characters, except
 597                  * for the three whitespace ones above (which do *not*
 598                  * include '\v' and '\f', so not the same group as isspace),
 599                  * even as character references.
 600                  * There's no official way to escape them, so we'll do this. */
 601                 if (g_ascii_iscntrl(c)) {
 602                     g_string_append_printf(buffer, "\\x%x", c);
 603                 } else {
 604                     g_string_append_c(buffer, c);
 605                 }
 606                 break;
 607         }
 608     }
 609     /* Return the string value contained within the GString
 610      * after getting rid of the GString structure.
 611      * This is the way to do this, see the GLib reference. */
 612     return g_string_free(buffer, FALSE);
 613 }
 614
 615 /*
 616  * Scan the search string to make sure it's valid hex.  Return the
 617  * number of bytes in nbytes.
 618  */
 619 uint8_t *
 620 convert_string_to_hex(const char *string, size_t *nbytes)
 621 {
 622     size_t n_bytes;
 623     const char *p;
 624     char c;
 625     uint8_t *bytes, *q, byte_val;
 626
 627     n_bytes = 0;
 628     p = &string[0];
 629     for (;;) {
 630         c = *p++;
 631         if (c == '\0')
 632             break;
 633         if (g_ascii_isspace(c))
 634             continue;    /* allow white space */
 635         if (c==':' || c=='.' || c=='-')
 636             continue; /* skip any ':', '.', or '-' between bytes */
 637         if (!g_ascii_isxdigit(c)) {
 638             /* Not a valid hex digit - fail */
 639             return NULL;
 640         }
 641
 642         /*
 643          * We can only match bytes, not nibbles; we must have a valid
 644          * hex digit immediately after that hex digit.
 645          */
 646         c = *p++;
 647         if (!g_ascii_isxdigit(c))
 648             return NULL;
 649
 650         /* 2 hex digits = 1 byte */
 651         n_bytes++;
 652     }
 653
 654     /*
 655      * Were we given any hex digits?
 656      */
 657     if (n_bytes == 0) {
 658         /* No. */
 659         return NULL;
 660     }
 661
 662     /*
 663      * OK, it's valid, and it generates "n_bytes" bytes; generate the
 664      * raw byte array.
 665      */
 666     bytes = (uint8_t *)g_malloc(n_bytes);
 667     p = &string[0];
 668     q = &bytes[0];
 669     for (;;) {
 670         c = *p++;
 671         if (c == '\0')
 672             break;
 673         if (g_ascii_isspace(c))
 674             continue;    /* allow white space */
 675         if (c==':' || c=='.' || c=='-')
 676             continue; /* skip any ':', '.', or '-' between bytes */
 677         /* From the loop above, we know this is a hex digit */
 678         byte_val = ws_xton(c);
 679         byte_val <<= 4;
 680
 681         /* We also know this is a hex digit */
 682         c = *p++;
 683         byte_val |= ws_xton(c);
 684
 685         *q++ = byte_val;
 686     }
 687     *nbytes = n_bytes;
 688     return bytes;
 689 }
 690
 691 /*
 692  * Copy if it's a case-sensitive search; uppercase it if it's
 693  * a case-insensitive search.
 694  */
 695 char *
 696 convert_string_case(const char *string, bool case_insensitive)
 697 {
 698
 699     if (case_insensitive) {
 700         return g_utf8_strup(string, -1);
 701     } else {
 702         return g_strdup(string);
 703     }
 704 }
 705
 706 #define GN_CHAR_ALPHABET_SIZE 128
 707
 708 static gunichar IA5_default_alphabet[GN_CHAR_ALPHABET_SIZE] = {
 709
 710     /*ITU-T recommendation T.50 specifies International Reference Alphabet 5 (IA5) */
 711
 712     '?', '?', '?', '?', '?', '?', '?', '?',
 713     '?', '?', '?', '?', '?', '?', '?', '?',
 714     '?', '?', '?', '?', '?', '?', '?', '?',
 715     '?', '?', '?', '?', '?', '?', '?', '?',
 716     ' ', '!', '\"','#', '$', '%', '&', '\'',
 717     '(', ')', '*', '+', ',', '-', '.', '/',
 718     '0', '1', '2', '3', '4', '5', '6', '7',
 719     '8', '9', ':', ';', '<', '=', '>', '?',
 720     '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
 721     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 722     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
 723     'X',  'Y',  'Z',  '[',  '\\',  ']',  '^',  '_',
 724     '`', 'a',  'b',  'c',  'd',  'e',  'f',  'g',
 725     'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
 726     'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 727     'x',  'y',  'z',  '{',  '|',  '}',  '~',  '?'
 728 };
 729
 730 static gunichar
 731 char_def_ia5_alphabet_decode(unsigned char value)
 732 {
 733     if (value < GN_CHAR_ALPHABET_SIZE) {
 734         return IA5_default_alphabet[value];
 735     }
 736     else {
 737         return '?';
 738     }
 739 }
 740
 741 void
 742 IA5_7BIT_decode(unsigned char * dest, const unsigned char* src, int len)
 743 {
 744     int i, j;
 745     gunichar buf;
 746
 747     for (i = 0, j = 0; j < len;  j++) {
 748         buf = char_def_ia5_alphabet_decode(src[j]);
 749         i += g_unichar_to_utf8(buf,&(dest[i]));
 750     }
 751     dest[i]=0;
 752 }
 753
 754 /* chars allowed: lower case letters, digits, '-', "_", and ".". */
 755 static
 756 const uint8_t module_valid_chars_lower_case[256] = {
 757     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x0F */
 758     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10-0x1F */
 759     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, /* 0x20-0x2F '-', '.'      */
 760     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30-0x3F '0'-'9'       */
 761     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40-0x4F */
 762     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50-0x5F '_' */
 763     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F 'a'-'o'       */
 764     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70-0x7F 'p'-'z'       */
 765     /* upper 128 empty-initialized to 0 */
 766 };
 767
 768 /* chars allowed: alphanumerics, '-', "_", and ".". */
 769 static
 770 const uint8_t module_valid_chars[256] = {
 771     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x0F */
 772     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10-0x1F */
 773     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, /* 0x20-0x2F '-', '.'      */
 774     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30-0x3F '0'-'9'       */
 775     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F 'A'-'O'       */
 776     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 0x50-0x5F 'P'-'Z', '_' */
 777     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F 'a'-'o'       */
 778     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70-0x7F 'p'-'z'       */
 779     /* upper 128 empty-initialized to 0 */
 780 };
 781
 782 unsigned char
 783 module_check_valid_name(const char *name, bool lower_only)
 784 {
 785     const char *p = name;
 786     unsigned char c = '.', lastc;
 787     const uint8_t *chars;
 788
 789     /* First character cannot be '-'. */
 790     if (name[0] == '-')
 791         return '-';
 792
 793     if (lower_only)
 794         chars = module_valid_chars_lower_case;
 795     else
 796         chars = module_valid_chars;
 797
 798     do {
 799         lastc = c;
 800         c = *(p++);
 801         /* Leading '.' or substring ".." are disallowed. */
 802         if (c == '.' && lastc == '.') {
 803             break;
 804         }
 805     } while (chars[c]);
 806
 807     /* Trailing '.' is disallowed. */
 808     if (lastc == '.') {
 809         return '.';
 810     }
 811     return c;
 812 }
 813
 814 static const char _hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
 815                               '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
 816
 817 /*
 818  * Copy byte by byte without UTF-8 truncation (assume valid UTF-8 input).
 819  * Return byte size written, or that would have been
 820  * written with enough space.
 821  */
 822 size_t
 823 ws_label_strcpy(char *label_str, size_t buf_size, size_t pos,
 824                 const uint8_t *str, int flags)
 825 {
 826     if (pos >= buf_size)
 827         return pos;
 828
 829     uint8_t r = 0;
 830     ssize_t chlen;
 831     ssize_t idx, src_len;
 832     ssize_t free_len;
 833
 834     label_str[pos] = '\0';
 835
 836     ws_return_val_if(str == NULL, pos);
 837     idx = 0;
 838     src_len = strlen(str);
 839     free_len = buf_size - pos - 1;
 840
 841     while (idx < src_len) {
 842         chlen = ws_utf8_char_len(str[idx]);
 843         if (chlen <= 0) {
 844             /* We were passed invalid UTF-8. This is an error. Complain and do... something. */
 845             ws_log_utf8(str, -1, NULL);
 846             /*
 847              * XXX If we are going to return here instead of trying to recover maybe the log level should
 848              * be higher than DEBUG.
 849              */
 850             return pos;
 851         }
 852
 853         /* ASCII */
 854         if (chlen == 1) {
 855             if (flags & FORMAT_LABEL_REPLACE_SPACE && g_ascii_isspace(str[idx])) {
 856                 if (free_len >= 1) {
 857                     label_str[pos] = ' ';
 858                     label_str[pos+1] = '\0';
 859                 }
 860                 pos++;
 861                 idx++;
 862                 free_len--;
 863                 continue;
 864             }
 865
 866             r = 0;
 867             switch (str[idx]) {
 868                 case '\a': r = 'a'; break;
 869                 case '\b': r = 'b'; break;
 870                 case '\f': r = 'f'; break;
 871                 case '\n': r = 'n'; break;
 872                 case '\r': r = 'r'; break;
 873                 case '\t': r = 't'; break;
 874                 case '\v': r = 'v'; break;
 875             }
 876             if (r != 0) {
 877                 if (free_len >= 2) {
 878                     label_str[pos] = '\\';
 879                     label_str[pos+1] = r;
 880                     label_str[pos+2] = '\0';
 881                 }
 882                 pos += 2;
 883                 idx += 1;
 884                 free_len -= 2;
 885                 continue;
 886             }
 887
 888             if (g_ascii_isprint(str[idx])) {
 889                 if (free_len >= 1) {
 890                     label_str[pos] = str[idx];
 891                     label_str[pos+1] = '\0';
 892                 }
 893                 pos++;
 894                 idx++;
 895                 free_len--;
 896                 continue;
 897             }
 898
 899             if (free_len >= 4) {
 900                 label_str[pos+0] = '\\';
 901                 label_str[pos+1] = 'x';
 902
 903                 uint8_t ch = str[idx];
 904                 label_str[pos+2] = _hex[ch >> 4];
 905                 label_str[pos+3] = _hex[ch & 0x0F];
 906                 label_str[pos+4] = '\0';
 907             }
 908             pos += 4;
 909             idx += chlen;
 910             free_len -= 4;
 911             continue;
 912         }
 913
 914         /* UTF-8 multibyte */
 915         if (chlen == 2 && str[idx] == 0xC2 &&
 916                                 str[idx+1] >= 0x80 && str[idx+1] <= 0x9F) {
 917             /*
 918              * Escape the C1 control codes. C0 (covered above) and C1 are
 919              * inband signalling and transparent to Unicode.
 920              * Anything else probably has text semantics should not be removed.
 921              */
 922             /*
 923              * Special case: The second UTF-8 byte is the same as the Unicode
 924              * code point for range U+0080 - U+009F.
 925              */
 926             if (free_len >= 6) {
 927                 label_str[pos+0] = '\\';
 928                 label_str[pos+1] = 'u';
 929                 label_str[pos+2] = '0';
 930                 label_str[pos+3] = '0';
 931
 932                 uint8_t ch = str[idx+1];
 933                 label_str[pos+4] = _hex[ch >> 4];
 934                 label_str[pos+5] = _hex[ch & 0x0F];
 935                 label_str[pos+6] = '\0';
 936             }
 937             pos += 6;
 938             idx += chlen;
 939             free_len -= 6;
 940             continue;
 941         }
 942
 943         /* Just copy */
 944         if (free_len >= chlen) {
 945             for (ssize_t j = 0; j < chlen; j++) {
 946                 label_str[pos+j] = str[idx+j];
 947             }
 948             label_str[pos+chlen] = '\0';
 949         }
 950         pos += chlen;
 951         idx += chlen;
 952         free_len -= chlen;
 953     }
 954
 955     return pos;
 956 }
 957
 958 size_t
 959 ws_label_strcat(char *label_str, size_t bufsize, const uint8_t *str, int flags)
 960 {
 961     return ws_label_strcpy(label_str, bufsize, strlen(label_str), str, flags);
 962 }
 963
 964 /*
 965  * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
 966  *
 967  * Local variables:
 968  * c-basic-offset: 4
 969  * tab-width: 8
 970  * indent-tabs-mode: nil
 971  * End:
 972  *
 973  * vi: set shiftwidth=4 tabstop=8 expandtab:
 974  * :indentSize=4:tabSize=8:noTabs=true:
 975  */