usr/src/lib/gss_mechs/mech_krb5/support/utf8.c

   1 /*
   2  * util/support/utf8.c
   3  *
   4  * Copyright 2008 by the Massachusetts Institute of Technology.
   5  * All Rights Reserved.
   6  *
   7  * Export of this software from the United States of America may
   8  *   require a specific license from the United States Government.
   9  *   It is the responsibility of any person or organization contemplating
  10  *   export to obtain such a license before exporting.
  11  *
  12  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
  13  * distribute this software and its documentation for any purpose and
  14  * without fee is hereby granted, provided that the above copyright
  15  * notice appear in all copies and that both that copyright notice and
  16  * this permission notice appear in supporting documentation, and that
  17  * the name of M.I.T. not be used in advertising or publicity pertaining
  18  * to distribution of the software without specific, written prior
  19  * permission.  Furthermore if you modify this software you must label
  20  * your software as modified software and not distribute it in such a
  21  * fashion that it might be confused with the original M.I.T. software.
  22  * M.I.T. makes no representations about the suitability of
  23  * this software for any purpose.  It is provided "as is" without express
  24  * or implied warranty.
  25  */
  26 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
  27  *
  28  * Copyright 1998-2008 The OpenLDAP Foundation.
  29  * All rights reserved.
  30  *
  31  * Redistribution and use in source and binary forms, with or without
  32  * modification, are permitted only as authorized by the OpenLDAP
  33  * Public License.
  34  *
  35  * A copy of this license is available in the file LICENSE in the
  36  * top-level directory of the distribution or, alternatively, at
  37  * <http://www.OpenLDAP.org/license.html>.
  38  */
  39 /* Basic UTF-8 routines
  40  *
  41  * These routines are "dumb".  Though they understand UTF-8,
  42  * they don't grok Unicode.  That is, they can push bits,
  43  * but don't have a clue what the bits represent.  That's
  44  * good enough for use with the KRB5 Client SDK.
  45  *
  46  * These routines are not optimized.
  47  */
  48
  49 #include "k5-platform.h"
  50 #include "k5-utf8.h"
  51 #include "supp-int.h"
  52
  53 /*
  54  * return the number of bytes required to hold the
  55  * NULL-terminated UTF-8 string NOT INCLUDING the
  56  * termination.
  57  */
  58 size_t krb5int_utf8_bytes(const char *p)
  59 {
  60     size_t bytes;
  61
  62     for (bytes = 0; p[bytes]; bytes++)
  63         ;
  64
  65     return bytes;
  66 }
  67
  68 size_t krb5int_utf8_chars(const char *p)
  69 {
  70     /* could be optimized and could check for invalid sequences */
  71     size_t chars = 0;
  72
  73     for ( ; *p ; KRB5_UTF8_INCR(p))
  74         chars++;
  75
  76     return chars;
  77 }
  78
  79 size_t krb5int_utf8c_chars(const char *p, size_t length)
  80 {
  81     /* could be optimized and could check for invalid sequences */
  82     size_t chars = 0;
  83     const char *end = p + length;
  84
  85     for ( ; p < end; KRB5_UTF8_INCR(p))
  86         chars++;
  87
  88     return chars;
  89 }
  90
  91 /* return offset to next character */
  92 int krb5int_utf8_offset(const char *p)
  93 {
  94     return KRB5_UTF8_NEXT(p) - p;
  95 }
  96
  97 /*
  98  * Returns length indicated by first byte.
  99  */
 100 const char krb5int_utf8_lentab[] = {
 101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 102     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 103     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 105     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 107     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 108     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
 109
 110 int krb5int_utf8_charlen(const char *p)
 111 {
 112     if (!(*p & 0x80))
 113         return 1;
 114
 115     return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
 116 }
 117
 118 /*
 119  * Make sure the UTF-8 char used the shortest possible encoding
 120  * returns charlen if valid, 0 if not.
 121  *
 122  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
 123  * The table is slightly modified from that of the RFC.
 124  *
 125  * UCS-4 range (hex)      UTF-8 sequence (binary)
 126  * 0000 0000-0000 007F   0.......
 127  * 0000 0080-0000 07FF   110++++. 10......
 128  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 129  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 130  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 131  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 132  *
 133  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 134  * at least one of the '+' bits must be set, otherwise the character
 135  * should have been encoded in fewer octets. Note that in the two-octet
 136  * case, only the first octet needs to be validated, and this is done
 137  * in the krb5int_utf8_lentab[] above.
 138  */
 139
 140 /* mask of required bits in second octet */
 141 #undef c
 142 #define c const char
 143 c krb5int_utf8_mintab[] = {
 144     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 145     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 146     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 147     (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
 148 #undef c
 149
 150 int krb5int_utf8_charlen2(const char *p)
 151 {
 152     int i = KRB5_UTF8_CHARLEN(p);
 153
 154     if (i > 2) {
 155         if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
 156             i = 0;
 157     }
 158
 159     return i;
 160 }
 161
 162 /*
 163  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
 164  * -1 on failure.
 165  */
 166 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
 167 {
 168     const unsigned char *c = (const unsigned char *) p;
 169     krb5_ucs4 ch;
 170     int len, i;
 171     static unsigned char mask[] = {
 172         0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 173
 174     *out = 0;
 175     len = KRB5_UTF8_CHARLEN2(p, len);
 176
 177     if (len == 0)
 178         return -1;
 179
 180     ch = c[0] & mask[len];
 181
 182     for (i = 1; i < len; i++) {
 183         if ((c[i] & 0xc0) != 0x80)
 184             return -1;
 185
 186         ch <<= 6;
 187         ch |= c[i] & 0x3f;
 188     }
 189
 190     *out = ch;
 191     return 0;
 192 }
 193
 194 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
 195 {
 196     krb5_ucs4 ch;
 197
 198     *out = 0;
 199     if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
 200         return -1;
 201     *out = (krb5_ucs2) ch;
 202     return 0;
 203 }
 204
 205 /* conv UCS-2 to UTF-8, not used */
 206 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
 207 {
 208     size_t len = 0;
 209     unsigned char *p = (unsigned char *) buf;
 210
 211     /* not a valid Unicode character */
 212     if (c < 0)
 213         return 0;
 214
 215     /* Just return length, don't convert */
 216     if (buf == NULL) {
 217         if (c < 0x80) return 1;
 218         else if (c < 0x800) return 2;
 219         else if (c < 0x10000) return 3;
 220         else if (c < 0x200000) return 4;
 221         else if (c < 0x4000000) return 5;
 222         else return 6;
 223     }
 224
 225     if (c < 0x80) {
 226         p[len++] = c;
 227     } else if (c < 0x800) {
 228         p[len++] = 0xc0 | ( c >> 6 );
 229         p[len++] = 0x80 | ( c & 0x3f );
 230     } else if (c < 0x10000) {
 231         p[len++] = 0xe0 | ( c >> 12 );
 232         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 233         p[len++] = 0x80 | ( c & 0x3f );
 234     } else if (c < 0x200000) {
 235         p[len++] = 0xf0 | ( c >> 18 );
 236         p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 237         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 238         p[len++] = 0x80 | ( c & 0x3f );
 239     } else if (c < 0x4000000) {
 240         p[len++] = 0xf8 | ( c >> 24 );
 241         p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 242         p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 243         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 244         p[len++] = 0x80 | ( c & 0x3f );
 245     } else /* if( c < 0x80000000 ) */ {
 246         p[len++] = 0xfc | ( c >> 30 );
 247         p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 248         p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 249         p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 250         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 251         p[len++] = 0x80 | ( c & 0x3f );
 252     }
 253
 254     return len;
 255 }
 256
 257 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
 258 {
 259     return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
 260 }
 261
 262 #define KRB5_UCS_UTF8LEN(c)     \
 263     c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
 264     (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
 265
 266 /*
 267  * Advance to the next UTF-8 character
 268  *
 269  * Ignores length of multibyte character, instead rely on
 270  * continuation markers to find start of next character.
 271  * This allows for "resyncing" of when invalid characters
 272  * are provided provided the start of the next character
 273  * is appears within the 6 bytes examined.
 274  */
 275 char *krb5int_utf8_next(const char *p)
 276 {
 277     int i;
 278     const unsigned char *u = (const unsigned char *) p;
 279
 280     if (KRB5_UTF8_ISASCII(u)) {
 281         return (char *) &p[1];
 282     }
 283
 284     for (i = 1; i < 6; i++) {
 285         if ((u[i] & 0xc0) != 0x80) {
 286             return (char *) &p[i];
 287         }
 288     }
 289
 290     return (char *) &p[i];
 291 }
 292
 293 /*
 294  * Advance to the previous UTF-8 character
 295  *
 296  * Ignores length of multibyte character, instead rely on
 297  * continuation markers to find start of next character.
 298  * This allows for "resyncing" of when invalid characters
 299  * are provided provided the start of the next character
 300  * is appears within the 6 bytes examined.
 301  */
 302 char *krb5int_utf8_prev(const char *p)
 303 {
 304     int i;
 305     const unsigned char *u = (const unsigned char *) p;
 306
 307     for (i = -1; i>-6 ; i--) {
 308         if ((u[i] & 0xc0 ) != 0x80) {
 309             return (char *) &p[i];
 310         }
 311     }
 312
 313     return (char *) &p[i];
 314 }
 315
 316 /*
 317  * Copy one UTF-8 character from src to dst returning
 318  * number of bytes copied.
 319  *
 320  * Ignores length of multibyte character, instead rely on
 321  * continuation markers to find start of next character.
 322  * This allows for "resyncing" of when invalid characters
 323  * are provided provided the start of the next character
 324  * is appears within the 6 bytes examined.
 325  */
 326 int krb5int_utf8_copy(char* dst, const char *src)
 327 {
 328     int i;
 329     const unsigned char *u = (const unsigned char *) src;
 330
 331     dst[0] = src[0];
 332
 333     if (KRB5_UTF8_ISASCII(u)) {
 334         return 1;
 335     }
 336
 337     for (i=1; i<6; i++) {
 338         if ((u[i] & 0xc0) != 0x80) {
 339             return i;
 340         }
 341         dst[i] = src[i];
 342     }
 343
 344     return i;
 345 }
 346
 347 #ifndef UTF8_ALPHA_CTYPE
 348 /*
 349  * UTF-8 ctype routines
 350  * Only deals with characters < 0x80 (ie: US-ASCII)
 351  */
 352
 353 int krb5int_utf8_isascii(const char * p)
 354 {
 355     unsigned c = * (const unsigned char *) p;
 356
 357     return KRB5_ASCII(c);
 358 }
 359
 360 int krb5int_utf8_isdigit(const char * p)
 361 {
 362     unsigned c = * (const unsigned char *) p;
 363
 364     if (!KRB5_ASCII(c))
 365         return 0;
 366
 367     return KRB5_DIGIT( c );
 368 }
 369
 370 int krb5int_utf8_isxdigit(const char * p)
 371 {
 372     unsigned c = * (const unsigned char *) p;
 373
 374     if (!KRB5_ASCII(c))
 375         return 0;
 376
 377     return KRB5_HEX(c);
 378 }
 379
 380 int krb5int_utf8_isspace(const char * p)
 381 {
 382     unsigned c = * (const unsigned char *) p;
 383
 384     if (!KRB5_ASCII(c))
 385         return 0;
 386
 387     switch(c) {
 388     case ' ':
 389     case '\t':
 390     case '\n':
 391     case '\r':
 392     case '\v':
 393     case '\f':
 394         return 1;
 395     }
 396
 397     return 0;
 398 }
 399
 400 /*
 401  * These are not needed by the C SDK and are
 402  * not "good enough" for general use.
 403  */
 404 int krb5int_utf8_isalpha(const char * p)
 405 {
 406     unsigned c = * (const unsigned char *) p;
 407
 408     if (!KRB5_ASCII(c))
 409         return 0;
 410
 411     return KRB5_ALPHA(c);
 412 }
 413
 414 int krb5int_utf8_isalnum(const char * p)
 415 {
 416     unsigned c = * (const unsigned char *) p;
 417
 418     if (!KRB5_ASCII(c))
 419         return 0;
 420
 421     return KRB5_ALNUM(c);
 422 }
 423
 424 #if 0
 425 int krb5int_utf8_islower(const char * p)
 426 {
 427     unsigned c = * (const unsigned char *) p;
 428
 429     if (!KRB5_ASCII(c))
 430         return 0;
 431
 432     return KRB5_LOWER(c);
 433 }
 434
 435 int krb5int_utf8_isupper(const char * p)
 436 {
 437     unsigned c = * (const unsigned char *) p;
 438
 439     if (!KRB5_ASCII(c))
 440         return 0;
 441
 442     return KRB5_UPPER(c);
 443 }
 444 #endif
 445 #endif
 446
 447
 448 /*
 449  * UTF-8 string routines
 450  */
 451
 452 /* like strchr() */
 453 char *krb5int_utf8_strchr(const char *str, const char *chr)
 454 {
 455     krb5_ucs4 chs, ch;
 456
 457     if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
 458         return NULL;
 459     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
 460         if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
 461             return (char *)str;
 462     }
 463
 464     return NULL;
 465 }
 466
 467 /* like strcspn() but returns number of bytes, not characters */
 468 size_t krb5int_utf8_strcspn(const char *str, const char *set)
 469 {
 470     const char *cstr, *cset;
 471     krb5_ucs4 chstr, chset;
 472
 473     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
 474         for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
 475             if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
 476                 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
 477                 return cstr - str;
 478         }
 479     }
 480
 481     return cstr - str;
 482 }
 483
 484 /* like strspn() but returns number of bytes, not characters */
 485 size_t krb5int_utf8_strspn(const char *str, const char *set)
 486 {
 487     const char *cstr, *cset;
 488     krb5_ucs4 chstr, chset;
 489
 490     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
 491         for (cset = set; ; KRB5_UTF8_INCR(cset)) {
 492             if (*cset == '\0')
 493                 return cstr - str;
 494             if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
 495                 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
 496                 break;
 497         }
 498     }
 499
 500     return cstr - str;
 501 }
 502
 503 /* like strpbrk(), replaces strchr() as well */
 504 char *krb5int_utf8_strpbrk(const char *str, const char *set)
 505 {
 506     const char *cset;
 507     krb5_ucs4 chstr, chset;
 508
 509     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
 510         for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
 511             if (krb5int_utf8_to_ucs4(str, &chstr) == 0
 512                 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
 513                 return (char *)str;
 514         }
 515     }
 516
 517     return NULL;
 518 }
 519
 520 /* like strtok_r(), not strtok() */
 521 char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
 522 {
 523     char *begin;
 524     char *end;
 525
 526     if (last == NULL)
 527         return NULL;
 528
 529     begin = str ? str : *last;
 530
 531     begin += krb5int_utf8_strspn(begin, sep);
 532
 533     if (*begin == '\0') {
 534         *last = NULL;
 535         return NULL;
 536     }
 537
 538     end = &begin[krb5int_utf8_strcspn(begin, sep)];
 539
 540     if (*end != '\0') {
 541         char *next = KRB5_UTF8_NEXT(end);
 542         *end = '\0';
 543         end = next;
 544     }
 545
 546     *last = end;
 547
 548     return begin;
 549 }