external/bsd/openldap/dist/libraries/libldap/utf-8.c

   1 /* utf-8.c -- Basic UTF-8 routines */
   2 /* $OpenLDAP: pkg/ldap/libraries/libldap/utf-8.c,v 1.36.2.3 2008/02/11 23:26:41 kurt Exp $ */
   3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   4  *
   5  * Copyright 1998-2008 The OpenLDAP Foundation.
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted only as authorized by the OpenLDAP
  10  * Public License.
  11  *
  12  * A copy of this license is available in the file LICENSE in the
  13  * top-level directory of the distribution or, alternatively, at
  14  * <http://www.OpenLDAP.org/license.html>.
  15  */
  16 /* Basic UTF-8 routines
  17  *
  18  * These routines are "dumb".  Though they understand UTF-8,
  19  * they don't grok Unicode.  That is, they can push bits,
  20  * but don't have a clue what the bits represent.  That's
  21  * good enough for use with the LDAP Client SDK.
  22  *
  23  * These routines are not optimized.
  24  */
  25
  26 #include "portable.h"
  27
  28 #include <stdio.h>
  29
  30 #include <ac/stdlib.h>
  31
  32 #include <ac/socket.h>
  33 #include <ac/string.h>
  34 #include <ac/time.h>
  35
  36 #include "ldap_utf8.h"
  37
  38 #include "ldap-int.h"
  39 #include "ldap_defaults.h"
  40
  41 /*
  42  * return the number of bytes required to hold the
  43  * NULL-terminated UTF-8 string NOT INCLUDING the
  44  * termination.
  45  */
  46 ber_len_t ldap_utf8_bytes( const char * p )
  47 {
  48         ber_len_t bytes;
  49
  50         for( bytes=0; p[bytes]; bytes++ ) {
  51                 /* EMPTY */ ;
  52         }
  53
  54         return bytes;
  55 }
  56
  57 ber_len_t ldap_utf8_chars( const char * p )
  58 {
  59         /* could be optimized and could check for invalid sequences */
  60         ber_len_t chars=0;
  61
  62         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  63                 chars++;
  64         }
  65
  66         return chars;
  67 }
  68
  69 /* return offset to next character */
  70 int ldap_utf8_offset( const char * p )
  71 {
  72         return LDAP_UTF8_NEXT(p) - p;
  73 }
  74
  75 /*
  76  * Returns length indicated by first byte.
  77  */
  78 const char ldap_utf8_lentab[] = {
  79         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  81         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  82         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  84         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  85         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  86         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  87
  88 int ldap_utf8_charlen( const char * p )
  89 {
  90         if (!(*p & 0x80))
  91                 return 1;
  92
  93         return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
  94 }
  95
  96 /*
  97  * Make sure the UTF-8 char used the shortest possible encoding
  98  * returns charlen if valid, 0 if not.
  99  *
 100  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
 101  * The table is slightly modified from that of the RFC.
 102  *
 103  * UCS-4 range (hex)      UTF-8 sequence (binary)
 104  * 0000 0000-0000 007F   0.......
 105  * 0000 0080-0000 07FF   110++++. 10......
 106  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 107  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 108  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 109  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 110  *
 111  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 112  * at least one of the '+' bits must be set, otherwise the character
 113  * should have been encoded in fewer octets. Note that in the two-octet
 114  * case, only the first octet needs to be validated, and this is done
 115  * in the ldap_utf8_lentab[] above.
 116  */
 117
 118 /* mask of required bits in second octet */
 119 #undef c
 120 #define c const char
 121 c ldap_utf8_mintab[] = {
 122         (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 123         (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 124         (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 125         (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
 126 #undef c
 127
 128 int ldap_utf8_charlen2( const char * p )
 129 {
 130         int i = LDAP_UTF8_CHARLEN( p );
 131
 132         if ( i > 2 ) {
 133                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 134                         i = 0;
 135         }
 136         return i;
 137 }
 138
 139 /* conv UTF-8 to UCS-4, useful for comparisons */
 140 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 141 {
 142     const unsigned char *c = (const unsigned char *) p;
 143     ldap_ucs4_t ch;
 144         int len, i;
 145         static unsigned char mask[] = {
 146                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 147
 148         len = LDAP_UTF8_CHARLEN2(p, len);
 149
 150         if( len == 0 ) return LDAP_UCS4_INVALID;
 151
 152         ch = c[0] & mask[len];
 153
 154         for(i=1; i < len; i++) {
 155                 if ((c[i] & 0xc0) != 0x80) {
 156                         return LDAP_UCS4_INVALID;
 157                 }
 158
 159                 ch <<= 6;
 160                 ch |= c[i] & 0x3f;
 161         }
 162
 163         return ch;
 164 }
 165
 166 /* conv UCS-4 to UTF-8, not used */
 167 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 168 {
 169         int len=0;
 170         unsigned char* p = (unsigned char *) buf;
 171
 172         /* not a valid Unicode character */
 173         if ( c < 0 ) return 0;
 174
 175         /* Just return length, don't convert */
 176         if(buf == NULL) {
 177                 if( c < 0x80 ) return 1;
 178                 else if( c < 0x800 ) return 2;
 179                 else if( c < 0x10000 ) return 3;
 180                 else if( c < 0x200000 ) return 4;
 181                 else if( c < 0x4000000 ) return 5;
 182                 else return 6;
 183         }
 184
 185         if( c < 0x80 ) {
 186                 p[len++] = c;
 187
 188         } else if( c < 0x800 ) {
 189                 p[len++] = 0xc0 | ( c >> 6 );
 190                 p[len++] = 0x80 | ( c & 0x3f );
 191
 192         } else if( c < 0x10000 ) {
 193                 p[len++] = 0xe0 | ( c >> 12 );
 194                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 195                 p[len++] = 0x80 | ( c & 0x3f );
 196
 197         } else if( c < 0x200000 ) {
 198                 p[len++] = 0xf0 | ( c >> 18 );
 199                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 200                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 201                 p[len++] = 0x80 | ( c & 0x3f );
 202
 203         } else if( c < 0x4000000 ) {
 204                 p[len++] = 0xf8 | ( c >> 24 );
 205                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 206                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 207                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 208                 p[len++] = 0x80 | ( c & 0x3f );
 209
 210         } else /* if( c < 0x80000000 ) */ {
 211                 p[len++] = 0xfc | ( c >> 30 );
 212                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 213                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 214                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 215                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 216                 p[len++] = 0x80 | ( c & 0x3f );
 217         }
 218
 219         return len;
 220 }
 221
 222 #define LDAP_UCS_UTF8LEN(c)     \
 223         c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
 224         (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
 225
 226 /* Convert a string to UTF-8 format. The input string is expected to
 227  * have characters of 1, 2, or 4 octets (in network byte order)
 228  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
 229  * types respectively. (Here T61STRING just means that there is one
 230  * octet per character and characters may use the high bit of the octet.
 231  * The characters are assumed to use ISO mappings, no provision is made
 232  * for converting from T.61 coding rules to Unicode.)
 233  */
 234
 235 int
 236 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
 237 {
 238         unsigned char *in, *end;
 239         char *ptr;
 240         ldap_ucs4_t u;
 241         int i, l = 0;
 242
 243         utf8s->bv_val = NULL;
 244         utf8s->bv_len = 0;
 245
 246         in = (unsigned char *)ucs->bv_val;
 247
 248         /* Make sure we stop at an even multiple of csize */
 249         end = in + ( ucs->bv_len & ~(csize-1) );
 250
 251         for (; in < end; ) {
 252                 u = *in++;
 253                 if (csize > 1) {
 254                         u <<= 8;
 255                         u |= *in++;
 256                 }
 257                 if (csize > 2) {
 258                         u <<= 8;
 259                         u |= *in++;
 260                         u <<= 8;
 261                         u |= *in++;
 262                 }
 263                 i = LDAP_UCS_UTF8LEN(u);
 264                 if (i == 0)
 265                         return LDAP_INVALID_SYNTAX;
 266                 l += i;
 267         }
 268
 269         utf8s->bv_val = LDAP_MALLOC( l+1 );
 270         if (utf8s->bv_val == NULL)
 271                 return LDAP_NO_MEMORY;
 272         utf8s->bv_len = l;
 273
 274         ptr = utf8s->bv_val;
 275         for (in = (unsigned char *)ucs->bv_val; in < end; ) {
 276                 u = *in++;
 277                 if (csize > 1) {
 278                         u <<= 8;
 279                         u |= *in++;
 280                 }
 281                 if (csize > 2) {
 282                         u <<= 8;
 283                         u |= *in++;
 284                         u <<= 8;
 285                         u |= *in++;
 286                 }
 287                 ptr += ldap_x_ucs4_to_utf8(u, ptr);
 288         }
 289         *ptr = '\0';
 290         return LDAP_SUCCESS;
 291 }
 292
 293 /*
 294  * Advance to the next UTF-8 character
 295  *
 296  * Ignores length of multibyte character, instead rely on
 297  * continuation markers to find start of next character.
 298  * This allows for "resyncing" of when invalid characters
 299  * are provided provided the start of the next character
 300  * is appears within the 6 bytes examined.
 301  */
 302 char* ldap_utf8_next( const char * p )
 303 {
 304         int i;
 305         const unsigned char *u = (const unsigned char *) p;
 306
 307         if( LDAP_UTF8_ISASCII(u) ) {
 308                 return (char *) &p[1];
 309         }
 310
 311         for( i=1; i<6; i++ ) {
 312                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 313                         return (char *) &p[i];
 314                 }
 315         }
 316
 317         return (char *) &p[i];
 318 }
 319
 320 /*
 321  * Advance to the previous UTF-8 character
 322  *
 323  * Ignores length of multibyte character, instead rely on
 324  * continuation markers to find start of next character.
 325  * This allows for "resyncing" of when invalid characters
 326  * are provided provided the start of the next character
 327  * is appears within the 6 bytes examined.
 328  */
 329 char* ldap_utf8_prev( const char * p )
 330 {
 331         int i;
 332         const unsigned char *u = (const unsigned char *) p;
 333
 334         for( i=-1; i>-6 ; i-- ) {
 335                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 336                         return (char *) &p[i];
 337                 }
 338         }
 339
 340         return (char *) &p[i];
 341 }
 342
 343 /*
 344  * Copy one UTF-8 character from src to dst returning
 345  * number of bytes copied.
 346  *
 347  * Ignores length of multibyte character, instead rely on
 348  * continuation markers to find start of next character.
 349  * This allows for "resyncing" of when invalid characters
 350  * are provided provided the start of the next character
 351  * is appears within the 6 bytes examined.
 352  */
 353 int ldap_utf8_copy( char* dst, const char *src )
 354 {
 355         int i;
 356         const unsigned char *u = (const unsigned char *) src;
 357
 358         dst[0] = src[0];
 359
 360         if( LDAP_UTF8_ISASCII(u) ) {
 361                 return 1;
 362         }
 363
 364         for( i=1; i<6; i++ ) {
 365                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 366                         return i;
 367                 }
 368                 dst[i] = src[i];
 369         }
 370
 371         return i;
 372 }
 373
 374 #ifndef UTF8_ALPHA_CTYPE
 375 /*
 376  * UTF-8 ctype routines
 377  * Only deals with characters < 0x80 (ie: US-ASCII)
 378  */
 379
 380 int ldap_utf8_isascii( const char * p )
 381 {
 382         unsigned c = * (const unsigned char *) p;
 383         return LDAP_ASCII(c);
 384 }
 385
 386 int ldap_utf8_isdigit( const char * p )
 387 {
 388         unsigned c = * (const unsigned char *) p;
 389
 390         if(!LDAP_ASCII(c)) return 0;
 391
 392         return LDAP_DIGIT( c );
 393 }
 394
 395 int ldap_utf8_isxdigit( const char * p )
 396 {
 397         unsigned c = * (const unsigned char *) p;
 398
 399         if(!LDAP_ASCII(c)) return 0;
 400
 401         return LDAP_HEX(c);
 402 }
 403
 404 int ldap_utf8_isspace( const char * p )
 405 {
 406         unsigned c = * (const unsigned char *) p;
 407
 408         if(!LDAP_ASCII(c)) return 0;
 409
 410         switch(c) {
 411         case ' ':
 412         case '\t':
 413         case '\n':
 414         case '\r':
 415         case '\v':
 416         case '\f':
 417                 return 1;
 418         }
 419
 420         return 0;
 421 }
 422
 423 /*
 424  * These are not needed by the C SDK and are
 425  * not "good enough" for general use.
 426  */
 427 int ldap_utf8_isalpha( const char * p )
 428 {
 429         unsigned c = * (const unsigned char *) p;
 430
 431         if(!LDAP_ASCII(c)) return 0;
 432
 433         return LDAP_ALPHA(c);
 434 }
 435
 436 int ldap_utf8_isalnum( const char * p )
 437 {
 438         unsigned c = * (const unsigned char *) p;
 439
 440         if(!LDAP_ASCII(c)) return 0;
 441
 442         return LDAP_ALNUM(c);
 443 }
 444
 445 int ldap_utf8_islower( const char * p )
 446 {
 447         unsigned c = * (const unsigned char *) p;
 448
 449         if(!LDAP_ASCII(c)) return 0;
 450
 451         return LDAP_LOWER(c);
 452 }
 453
 454 int ldap_utf8_isupper( const char * p )
 455 {
 456         unsigned c = * (const unsigned char *) p;
 457
 458         if(!LDAP_ASCII(c)) return 0;
 459
 460         return LDAP_UPPER(c);
 461 }
 462 #endif
 463
 464
 465 /*
 466  * UTF-8 string routines
 467  */
 468
 469 /* like strchr() */
 470 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 471 {
 472         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 473                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 474                         return (char *) str;
 475                 }
 476         }
 477
 478         return NULL;
 479 }
 480
 481 /* like strcspn() but returns number of bytes, not characters */
 482 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 483 {
 484         const char *cstr;
 485         const char *cset;
 486
 487         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 488                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 489                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 490                                 return cstr - str;
 491                         }
 492                 }
 493         }
 494
 495         return cstr - str;
 496 }
 497
 498 /* like strspn() but returns number of bytes, not characters */
 499 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 500 {
 501         const char *cstr;
 502         const char *cset;
 503
 504         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 505                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 506                         if( *cset == '\0' ) {
 507                                 return cstr - str;
 508                         }
 509
 510                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 511                                 break;
 512                         }
 513                 }
 514         }
 515
 516         return cstr - str;
 517 }
 518
 519 /* like strpbrk(), replaces strchr() as well */
 520 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 521 {
 522         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 523                 const char *cset;
 524
 525                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 526                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 527                                 return (char *) str;
 528                         }
 529                 }
 530         }
 531
 532         return NULL;
 533 }
 534
 535 /* like strtok_r(), not strtok() */
 536 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 537 {
 538         char *begin;
 539         char *end;
 540
 541         if( last == NULL ) return NULL;
 542
 543         begin = str ? str : *last;
 544
 545         begin += ldap_utf8_strspn( begin, sep );
 546
 547         if( *begin == '\0' ) {
 548                 *last = NULL;
 549                 return NULL;
 550         }
 551
 552         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 553
 554         if( *end != '\0' ) {
 555                 char *next = LDAP_UTF8_NEXT( end );
 556                 *end = '\0';
 557                 end = next;
 558         }
 559
 560         *last = end;
 561         return begin;
 562 }