external/bsd/openldap/dist/libraries/libldap/utf-8-conv.c

   1 /* $OpenLDAP: pkg/ldap/libraries/libldap/utf-8-conv.c,v 1.16.2.3 2008/02/11 23:26:41 kurt Exp $ */
   2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   3  *
   4  * Copyright 1998-2008 The OpenLDAP Foundation.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted only as authorized by the OpenLDAP
   9  * Public License.
  10  *
  11  * A copy of this license is available in the file LICENSE in the
  12  * top-level directory of the distribution or, alternatively, at
  13  * <http://www.OpenLDAP.org/license.html>.
  14  */
  15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  16  *
  17  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  18  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  19  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  20  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  21  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  22  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  23  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  24  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  25  *---
  26  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
  27  * can be found in the file "build/LICENSE-2.0.1" in this distribution
  28  * of OpenLDAP Software.
  29  */
  30
  31 /*
  32  * UTF-8 Conversion Routines
  33  *
  34  * These routines convert between Wide Character and UTF-8,
  35  * or between MultiByte and UTF-8 encodings.
  36  *
  37  * Both single character and string versions of the functions are provided.
  38  * All functions return -1 if the character or string cannot be converted.
  39  */
  40
  41 #include "portable.h"
  42
  43 #if SIZEOF_WCHAR_T >= 4
  44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
  45
  46 #include <stdio.h>
  47 #include <ac/stdlib.h>          /* For wctomb, wcstombs, mbtowc, mbstowcs */
  48 #include <ac/string.h>
  49 #include <ac/time.h>            /* for time_t */
  50
  51 #include "ldap-int.h"
  52
  53 #include <ldap_utf8.h>
  54
  55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  56
  57
  58 /*-----------------------------------------------------------------------------
  59                                         UTF-8 Format Summary
  60
  61 ASCII chars                                             7 bits
  62     0xxxxxxx
  63
  64 2-character UTF-8 sequence:        11 bits
  65     110xxxxx  10xxxxxx
  66
  67 3-character UTF-8                  16 bits
  68     1110xxxx  10xxxxxx  10xxxxxx
  69
  70 4-char UTF-8                       21 bits
  71     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
  72
  73 5-char UTF-8                       26 bits
  74     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  75
  76 6-char UTF-8                       31 bits
  77     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  78
  79 Unicode address space   (0 - 0x10FFFF)    21 bits
  80 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
  81
  82 Note: This code does not prevent UTF-8 sequences which are longer than
  83       necessary from being decoded.
  84 */
  85
  86 /*-----------------------------------------------------------------------------
  87    Convert a UTF-8 character to a wide char.
  88    Return the length of the UTF-8 input character in bytes.
  89 */
  90 int
  91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  92 {
  93         int utflen, i;
  94         wchar_t ch;
  95
  96         if (utf8char == NULL) return -1;
  97
  98         /* Get UTF-8 sequence length from 1st byte */
  99         utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
 100
 101         if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
 102
 103         /* First byte minus length tag */
 104         ch = (wchar_t)(utf8char[0] & mask[utflen]);
 105
 106         for(i=1; i < utflen; i++) {
 107                 /* Subsequent bytes must start with 10 */
 108                 if ((utf8char[i] & 0xc0) != 0x80) return -1;
 109
 110                 ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 111                 ch |= (wchar_t)(utf8char[i] & 0x3f);
 112         }
 113
 114         if (wchar) *wchar = ch;
 115
 116         return utflen;
 117 }
 118
 119 /*-----------------------------------------------------------------------------
 120    Convert a UTF-8 string to a wide char string.
 121    No more than 'count' wide chars will be written to the output buffer.
 122    Return the size of the converted string in wide chars, excl null terminator.
 123 */
 124 int
 125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
 126 {
 127         size_t wclen = 0;
 128         int utflen, i;
 129         wchar_t ch;
 130
 131
 132         /* If input ptr is NULL or empty... */
 133         if (utf8str == NULL || !*utf8str) {
 134                 if ( wcstr )
 135                         *wcstr = 0;
 136                 return 0;
 137         }
 138
 139         /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
 140         while ( *utf8str && (wcstr==NULL || wclen<count) ) {
 141                 /* Get UTF-8 sequence length from 1st byte */
 142                 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
 143
 144                 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
 145
 146                 /* First byte minus length tag */
 147                 ch = (wchar_t)(utf8str[0] & mask[utflen]);
 148
 149                 for(i=1; i < utflen; i++) {
 150                         /* Subsequent bytes must start with 10 */
 151                         if ((utf8str[i] & 0xc0) != 0x80) return -1;
 152
 153                         ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 154                         ch |= (wchar_t)(utf8str[i] & 0x3f);
 155                 }
 156
 157                 if (wcstr) wcstr[wclen] = ch;
 158
 159                 utf8str += utflen;      /* Move to next UTF-8 character */
 160                 wclen++;                        /* Count number of wide chars stored/required */
 161         }
 162
 163         /* Add null terminator if there's room in the buffer. */
 164         if (wcstr && wclen < count) wcstr[wclen] = 0;
 165
 166         return wclen;
 167 }
 168
 169
 170 /*-----------------------------------------------------------------------------
 171    Convert one wide char to a UTF-8 character.
 172    Return the length of the converted UTF-8 character in bytes.
 173    No more than 'count' bytes will be written to the output buffer.
 174 */
 175 int
 176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
 177 {
 178         int len=0;
 179
 180         if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
 181         {                                               /* Ignore count */
 182                 if( wchar < 0 )
 183                         return -1;
 184                 if( wchar < 0x80 )
 185                         return 1;
 186                 if( wchar < 0x800 )
 187                         return 2;
 188                 if( wchar < 0x10000 )
 189                         return 3;
 190                 if( wchar < 0x200000 )
 191                         return 4;
 192                 if( wchar < 0x4000000 )
 193                         return 5;
 194                 if( wchar < 0x80000000 )
 195                         return 6;
 196                 return -1;
 197         }
 198
 199
 200         if ( wchar < 0 ) {                              /* Invalid wide character */
 201                 len = -1;
 202
 203         } else if( wchar < 0x80 ) {
 204                 if (count >= 1) {
 205                         utf8char[len++] = (char)wchar;
 206                 }
 207
 208         } else if( wchar < 0x800 ) {
 209                 if (count >=2) {
 210                         utf8char[len++] = 0xc0 | ( wchar >> 6 );
 211                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 212                 }
 213
 214         } else if( wchar < 0x10000 ) {
 215                 if (count >= 3) {
 216                         utf8char[len++] = 0xe0 | ( wchar >> 12 );
 217                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 218                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 219                 }
 220
 221         } else if( wchar < 0x200000 ) {
 222                 if (count >= 4) {
 223                         utf8char[len++] = 0xf0 | ( wchar >> 18 );
 224                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 225                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 226                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 227                 }
 228
 229         } else if( wchar < 0x4000000 ) {
 230                 if (count >= 5) {
 231                         utf8char[len++] = 0xf8 | ( wchar >> 24 );
 232                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 233                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 234                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 235                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 236                 }
 237
 238         } else if( wchar < 0x80000000 ) {
 239                 if (count >= 6) {
 240                         utf8char[len++] = 0xfc | ( wchar >> 30 );
 241                         utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
 242                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 243                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 244                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 245                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 246                 }
 247
 248         } else
 249                 len = -1;
 250
 251         return len;
 252
 253 }
 254
 255
 256 /*-----------------------------------------------------------------------------
 257    Convert a wide char string to a UTF-8 string.
 258    No more than 'count' bytes will be written to the output buffer.
 259    Return the # of bytes written to the output buffer, excl null terminator.
 260 */
 261 int
 262 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
 263 {
 264         int len = 0;
 265         int n;
 266         char *p = utf8str;
 267         wchar_t empty = 0;              /* To avoid use of L"" construct */
 268
 269         if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
 270                 wcstr = &empty;
 271
 272         if (utf8str == NULL)    /* Just compute size of output, excl null */
 273         {
 274                 while (*wcstr)
 275                 {
 276                         /* Get UTF-8 size of next wide char */
 277                         n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
 278                         if (n == -1)
 279                                 return -1;
 280                         len += n;
 281                 }
 282
 283                 return len;
 284         }
 285
 286
 287         /* Do the actual conversion. */
 288
 289         n = 1;                                  /* In case of empty wcstr */
 290         while (*wcstr)
 291         {
 292                 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
 293
 294                 if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
 295                         break;
 296
 297                 p += n;
 298                 count -= n;                     /* Space left in output buffer */
 299         }
 300
 301         /* If not enough room for last character, pad remainder with null
 302            so that return value = original count, indicating buffer full. */
 303         if (n == 0)
 304         {
 305                 while (count--)
 306                         *p++ = 0;
 307         }
 308
 309         /* Add a null terminator if there's room. */
 310         else if (count)
 311                 *p = 0;
 312
 313         if (n == -1)                    /* Conversion encountered invalid wide char. */
 314                 return -1;
 315
 316         /* Return the number of bytes written to output buffer, excl null. */
 317         return (p - utf8str);
 318 }
 319
 320
 321 /*-----------------------------------------------------------------------------
 322    Convert a UTF-8 character to a MultiByte character.
 323    Return the size of the converted character in bytes.
 324 */
 325 int
 326 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 327                 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 328 {
 329         wchar_t wchar;
 330         int n;
 331         char tmp[6];                            /* Large enough for biggest multibyte char */
 332
 333         if (f_wctomb == NULL)           /* If no conversion function was given... */
 334                 f_wctomb = wctomb;              /*    use the local ANSI C function */
 335
 336         /* First convert UTF-8 char to a wide char */
 337         n = ldap_x_utf8_to_wc( &wchar, utf8char);
 338
 339         if (n == -1)
 340                 return -1;              /* Invalid UTF-8 character */
 341
 342         if (mbchar == NULL)
 343                 n = f_wctomb( tmp, wchar );
 344         else
 345                 n = f_wctomb( mbchar, wchar);
 346
 347         return n;
 348 }
 349
 350 /*-----------------------------------------------------------------------------
 351    Convert a UTF-8 string to a MultiByte string.
 352    No more than 'count' bytes will be written to the output buffer.
 353    Return the size of the converted string in bytes, excl null terminator.
 354 */
 355 int
 356 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 357                 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 358 {
 359         wchar_t *wcs;
 360         size_t wcsize;
 361     int n;
 362
 363         if (f_wcstombs == NULL)         /* If no conversion function was given... */
 364                 f_wcstombs = wcstombs;  /*    use the local ANSI C function */
 365
 366         if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
 367         {
 368                 if (mbstr)
 369                         *mbstr = 0;
 370                 return 0;
 371         }
 372
 373 /* Allocate memory for the maximum size wchar string that we could get. */
 374         wcsize = strlen(utf8str) + 1;
 375         wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
 376         if (wcs == NULL)
 377                 return -1;                              /* Memory allocation failure. */
 378
 379         /* First convert the UTF-8 string to a wide char string */
 380         n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
 381
 382         /* Then convert wide char string to multi-byte string */
 383         if (n != -1)
 384         {
 385                 n = f_wcstombs(mbstr, wcs, count);
 386         }
 387
 388         LDAP_FREE(wcs);
 389
 390         return n;
 391 }
 392
 393 /*-----------------------------------------------------------------------------
 394    Convert a MultiByte character to a UTF-8 character.
 395    'mbsize' indicates the number of bytes of 'mbchar' to check.
 396    Returns the number of bytes written to the output character.
 397 */
 398 int
 399 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
 400                 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
 401 {
 402     wchar_t wchar;
 403     int n;
 404
 405         if (f_mbtowc == NULL)           /* If no conversion function was given... */
 406                 f_mbtowc = mbtowc;              /*    use the local ANSI C function */
 407
 408     if (mbsize == 0)                            /* 0 is not valid. */
 409         return -1;
 410
 411     if (mbchar == NULL || *mbchar == 0)
 412     {
 413         if (utf8char)
 414             *utf8char = 0;
 415         return 1;
 416     }
 417
 418         /* First convert the MB char to a Wide Char */
 419         n = f_mbtowc( &wchar, mbchar, mbsize);
 420
 421         if (n == -1)
 422                 return -1;
 423
 424         /* Convert the Wide Char to a UTF-8 character. */
 425         n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
 426
 427         return n;
 428 }
 429
 430
 431 /*-----------------------------------------------------------------------------
 432    Convert a MultiByte string to a UTF-8 string.
 433    No more than 'count' bytes will be written to the output buffer.
 434    Return the size of the converted string in bytes, excl null terminator.
 435 */
 436 int
 437 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
 438                 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
 439 {
 440         wchar_t *wcs;
 441         int n;
 442         size_t wcsize;
 443
 444         if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
 445                 mbstr = "";
 446
 447         if (f_mbstowcs == NULL)         /* If no conversion function was given... */
 448                 f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
 449
 450         /* Allocate memory for the maximum size wchar string that we could get. */
 451         wcsize = strlen(mbstr) + 1;
 452         wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
 453         if (wcs == NULL)
 454                 return -1;
 455
 456         /* First convert multi-byte string to a wide char string */
 457         n = f_mbstowcs(wcs, mbstr, wcsize);
 458
 459         /* Convert wide char string to UTF-8 string */
 460         if (n != -1)
 461         {
 462                 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
 463         }
 464
 465         LDAP_FREE(wcs);
 466
 467         return n;
 468 }
 469
 470 #endif