crypto/external/bsd/heimdal/dist/lib/wind/utf8.c

   1 /*      $NetBSD: utf8.c,v 1.1.1.2 2014/04/24 12:45:56 pettai Exp $      */
   2
   3 /*
   4  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
   5  * (Royal Institute of Technology, Stockholm, Sweden).
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  *
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * 3. Neither the name of the Institute nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #include <config.h>
  37 #include "windlocl.h"
  38
  39 static int
  40 utf8toutf32(const unsigned char **pp, uint32_t *out)
  41 {
  42     const unsigned char *p = *pp;
  43     unsigned c = *p;
  44
  45     if (c & 0x80) {
  46         if ((c & 0xE0) == 0xC0) {
  47             const unsigned c2 = *++p;
  48             if ((c2 & 0xC0) == 0x80) {
  49                 *out =  ((c  & 0x1F) << 6)
  50                     | (c2 & 0x3F);
  51             } else {
  52                 return WIND_ERR_INVALID_UTF8;
  53             }
  54         } else if ((c & 0xF0) == 0xE0) {
  55             const unsigned c2 = *++p;
  56             if ((c2 & 0xC0) == 0x80) {
  57                 const unsigned c3 = *++p;
  58                 if ((c3 & 0xC0) == 0x80) {
  59                     *out =   ((c  & 0x0F) << 12)
  60                         | ((c2 & 0x3F) << 6)
  61                         |  (c3 & 0x3F);
  62                 } else {
  63                     return WIND_ERR_INVALID_UTF8;
  64                 }
  65             } else {
  66                 return WIND_ERR_INVALID_UTF8;
  67             }
  68         } else if ((c & 0xF8) == 0xF0) {
  69             const unsigned c2 = *++p;
  70             if ((c2 & 0xC0) == 0x80) {
  71                 const unsigned c3 = *++p;
  72                 if ((c3 & 0xC0) == 0x80) {
  73                     const unsigned c4 = *++p;
  74                     if ((c4 & 0xC0) == 0x80) {
  75                         *out =   ((c  & 0x07) << 18)
  76                             | ((c2 & 0x3F) << 12)
  77                             | ((c3 & 0x3F) <<  6)
  78                             |  (c4 & 0x3F);
  79                     } else {
  80                         return WIND_ERR_INVALID_UTF8;
  81                     }
  82                 } else {
  83                     return WIND_ERR_INVALID_UTF8;
  84                 }
  85             } else {
  86                 return WIND_ERR_INVALID_UTF8;
  87             }
  88         } else {
  89             return WIND_ERR_INVALID_UTF8;
  90         }
  91     } else {
  92         *out = c;
  93     }
  94
  95     *pp = p;
  96
  97     return 0;
  98 }
  99
 100 /**
 101  * Convert an UTF-8 string to an UCS4 string.
 102  *
 103  * @param in an UTF-8 string to convert.
 104  * @param out the resulting UCS4 strint, must be at least
 105  * wind_utf8ucs4_length() long.  If out is NULL, the function will
 106  * calculate the needed space for the out variable (just like
 107  * wind_utf8ucs4_length()).
 108  * @param out_len before processing out_len should be the length of
 109  * the out variable, after processing it will be the length of the out
 110  * string.
 111  *
 112  * @return returns 0 on success, an wind error code otherwise
 113  * @ingroup wind
 114  */
 115
 116 int
 117 wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
 118 {
 119     const unsigned char *p;
 120     size_t o = 0;
 121     int ret;
 122
 123     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 124         uint32_t u;
 125
 126         ret = utf8toutf32(&p, &u);
 127         if (ret)
 128             return ret;
 129
 130         if (out) {
 131             if (o >= *out_len)
 132                 return WIND_ERR_OVERRUN;
 133             out[o] = u;
 134         }
 135         o++;
 136     }
 137     *out_len = o;
 138     return 0;
 139 }
 140
 141 /**
 142  * Calculate the length of from converting a UTF-8 string to a UCS4
 143  * string.
 144  *
 145  * @param in an UTF-8 string to convert.
 146  * @param out_len the length of the resulting UCS4 string.
 147  *
 148  * @return returns 0 on success, an wind error code otherwise
 149  * @ingroup wind
 150  */
 151
 152 int
 153 wind_utf8ucs4_length(const char *in, size_t *out_len)
 154 {
 155     return wind_utf8ucs4(in, NULL, out_len);
 156 }
 157
 158 static const char first_char[4] =
 159     { 0x00, 0xC0, 0xE0, 0xF0 };
 160
 161 /**
 162  * Convert an UCS4 string to a UTF-8 string.
 163  *
 164  * @param in an UCS4 string to convert.
 165  * @param in_len the length input array.
 166
 167  * @param out the resulting UTF-8 strint, must be at least
 168  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
 169  * out is NULL, the function will calculate the needed space for the
 170  * out variable (just like wind_ucs4utf8_length()).
 171
 172  * @param out_len before processing out_len should be the length of
 173  * the out variable, after processing it will be the length of the out
 174  * string.
 175  *
 176  * @return returns 0 on success, an wind error code otherwise
 177  * @ingroup wind
 178  */
 179
 180 int
 181 wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
 182 {
 183     uint32_t ch;
 184     size_t i, len, o;
 185
 186     for (o = 0, i = 0; i < in_len; i++) {
 187         ch = in[i];
 188
 189         if (ch < 0x80) {
 190             len = 1;
 191         } else if (ch < 0x800) {
 192             len = 2;
 193         } else if (ch < 0x10000) {
 194             len = 3;
 195         } else if (ch <= 0x10FFFF) {
 196             len = 4;
 197         } else
 198             return WIND_ERR_INVALID_UTF32;
 199
 200         o += len;
 201
 202         if (out) {
 203             if (o >= *out_len)
 204                 return WIND_ERR_OVERRUN;
 205
 206             switch(len) {
 207             case 4:
 208                 out[3] = (ch | 0x80) & 0xbf;
 209                 ch = ch >> 6;
 210             case 3:
 211                 out[2] = (ch | 0x80) & 0xbf;
 212                 ch = ch >> 6;
 213             case 2:
 214                 out[1] = (ch | 0x80) & 0xbf;
 215                 ch = ch >> 6;
 216             case 1:
 217                 out[0] = ch | first_char[len - 1];
 218             }
 219         }
 220         out += len;
 221     }
 222     if (out) {
 223         if (o + 1 >= *out_len)
 224             return WIND_ERR_OVERRUN;
 225         *out = '\0';
 226     }
 227     *out_len = o;
 228     return 0;
 229 }
 230
 231 /**
 232  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
 233  *
 234  * @param in an UCS4 string to convert.
 235  * @param in_len the length of UCS4 string to convert.
 236  * @param out_len the length of the resulting UTF-8 string.
 237  *
 238  * @return returns 0 on success, an wind error code otherwise
 239  * @ingroup wind
 240  */
 241
 242 int
 243 wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
 244 {
 245     return wind_ucs4utf8(in, in_len, NULL, out_len);
 246 }
 247
 248 /**
 249  * Read in an UCS2 from a buffer.
 250  *
 251  * @param ptr The input buffer to read from.
 252  * @param len the length of the input buffer.
 253  * @param flags Flags to control the behavior of the function.
 254  * @param out the output UCS2, the array must be at least out/2 long.
 255  * @param out_len the output length
 256  *
 257  * @return returns 0 on success, an wind error code otherwise.
 258  * @ingroup wind
 259  */
 260
 261 int
 262 wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
 263               uint16_t *out, size_t *out_len)
 264 {
 265     const unsigned char *p = ptr;
 266     int little = ((*flags) & WIND_RW_LE);
 267     size_t olen = *out_len;
 268
 269     /** if len is zero, flags are unchanged */
 270     if (len == 0) {
 271         *out_len = 0;
 272         return 0;
 273     }
 274
 275     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
 276     if (len & 1)
 277         return WIND_ERR_LENGTH_NOT_MOD2;
 278
 279     /**
 280      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
 281      * found, check is LE/BE flag is already and use that otherwise
 282      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
 283      * the LE/BE flag and set the resulting LE/BE flag.
 284      */
 285     if ((*flags) & WIND_RW_BOM) {
 286         uint16_t bom = (p[0] << 8) + p[1];
 287         if (bom == 0xfffe || bom == 0xfeff) {
 288             little = (bom == 0xfffe);
 289             p += 2;
 290             len -= 2;
 291         } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
 292             /* little already set */
 293         } else
 294             return WIND_ERR_NO_BOM;
 295         *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
 296         *flags |= little ? WIND_RW_LE : WIND_RW_BE;
 297     }
 298
 299     while (len) {
 300         if (olen < 1)
 301             return WIND_ERR_OVERRUN;
 302         if (little)
 303             *out = (p[1] << 8) + p[0];
 304         else
 305             *out = (p[0] << 8) + p[1];
 306         out++; p += 2; len -= 2; olen--;
 307     }
 308     *out_len -= olen;
 309     return 0;
 310 }
 311
 312 /**
 313  * Write an UCS2 string to a buffer.
 314  *
 315  * @param in The input UCS2 string.
 316  * @param in_len the length of the input buffer.
 317  * @param flags Flags to control the behavior of the function.
 318  * @param ptr The input buffer to write to, the array must be at least
 319  * (in + 1) * 2 bytes long.
 320  * @param out_len the output length
 321  *
 322  * @return returns 0 on success, an wind error code otherwise.
 323  * @ingroup wind
 324  */
 325
 326 int
 327 wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
 328                void *ptr, size_t *out_len)
 329 {
 330     unsigned char *p = ptr;
 331     size_t len = *out_len;
 332
 333     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
 334     if (len & 1)
 335         return WIND_ERR_LENGTH_NOT_MOD2;
 336
 337     /** On zero input length, flags are preserved */
 338     if (in_len == 0) {
 339         *out_len = 0;
 340         return 0;
 341     }
 342     /** If flags have WIND_RW_BOM set, the byte order mark is written
 343      * first to the output data */
 344     if ((*flags) & WIND_RW_BOM) {
 345         uint16_t bom = 0xfffe;
 346
 347         if (len < 2)
 348             return WIND_ERR_OVERRUN;
 349
 350         if ((*flags) & WIND_RW_LE) {
 351             p[0] = (bom     ) & 0xff;
 352             p[1] = (bom >> 8) & 0xff;
 353         } else {
 354             p[1] = (bom     ) & 0xff;
 355             p[0] = (bom >> 8) & 0xff;
 356         }
 357         len -= 2;
 358     }
 359
 360     while (in_len) {
 361         /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
 362         if (len < 2)
 363             return WIND_ERR_OVERRUN;
 364         if ((*flags) & WIND_RW_LE) {
 365             p[0] = (in[0]     ) & 0xff;
 366             p[1] = (in[0] >> 8) & 0xff;
 367         } else {
 368             p[1] = (in[0]     ) & 0xff;
 369             p[0] = (in[0] >> 8) & 0xff;
 370         }
 371         len -= 2;
 372         in_len--;
 373         p += 2;
 374         in++;
 375     }
 376     *out_len -= len;
 377     return 0;
 378 }
 379
 380
 381 /**
 382  * Convert an UTF-8 string to an UCS2 string.
 383  *
 384  * @param in an UTF-8 string to convert.
 385  * @param out the resulting UCS2 strint, must be at least
 386  * wind_utf8ucs2_length() long.  If out is NULL, the function will
 387  * calculate the needed space for the out variable (just like
 388  * wind_utf8ucs2_length()).
 389  * @param out_len before processing out_len should be the length of
 390  * the out variable, after processing it will be the length of the out
 391  * string.
 392  *
 393  * @return returns 0 on success, an wind error code otherwise
 394  * @ingroup wind
 395  */
 396
 397 int
 398 wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
 399 {
 400     const unsigned char *p;
 401     size_t o = 0;
 402     int ret;
 403
 404     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 405         uint32_t u;
 406
 407         ret = utf8toutf32(&p, &u);
 408         if (ret)
 409             return ret;
 410
 411         if (u & 0xffff0000)
 412             return WIND_ERR_NOT_UTF16;
 413
 414         if (out) {
 415             if (o >= *out_len)
 416                 return WIND_ERR_OVERRUN;
 417             out[o] = u;
 418         }
 419         o++;
 420     }
 421     *out_len = o;
 422     return 0;
 423 }
 424
 425 /**
 426  * Calculate the length of from converting a UTF-8 string to a UCS2
 427  * string.
 428  *
 429  * @param in an UTF-8 string to convert.
 430  * @param out_len the length of the resulting UCS4 string.
 431  *
 432  * @return returns 0 on success, an wind error code otherwise
 433  * @ingroup wind
 434  */
 435
 436 int
 437 wind_utf8ucs2_length(const char *in, size_t *out_len)
 438 {
 439     return wind_utf8ucs2(in, NULL, out_len);
 440 }
 441
 442 /**
 443  * Convert an UCS2 string to a UTF-8 string.
 444  *
 445  * @param in an UCS2 string to convert.
 446  * @param in_len the length of the in UCS2 string.
 447  * @param out the resulting UTF-8 strint, must be at least
 448  * wind_ucs2utf8_length() long.  If out is NULL, the function will
 449  * calculate the needed space for the out variable (just like
 450  * wind_ucs2utf8_length()).
 451  * @param out_len before processing out_len should be the length of
 452  * the out variable, after processing it will be the length of the out
 453  * string.
 454  *
 455  * @return returns 0 on success, an wind error code otherwise
 456  * @ingroup wind
 457  */
 458
 459 int
 460 wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
 461 {
 462     uint16_t ch;
 463     size_t i, len, o;
 464
 465     for (o = 0, i = 0; i < in_len; i++) {
 466         ch = in[i];
 467
 468         if (ch < 0x80) {
 469             len = 1;
 470         } else if (ch < 0x800) {
 471             len = 2;
 472         } else
 473             len = 3;
 474
 475         o += len;
 476
 477         if (out) {
 478             if (o >= *out_len)
 479                 return WIND_ERR_OVERRUN;
 480
 481             switch(len) {
 482             case 3:
 483                 out[2] = (ch | 0x80) & 0xbf;
 484                 ch = ch >> 6;
 485             case 2:
 486                 out[1] = (ch | 0x80) & 0xbf;
 487                 ch = ch >> 6;
 488             case 1:
 489                 out[0] = ch | first_char[len - 1];
 490             }
 491             out += len;
 492         }
 493     }
 494     if (out) {
 495         if (o >= *out_len)
 496             return WIND_ERR_OVERRUN;
 497         *out = '\0';
 498     }
 499     *out_len = o;
 500     return 0;
 501 }
 502
 503 /**
 504  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
 505  *
 506  * @param in an UCS2 string to convert.
 507  * @param in_len an UCS2 string length to convert.
 508  * @param out_len the length of the resulting UTF-8 string.
 509  *
 510  * @return returns 0 on success, an wind error code otherwise
 511  * @ingroup wind
 512  */
 513
 514 int
 515 wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
 516 {
 517     return wind_ucs2utf8(in, in_len, NULL, out_len);
 518 }