external/bsd/bind/dist/contrib/idn/idnkit-1.0-src/lib/ucs4.c

   1 /*      $NetBSD: ucs4.c,v 1.4 2014/12/10 04:37:55 christos Exp $        */
   2
   3 #ifndef lint
   4 static char *rcsid = "Id: ucs4.c,v 1.1 2003/06/04 00:26:14 marka Exp ";
   5 #endif
   6
   7 /*
   8  * Copyright (c) 2001 Japan Network Information Center.  All rights reserved.
   9  *
  10  * By using this file, you agree to the terms and conditions set forth bellow.
  11  *
  12  *                      LICENSE TERMS AND CONDITIONS
  13  *
  14  * The following License Terms and Conditions apply, unless a different
  15  * license is obtained from Japan Network Information Center ("JPNIC"),
  16  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
  17  * Chiyoda-ku, Tokyo 101-0047, Japan.
  18  *
  19  * 1. Use, Modification and Redistribution (including distribution of any
  20  *    modified or derived work) in source and/or binary forms is permitted
  21  *    under this License Terms and Conditions.
  22  *
  23  * 2. Redistribution of source code must retain the copyright notices as they
  24  *    appear in each source code file, this License Terms and Conditions.
  25  *
  26  * 3. Redistribution in binary form must reproduce the Copyright Notice,
  27  *    this License Terms and Conditions, in the documentation and/or other
  28  *    materials provided with the distribution.  For the purposes of binary
  29  *    distribution the "Copyright Notice" refers to the following language:
  30  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
  31  *
  32  * 4. The name of JPNIC may not be used to endorse or promote products
  33  *    derived from this Software without specific prior written approval of
  34  *    JPNIC.
  35  *
  36  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
  37  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  38  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  39  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
  40  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  41  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  42  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  43  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  44  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  45  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  46  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
  47  */
  48
  49 #include <config.h>
  50
  51 #include <stddef.h>
  52 #include <stdlib.h>
  53 #include <string.h>
  54
  55 #include <idn/assert.h>
  56 #include <idn/result.h>
  57 #include <idn/logmacro.h>
  58 #include <idn/util.h>
  59 #include <idn/ucs4.h>
  60 #include <idn/debug.h>
  61
  62 /*
  63  * Unicode surrogate pair.
  64  */
  65 #define IS_SURROGATE_HIGH(v)    (0xd800 <= (v) && (v) <= 0xdbff)
  66 #define IS_SURROGATE_LOW(v)     (0xdc00 <= (v) && (v) <= 0xdfff)
  67 #define SURROGATE_HIGH(v)       (SURROGATE_H_OFF + (((v) - 0x10000) >> 10))
  68 #define SURROGATE_LOW(v)        (SURROGATE_L_OFF + ((v) & 0x3ff))
  69 #define SURROGATE_BASE          0x10000
  70 #define SURROGATE_H_OFF         0xd800
  71 #define SURROGATE_L_OFF         0xdc00
  72 #define COMBINE_SURROGATE(h, l) \
  73         (SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF))
  74
  75 /*
  76  * ASCII ctype macros.
  77  * Note that these macros evaluate the argument multiple times.  Be careful.
  78  */
  79 #define ASCII_TOUPPER(c) \
  80         (('a' <= (c) && (c) <= 'z') ? ((c) - 'a' + 'A') : (c))
  81 #define ASCII_TOLOWER(c) \
  82         (('A' <= (c) && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c))
  83
  84 idn_result_t
  85 idn_ucs4_ucs4toutf16(const unsigned long *ucs4, unsigned short *utf16,
  86                      size_t tolen) {
  87         unsigned short *utf16p = utf16;
  88         unsigned long v;
  89         idn_result_t r;
  90
  91         TRACE(("idn_ucs4_ucs4toutf16(ucs4=\"%s\", tolen=%d)\n",
  92                idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
  93
  94         while (*ucs4 != '\0') {
  95                 v = *ucs4++;
  96
  97                 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
  98                         WARNING(("idn_ucs4_ucs4toutf16: UCS4 string contains "
  99                                  "surrogate pair\n"));
 100                         r = idn_invalid_encoding;
 101                         goto ret;
 102                 } else if (v > 0xffff) {
 103                         /* Convert to surrogate pair */
 104                         if (v >= 0x110000) {
 105                                 r = idn_invalid_encoding;
 106                                 goto ret;
 107                         }
 108                         if (tolen < 2) {
 109                                 r = idn_buffer_overflow;
 110                                 goto ret;
 111                         }
 112                         *utf16p++ = SURROGATE_HIGH(v);
 113                         *utf16p++ = SURROGATE_LOW(v);
 114                         tolen -= 2;
 115                 } else {
 116                         if (tolen < 1) {
 117                                 r = idn_buffer_overflow;
 118                                 goto ret;
 119                         }
 120                         *utf16p++ = v;
 121                         tolen--;
 122                 }
 123         }
 124
 125         if (tolen < 1) {
 126                 r = idn_buffer_overflow;
 127                 goto ret;
 128         }
 129         *utf16p = '\0';
 130
 131         r = idn_success;
 132 ret:
 133         if (r == idn_success) {
 134                 TRACE(("idn_ucs4_ucs4toutf16(): success (utf16=\"%s\")\n",
 135                        idn__debug_utf16xstring(utf16, 50)));
 136         } else {
 137                 TRACE(("idn_ucs4_ucs4toutf16(): %s\n",
 138                        idn_result_tostring(r)));
 139         }
 140         return (r);
 141 }
 142
 143 idn_result_t
 144 idn_ucs4_utf16toucs4(const unsigned short *utf16, unsigned long *ucs4,
 145                      size_t tolen) {
 146         unsigned long *ucs4p = ucs4;
 147         unsigned short v0, v1;
 148         idn_result_t r;
 149
 150         TRACE(("idn_ucs4_utf16toucs4(utf16=\"%s\", tolen=%d)\n",
 151                idn__debug_utf16xstring(utf16, 50), (int)tolen));
 152
 153         while (*utf16 != '\0') {
 154                 v0 = *utf16;
 155
 156                 if (tolen < 1) {
 157                         r = idn_buffer_overflow;
 158                         goto ret;
 159                 }
 160
 161                 if (IS_SURROGATE_HIGH(v0)) {
 162                         v1 = *(utf16 + 1);
 163                         if (!IS_SURROGATE_LOW(v1)) {
 164                                 WARNING(("idn_ucs4_utf16toucs4: "
 165                                          "corrupted surrogate pair\n"));
 166                                 r = idn_invalid_encoding;
 167                                 goto ret;
 168                         }
 169                         *ucs4p++ = COMBINE_SURROGATE(v0, v1);
 170                         tolen--;
 171                         utf16 += 2;
 172
 173                 } else {
 174                         *ucs4p++ = v0;
 175                         tolen--;
 176                         utf16++;
 177
 178                 }
 179         }
 180
 181         if (tolen < 1) {
 182                 r = idn_buffer_overflow;
 183                 goto ret;
 184         }
 185         *ucs4p = '\0';
 186
 187         r = idn_success;
 188 ret:
 189         if (r == idn_success) {
 190                 TRACE(("idn_ucs4_utf16toucs4(): success (ucs4=\"%s\")\n",
 191                        idn__debug_ucs4xstring(ucs4, 50)));
 192         } else {
 193                 TRACE(("idn_ucs4_utf16toucs4(): %s\n",
 194                        idn_result_tostring(r)));
 195         }
 196         return (r);
 197 }
 198
 199 idn_result_t
 200 idn_ucs4_utf8toucs4(const char *utf8, unsigned long *ucs4, size_t tolen) {
 201         const unsigned char *utf8p = (const unsigned char *)utf8;
 202         unsigned long *ucs4p = ucs4;
 203         unsigned long v, min;
 204         unsigned char c;
 205         int width;
 206         int i;
 207         idn_result_t r;
 208
 209         TRACE(("idn_ucs4_utf8toucs4(utf8=\"%s\", tolen=%d)\n",
 210                idn__debug_xstring(utf8, 50), (int)tolen));
 211
 212         while(*utf8p != '\0') {
 213                 c = *utf8p++;
 214                 if (c < 0x80) {
 215                         v = c;
 216                         min = 0;
 217                         width = 1;
 218                 } else if (c < 0xc0) {
 219                         WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
 220                         r = idn_invalid_encoding;
 221                         goto ret;
 222                 } else if (c < 0xe0) {
 223                         v = c & 0x1f;
 224                         min = 0x80;
 225                         width = 2;
 226                 } else if (c < 0xf0) {
 227                         v = c & 0x0f;
 228                         min = 0x800;
 229                         width = 3;
 230                 } else if (c < 0xf8) {
 231                         v = c & 0x07;
 232                         min = 0x10000;
 233                         width = 4;
 234                 } else if (c < 0xfc) {
 235                         v = c & 0x03;
 236                         min = 0x200000;
 237                         width = 5;
 238                 } else if (c < 0xfe) {
 239                         v = c & 0x01;
 240                         min = 0x4000000;
 241                         width = 6;
 242                 } else {
 243                         WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
 244                         r = idn_invalid_encoding;
 245                         goto ret;
 246                 }
 247
 248                 for (i = width - 1; i > 0; i--) {
 249                         c = *utf8p++;
 250                         if (c < 0x80 || 0xc0 <= c) {
 251                                 WARNING(("idn_ucs4_utf8toucs4: "
 252                                          "invalid character\n"));
 253                                 r = idn_invalid_encoding;
 254                                 goto ret;
 255                         }
 256                         v = (v << 6) | (c & 0x3f);
 257                 }
 258
 259                 if (v < min) {
 260                         WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
 261                         r = idn_invalid_encoding;
 262                         goto ret;
 263                 }
 264                 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
 265                         WARNING(("idn_ucs4_utf8toucs4: UTF-8 string contains "
 266                                  "surrogate pair\n"));
 267                         r = idn_invalid_encoding;
 268                         goto ret;
 269                 }
 270                 if (tolen < 1) {
 271                         r = idn_buffer_overflow;
 272                         goto ret;
 273                 }
 274                 tolen--;
 275                 *ucs4p++ = v;
 276         }
 277
 278         if (tolen < 1) {
 279                 r = idn_buffer_overflow;
 280                 goto ret;
 281         }
 282         *ucs4p = '\0';
 283
 284         r = idn_success;
 285 ret:
 286         if (r == idn_success) {
 287                 TRACE(("idn_ucs4_utf8toucs4(): success (ucs4=\"%s\")\n",
 288                        idn__debug_ucs4xstring(ucs4, 50)));
 289         } else {
 290                 TRACE(("idn_ucs4_utf8toucs4(): %s\n",
 291                        idn_result_tostring(r)));
 292         }
 293         return (r);
 294 }
 295
 296 idn_result_t
 297 idn_ucs4_ucs4toutf8(const unsigned long *ucs4, char *utf8, size_t tolen) {
 298         unsigned char *utf8p = (unsigned char *)utf8;
 299         unsigned long v;
 300         int width;
 301         int mask;
 302         int offset;
 303         idn_result_t r;
 304
 305         TRACE(("idn_ucs4_ucs4toutf8(ucs4=\"%s\", tolen=%d)\n",
 306                idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
 307
 308         while (*ucs4 != '\0') {
 309                 v = *ucs4++;
 310                 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
 311                         WARNING(("idn_ucs4_ucs4toutf8: UCS4 string contains "
 312                                  "surrogate pair\n"));
 313                         r = idn_invalid_encoding;
 314                         goto ret;
 315                 }
 316                 if (v < 0x80) {
 317                         mask = 0;
 318                         width = 1;
 319                 } else if (v < 0x800) {
 320                         mask = 0xc0;
 321                         width = 2;
 322                 } else if (v < 0x10000) {
 323                         mask = 0xe0;
 324                         width = 3;
 325                 } else if (v < 0x200000) {
 326                         mask = 0xf0;
 327                         width = 4;
 328                 } else if (v < 0x4000000) {
 329                         mask = 0xf8;
 330                         width = 5;
 331                 } else if (v < 0x80000000) {
 332                         mask = 0xfc;
 333                         width = 6;
 334                 } else {
 335                         WARNING(("idn_ucs4_ucs4toutf8: invalid character\n"));
 336                         r = idn_invalid_encoding;
 337                         goto ret;
 338                 }
 339
 340                 if (tolen < width) {
 341                         r = idn_buffer_overflow;
 342                         goto ret;
 343                 }
 344                 offset = 6 * (width - 1);
 345                 *utf8p++ = (v >> offset) | mask;
 346                 mask = 0x80;
 347                 while (offset > 0) {
 348                         offset -= 6;
 349                         *utf8p++ = ((v >> offset) & 0x3f) | mask;
 350                 }
 351                 tolen -= width;
 352         }
 353
 354         if (tolen < 1) {
 355                 r = idn_buffer_overflow;
 356                 goto ret;
 357         }
 358         *utf8p = '\0';
 359
 360         r = idn_success;
 361 ret:
 362         if (r == idn_success) {
 363                 TRACE(("idn_ucs4_ucs4toutf8(): success (utf8=\"%s\")\n",
 364                        idn__debug_xstring(utf8, 50)));
 365         } else {
 366                 TRACE(("idn_ucs4_ucs4toutf8(): %s\n",
 367                        idn_result_tostring(r)));
 368         }
 369         return (r);
 370 }
 371
 372 size_t
 373 idn_ucs4_strlen(const unsigned long *ucs4) {
 374         size_t len;
 375
 376         for (len = 0; *ucs4 != '\0'; ucs4++, len++)
 377                 /* nothing to do */ ;
 378
 379         return (len);
 380 }
 381
 382 unsigned long *
 383 idn_ucs4_strcpy(unsigned long *to, const unsigned long *from) {
 384         unsigned long *result = to;
 385
 386         while (*from != '\0')
 387                 *to++ = *from++;
 388         *to = '\0';
 389
 390         return (result);
 391 }
 392
 393 unsigned long *
 394 idn_ucs4_strcat(unsigned long *to, const unsigned long *from) {
 395         unsigned long *result = to;
 396
 397         while (*to != '\0')
 398                 to++;
 399
 400         while (*from != '\0')
 401                 *to++ = *from++;
 402         *to = '\0';
 403
 404         return (result);
 405 }
 406
 407 int
 408 idn_ucs4_strcmp(const unsigned long *str1, const unsigned long *str2) {
 409         while (*str1 != '\0') {
 410                 if (*str1 > *str2)
 411                         return (1);
 412                 else if (*str1 < *str2)
 413                         return (-1);
 414                 str1++;
 415                 str2++;
 416         }
 417
 418         if (*str1 > *str2)
 419                 return (1);
 420         else if (*str1 < *str2)
 421                 return (-1);
 422
 423         return (0);
 424 }
 425
 426 int
 427 idn_ucs4_strcasecmp(const unsigned long *str1, const unsigned long *str2) {
 428         unsigned long c1, c2;
 429
 430         while (*str1 != '\0') {
 431                 c1 = ASCII_TOLOWER(*str1);
 432                 c2 = ASCII_TOLOWER(*str2);
 433                 if (c1 > c2)
 434                         return (1);
 435                 else if (c1 < c2)
 436                         return (-1);
 437                 str1++;
 438                 str2++;
 439         }
 440
 441         c1 = ASCII_TOLOWER(*str1);
 442         c2 = ASCII_TOLOWER(*str2);
 443         if (c1 > c2)
 444                 return (1);
 445         else if (c1 < c2)
 446                 return (-1);
 447
 448         return (0);
 449 }
 450
 451
 452 unsigned long *
 453 idn_ucs4_strdup(const unsigned long *str) {
 454         size_t length = idn_ucs4_strlen(str);
 455         unsigned long *dupstr;
 456
 457         dupstr = (unsigned long *)malloc(sizeof(*str) * (length + 1));
 458         if (dupstr == NULL)
 459                 return NULL;
 460         memcpy(dupstr, str, sizeof(*str) * (length + 1));
 461
 462         return dupstr;
 463 }