gnu/dist/gettext/gettext-tools/libuniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 #if HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 /* Specification.  */
  23 #include "uniname.h"
  24
  25 #include <sys/types.h>
  26 #include <assert.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <string.h>
  30
  31 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  32
  33
  34 /* Table of Unicode character names, derived from UnicodeData.txt.  */
  35 #include "uninames.h"
  36 /* It contains:
  37   static const char unicode_name_words[26496] = ...;
  38   #define UNICODE_CHARNAME_NUM_WORDS 4725
  39   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  40   #define UNICODE_CHARNAME_WORD_HANGUL 3030
  41   #define UNICODE_CHARNAME_WORD_SYLLABLE 3891
  42   #define UNICODE_CHARNAME_WORD_CJK 367
  43   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 4585
  44   static const uint16_t unicode_names[53315] = ...;
  45   static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[12886] = ...;
  46   static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[12886] = ...;
  47   #define UNICODE_CHARNAME_MAX_LENGTH 83
  48   #define UNICODE_CHARNAME_MAX_WORDS 13
  49 */
  50
  51 /* Returns the word with a given index.  */
  52 static const char *
  53 unicode_name_word (unsigned int index, unsigned int *lengthp)
  54 {
  55   unsigned int i1;
  56   unsigned int i2;
  57   unsigned int i;
  58
  59   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  60
  61   /* Binary search for i with
  62        unicode_name_by_length[i].ind_offset <= index
  63      and
  64        index < unicode_name_by_length[i+1].ind_offset
  65    */
  66
  67   i1 = 0;
  68   i2 = SIZEOF (unicode_name_by_length) - 1;
  69   while (i2 - i1 > 1)
  70     {
  71       unsigned int i = (i1 + i2) >> 1;
  72       if (unicode_name_by_length[i].ind_offset <= index)
  73         i1 = i;
  74       else
  75         i2 = i;
  76     }
  77   i = i1;
  78   assert (unicode_name_by_length[i].ind_offset <= index
  79           && index < unicode_name_by_length[i+1].ind_offset);
  80   *lengthp = i;
  81   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  82                              + (index-unicode_name_by_length[i].ind_offset)*i];
  83 }
  84
  85 /* Looks up the index of a word.  */
  86 static int
  87 unicode_name_word_lookup (const char *word, unsigned int length)
  88 {
  89   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  90     {
  91       /* Binary search among the words of given length.  */
  92       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  93       unsigned int i0 = unicode_name_by_length[length].ind_offset;
  94       unsigned int i1 = i0;
  95       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
  96       while (i2 - i1 > 0)
  97         {
  98           unsigned int i = (i1 + i2) >> 1;
  99           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 100           const char *w = word;
 101           unsigned int n = length;
 102           for (;;)
 103             {
 104               if (*p < *w)
 105                 {
 106                   if (i1 == i)
 107                     return -1;
 108                   /* Note here: i1 < i < i2.  */
 109                   i1 = i;
 110                   break;
 111                 }
 112               if (*p > *w)
 113                 {
 114                   /* Note here: i1 <= i < i2.  */
 115                   i2 = i;
 116                   break;
 117                 }
 118               p++; w++; n--;
 119               if (n == 0)
 120                 return i;
 121             }
 122         }
 123     }
 124   return -1;
 125 }
 126
 127 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 128    sections 3.11 and 4.4.  */
 129 static const char jamo_initial_short_name[19][3] =
 130 {
 131   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 132   "C", "K", "T", "P", "H"
 133 };
 134 static const char jamo_medial_short_name[21][4] =
 135 {
 136   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 137   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 138 };
 139 static const char jamo_final_short_name[28][3] =
 140 {
 141   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 142   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 143 };
 144
 145 /* Looks up the name of a Unicode character, in uppercase ASCII.
 146    Returns the filled buf, or NULL if the character does not have a name.  */
 147 char *
 148 unicode_character_name (unsigned int c, char *buf)
 149 {
 150   if (c >= 0xAC00 && c <= 0xD7A3)
 151     {
 152       /* Special case for Hangul syllables. Keeps the tables small.  */
 153       char *ptr;
 154       unsigned int tmp;
 155       unsigned int index1;
 156       unsigned int index2;
 157       unsigned int index3;
 158       const char *q;
 159
 160       /* buf needs to have at least 16 + 7 bytes here.  */
 161       memcpy (buf, "HANGUL SYLLABLE ", 16);
 162       ptr = buf + 16;
 163
 164       tmp = c - 0xAC00;
 165       index3 = tmp % 28; tmp = tmp / 28;
 166       index2 = tmp % 21; tmp = tmp / 21;
 167       index1 = tmp;
 168
 169       q = jamo_initial_short_name[index1];
 170       while (*q != '\0')
 171         *ptr++ = *q++;
 172       q = jamo_medial_short_name[index2];
 173       while (*q != '\0')
 174         *ptr++ = *q++;
 175       q = jamo_final_short_name[index3];
 176       while (*q != '\0')
 177         *ptr++ = *q++;
 178       *ptr = '\0';
 179       return buf;
 180     }
 181   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 182            || (c >= 0x2F800 && c <= 0x2FA1D))
 183     {
 184       /* Special case for CJK compatibility ideographs. Keeps the tables
 185          small.  */
 186       char *ptr;
 187       int i;
 188
 189       /* buf needs to have at least 28 + 5 bytes here.  */
 190       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 191       ptr = buf + 28;
 192
 193       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 194         {
 195           unsigned int x = (c >> i) & 0xf;
 196           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 197         }
 198       *ptr = '\0';
 199       return buf;
 200     }
 201   else
 202     {
 203       const uint16_t *words;
 204
 205       /* Transform the code so that it fits in 16 bits.  */
 206       switch (c >> 12)
 207         {
 208         case 0x00: case 0x01: case 0x02: case 0x03:
 209           break;
 210         case 0x0A:
 211           c -= 0x06000;
 212           break;
 213         case 0x0F:
 214           c -= 0x0A000;
 215           break;
 216         case 0x10:
 217           c -= 0x0A000;
 218           break;
 219         case 0x1D:
 220           c -= 0x16000;
 221           break;
 222         case 0x2F:
 223           c -= 0x27000;
 224           break;
 225         case 0xE0:
 226           c -= 0xD7000;
 227           break;
 228         default:
 229           return NULL;
 230         }
 231
 232       {
 233         /* Binary search in unicode_code_to_name.  */
 234         unsigned int i1 = 0;
 235         unsigned int i2 = SIZEOF (unicode_code_to_name);
 236         for (;;)
 237           {
 238             unsigned int i = (i1 + i2) >> 1;
 239             if (unicode_code_to_name[i].code == c)
 240               {
 241                 words = &unicode_names[unicode_code_to_name[i].name];
 242                 break;
 243               }
 244             else if (unicode_code_to_name[i].code < c)
 245               {
 246                 if (i1 == i)
 247                   {
 248                     words = NULL;
 249                     break;
 250                   }
 251                 /* Note here: i1 < i < i2.  */
 252                 i1 = i;
 253               }
 254             else if (unicode_code_to_name[i].code > c)
 255               {
 256                 if (i2 == i)
 257                   {
 258                     words = NULL;
 259                     break;
 260                   }
 261                 /* Note here: i1 <= i < i2.  */
 262                 i2 = i;
 263               }
 264           }
 265       }
 266       if (words != NULL)
 267         {
 268           /* Found it in unicode_code_to_name. Now concatenate the words.  */
 269           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 270           char *ptr = buf;
 271           for (;;)
 272             {
 273               unsigned int wordlen;
 274               const char *word = unicode_name_word (*words>>1, &wordlen);
 275               do
 276                 *ptr++ = *word++;
 277               while (--wordlen > 0);
 278               if ((*words & 1) == 0)
 279                 break;
 280               *ptr++ = ' ';
 281               words++;
 282             }
 283           *ptr = '\0';
 284           return buf;
 285         }
 286       return NULL;
 287     }
 288 }
 289
 290 /* Looks up the Unicode character with a given name, in upper- or lowercase
 291    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 292 unsigned int
 293 unicode_name_character (const char *name)
 294 {
 295   unsigned int len = strlen (name);
 296   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 297     {
 298       /* Test for "word1 word2 ..." syntax.  */
 299       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 300       char *ptr = buf;
 301       for (;;)
 302         {
 303           char c = *name++;
 304           if (!(c >= ' ' && c <= '~'))
 305             break;
 306           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 307           if (--len == 0)
 308             goto filled_buf;
 309         }
 310       if (false)
 311       filled_buf:
 312         {
 313           /* Convert the constituents to uint16_t words.  */
 314           uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 315           uint16_t *wordptr = words;
 316           {
 317             const char *p1 = buf;
 318             for (;;)
 319               {
 320                 {
 321                   int word;
 322                   const char *p2 = p1;
 323                   while (p2 < ptr && *p2 != ' ')
 324                     p2++;
 325                   word = unicode_name_word_lookup (p1, p2 - p1);
 326                   if (word < 0)
 327                     break;
 328                   if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 329                     break;
 330                   *wordptr++ = word;
 331                   if (p2 == ptr)
 332                     goto filled_words;
 333                   p1 = p2 + 1;
 334                 }
 335                 /* Special case for Hangul syllables. Keeps the tables small. */
 336                 if (wordptr == &words[2]
 337                     && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 338                     && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 339                   {
 340                     /* Split the last word [p1..ptr) into three parts:
 341                          1) [BCDGHJKMNPRST]
 342                          2) [AEIOUWY]
 343                          3) [BCDGHIJKLMNPST]
 344                      */
 345                     const char *p2;
 346                     const char *p3;
 347                     const char *p4;
 348
 349                     p2 = p1;
 350                     while (p2 < ptr
 351                            && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 352                                || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 353                                || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 354                                || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 355                                || *p2 == 'T'))
 356                       p2++;
 357                     p3 = p2;
 358                     while (p3 < ptr
 359                            && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 360                                || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 361                                || *p3 == 'Y'))
 362                       p3++;
 363                     p4 = p3;
 364                     while (p4 < ptr
 365                            && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 366                                || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 367                                || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 368                                || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 369                                || *p4 == 'S' || *p4 == 'T'))
 370                       p4++;
 371                     if (p4 == ptr)
 372                       {
 373                         unsigned int n1 = p2 - p1;
 374                         unsigned int n2 = p3 - p2;
 375                         unsigned int n3 = p4 - p3;
 376
 377                         if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 378                           {
 379                             unsigned int index1;
 380
 381                             for (index1 = 0; index1 < 19; index1++)
 382                               if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
 383                                   && jamo_initial_short_name[index1][n1] == '\0')
 384                                 {
 385                                   unsigned int index2;
 386
 387                                   for (index2 = 0; index2 < 21; index2++)
 388                                     if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
 389                                         && jamo_medial_short_name[index2][n2] == '\0')
 390                                       {
 391                                         unsigned int index3;
 392
 393                                         for (index3 = 0; index3 < 28; index3++)
 394                                           if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
 395                                               && jamo_final_short_name[index3][n3] == '\0')
 396                                             {
 397                                               return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 398                                             }
 399                                         break;
 400                                       }
 401                                   break;
 402                                 }
 403                           }
 404                       }
 405                   }
 406                 /* Special case for CJK compatibility ideographs. Keeps the
 407                    tables small.  */
 408                 if (wordptr == &words[2]
 409                     && words[0] == UNICODE_CHARNAME_WORD_CJK
 410                     && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 411                     && p1 + 14 <= ptr
 412                     && p1 + 15 >= ptr
 413                     && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 414                   {
 415                     const char *p2 = p1 + 10;
 416
 417                     if (*p2 != '0')
 418                       {
 419                         unsigned int c = 0;
 420
 421                         for (;;)
 422                           {
 423                             if (*p2 >= '0' && *p2 <= '9')
 424                               c += (*p2 - '0');
 425                             else if (*p2 >= 'A' && *p2 <= 'F')
 426                               c += (*p2 - 'A' + 10);
 427                             else
 428                               break;
 429                             p2++;
 430                             if (p2 == ptr)
 431                               {
 432                                 if ((c >= 0xF900 && c <= 0xFA2D)
 433                                     || (c >= 0xFA30 && c <= 0xFA6A)
 434                                     || (c >= 0x2F800 && c <= 0x2FA1D))
 435                                   return c;
 436                                 else
 437                                   break;
 438                               }
 439                             c = c << 4;
 440                           }
 441                       }
 442                   }
 443               }
 444           }
 445           if (false)
 446           filled_words:
 447             {
 448               /* Multiply by 2, to simplify later comparisons.  */
 449               unsigned int words_length = wordptr - words;
 450               {
 451                 int i = words_length - 1;
 452                 words[i] = 2 * words[i];
 453                 for (; --i >= 0; )
 454                   words[i] = 2 * words[i] + 1;
 455               }
 456               /* Binary search in unicode_name_to_code.  */
 457               {
 458                 unsigned int i1 = 0;
 459                 unsigned int i2 = SIZEOF (unicode_name_to_code);
 460                 for (;;)
 461                   {
 462                     unsigned int i = (i1 + i2) >> 1;
 463                     const uint16_t *w = words;
 464                     const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
 465                     unsigned int n = words_length;
 466                     for (;;)
 467                       {
 468                         if (*p < *w)
 469                           {
 470                             if (i1 == i)
 471                               goto name_not_found;
 472                             /* Note here: i1 < i < i2.  */
 473                             i1 = i;
 474                             break;
 475                           }
 476                         else if (*p > *w)
 477                           {
 478                             if (i2 == i)
 479                               goto name_not_found;
 480                             /* Note here: i1 <= i < i2.  */
 481                             i2 = i;
 482                             break;
 483                           }
 484                         p++; w++; n--;
 485                         if (n == 0)
 486                           {
 487                             unsigned int c = unicode_name_to_code[i].code;
 488
 489                             /* Undo the transformation to 16-bit space.  */
 490                             static const unsigned int offset[10] =
 491                               {
 492                                 0x00000, 0x00000, 0x00000, 0x00000, 0x06000,
 493                                 0x0A000, 0x0A000, 0x16000, 0x27000, 0xD7000
 494                               };
 495                             return c + offset[c >> 12];
 496                           }
 497                       }
 498                   }
 499               }
 500             name_not_found: ;
 501             }
 502         }
 503     }
 504   return UNINAME_INVALID;
 505 }