Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / libuniname / uniname.c
blobce8b43e3b6e24615608d7cfe8efe695d06157e60
1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 #if HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 /* Specification. */
23 #include "uniname.h"
25 #include <sys/types.h>
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <string.h>
31 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
34 /* Table of Unicode character names, derived from UnicodeData.txt. */
35 #include "uninames.h"
36 /* It contains:
37 static const char unicode_name_words[26496] = ...;
38 #define UNICODE_CHARNAME_NUM_WORDS 4725
39 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
40 #define UNICODE_CHARNAME_WORD_HANGUL 3030
41 #define UNICODE_CHARNAME_WORD_SYLLABLE 3891
42 #define UNICODE_CHARNAME_WORD_CJK 367
43 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 4585
44 static const uint16_t unicode_names[53315] = ...;
45 static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[12886] = ...;
46 static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[12886] = ...;
47 #define UNICODE_CHARNAME_MAX_LENGTH 83
48 #define UNICODE_CHARNAME_MAX_WORDS 13
51 /* Returns the word with a given index. */
52 static const char *
53 unicode_name_word (unsigned int index, unsigned int *lengthp)
55 unsigned int i1;
56 unsigned int i2;
57 unsigned int i;
59 assert (index < UNICODE_CHARNAME_NUM_WORDS);
61 /* Binary search for i with
62 unicode_name_by_length[i].ind_offset <= index
63 and
64 index < unicode_name_by_length[i+1].ind_offset
67 i1 = 0;
68 i2 = SIZEOF (unicode_name_by_length) - 1;
69 while (i2 - i1 > 1)
71 unsigned int i = (i1 + i2) >> 1;
72 if (unicode_name_by_length[i].ind_offset <= index)
73 i1 = i;
74 else
75 i2 = i;
77 i = i1;
78 assert (unicode_name_by_length[i].ind_offset <= index
79 && index < unicode_name_by_length[i+1].ind_offset);
80 *lengthp = i;
81 return &unicode_name_words[unicode_name_by_length[i].extra_offset
82 + (index-unicode_name_by_length[i].ind_offset)*i];
85 /* Looks up the index of a word. */
86 static int
87 unicode_name_word_lookup (const char *word, unsigned int length)
89 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
91 /* Binary search among the words of given length. */
92 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
93 unsigned int i0 = unicode_name_by_length[length].ind_offset;
94 unsigned int i1 = i0;
95 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
96 while (i2 - i1 > 0)
98 unsigned int i = (i1 + i2) >> 1;
99 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
100 const char *w = word;
101 unsigned int n = length;
102 for (;;)
104 if (*p < *w)
106 if (i1 == i)
107 return -1;
108 /* Note here: i1 < i < i2. */
109 i1 = i;
110 break;
112 if (*p > *w)
114 /* Note here: i1 <= i < i2. */
115 i2 = i;
116 break;
118 p++; w++; n--;
119 if (n == 0)
120 return i;
124 return -1;
127 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
128 sections 3.11 and 4.4. */
129 static const char jamo_initial_short_name[19][3] =
131 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
132 "C", "K", "T", "P", "H"
134 static const char jamo_medial_short_name[21][4] =
136 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
137 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
139 static const char jamo_final_short_name[28][3] =
141 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
142 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
145 /* Looks up the name of a Unicode character, in uppercase ASCII.
146 Returns the filled buf, or NULL if the character does not have a name. */
147 char *
148 unicode_character_name (unsigned int c, char *buf)
150 if (c >= 0xAC00 && c <= 0xD7A3)
152 /* Special case for Hangul syllables. Keeps the tables small. */
153 char *ptr;
154 unsigned int tmp;
155 unsigned int index1;
156 unsigned int index2;
157 unsigned int index3;
158 const char *q;
160 /* buf needs to have at least 16 + 7 bytes here. */
161 memcpy (buf, "HANGUL SYLLABLE ", 16);
162 ptr = buf + 16;
164 tmp = c - 0xAC00;
165 index3 = tmp % 28; tmp = tmp / 28;
166 index2 = tmp % 21; tmp = tmp / 21;
167 index1 = tmp;
169 q = jamo_initial_short_name[index1];
170 while (*q != '\0')
171 *ptr++ = *q++;
172 q = jamo_medial_short_name[index2];
173 while (*q != '\0')
174 *ptr++ = *q++;
175 q = jamo_final_short_name[index3];
176 while (*q != '\0')
177 *ptr++ = *q++;
178 *ptr = '\0';
179 return buf;
181 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
182 || (c >= 0x2F800 && c <= 0x2FA1D))
184 /* Special case for CJK compatibility ideographs. Keeps the tables
185 small. */
186 char *ptr;
187 int i;
189 /* buf needs to have at least 28 + 5 bytes here. */
190 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
191 ptr = buf + 28;
193 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
195 unsigned int x = (c >> i) & 0xf;
196 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
198 *ptr = '\0';
199 return buf;
201 else
203 const uint16_t *words;
205 /* Transform the code so that it fits in 16 bits. */
206 switch (c >> 12)
208 case 0x00: case 0x01: case 0x02: case 0x03:
209 break;
210 case 0x0A:
211 c -= 0x06000;
212 break;
213 case 0x0F:
214 c -= 0x0A000;
215 break;
216 case 0x10:
217 c -= 0x0A000;
218 break;
219 case 0x1D:
220 c -= 0x16000;
221 break;
222 case 0x2F:
223 c -= 0x27000;
224 break;
225 case 0xE0:
226 c -= 0xD7000;
227 break;
228 default:
229 return NULL;
233 /* Binary search in unicode_code_to_name. */
234 unsigned int i1 = 0;
235 unsigned int i2 = SIZEOF (unicode_code_to_name);
236 for (;;)
238 unsigned int i = (i1 + i2) >> 1;
239 if (unicode_code_to_name[i].code == c)
241 words = &unicode_names[unicode_code_to_name[i].name];
242 break;
244 else if (unicode_code_to_name[i].code < c)
246 if (i1 == i)
248 words = NULL;
249 break;
251 /* Note here: i1 < i < i2. */
252 i1 = i;
254 else if (unicode_code_to_name[i].code > c)
256 if (i2 == i)
258 words = NULL;
259 break;
261 /* Note here: i1 <= i < i2. */
262 i2 = i;
266 if (words != NULL)
268 /* Found it in unicode_code_to_name. Now concatenate the words. */
269 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
270 char *ptr = buf;
271 for (;;)
273 unsigned int wordlen;
274 const char *word = unicode_name_word (*words>>1, &wordlen);
276 *ptr++ = *word++;
277 while (--wordlen > 0);
278 if ((*words & 1) == 0)
279 break;
280 *ptr++ = ' ';
281 words++;
283 *ptr = '\0';
284 return buf;
286 return NULL;
290 /* Looks up the Unicode character with a given name, in upper- or lowercase
291 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
292 unsigned int
293 unicode_name_character (const char *name)
295 unsigned int len = strlen (name);
296 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
298 /* Test for "word1 word2 ..." syntax. */
299 char buf[UNICODE_CHARNAME_MAX_LENGTH];
300 char *ptr = buf;
301 for (;;)
303 char c = *name++;
304 if (!(c >= ' ' && c <= '~'))
305 break;
306 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
307 if (--len == 0)
308 goto filled_buf;
310 if (false)
311 filled_buf:
313 /* Convert the constituents to uint16_t words. */
314 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
315 uint16_t *wordptr = words;
317 const char *p1 = buf;
318 for (;;)
321 int word;
322 const char *p2 = p1;
323 while (p2 < ptr && *p2 != ' ')
324 p2++;
325 word = unicode_name_word_lookup (p1, p2 - p1);
326 if (word < 0)
327 break;
328 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
329 break;
330 *wordptr++ = word;
331 if (p2 == ptr)
332 goto filled_words;
333 p1 = p2 + 1;
335 /* Special case for Hangul syllables. Keeps the tables small. */
336 if (wordptr == &words[2]
337 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
338 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
340 /* Split the last word [p1..ptr) into three parts:
341 1) [BCDGHJKMNPRST]
342 2) [AEIOUWY]
343 3) [BCDGHIJKLMNPST]
345 const char *p2;
346 const char *p3;
347 const char *p4;
349 p2 = p1;
350 while (p2 < ptr
351 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
352 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
353 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
354 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
355 || *p2 == 'T'))
356 p2++;
357 p3 = p2;
358 while (p3 < ptr
359 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
360 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
361 || *p3 == 'Y'))
362 p3++;
363 p4 = p3;
364 while (p4 < ptr
365 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
366 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
367 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
368 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
369 || *p4 == 'S' || *p4 == 'T'))
370 p4++;
371 if (p4 == ptr)
373 unsigned int n1 = p2 - p1;
374 unsigned int n2 = p3 - p2;
375 unsigned int n3 = p4 - p3;
377 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
379 unsigned int index1;
381 for (index1 = 0; index1 < 19; index1++)
382 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
383 && jamo_initial_short_name[index1][n1] == '\0')
385 unsigned int index2;
387 for (index2 = 0; index2 < 21; index2++)
388 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
389 && jamo_medial_short_name[index2][n2] == '\0')
391 unsigned int index3;
393 for (index3 = 0; index3 < 28; index3++)
394 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
395 && jamo_final_short_name[index3][n3] == '\0')
397 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
399 break;
401 break;
406 /* Special case for CJK compatibility ideographs. Keeps the
407 tables small. */
408 if (wordptr == &words[2]
409 && words[0] == UNICODE_CHARNAME_WORD_CJK
410 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
411 && p1 + 14 <= ptr
412 && p1 + 15 >= ptr
413 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
415 const char *p2 = p1 + 10;
417 if (*p2 != '0')
419 unsigned int c = 0;
421 for (;;)
423 if (*p2 >= '0' && *p2 <= '9')
424 c += (*p2 - '0');
425 else if (*p2 >= 'A' && *p2 <= 'F')
426 c += (*p2 - 'A' + 10);
427 else
428 break;
429 p2++;
430 if (p2 == ptr)
432 if ((c >= 0xF900 && c <= 0xFA2D)
433 || (c >= 0xFA30 && c <= 0xFA6A)
434 || (c >= 0x2F800 && c <= 0x2FA1D))
435 return c;
436 else
437 break;
439 c = c << 4;
445 if (false)
446 filled_words:
448 /* Multiply by 2, to simplify later comparisons. */
449 unsigned int words_length = wordptr - words;
451 int i = words_length - 1;
452 words[i] = 2 * words[i];
453 for (; --i >= 0; )
454 words[i] = 2 * words[i] + 1;
456 /* Binary search in unicode_name_to_code. */
458 unsigned int i1 = 0;
459 unsigned int i2 = SIZEOF (unicode_name_to_code);
460 for (;;)
462 unsigned int i = (i1 + i2) >> 1;
463 const uint16_t *w = words;
464 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
465 unsigned int n = words_length;
466 for (;;)
468 if (*p < *w)
470 if (i1 == i)
471 goto name_not_found;
472 /* Note here: i1 < i < i2. */
473 i1 = i;
474 break;
476 else if (*p > *w)
478 if (i2 == i)
479 goto name_not_found;
480 /* Note here: i1 <= i < i2. */
481 i2 = i;
482 break;
484 p++; w++; n--;
485 if (n == 0)
487 unsigned int c = unicode_name_to_code[i].code;
489 /* Undo the transformation to 16-bit space. */
490 static const unsigned int offset[10] =
492 0x00000, 0x00000, 0x00000, 0x00000, 0x06000,
493 0x0A000, 0x0A000, 0x16000, 0x27000, 0xD7000
495 return c + offset[c >> 12];
500 name_not_found: ;
504 return UNINAME_INVALID;