1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
25 #include <sys/types.h>
31 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
34 /* Table of Unicode character names, derived from UnicodeData.txt. */
37 static const char unicode_name_words[26496] = ...;
38 #define UNICODE_CHARNAME_NUM_WORDS 4725
39 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
40 #define UNICODE_CHARNAME_WORD_HANGUL 3030
41 #define UNICODE_CHARNAME_WORD_SYLLABLE 3891
42 #define UNICODE_CHARNAME_WORD_CJK 367
43 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 4585
44 static const uint16_t unicode_names[53315] = ...;
45 static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[12886] = ...;
46 static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[12886] = ...;
47 #define UNICODE_CHARNAME_MAX_LENGTH 83
48 #define UNICODE_CHARNAME_MAX_WORDS 13
51 /* Returns the word with a given index. */
53 unicode_name_word (unsigned int index
, unsigned int *lengthp
)
59 assert (index
< UNICODE_CHARNAME_NUM_WORDS
);
61 /* Binary search for i with
62 unicode_name_by_length[i].ind_offset <= index
64 index < unicode_name_by_length[i+1].ind_offset
68 i2
= SIZEOF (unicode_name_by_length
) - 1;
71 unsigned int i
= (i1
+ i2
) >> 1;
72 if (unicode_name_by_length
[i
].ind_offset
<= index
)
78 assert (unicode_name_by_length
[i
].ind_offset
<= index
79 && index
< unicode_name_by_length
[i
+1].ind_offset
);
81 return &unicode_name_words
[unicode_name_by_length
[i
].extra_offset
82 + (index
-unicode_name_by_length
[i
].ind_offset
)*i
];
85 /* Looks up the index of a word. */
87 unicode_name_word_lookup (const char *word
, unsigned int length
)
89 if (length
> 0 && length
< SIZEOF (unicode_name_by_length
) - 1)
91 /* Binary search among the words of given length. */
92 unsigned int extra_offset
= unicode_name_by_length
[length
].extra_offset
;
93 unsigned int i0
= unicode_name_by_length
[length
].ind_offset
;
95 unsigned int i2
= unicode_name_by_length
[length
+1].ind_offset
;
98 unsigned int i
= (i1
+ i2
) >> 1;
99 const char *p
= &unicode_name_words
[extra_offset
+ (i
-i0
)*length
];
100 const char *w
= word
;
101 unsigned int n
= length
;
108 /* Note here: i1 < i < i2. */
114 /* Note here: i1 <= i < i2. */
127 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
128 sections 3.11 and 4.4. */
129 static const char jamo_initial_short_name
[19][3] =
131 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
132 "C", "K", "T", "P", "H"
134 static const char jamo_medial_short_name
[21][4] =
136 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
137 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
139 static const char jamo_final_short_name
[28][3] =
141 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
142 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
145 /* Looks up the name of a Unicode character, in uppercase ASCII.
146 Returns the filled buf, or NULL if the character does not have a name. */
148 unicode_character_name (unsigned int c
, char *buf
)
150 if (c
>= 0xAC00 && c
<= 0xD7A3)
152 /* Special case for Hangul syllables. Keeps the tables small. */
160 /* buf needs to have at least 16 + 7 bytes here. */
161 memcpy (buf
, "HANGUL SYLLABLE ", 16);
165 index3
= tmp
% 28; tmp
= tmp
/ 28;
166 index2
= tmp
% 21; tmp
= tmp
/ 21;
169 q
= jamo_initial_short_name
[index1
];
172 q
= jamo_medial_short_name
[index2
];
175 q
= jamo_final_short_name
[index3
];
181 else if ((c
>= 0xF900 && c
<= 0xFA2D) || (c
>= 0xFA30 && c
<= 0xFA6A)
182 || (c
>= 0x2F800 && c
<= 0x2FA1D))
184 /* Special case for CJK compatibility ideographs. Keeps the tables
189 /* buf needs to have at least 28 + 5 bytes here. */
190 memcpy (buf
, "CJK COMPATIBILITY IDEOGRAPH-", 28);
193 for (i
= (c
< 0x10000 ? 12 : 16); i
>= 0; i
-= 4)
195 unsigned int x
= (c
>> i
) & 0xf;
196 *ptr
++ = (x
< 10 ? '0' : 'A' - 10) + x
;
203 const uint16_t *words
;
205 /* Transform the code so that it fits in 16 bits. */
208 case 0x00: case 0x01: case 0x02: case 0x03:
233 /* Binary search in unicode_code_to_name. */
235 unsigned int i2
= SIZEOF (unicode_code_to_name
);
238 unsigned int i
= (i1
+ i2
) >> 1;
239 if (unicode_code_to_name
[i
].code
== c
)
241 words
= &unicode_names
[unicode_code_to_name
[i
].name
];
244 else if (unicode_code_to_name
[i
].code
< c
)
251 /* Note here: i1 < i < i2. */
254 else if (unicode_code_to_name
[i
].code
> c
)
261 /* Note here: i1 <= i < i2. */
268 /* Found it in unicode_code_to_name. Now concatenate the words. */
269 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
273 unsigned int wordlen
;
274 const char *word
= unicode_name_word (*words
>>1, &wordlen
);
277 while (--wordlen
> 0);
278 if ((*words
& 1) == 0)
290 /* Looks up the Unicode character with a given name, in upper- or lowercase
291 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
293 unicode_name_character (const char *name
)
295 unsigned int len
= strlen (name
);
296 if (len
> 1 && len
<= UNICODE_CHARNAME_MAX_LENGTH
)
298 /* Test for "word1 word2 ..." syntax. */
299 char buf
[UNICODE_CHARNAME_MAX_LENGTH
];
304 if (!(c
>= ' ' && c
<= '~'))
306 *ptr
++ = (c
>= 'a' && c
<= 'z' ? c
- 'a' + 'A' : c
);
313 /* Convert the constituents to uint16_t words. */
314 uint16_t words
[UNICODE_CHARNAME_MAX_WORDS
];
315 uint16_t *wordptr
= words
;
317 const char *p1
= buf
;
323 while (p2
< ptr
&& *p2
!= ' ')
325 word
= unicode_name_word_lookup (p1
, p2
- p1
);
328 if (wordptr
== &words
[UNICODE_CHARNAME_MAX_WORDS
])
335 /* Special case for Hangul syllables. Keeps the tables small. */
336 if (wordptr
== &words
[2]
337 && words
[0] == UNICODE_CHARNAME_WORD_HANGUL
338 && words
[1] == UNICODE_CHARNAME_WORD_SYLLABLE
)
340 /* Split the last word [p1..ptr) into three parts:
351 && (*p2
== 'B' || *p2
== 'C' || *p2
== 'D'
352 || *p2
== 'G' || *p2
== 'H' || *p2
== 'J'
353 || *p2
== 'K' || *p2
== 'M' || *p2
== 'N'
354 || *p2
== 'P' || *p2
== 'R' || *p2
== 'S'
359 && (*p3
== 'A' || *p3
== 'E' || *p3
== 'I'
360 || *p3
== 'O' || *p3
== 'U' || *p3
== 'W'
365 && (*p4
== 'B' || *p4
== 'C' || *p4
== 'D'
366 || *p4
== 'G' || *p4
== 'H' || *p4
== 'I'
367 || *p4
== 'J' || *p4
== 'K' || *p4
== 'L'
368 || *p4
== 'M' || *p4
== 'N' || *p4
== 'P'
369 || *p4
== 'S' || *p4
== 'T'))
373 unsigned int n1
= p2
- p1
;
374 unsigned int n2
= p3
- p2
;
375 unsigned int n3
= p4
- p3
;
377 if (n1
<= 2 && (n2
>= 1 && n2
<= 3) && n3
<= 2)
381 for (index1
= 0; index1
< 19; index1
++)
382 if (memcmp(jamo_initial_short_name
[index1
], p1
, n1
) == 0
383 && jamo_initial_short_name
[index1
][n1
] == '\0')
387 for (index2
= 0; index2
< 21; index2
++)
388 if (memcmp(jamo_medial_short_name
[index2
], p2
, n2
) == 0
389 && jamo_medial_short_name
[index2
][n2
] == '\0')
393 for (index3
= 0; index3
< 28; index3
++)
394 if (memcmp(jamo_final_short_name
[index3
], p3
, n3
) == 0
395 && jamo_final_short_name
[index3
][n3
] == '\0')
397 return 0xAC00 + (index1
* 21 + index2
) * 28 + index3
;
406 /* Special case for CJK compatibility ideographs. Keeps the
408 if (wordptr
== &words
[2]
409 && words
[0] == UNICODE_CHARNAME_WORD_CJK
410 && words
[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
413 && memcmp (p1
, "IDEOGRAPH-", 10) == 0)
415 const char *p2
= p1
+ 10;
423 if (*p2
>= '0' && *p2
<= '9')
425 else if (*p2
>= 'A' && *p2
<= 'F')
426 c
+= (*p2
- 'A' + 10);
432 if ((c
>= 0xF900 && c
<= 0xFA2D)
433 || (c
>= 0xFA30 && c
<= 0xFA6A)
434 || (c
>= 0x2F800 && c
<= 0x2FA1D))
448 /* Multiply by 2, to simplify later comparisons. */
449 unsigned int words_length
= wordptr
- words
;
451 int i
= words_length
- 1;
452 words
[i
] = 2 * words
[i
];
454 words
[i
] = 2 * words
[i
] + 1;
456 /* Binary search in unicode_name_to_code. */
459 unsigned int i2
= SIZEOF (unicode_name_to_code
);
462 unsigned int i
= (i1
+ i2
) >> 1;
463 const uint16_t *w
= words
;
464 const uint16_t *p
= &unicode_names
[unicode_name_to_code
[i
].name
];
465 unsigned int n
= words_length
;
472 /* Note here: i1 < i < i2. */
480 /* Note here: i1 <= i < i2. */
487 unsigned int c
= unicode_name_to_code
[i
].code
;
489 /* Undo the transformation to 16-bit space. */
490 static const unsigned int offset
[10] =
492 0x00000, 0x00000, 0x00000, 0x00000, 0x06000,
493 0x0A000, 0x0A000, 0x16000, 0x27000, 0xD7000
495 return c
+ offset
[c
>> 12];
504 return UNINAME_INVALID
;