1 // -*- coding: viscii tab-width:2 mode: c++ -*-
5 #include "dictionary.h"
11 <U1EB2> /x02 \x02 LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE
12 <U1EB4> /x05 \x05 LATIN CAPITAL LETTER A WITH BREVE AND TILDE
13 <U1EAA> /x06 \x06 LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE
14 <U1EF6> /x14 \x14 LATIN CAPITAL LETTER Y WITH HOOK ABOVE
15 <U1EF8> /x19 \x19 LATIN CAPITAL LETTER Y WITH TILDE
16 <U1EF4> /x1e \x1e LATIN CAPITAL LETTER Y WITH DOT BELOW
17 <U1EA0> /x80 € LATIN CAPITAL LETTER A WITH DOT BELOW
18 <U1EAE> /x81 � LATIN CAPITAL LETTER A WITH BREVE AND ACUTE
19 <U1EB0> /x82 ‚ LATIN CAPITAL LETTER A WITH BREVE AND GRAVE
20 <U1EB6> /x83 ƒ LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW
21 <U1EA4> /x84 „ LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE
22 <U1EA6> /x85 … LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE
23 <U1EA8> /x86 † LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE
24 <U1EAC> /x87 ‡ LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW
25 <U1EBC> /x88 ˆ LATIN CAPITAL LETTER E WITH TILDE
26 <U1EB8> /x89 ‰ LATIN CAPITAL LETTER E WITH DOT BELOW
27 <U1EBE> /x8a Š LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE
28 <U1EC0> /x8b ‹ LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE
29 <U1EC2> /x8c Œ LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE
30 <U1EC4> /x8d � LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE
31 <U1EC6> /x8e Ž LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW
32 <U1ED0> /x8f � LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE
33 <U1ED2> /x90 � LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE
34 <U1ED4> /x91 ‘ LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE
35 <U1ED6> /x92 ’ LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE
36 <U1ED8> /x93 “ LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW
37 <U1EE2> /x94 ” LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW
38 <U1EDA> /x95 • LATIN CAPITAL LETTER O WITH HORN AND ACUTE
39 <U1EDC> /x96 – LATIN CAPITAL LETTER O WITH HORN AND GRAVE
40 <U1EDE> /x97 — LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE
41 <U1ECA> /x98 ˜ LATIN CAPITAL LETTER I WITH DOT BELOW
42 <U1ECE> /x99 ™ LATIN CAPITAL LETTER O WITH HOOK ABOVE
43 <U1ECC> /x9a š LATIN CAPITAL LETTER O WITH DOT BELOW
44 <U1EC8> /x9b › LATIN CAPITAL LETTER I WITH HOOK ABOVE
45 <U1EE6> /x9c œ LATIN CAPITAL LETTER U WITH HOOK ABOVE
46 <U0168> /x9d � LATIN CAPITAL LETTER U WITH TILDE
47 <U1EE4> /x9e ž LATIN CAPITAL LETTER U WITH DOT BELOW
48 <U1EF2> /x9f Ÿ LATIN CAPITAL LETTER Y WITH GRAVE
49 <U00D5> /xa0 LATIN CAPITAL LETTER O WITH TILDE
50 <U1EAF> /xa1 ¡ LATIN SMALL LETTER A WITH BREVE AND ACUTE
51 <U1EB1> /xa2 ¢ LATIN SMALL LETTER A WITH BREVE AND GRAVE
52 <U1EB7> /xa3 £ LATIN SMALL LETTER A WITH BREVE AND DOT BELOW
53 <U1EA5> /xa4 ¤ LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE
54 <U1EA7> /xa5 ¥ LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE
55 <U1EA9> /xa6 ¦ LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE
56 <U1EAD> /xa7 § LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW
57 <U1EBD> /xa8 ¨ LATIN SMALL LETTER E WITH TILDE
58 <U1EB9> /xa9 © LATIN SMALL LETTER E WITH DOT BELOW
59 <U1EBF> /xaa ª LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE
60 <U1EC1> /xab « LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE
61 <U1EC3> /xac ¬ LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE
62 <U1EC5> /xad LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE
63 <U1EC7> /xae ® LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW
64 <U1ED1> /xaf ¯ LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE
65 <U1ED3> /xb0 ° LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE
66 <U1ED5> /xb1 ± LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE
67 <U1ED7> /xb2 ² LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE
68 <U1EE0> /xb3 ³ LATIN CAPITAL LETTER O WITH HORN AND TILDE
69 <U01A0> /xb4 ´ LATIN CAPITAL LETTER O WITH HORN
70 <U1ED9> /xb5 µ LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW
71 <U1EDD> /xb6 ¶ LATIN SMALL LETTER O WITH HORN AND GRAVE
72 <U1EDF> /xb7 · LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE
73 <U1ECB> /xb8 ¸ LATIN SMALL LETTER I WITH DOT BELOW
74 <U1EF0> /xb9 ¹ LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW
75 <U1EE8> /xba º LATIN CAPITAL LETTER U WITH HORN AND ACUTE
76 <U1EEA> /xbb » LATIN CAPITAL LETTER U WITH HORN AND GRAVE
77 <U1EEC> /xbc ¼ LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE
78 <U01A1> /xbd ½ LATIN SMALL LETTER O WITH HORN
79 <U1EDB> /xbe ¾ LATIN SMALL LETTER O WITH HORN AND ACUTE
80 <U01AF> /xbf ¿ LATIN CAPITAL LETTER U WITH HORN
81 <U00C0> /xc0 À LATIN CAPITAL LETTER A WITH GRAVE
82 <U00C1> /xc1 Á LATIN CAPITAL LETTER A WITH ACUTE
83 <U00C2> /xc2 Â LATIN CAPITAL LETTER A WITH CIRCUMFLEX
84 <U00C3> /xc3 Ã LATIN CAPITAL LETTER A WITH TILDE
85 <U1EA2> /xc4 Ä LATIN CAPITAL LETTER A WITH HOOK ABOVE
86 <U0102> /xc5 Å LATIN CAPITAL LETTER A WITH BREVE
87 <U1EB3> /xc6 Æ LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE
88 <U1EB5> /xc7 Ç LATIN SMALL LETTER A WITH BREVE AND TILDE
89 <U00C8> /xc8 È LATIN CAPITAL LETTER E WITH GRAVE
90 <U00C9> /xc9 É LATIN CAPITAL LETTER E WITH ACUTE
91 <U00CA> /xca Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
92 <U1EBA> /xcb Ë LATIN CAPITAL LETTER E WITH HOOK ABOVE
93 <U00CC> /xcc Ì LATIN CAPITAL LETTER I WITH GRAVE
94 <U00CD> /xcd Í LATIN CAPITAL LETTER I WITH ACUTE
95 <U0128> /xce Î LATIN CAPITAL LETTER I WITH TILDE
96 <U1EF3> /xcf Ï LATIN SMALL LETTER Y WITH GRAVE
97 <U0110> /xd0 Ð LATIN CAPITAL LETTER D WITH STROKE
98 <U1EE9> /xd1 Ñ LATIN SMALL LETTER U WITH HORN AND ACUTE
99 <U00D2> /xd2 Ò LATIN CAPITAL LETTER O WITH GRAVE
100 <U00D3> /xd3 Ó LATIN CAPITAL LETTER O WITH ACUTE
101 <U00D4> /xd4 Ô LATIN CAPITAL LETTER O WITH CIRCUMFLEX
102 <U1EA1> /xd5 Õ LATIN SMALL LETTER A WITH DOT BELOW
103 <U1EF7> /xd6 Ö LATIN SMALL LETTER Y WITH HOOK ABOVE
104 <U1EEB> /xd7 × LATIN SMALL LETTER U WITH HORN AND GRAVE
105 <U1EED> /xd8 Ø LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE
106 <U00D9> /xd9 Ù LATIN CAPITAL LETTER U WITH GRAVE
107 <U00DA> /xda Ú LATIN CAPITAL LETTER U WITH ACUTE
108 <U1EF9> /xdb Û LATIN SMALL LETTER Y WITH TILDE
109 <U1EF5> /xdc Ü LATIN SMALL LETTER Y WITH DOT BELOW
110 <U00DD> /xdd Ý LATIN CAPITAL LETTER Y WITH ACUTE
111 <U1EE1> /xde Þ LATIN SMALL LETTER O WITH HORN AND TILDE
112 <U01B0> /xdf ß LATIN SMALL LETTER U WITH HORN
113 <U00E0> /xe0 à LATIN SMALL LETTER A WITH GRAVE
114 <U00E1> /xe1 á LATIN SMALL LETTER A WITH ACUTE
115 <U00E2> /xe2 â LATIN SMALL LETTER A WITH CIRCUMFLEX
116 <U00E3> /xe3 ã LATIN SMALL LETTER A WITH TILDE
117 <U1EA3> /xe4 ä LATIN SMALL LETTER A WITH HOOK ABOVE
118 <U0103> /xe5 å LATIN SMALL LETTER A WITH BREVE
119 <U1EEF> /xe6 æ LATIN SMALL LETTER U WITH HORN AND TILDE
120 <U1EAB> /xe7 ç LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE
121 <U00E8> /xe8 è LATIN SMALL LETTER E WITH GRAVE
122 <U00E9> /xe9 é LATIN SMALL LETTER E WITH ACUTE
123 <U00EA> /xea ê LATIN SMALL LETTER E WITH CIRCUMFLEX
124 <U1EBB> /xeb ë LATIN SMALL LETTER E WITH HOOK ABOVE
125 <U00EC> /xec ì LATIN SMALL LETTER I WITH GRAVE
126 <U00ED> /xed í LATIN SMALL LETTER I WITH ACUTE
127 <U0129> /xee î LATIN SMALL LETTER I WITH TILDE
128 <U1EC9> /xef ï LATIN SMALL LETTER I WITH HOOK ABOVE
129 <U0111> /xf0 ð LATIN SMALL LETTER D WITH STROKE
130 <U1EF1> /xf1 ñ LATIN SMALL LETTER U WITH HORN AND DOT BELOW
131 <U00F2> /xf2 ò LATIN SMALL LETTER O WITH GRAVE
132 <U00F3> /xf3 ó LATIN SMALL LETTER O WITH ACUTE
133 <U00F4> /xf4 ô LATIN SMALL LETTER O WITH CIRCUMFLEX
134 <U00F5> /xf5 õ LATIN SMALL LETTER O WITH TILDE
135 <U1ECF> /xf6 ö LATIN SMALL LETTER O WITH HOOK ABOVE
136 <U1ECD> /xf7 ÷ LATIN SMALL LETTER O WITH DOT BELOW
137 <U1EE5> /xf8 ø LATIN SMALL LETTER U WITH DOT BELOW
138 <U00F9> /xf9 ù LATIN SMALL LETTER U WITH GRAVE
139 <U00FA> /xfa ú LATIN SMALL LETTER U WITH ACUTE
140 <U0169> /xfb û LATIN SMALL LETTER U WITH TILDE
141 <U1EE7> /xfc ü LATIN SMALL LETTER U WITH HOOK ABOVE
142 <U00FD> /xfd ý LATIN SMALL LETTER Y WITH ACUTE
143 <U1EE3> /xfe þ LATIN SMALL LETTER O WITH HORN AND DOT BELOW
144 <U1EEE> /xff ÿ LATIN CAPITAL LETTER U WITH HORN AND TILDE
149 namespace Dictionary {
151 static char *vowels
[] = {
161 static char *first_consonants
[] = { // longest first
191 static char *last_consonants
[] = { // longest first
192 "nh","ng","ch",/*"gh","ph","th","tr","gi","kh",*/
193 "c","m","n","p","t",/*"b","k","q","d","ð","g","h","l","r","s","v","x",*/
194 // these are semivowels, not consonants.
199 static char *padding_vowels
[] = {
204 static char *diacritic_table
[6] = {
213 static char *case_table
[2] = {
214 "áàäãÕ⤥¦ç§å¡¢ÆÇ£éè먩ꪫ¬®íìïî¸óòöõ÷ô¯°±²µ½¾¶·ÞþúùüûøßÑ×ØæñýÏÖÛÜð",
215 "ÁÀÄÀ„…†\x06‡Å�‚\x02\x05ƒÉÈˈ‰ÊŠ‹Œ�ŽÍ̛ΘÓÒ™ šÔ��‘’“´•–—³”ÚÙœ�ž¿º»¼ÿ¹ÝŸ\x14\x19\x1eÐ",
218 static char full_case_table
[2][256];
219 static char cat_table
[256]; // numeric,alpha...
220 static pair
<char,unsigned char> full_diacritic_table
[256];
225 #define CAT_XDIGIT 16
227 static const char *syll_empty
= "Empty";
228 static const char *syll_exist
= "Exist";
230 static vector
<confusion_set
> confusion_sets
;
231 std::vector
<confusion_set
>& get_confusion_sets()
233 return confusion_sets
;
237 string
viet_tolower(const string
&str
); // hack
241 int i
,len
= strlen(case_table
[0]);
243 for (i
= 0;i
< 256;i
++) {
244 if (i
< 128 && i
> 32) {
245 full_case_table
[0][i
] = tolower(i
);
246 full_case_table
[1][i
] = toupper(i
);
248 full_case_table
[0][i
] = i
;
249 full_case_table
[1][i
] = i
;
252 if (isalpha(i
)) cat_table
[i
] |= CAT_ALPHA
;
253 if (isspace(i
)) cat_table
[i
] |= CAT_SPACE
;
254 if (isdigit(i
)) cat_table
[i
] |= CAT_DIGIT
;
255 if (isxdigit(i
)) cat_table
[i
] |= CAT_XDIGIT
;
256 if (ispunct(i
)) cat_table
[i
] |= CAT_PUNCT
;
257 full_diacritic_table
[i
] = make_pair(-1,0);
259 for (i
= 0;i
< len
; i
++) {
260 full_case_table
[0][(unsigned char)case_table
[1][i
]] = case_table
[0][i
];
261 full_case_table
[1][(unsigned char)case_table
[0][i
]] = case_table
[1][i
];
262 cat_table
[(unsigned char)case_table
[0][i
]] = CAT_ALPHA
;
263 cat_table
[(unsigned char)case_table
[1][i
]] = CAT_ALPHA
;
266 for (i
= 0;i
< 6;i
++) {
267 int j
,n
= strlen(diacritic_table
[i
]);
268 for (j
= 0;j
< n
;j
++)
269 full_diacritic_table
[(unsigned char)diacritic_table
[i
][j
]] = make_pair(i
,j
);
272 confusion_sets
.push_back(confusion_set());
273 confusion_sets
.back().push_back(Syllable("c"));
274 confusion_sets
.back().push_back(Syllable("k"));
276 confusion_sets
.push_back(confusion_set());
277 confusion_sets
.back().push_back("g");
278 confusion_sets
.back().push_back("gh");
280 confusion_sets
.push_back(confusion_set());
281 confusion_sets
.back().push_back("ng");
282 confusion_sets
.back().push_back("ngh");
284 confusion_sets
.push_back(confusion_set());
285 confusion_sets
.back().push_back("ch");
286 confusion_sets
.back().push_back("tr");
288 confusion_sets
.push_back(confusion_set());
289 confusion_sets
.back().push_back("s");
290 confusion_sets
.back().push_back("x");
292 confusion_sets
.push_back(confusion_set());
293 confusion_sets
.back().push_back("v");
294 confusion_sets
.back().push_back("d");
295 confusion_sets
.back().push_back("gi");
296 confusion_sets
.back().push_back("r");
298 confusion_sets
.push_back(confusion_set());
299 confusion_sets
.back().push_back(Syllable(syll_empty
,syll_exist
));
300 confusion_sets
.back().push_back(Syllable("h",syll_exist
));
301 confusion_sets
.back().push_back(Syllable("ng",syll_exist
));
302 confusion_sets
.back().push_back(Syllable("q","u"));
304 confusion_sets
.push_back(confusion_set());
305 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","i"));
306 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","y"));
307 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","y"));
309 confusion_sets
.push_back(confusion_set());
310 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","u"));
311 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","o"));
312 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","u"));
314 confusion_sets
.push_back(confusion_set());
315 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"å","m"));
316 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","m"));
318 confusion_sets
.push_back(confusion_set());
319 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"å","p"));
320 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","p"));
322 confusion_sets
.push_back(confusion_set());
323 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","u"));
324 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","u"));
325 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","u"));
327 confusion_sets
.push_back(confusion_set());
328 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","m"));
329 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","m"));
330 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","m"));
331 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"e","m"));
333 confusion_sets
.push_back(confusion_set());
334 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","p"));
335 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","p"));
336 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","p"));
337 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"e","p"));
339 confusion_sets
.push_back(confusion_set());
340 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"o","i"));
341 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ô","i"));
342 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"½","i"));
344 confusion_sets
.push_back(confusion_set());
345 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"o","m"));
346 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ô","m"));
347 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"½","m"));
349 confusion_sets
.push_back(confusion_set());
350 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"o","p"));
351 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ô","p"));
352 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"½","p"));
354 confusion_sets
.push_back(confusion_set());
355 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"o","ng"));
356 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ô","ng"));
358 confusion_sets
.push_back(confusion_set());
359 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"o","c"));
360 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ô","c"));
362 confusion_sets
.push_back(confusion_set());
363 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","i"));
364 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","i"));
366 confusion_sets
.push_back(confusion_set());
367 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","m"));
368 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","m"));
370 confusion_sets
.push_back(confusion_set());
371 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","p"));
372 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","p"));
374 confusion_sets
.push_back(confusion_set());
375 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","i"));
376 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","i"));
378 confusion_sets
.push_back(confusion_set());
379 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","u"));
380 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","u"));
382 confusion_sets
.push_back(confusion_set());
383 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","m"));
384 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","m"));
386 confusion_sets
.push_back(confusion_set());
387 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","p"));
388 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","p"));
390 confusion_sets
.push_back(confusion_set());
391 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","n"));
392 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","ng"));
394 confusion_sets
.push_back(confusion_set());
395 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","t"));
396 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"a","c"));
398 confusion_sets
.push_back(confusion_set());
399 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"å","n"));
400 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"å","ng"));
402 confusion_sets
.push_back(confusion_set());
403 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"å","t"));
404 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"å","c"));
406 confusion_sets
.push_back(confusion_set());
407 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","n"));
408 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","ng"));
410 confusion_sets
.push_back(confusion_set());
411 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","t"));
412 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"â","c"));
414 confusion_sets
.push_back(confusion_set());
415 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"e","n"));
416 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"e","ng"));
418 confusion_sets
.push_back(confusion_set());
419 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"e","t"));
420 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"e","c"));
422 confusion_sets
.push_back(confusion_set());
423 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","n"));
424 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","nh"));
426 confusion_sets
.push_back(confusion_set());
427 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","t"));
428 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ê","c"));
430 confusion_sets
.push_back(confusion_set());
431 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","n"));
432 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","nh"));
434 confusion_sets
.push_back(confusion_set());
435 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","t"));
436 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"i","ch"));
438 confusion_sets
.push_back(confusion_set());
439 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","n"));
440 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","ng"));
442 confusion_sets
.push_back(confusion_set());
443 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","t"));
444 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"iê","c"));
446 confusion_sets
.push_back(confusion_set());
447 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"½","n"));
448 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"½","ng"));
450 confusion_sets
.push_back(confusion_set());
451 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"¾","n"));
452 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"¾","ng"));
454 confusion_sets
.push_back(confusion_set());
455 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","n"));
456 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","ng"));
458 confusion_sets
.push_back(confusion_set());
459 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","t"));
460 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"u","c"));
462 confusion_sets
.push_back(confusion_set());
463 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","n"));
464 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","ng"));
466 confusion_sets
.push_back(confusion_set());
467 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","t"));
468 confusion_sets
.back().push_back(Syllable(NULL
,"u","ô","c"));
470 confusion_sets
.push_back(confusion_set());
471 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","n"));
472 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","ng"));
474 confusion_sets
.push_back(confusion_set());
475 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","t"));
476 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"ß","c"));
478 confusion_sets
.push_back(confusion_set());
479 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","n"));
480 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","ng"));
482 confusion_sets
.push_back(confusion_set());
483 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","t"));
484 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,"߽","c"));
486 confusion_sets
.push_back(confusion_set());
487 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,NULL
,NULL
,Syllable::Hook
));
488 confusion_sets
.back().push_back(Syllable(NULL
,NULL
,NULL
,NULL
,Syllable::Tilde
));
489 //confusion_sets.back().push_back(Syllable(NULL,NULL,NULL,NULL,Syllable::Dot));
493 Syllable::Syllable(const Syllable
&sy
)
495 for (int i
= 0;i
< 5;i
++) {
496 components
[i
] = sy
.components
[i
];
497 scomponents
[i
] = sy
.scomponents
[i
];
501 Syllable::Syllable(const char* _first_consonant
,
502 const char* _padding_vowel
,
504 const char* _last_consonant
,
507 int __first_consonant
= -1;
508 int __padding_vowel
= -1;
510 int __last_consonant
= -1;
511 //int __diacritic = -1;
514 if (_first_consonant
== syll_exist
)
515 __first_consonant
= -2;
516 else if (_first_consonant
== syll_empty
)
517 __first_consonant
= -3;
518 else if (_first_consonant
!= NULL
)
519 for (i
= 0;first_consonants
[i
] != NULL
;i
++)
520 if (!strcmp(first_consonants
[i
],_first_consonant
)) {
521 __first_consonant
= i
;
525 if (_padding_vowel
== syll_exist
)
526 __padding_vowel
= -2;
527 else if (_padding_vowel
== syll_empty
)
528 __padding_vowel
= -3;
529 else if (_padding_vowel
!= NULL
)
530 for (i
= 0;padding_vowels
[i
] != NULL
;i
++) {
531 if (!strcmp(padding_vowels
[i
],_padding_vowel
)) {
537 if (_vowel
== syll_exist
)
539 else if (_vowel
== syll_empty
)
541 else if (_vowel
!= NULL
)
542 for (i
= 0;vowels
[i
] != NULL
;i
++)
543 if (!strcmp(vowels
[i
],_vowel
)) {
548 if (_last_consonant
== syll_exist
)
549 __last_consonant
= -2;
550 else if (_last_consonant
== syll_empty
)
551 __last_consonant
= -3;
552 else if (_last_consonant
!= NULL
)
553 for (i
= 0;last_consonants
[i
] != NULL
;i
++)
554 if (!strcmp(last_consonants
[i
],_last_consonant
)) {
555 __last_consonant
= i
;
559 components
[First_Consonant
] = __first_consonant
;
560 if (__first_consonant
>= 0)
561 scomponents
[First_Consonant
] = first_consonants
[__first_consonant
];
562 components
[Padding_Vowel
] = __padding_vowel
;
563 if (__padding_vowel
>= 0)
564 scomponents
[Padding_Vowel
] = padding_vowels
[__padding_vowel
];
565 components
[Vowel
] = __vowel
;
567 scomponents
[Vowel
] = vowels
[__vowel
];
568 components
[Last_Consonant
] = __last_consonant
;
569 if (__last_consonant
>= 0)
570 scomponents
[Last_Consonant
] = last_consonants
[__last_consonant
];
571 components
[Diacritic
] = d
;
574 bool Syllable::match(const Syllable
&sample
)
576 for (int i
= 0;i
< 5;i
++) {
577 switch (components
[i
]) {
578 case -1:break; // it's alright
580 if (sample
.components
[i
] == -1)
584 if (sample
.components
[i
] != -1)
587 if (components
[i
] != sample
.components
[i
])
594 void Syllable::apply(const Syllable
&sample
,vector
<Syllable
> &output
)
604 for (i
= 0;i
< 5;i
++) {
607 switch (components
[i
]) {
611 case First_Consonant
: p
= first_consonants
; break;
612 case Last_Consonant
: p
= last_consonants
; break;
613 case Padding_Vowel
: p
= padding_vowels
; break;
614 case Vowel
: p
= vowels
; break;
616 for (j
= 0;p
[j
] != NULL
;j
++);
623 s
.components
[i
] = -1;
624 s
.scomponents
[i
] = "";
627 s
.components
[i
] = components
[i
];
628 s
.scomponents
[i
] = scomponents
[i
];
641 if (i
< n
-1 && iter
[i
] < limit
[i
]) {
644 if (i
== n
-1 && iter
[i
] < limit
[i
]) {
645 for (k
= 0;k
< n
;k
++) {
647 for (int kk
= 0;kk
< 5;kk
++)
648 if (components
[kk
] == -2) {
649 output
.back().components
[kk
] = iter
[k
];
650 if (kk
!= Diacritic
) {
652 case First_Consonant
: p
= first_consonants
; break;
653 case Last_Consonant
: p
= last_consonants
; break;
654 case Padding_Vowel
: p
= padding_vowels
; break;
655 case Vowel
: p
= vowels
; break;
657 output
.back().scomponents
[kk
] = p
[iter
[k
]];
664 while (i
>= 0 && iter
[i
] == limit
[i
])
677 // we assumes str is a valid syllable ;)
678 bool Syllable::parse(const char *str
)
680 // Rule: there is always vowel. Others can be omitted.
681 // [first_consonant] [padding_vowel] vowel [last_consonant]
686 string
syllable(viet_tolower(string(str
)));
687 string
ssyllable(str
);
689 // fisrt of all, extract diacritic.
690 // because the syllable has been stardardized. just extract the diacritic.
691 if (syllable
[0] >= '0' && syllable
[0] <= '5') {
692 components
[Diacritic
] = syllable
[0] - '0';
694 ssyllable
.erase(0,1);
696 components
[Diacritic
] = None
;
697 len
= syllable
.size();
698 for (k
= 0;k
< len
;k
++) {
699 // look up into diacritic_table
700 for (j
= 1;j
< 6;j
++) {
701 char *pos
= strchr(diacritic_table
[j
],syllable
[k
]);
703 int ipos
= pos
- diacritic_table
[j
];
704 syllable
[k
] = diacritic_table
[0][ipos
]; // remove diacritic
705 if (viet_toupper(ssyllable
[k
]) == ssyllable
[k
])
706 ssyllable
[k
] = viet_toupper(diacritic_table
[0][ipos
]); // remove diacritic
708 ssyllable
[k
] = diacritic_table
[0][ipos
]; // remove diacritic
709 components
[Diacritic
] = j
;
716 // there are 8 cases:
718 { 0,-1, 0, 0}, // F_VL
719 {-1,-1, 0, 0}, // __VL
720 { 0,-1, 0,-1}, // F_V_
721 {-1,-1, 0,-1}, // __V_
722 { 0, 0, 0, 0}, // FPVL
723 {-1, 0, 0, 0}, // _PVL
724 { 0, 0, 0,-1}, // FPV_
725 {-1, 0, 0,-1}, // _PV_
728 string saved_syllable
= syllable
;
729 string saved_ssyllable
= ssyllable
;
731 for (unsigned z
= 0;z
< 8;z
++) {
733 syllable
= saved_syllable
;
734 ssyllable
= saved_ssyllable
;
736 // parse from the pattern cases[z]
737 for (unsigned zz
= 0;ok
&& zz
< 4;zz
++) { // component
739 scomponents
[zz
] = "";
740 if (ok
&& cases
[z
][zz
] == 0) {
741 // get the first_consonant
743 len
= syllable
.size();
746 case First_Consonant
: p
= first_consonants
; break;
747 case Last_Consonant
: p
= last_consonants
; break;
748 case Padding_Vowel
: p
= padding_vowels
; break;
749 case Vowel
: p
= vowels
; break;
751 for (i
= 0;p
[i
] != 0; i
++) {
752 char *pattern
= p
[i
];
753 int pattern_len
= strlen(pattern
);
755 if (len
>= pattern_len
&& // equal is possible
756 syllable
.substr(0,pattern_len
) == pattern
) {
757 //cerr << "Comp " << zz << " is <" << pattern << ">" << endl;
759 scomponents
[zz
] = ssyllable
.substr(0,pattern_len
);
760 syllable
.erase(0,pattern_len
);
761 ssyllable
.erase(0,pattern_len
);
769 // some rules to prevent errors
770 // the last consonant "i" is only followed by u+, u+o+, o+, a^, a, a(, u, uo^, o^, o
771 // the last consonant "u" is only followed by i, ie^, e^, e, u+, u+o+, o+, a^, a, a(
772 // padding vowels don't precede u, uo^, o^, o --> vowel is higher priority than padding vowel
774 // Có các tr߶ng hþp sau: uô ua, ui, uy, oi, qu.
775 // uô, ua, ui, oi ðßþc giäi quyªt b¢ng thÑ tñ cases.
776 if (ok
&& syllable
.empty()) {
777 // "q" always precedes 'u' (padding vowel)
778 if (components
[First_Consonant
] != -1 &&
779 !strcmp(first_consonants
[components
[First_Consonant
]],"q") && // first consonant is 'q'
780 (components
[Padding_Vowel
] == -1 ||
781 strcmp(padding_vowels
[components
[Padding_Vowel
]],"u"))) // padding not exist or not 'u'
783 else if (components
[Vowel
] != -1 &&
784 !strcmp(vowels
[components
[Vowel
]],"u") &&
785 components
[Last_Consonant
] != -1 &&
786 !strcmp(last_consonants
[components
[Last_Consonant
]],"y"))
790 if (ok
&& syllable
.empty())
793 //cerr << "Case " << z << " failed" << endl;
799 std::ostream
& operator << (std::ostream
&os
,const Syllable
&sy
)
802 char *diacritics
[] = {"_","'","`","?","~","."};
803 for (int i
= 0;i
< 5;i
++) {
804 if (sy
.components
[i
] < 0)
808 case Syllable::First_Consonant
: p
= first_consonants
; break;
809 case Syllable::Last_Consonant
: p
= last_consonants
; break;
810 case Syllable::Padding_Vowel
: p
= padding_vowels
; break;
811 case Syllable::Vowel
: p
= vowels
; break;
812 case Syllable::Diacritic
: p
= diacritics
; break;
814 if (i
!= Syllable::Diacritic
)
815 os
<< sy
.scomponents
[i
];
817 os
<< p
[sy
.components
[i
]];
824 string
Syllable::to_str() const
828 for (int i
= 0;i
< 4;i
++) {
829 if (components
[i
] >= 0) {
831 case First_Consonant
: p
= first_consonants
; break;
832 case Last_Consonant
: p
= last_consonants
; break;
833 case Padding_Vowel
: p
= padding_vowels
; break;
834 case Vowel
: p
= vowels
; break;
836 s
+= scomponents
[i
]; // no diacritic because i=0..3
837 if (i
== Vowel
&& components
[Diacritic
] != None
) {
839 if (components
[Last_Consonant
] == -1)
840 last
= s
.size() - strlen(scomponents
[Vowel
].c_str());
843 int j
= strchr(diacritic_table
[0],viet_tolower(s
[last
])) - diacritic_table
[0];
844 if (viet_toupper(s
[last
]) == s
[last
])
845 s
[last
] = viet_toupper(diacritic_table
[components
[Diacritic
]][j
]);
847 s
[last
] = diacritic_table
[components
[Diacritic
]][j
];
854 string
Syllable::to_std_str() const
859 s
[0] = '0'+components
[Diacritic
];
861 for (int i
= 0;i
< 4;i
++) {
862 if (components
[i
] >= 0) {
864 case First_Consonant
: p
= first_consonants
; break;
865 case Last_Consonant
: p
= last_consonants
; break;
866 case Padding_Vowel
: p
= padding_vowels
; break;
867 case Vowel
: p
= vowels
; break;
869 s
+= scomponents
[i
]; // no diacritic because i=0..3
876 strid
Syllable::to_id() const
878 return get_sarch()[to_str()];
881 strid
Syllable::to_std_id() const
883 return get_sarch()[to_std_str()];
886 std::string
Syllable::get_component(int i
)
889 case First_Consonant
: return first_consonants
[i
];
890 case Padding_Vowel
: return padding_vowels
[i
];
891 case Vowel
: return vowels
[i
];
892 case Last_Consonant
: return last_consonants
[i
];
897 bool Syllable::set_component(int pos
,const char *s
)
901 case First_Consonant
: p
= first_consonants
;
902 case Padding_Vowel
: p
= padding_vowels
;
903 case Vowel
: p
= vowels
;
904 case Last_Consonant
: p
= last_consonants
;
905 default: return false;
908 for (int i
= 0;p
[i
];i
++)
909 if (get_lowercased_syllable(s
) == p
[i
]) {
911 scomponents
[pos
] = s
;
919 void Syllable::standardize(std::string syllable)
921 // We just need to eliminate "òa", "òe", "ùa"
923 // first, cut the first consonant off
925 int len = syllable.size();
926 for (i = 0;first_consonants[i] != 0; i++) {
927 char *pattern = *first_consonants[i];
928 int pattern_len = strlen(pattern);
930 if (len > pattern_len &&
931 syllable.compare(0,pattern_len,pattern) == 0) {
940 int viet_toupper(int ch
) // must be sure ch is a character
942 return full_case_table
[1][(unsigned char)(char)ch
];
945 int viet_tolower(int ch
) // must be sure ch is a character
947 return full_case_table
[0][(unsigned char)(char)ch
];
950 string
viet_tolower(const string
&str
)
954 for (int i
= 0;i
< n
;i
++)
955 s
[i
] = viet_tolower(s
[i
]);
959 bool viet_isupper(int ch
)
961 return viet_isalpha(ch
) && full_case_table
[1][ch
] == ch
;
964 bool viet_islower(int ch
)
966 return viet_isalpha(ch
) && full_case_table
[0][ch
] == ch
;
969 bool viet_isalpha(int ch
)
971 return cat_table
[ch
] & CAT_ALPHA
;
974 bool viet_isdigit(int ch
)
976 return cat_table
[ch
] & CAT_DIGIT
;
979 bool viet_isxdigit(int ch
)
981 return cat_table
[ch
] & CAT_XDIGIT
;
984 bool viet_isspace(int ch
)
986 return cat_table
[ch
] & CAT_SPACE
;
989 bool viet_ispunct(int ch
)
991 return cat_table
[ch
] & CAT_PUNCT
;
994 string
get_std_syllable(const string
&str
)
996 uint i
,n
= str
.size();
999 if (sy
.parse(str
.c_str())) {
1001 if (sy.parse(s.c_str()) && // 'y' canonicalization
1002 sy.components[Syllable::Padding_Vowel] == -1 &&
1003 sy.get_component(Syllable::Vowel) == "y")
1004 sy.set_component(Syllable::Vowel,s[s.size()-1] == 'Y' ? "I" : "i");
1006 return sy
.to_std_str();
1008 return string("0")+str
;
1011 string
get_unstd_syllable(const string
&str
)
1013 uint i
,n
= str
.size();
1016 if (sy
.parse(str
.c_str()))
1019 return (str
[0] >= '0' && str
[0] <= '5') ? str
.substr(1) : str
;
1022 string
get_lowercased_syllable(const string
&str
)
1024 return viet_tolower(str
);
1027 bool operator < (const Syllable
&s1
,const Syllable
&s2
)
1030 for (i
= 0;i
< 4;i
++) {
1031 if (s1
.scomponents
[i
] == s2
.scomponents
[i
])
1033 if (s1
.scomponents
[i
] > s2
.scomponents
[i
])
1035 if (s1
.scomponents
[i
] < s2
.scomponents
[i
])
1039 if (s1
.components
[i
] == s2
.components
[i
])
1041 if (s1
.components
[i
] > s2
.components
[i
])
1043 if (s1
.components
[i
] < s2
.components
[i
])
1047 bool operator == (const Syllable
&s1
,const Syllable
&s2
)
1049 for (int i
= 0;i
< 4;i
++) {
1050 if (s1
.scomponents
[i
] == s2
.scomponents
[i
])
1054 return s1
.components
[4] == s2
.components
[4];