fixed bugs in Text::penalty2_construct(), Penalty2DAG::set_syllable_weights()
[vspell.git] / libvspell / syllable.cpp
blob81dfea963ee1310bb3e6ad1d8e10d031ef128ff0
1 // -*- coding: viscii tab-width:2 mode: c++ -*-
2 #include <string>
3 #include <iostream>
4 #include <string.h>
5 #include "dictionary.h"
6 #include "syllable.h"
7 #include "spell.h"
9 /*
10 VISCII character map.
11 <U1EB2> /x02 \x02 LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE
12 <U1EB4> /x05 \x05 LATIN CAPITAL LETTER A WITH BREVE AND TILDE
13 <U1EAA> /x06 \x06 LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE
14 <U1EF6> /x14 \x14 LATIN CAPITAL LETTER Y WITH HOOK ABOVE
15 <U1EF8> /x19 \x19 LATIN CAPITAL LETTER Y WITH TILDE
16 <U1EF4> /x1e \x1e LATIN CAPITAL LETTER Y WITH DOT BELOW
17 <U1EA0> /x80 € LATIN CAPITAL LETTER A WITH DOT BELOW
18 <U1EAE> /x81 � LATIN CAPITAL LETTER A WITH BREVE AND ACUTE
19 <U1EB0> /x82 ‚ LATIN CAPITAL LETTER A WITH BREVE AND GRAVE
20 <U1EB6> /x83 ƒ LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW
21 <U1EA4> /x84 „ LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE
22 <U1EA6> /x85 … LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE
23 <U1EA8> /x86 † LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE
24 <U1EAC> /x87 ‡ LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW
25 <U1EBC> /x88 ˆ LATIN CAPITAL LETTER E WITH TILDE
26 <U1EB8> /x89 ‰ LATIN CAPITAL LETTER E WITH DOT BELOW
27 <U1EBE> /x8a Š LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE
28 <U1EC0> /x8b ‹ LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE
29 <U1EC2> /x8c Œ LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE
30 <U1EC4> /x8d � LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE
31 <U1EC6> /x8e Ž LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW
32 <U1ED0> /x8f � LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE
33 <U1ED2> /x90 � LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE
34 <U1ED4> /x91 ‘ LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE
35 <U1ED6> /x92 ’ LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE
36 <U1ED8> /x93 “ LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW
37 <U1EE2> /x94 ” LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW
38 <U1EDA> /x95 • LATIN CAPITAL LETTER O WITH HORN AND ACUTE
39 <U1EDC> /x96 – LATIN CAPITAL LETTER O WITH HORN AND GRAVE
40 <U1EDE> /x97 — LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE
41 <U1ECA> /x98 ˜ LATIN CAPITAL LETTER I WITH DOT BELOW
42 <U1ECE> /x99 ™ LATIN CAPITAL LETTER O WITH HOOK ABOVE
43 <U1ECC> /x9a š LATIN CAPITAL LETTER O WITH DOT BELOW
44 <U1EC8> /x9b › LATIN CAPITAL LETTER I WITH HOOK ABOVE
45 <U1EE6> /x9c œ LATIN CAPITAL LETTER U WITH HOOK ABOVE
46 <U0168> /x9d � LATIN CAPITAL LETTER U WITH TILDE
47 <U1EE4> /x9e ž LATIN CAPITAL LETTER U WITH DOT BELOW
48 <U1EF2> /x9f Ÿ LATIN CAPITAL LETTER Y WITH GRAVE
49 <U00D5> /xa0   LATIN CAPITAL LETTER O WITH TILDE
50 <U1EAF> /xa1 ¡ LATIN SMALL LETTER A WITH BREVE AND ACUTE
51 <U1EB1> /xa2 ¢ LATIN SMALL LETTER A WITH BREVE AND GRAVE
52 <U1EB7> /xa3 £ LATIN SMALL LETTER A WITH BREVE AND DOT BELOW
53 <U1EA5> /xa4 ¤ LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE
54 <U1EA7> /xa5 ¥ LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE
55 <U1EA9> /xa6 ¦ LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE
56 <U1EAD> /xa7 § LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW
57 <U1EBD> /xa8 ¨ LATIN SMALL LETTER E WITH TILDE
58 <U1EB9> /xa9 © LATIN SMALL LETTER E WITH DOT BELOW
59 <U1EBF> /xaa ª LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE
60 <U1EC1> /xab « LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE
61 <U1EC3> /xac ¬ LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE
62 <U1EC5> /xad ­ LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE
63 <U1EC7> /xae ® LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW
64 <U1ED1> /xaf ¯ LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE
65 <U1ED3> /xb0 ° LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE
66 <U1ED5> /xb1 ± LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE
67 <U1ED7> /xb2 ² LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE
68 <U1EE0> /xb3 ³ LATIN CAPITAL LETTER O WITH HORN AND TILDE
69 <U01A0> /xb4 ´ LATIN CAPITAL LETTER O WITH HORN
70 <U1ED9> /xb5 µ LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW
71 <U1EDD> /xb6 ¶ LATIN SMALL LETTER O WITH HORN AND GRAVE
72 <U1EDF> /xb7 · LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE
73 <U1ECB> /xb8 ¸ LATIN SMALL LETTER I WITH DOT BELOW
74 <U1EF0> /xb9 ¹ LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW
75 <U1EE8> /xba º LATIN CAPITAL LETTER U WITH HORN AND ACUTE
76 <U1EEA> /xbb » LATIN CAPITAL LETTER U WITH HORN AND GRAVE
77 <U1EEC> /xbc ¼ LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE
78 <U01A1> /xbd ½ LATIN SMALL LETTER O WITH HORN
79 <U1EDB> /xbe ¾ LATIN SMALL LETTER O WITH HORN AND ACUTE
80 <U01AF> /xbf ¿ LATIN CAPITAL LETTER U WITH HORN
81 <U00C0> /xc0 À LATIN CAPITAL LETTER A WITH GRAVE
82 <U00C1> /xc1 Á LATIN CAPITAL LETTER A WITH ACUTE
83 <U00C2> /xc2 Â LATIN CAPITAL LETTER A WITH CIRCUMFLEX
84 <U00C3> /xc3 Ã LATIN CAPITAL LETTER A WITH TILDE
85 <U1EA2> /xc4 Ä LATIN CAPITAL LETTER A WITH HOOK ABOVE
86 <U0102> /xc5 Å LATIN CAPITAL LETTER A WITH BREVE
87 <U1EB3> /xc6 Æ LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE
88 <U1EB5> /xc7 Ç LATIN SMALL LETTER A WITH BREVE AND TILDE
89 <U00C8> /xc8 È LATIN CAPITAL LETTER E WITH GRAVE
90 <U00C9> /xc9 É LATIN CAPITAL LETTER E WITH ACUTE
91 <U00CA> /xca Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
92 <U1EBA> /xcb Ë LATIN CAPITAL LETTER E WITH HOOK ABOVE
93 <U00CC> /xcc Ì LATIN CAPITAL LETTER I WITH GRAVE
94 <U00CD> /xcd Í LATIN CAPITAL LETTER I WITH ACUTE
95 <U0128> /xce Î LATIN CAPITAL LETTER I WITH TILDE
96 <U1EF3> /xcf Ï LATIN SMALL LETTER Y WITH GRAVE
97 <U0110> /xd0 Ð LATIN CAPITAL LETTER D WITH STROKE
98 <U1EE9> /xd1 Ñ LATIN SMALL LETTER U WITH HORN AND ACUTE
99 <U00D2> /xd2 Ò LATIN CAPITAL LETTER O WITH GRAVE
100 <U00D3> /xd3 Ó LATIN CAPITAL LETTER O WITH ACUTE
101 <U00D4> /xd4 Ô LATIN CAPITAL LETTER O WITH CIRCUMFLEX
102 <U1EA1> /xd5 Õ LATIN SMALL LETTER A WITH DOT BELOW
103 <U1EF7> /xd6 Ö LATIN SMALL LETTER Y WITH HOOK ABOVE
104 <U1EEB> /xd7 × LATIN SMALL LETTER U WITH HORN AND GRAVE
105 <U1EED> /xd8 Ø LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE
106 <U00D9> /xd9 Ù LATIN CAPITAL LETTER U WITH GRAVE
107 <U00DA> /xda Ú LATIN CAPITAL LETTER U WITH ACUTE
108 <U1EF9> /xdb Û LATIN SMALL LETTER Y WITH TILDE
109 <U1EF5> /xdc Ü LATIN SMALL LETTER Y WITH DOT BELOW
110 <U00DD> /xdd Ý LATIN CAPITAL LETTER Y WITH ACUTE
111 <U1EE1> /xde Þ LATIN SMALL LETTER O WITH HORN AND TILDE
112 <U01B0> /xdf ß LATIN SMALL LETTER U WITH HORN
113 <U00E0> /xe0 à LATIN SMALL LETTER A WITH GRAVE
114 <U00E1> /xe1 á LATIN SMALL LETTER A WITH ACUTE
115 <U00E2> /xe2 â LATIN SMALL LETTER A WITH CIRCUMFLEX
116 <U00E3> /xe3 ã LATIN SMALL LETTER A WITH TILDE
117 <U1EA3> /xe4 ä LATIN SMALL LETTER A WITH HOOK ABOVE
118 <U0103> /xe5 å LATIN SMALL LETTER A WITH BREVE
119 <U1EEF> /xe6 æ LATIN SMALL LETTER U WITH HORN AND TILDE
120 <U1EAB> /xe7 ç LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE
121 <U00E8> /xe8 è LATIN SMALL LETTER E WITH GRAVE
122 <U00E9> /xe9 é LATIN SMALL LETTER E WITH ACUTE
123 <U00EA> /xea ê LATIN SMALL LETTER E WITH CIRCUMFLEX
124 <U1EBB> /xeb ë LATIN SMALL LETTER E WITH HOOK ABOVE
125 <U00EC> /xec ì LATIN SMALL LETTER I WITH GRAVE
126 <U00ED> /xed í LATIN SMALL LETTER I WITH ACUTE
127 <U0129> /xee î LATIN SMALL LETTER I WITH TILDE
128 <U1EC9> /xef ï LATIN SMALL LETTER I WITH HOOK ABOVE
129 <U0111> /xf0 ð LATIN SMALL LETTER D WITH STROKE
130 <U1EF1> /xf1 ñ LATIN SMALL LETTER U WITH HORN AND DOT BELOW
131 <U00F2> /xf2 ò LATIN SMALL LETTER O WITH GRAVE
132 <U00F3> /xf3 ó LATIN SMALL LETTER O WITH ACUTE
133 <U00F4> /xf4 ô LATIN SMALL LETTER O WITH CIRCUMFLEX
134 <U00F5> /xf5 õ LATIN SMALL LETTER O WITH TILDE
135 <U1ECF> /xf6 ö LATIN SMALL LETTER O WITH HOOK ABOVE
136 <U1ECD> /xf7 ÷ LATIN SMALL LETTER O WITH DOT BELOW
137 <U1EE5> /xf8 ø LATIN SMALL LETTER U WITH DOT BELOW
138 <U00F9> /xf9 ù LATIN SMALL LETTER U WITH GRAVE
139 <U00FA> /xfa ú LATIN SMALL LETTER U WITH ACUTE
140 <U0169> /xfb û LATIN SMALL LETTER U WITH TILDE
141 <U1EE7> /xfc ü LATIN SMALL LETTER U WITH HOOK ABOVE
142 <U00FD> /xfd ý LATIN SMALL LETTER Y WITH ACUTE
143 <U1EE3> /xfe þ LATIN SMALL LETTER O WITH HORN AND DOT BELOW
144 <U1EEE> /xff ÿ LATIN CAPITAL LETTER U WITH HORN AND TILDE
147 using namespace std;
149 namespace Dictionary {
151 static char *vowels[] = {
152 "iê","yê","ia",
153 "ß½","ßa","uô","ua",
154 "a","å","â",
155 "e","ê",
156 "o","ô","½",
157 "u","ß","i","y",
158 NULL
161 static char *first_consonants[] = { // longest first
162 "ngh", // 0
163 "nh", // 1
164 "ng", // 2
165 "ch", // 3
166 "gh", // 4
167 "ph", // 5
168 "th", // 6
169 "tr", // 7
170 "gi", // 8
171 "kh", // 9
172 "c", // 10
173 "m", // 11
174 "n"/*,"p"*/, // 12
175 "t", // 13
176 "b", // 14
177 "k", // 15
178 "q", // 16
179 "d", // 17
180 "ð", // 18
181 "g", // 19
182 "h", // 20
183 "l", // 21
184 "r", // 22
185 "s", // 23
186 "v", // 24
187 "x", // 25
188 NULL
191 static char *last_consonants[] = { // longest first
192 "nh","ng","ch",/*"gh","ph","th","tr","gi","kh",*/
193 "c","m","n","p","t",/*"b","k","q","d","ð","g","h","l","r","s","v","x",*/
194 // these are semivowels, not consonants.
195 "i","y","o","u",
196 NULL
199 static char *padding_vowels[] = {
200 "o","u",
201 NULL
204 static char *diacritic_table[6] = {
205 "aâåeêioô½ußy",
206 "ᤡéªíó¯¾úÑý",
207 "ॢè«ìò°¶ù×Ï",
208 "ä¦Æë¬ïö±·üØÖ",
209 "ãçǨ­îõ²ÞûæÛ",
210 "Õ§£©®¸÷µþøñÜ",
213 static char *case_table[2] = {
214 "áàäãÕ⤥¦ç§å¡¢ÆÇ£éè먩ꪫ¬­®íìïî¸óòöõ÷ô¯°±²µ½¾¶·ÞþúùüûøßÑ×ØæñýÏÖÛÜð",
215 "ÁÀÄÀ„…†\x06‡Å�‚\x02\x05ƒÉÈˈ‰ÊŠ‹Œ�ŽÍ̛ΘÓÒ™ šÔ��‘’“´•–—³”ÚÙœ�ž¿º»¼ÿ¹ÝŸ\x14\x19\x1eÐ",
218 static char full_case_table[2][256];
219 static char cat_table[256]; // numeric,alpha...
220 static pair<char,unsigned char> full_diacritic_table[256];
221 #define CAT_ALPHA 1
222 #define CAT_DIGIT 2
223 #define CAT_SPACE 4
224 #define CAT_PUNCT 8
225 #define CAT_XDIGIT 16
227 static const char *syll_empty = "Empty";
228 static const char *syll_exist = "Exist";
230 static vector<confusion_set> confusion_sets;
231 std::vector<confusion_set>& get_confusion_sets()
233 return confusion_sets;
237 string viet_tolower(const string &str); // hack
239 bool syllable_init()
241 int i,len = strlen(case_table[0]);
243 for (i = 0;i < 256;i ++) {
244 if (i < 128 && i > 32) {
245 full_case_table[0][i] = tolower(i);
246 full_case_table[1][i] = toupper(i);
247 } else {
248 full_case_table[0][i] = i;
249 full_case_table[1][i] = i;
251 cat_table[i] = 0;
252 if (isalpha(i)) cat_table[i] |= CAT_ALPHA;
253 if (isspace(i)) cat_table[i] |= CAT_SPACE;
254 if (isdigit(i)) cat_table[i] |= CAT_DIGIT;
255 if (isxdigit(i)) cat_table[i] |= CAT_XDIGIT;
256 if (ispunct(i)) cat_table[i] |= CAT_PUNCT;
257 full_diacritic_table[i] = make_pair(-1,0);
259 for (i = 0;i < len; i ++) {
260 full_case_table[0][(unsigned char)case_table[1][i]] = case_table[0][i];
261 full_case_table[1][(unsigned char)case_table[0][i]] = case_table[1][i];
262 cat_table[(unsigned char)case_table[0][i]] = CAT_ALPHA;
263 cat_table[(unsigned char)case_table[1][i]] = CAT_ALPHA;
266 for (i = 0;i < 6;i ++) {
267 int j,n = strlen(diacritic_table[i]);
268 for (j = 0;j < n;j ++)
269 full_diacritic_table[(unsigned char)diacritic_table[i][j]] = make_pair(i,j);
272 confusion_sets.push_back(confusion_set());
273 confusion_sets.back().push_back(Syllable("c"));
274 confusion_sets.back().push_back(Syllable("k"));
276 confusion_sets.push_back(confusion_set());
277 confusion_sets.back().push_back("g");
278 confusion_sets.back().push_back("gh");
280 confusion_sets.push_back(confusion_set());
281 confusion_sets.back().push_back("ng");
282 confusion_sets.back().push_back("ngh");
284 confusion_sets.push_back(confusion_set());
285 confusion_sets.back().push_back("ch");
286 confusion_sets.back().push_back("tr");
288 confusion_sets.push_back(confusion_set());
289 confusion_sets.back().push_back("s");
290 confusion_sets.back().push_back("x");
292 confusion_sets.push_back(confusion_set());
293 confusion_sets.back().push_back("v");
294 confusion_sets.back().push_back("d");
295 confusion_sets.back().push_back("gi");
296 confusion_sets.back().push_back("r");
298 confusion_sets.push_back(confusion_set());
299 confusion_sets.back().push_back(Syllable(syll_empty,syll_exist));
300 confusion_sets.back().push_back(Syllable("h",syll_exist));
301 confusion_sets.back().push_back(Syllable("ng",syll_exist));
302 confusion_sets.back().push_back(Syllable("q","u"));
304 confusion_sets.push_back(confusion_set());
305 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","i"));
306 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","y"));
307 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","y"));
309 confusion_sets.push_back(confusion_set());
310 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","u"));
311 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","o"));
312 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","u"));
314 confusion_sets.push_back(confusion_set());
315 confusion_sets.back().push_back(Syllable(NULL,NULL,"å","m"));
316 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","m"));
318 confusion_sets.push_back(confusion_set());
319 confusion_sets.back().push_back(Syllable(NULL,NULL,"å","p"));
320 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","p"));
322 confusion_sets.push_back(confusion_set());
323 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","u"));
324 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","u"));
325 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","u"));
327 confusion_sets.push_back(confusion_set());
328 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","m"));
329 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","m"));
330 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","m"));
331 confusion_sets.back().push_back(Syllable(NULL,NULL,"e","m"));
333 confusion_sets.push_back(confusion_set());
334 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","p"));
335 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","p"));
336 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","p"));
337 confusion_sets.back().push_back(Syllable(NULL,NULL,"e","p"));
339 confusion_sets.push_back(confusion_set());
340 confusion_sets.back().push_back(Syllable(NULL,NULL,"o","i"));
341 confusion_sets.back().push_back(Syllable(NULL,NULL,"ô","i"));
342 confusion_sets.back().push_back(Syllable(NULL,NULL,"½","i"));
344 confusion_sets.push_back(confusion_set());
345 confusion_sets.back().push_back(Syllable(NULL,NULL,"o","m"));
346 confusion_sets.back().push_back(Syllable(NULL,NULL,"ô","m"));
347 confusion_sets.back().push_back(Syllable(NULL,NULL,"½","m"));
349 confusion_sets.push_back(confusion_set());
350 confusion_sets.back().push_back(Syllable(NULL,NULL,"o","p"));
351 confusion_sets.back().push_back(Syllable(NULL,NULL,"ô","p"));
352 confusion_sets.back().push_back(Syllable(NULL,NULL,"½","p"));
354 confusion_sets.push_back(confusion_set());
355 confusion_sets.back().push_back(Syllable(NULL,NULL,"o","ng"));
356 confusion_sets.back().push_back(Syllable(NULL,NULL,"ô","ng"));
358 confusion_sets.push_back(confusion_set());
359 confusion_sets.back().push_back(Syllable(NULL,NULL,"o","c"));
360 confusion_sets.back().push_back(Syllable(NULL,NULL,"ô","c"));
362 confusion_sets.push_back(confusion_set());
363 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","i"));
364 confusion_sets.back().push_back(Syllable(NULL,"u","ô","i"));
366 confusion_sets.push_back(confusion_set());
367 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","m"));
368 confusion_sets.back().push_back(Syllable(NULL,"u","ô","m"));
370 confusion_sets.push_back(confusion_set());
371 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","p"));
372 confusion_sets.back().push_back(Syllable(NULL,"u","ô","p"));
374 confusion_sets.push_back(confusion_set());
375 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","i"));
376 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","i"));
378 confusion_sets.push_back(confusion_set());
379 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","u"));
380 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","u"));
382 confusion_sets.push_back(confusion_set());
383 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","m"));
384 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","m"));
386 confusion_sets.push_back(confusion_set());
387 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","p"));
388 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","p"));
390 confusion_sets.push_back(confusion_set());
391 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","n"));
392 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","ng"));
394 confusion_sets.push_back(confusion_set());
395 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","t"));
396 confusion_sets.back().push_back(Syllable(NULL,NULL,"a","c"));
398 confusion_sets.push_back(confusion_set());
399 confusion_sets.back().push_back(Syllable(NULL,NULL,"å","n"));
400 confusion_sets.back().push_back(Syllable(NULL,NULL,"å","ng"));
402 confusion_sets.push_back(confusion_set());
403 confusion_sets.back().push_back(Syllable(NULL,NULL,"å","t"));
404 confusion_sets.back().push_back(Syllable(NULL,NULL,"å","c"));
406 confusion_sets.push_back(confusion_set());
407 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","n"));
408 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","ng"));
410 confusion_sets.push_back(confusion_set());
411 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","t"));
412 confusion_sets.back().push_back(Syllable(NULL,NULL,"â","c"));
414 confusion_sets.push_back(confusion_set());
415 confusion_sets.back().push_back(Syllable(NULL,NULL,"e","n"));
416 confusion_sets.back().push_back(Syllable(NULL,NULL,"e","ng"));
418 confusion_sets.push_back(confusion_set());
419 confusion_sets.back().push_back(Syllable(NULL,NULL,"e","t"));
420 confusion_sets.back().push_back(Syllable(NULL,NULL,"e","c"));
422 confusion_sets.push_back(confusion_set());
423 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","n"));
424 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","nh"));
426 confusion_sets.push_back(confusion_set());
427 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","t"));
428 confusion_sets.back().push_back(Syllable(NULL,NULL,"ê","c"));
430 confusion_sets.push_back(confusion_set());
431 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","n"));
432 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","nh"));
434 confusion_sets.push_back(confusion_set());
435 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","t"));
436 confusion_sets.back().push_back(Syllable(NULL,NULL,"i","ch"));
438 confusion_sets.push_back(confusion_set());
439 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","n"));
440 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","ng"));
442 confusion_sets.push_back(confusion_set());
443 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","t"));
444 confusion_sets.back().push_back(Syllable(NULL,NULL,"iê","c"));
446 confusion_sets.push_back(confusion_set());
447 confusion_sets.back().push_back(Syllable(NULL,NULL,"½","n"));
448 confusion_sets.back().push_back(Syllable(NULL,NULL,"½","ng"));
450 confusion_sets.push_back(confusion_set());
451 confusion_sets.back().push_back(Syllable(NULL,NULL,"¾","n"));
452 confusion_sets.back().push_back(Syllable(NULL,NULL,"¾","ng"));
454 confusion_sets.push_back(confusion_set());
455 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","n"));
456 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","ng"));
458 confusion_sets.push_back(confusion_set());
459 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","t"));
460 confusion_sets.back().push_back(Syllable(NULL,NULL,"u","c"));
462 confusion_sets.push_back(confusion_set());
463 confusion_sets.back().push_back(Syllable(NULL,"u","ô","n"));
464 confusion_sets.back().push_back(Syllable(NULL,"u","ô","ng"));
466 confusion_sets.push_back(confusion_set());
467 confusion_sets.back().push_back(Syllable(NULL,"u","ô","t"));
468 confusion_sets.back().push_back(Syllable(NULL,"u","ô","c"));
470 confusion_sets.push_back(confusion_set());
471 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","n"));
472 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","ng"));
474 confusion_sets.push_back(confusion_set());
475 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","t"));
476 confusion_sets.back().push_back(Syllable(NULL,NULL,"ß","c"));
478 confusion_sets.push_back(confusion_set());
479 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","n"));
480 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","ng"));
482 confusion_sets.push_back(confusion_set());
483 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","t"));
484 confusion_sets.back().push_back(Syllable(NULL,NULL,"߽","c"));
486 confusion_sets.push_back(confusion_set());
487 confusion_sets.back().push_back(Syllable(NULL,NULL,NULL,NULL,Syllable::Hook));
488 confusion_sets.back().push_back(Syllable(NULL,NULL,NULL,NULL,Syllable::Tilde));
489 //confusion_sets.back().push_back(Syllable(NULL,NULL,NULL,NULL,Syllable::Dot));
490 return true;
493 Syllable::Syllable(const Syllable &sy)
495 for (int i = 0;i < 5;i ++) {
496 components[i] = sy.components[i];
497 scomponents[i] = sy.scomponents[i];
501 Syllable::Syllable(const char* _first_consonant,
502 const char* _padding_vowel,
503 const char* _vowel,
504 const char* _last_consonant,
505 int d)
507 int __first_consonant = -1;
508 int __padding_vowel = -1;
509 int __vowel = -1;
510 int __last_consonant = -1;
511 //int __diacritic = -1;
512 int i;
514 if (_first_consonant == syll_exist)
515 __first_consonant = -2;
516 else if (_first_consonant == syll_empty)
517 __first_consonant = -3;
518 else if (_first_consonant != NULL)
519 for (i = 0;first_consonants[i] != NULL;i ++)
520 if (!strcmp(first_consonants[i],_first_consonant)) {
521 __first_consonant = i;
522 break;
525 if (_padding_vowel == syll_exist)
526 __padding_vowel = -2;
527 else if (_padding_vowel == syll_empty)
528 __padding_vowel = -3;
529 else if (_padding_vowel != NULL)
530 for (i = 0;padding_vowels[i] != NULL;i ++) {
531 if (!strcmp(padding_vowels[i],_padding_vowel)) {
532 __padding_vowel = i;
533 break;
537 if (_vowel == syll_exist)
538 __vowel = -2;
539 else if (_vowel == syll_empty)
540 __vowel = -3;
541 else if (_vowel != NULL)
542 for (i = 0;vowels[i] != NULL;i ++)
543 if (!strcmp(vowels[i],_vowel)) {
544 __vowel = i;
545 break;
548 if (_last_consonant == syll_exist)
549 __last_consonant = -2;
550 else if (_last_consonant == syll_empty)
551 __last_consonant = -3;
552 else if (_last_consonant != NULL)
553 for (i = 0;last_consonants[i] != NULL;i ++)
554 if (!strcmp(last_consonants[i],_last_consonant)) {
555 __last_consonant = i;
556 break;
559 components[First_Consonant] = __first_consonant;
560 if (__first_consonant >= 0)
561 scomponents[First_Consonant] = first_consonants[__first_consonant];
562 components[Padding_Vowel] = __padding_vowel;
563 if (__padding_vowel >= 0)
564 scomponents[Padding_Vowel] = padding_vowels[__padding_vowel];
565 components[Vowel] = __vowel;
566 if (__vowel >= 0)
567 scomponents[Vowel] = vowels[__vowel];
568 components[Last_Consonant] = __last_consonant;
569 if (__last_consonant >= 0)
570 scomponents[Last_Consonant] = last_consonants[__last_consonant];
571 components[Diacritic] = d;
574 bool Syllable::match(const Syllable &sample)
576 for (int i = 0;i < 5;i ++) {
577 switch (components[i]) {
578 case -1:break; // it's alright
579 case -2: // exist
580 if (sample.components[i] == -1)
581 return false;
582 break;
583 case -3: // empty
584 if (sample.components[i] != -1)
585 return false;
586 default:
587 if (components[i] != sample.components[i])
588 return false;
591 return true;
594 void Syllable::apply(const Syllable &sample,vector<Syllable> &output)
596 int i,j;
597 int iter[5];
598 int limit[5];
599 char **p;
600 Syllable s = sample;
601 bool run = false;
603 // init phase
604 for (i = 0;i < 5;i ++) {
605 limit[i] = 1;
606 iter[i] = 0;
607 switch (components[i]) {
608 case -1:break;
609 case -2:
610 switch (i) {
611 case First_Consonant: p = first_consonants; break;
612 case Last_Consonant: p = last_consonants; break;
613 case Padding_Vowel: p = padding_vowels; break;
614 case Vowel: p = vowels; break;
616 for (j = 0;p[j] != NULL;j ++);
617 if (j > 1) {
618 limit[i] = j;
619 run = true;
621 break;
622 case -3:
623 s.components[i] = -1;
624 s.scomponents[i] = "";
625 break;
626 default:
627 s.components[i] = components[i];
628 s.scomponents[i] = scomponents[i];
632 if (!run) {
633 output.push_back(s);
634 return;
637 // generating phase
638 int k,n = 5;
639 i = 0;
640 while (run) {
641 if (i < n-1 && iter[i] < limit[i]) {
642 i ++;
643 } else {
644 if (i == n-1 && iter[i] < limit[i]) {
645 for (k = 0;k < n;k ++) {
646 output.push_back(s);
647 for (int kk = 0;kk < 5;kk ++)
648 if (components[kk] == -2) {
649 output.back().components[kk] = iter[k];
650 if (kk != Diacritic) {
651 switch (kk) {
652 case First_Consonant: p = first_consonants; break;
653 case Last_Consonant: p = last_consonants; break;
654 case Padding_Vowel: p = padding_vowels; break;
655 case Vowel: p = vowels; break;
657 output.back().scomponents[kk] = p[iter[k]];
661 iter[i] ++;
662 } else {
663 k = i;
664 while (i >= 0 && iter[i] == limit[i])
665 i --;
666 if (i < 0)
667 run = false;
668 else {
669 iter[i] ++;
670 iter[k] = 0;
677 // we assumes str is a valid syllable ;)
678 bool Syllable::parse(const char *str)
680 // Rule: there is always vowel. Others can be omitted.
681 // [first_consonant] [padding_vowel] vowel [last_consonant]
683 int i,j,k;
684 //char **pattern;
685 int len;
686 string syllable(viet_tolower(string(str)));
687 string ssyllable(str);
689 // fisrt of all, extract diacritic.
690 // because the syllable has been stardardized. just extract the diacritic.
691 if (syllable[0] >= '0' && syllable[0] <= '5') {
692 components[Diacritic] = syllable[0] - '0';
693 syllable.erase(0,1);
694 ssyllable.erase(0,1);
695 } else {
696 components[Diacritic] = None;
697 len = syllable.size();
698 for (k = 0;k < len;k ++) {
699 // look up into diacritic_table
700 for (j = 1;j < 6;j ++) {
701 char *pos = strchr(diacritic_table[j],syllable[k]);
702 if (pos) {
703 int ipos = pos - diacritic_table[j];
704 syllable[k] = diacritic_table[0][ipos]; // remove diacritic
705 if (viet_toupper(ssyllable[k]) == ssyllable[k])
706 ssyllable[k] = viet_toupper(diacritic_table[0][ipos]); // remove diacritic
707 else
708 ssyllable[k] = diacritic_table[0][ipos]; // remove diacritic
709 components[Diacritic] = j;
710 break;
716 // there are 8 cases:
717 int cases[8][4] = {
718 { 0,-1, 0, 0}, // F_VL
719 {-1,-1, 0, 0}, // __VL
720 { 0,-1, 0,-1}, // F_V_
721 {-1,-1, 0,-1}, // __V_
722 { 0, 0, 0, 0}, // FPVL
723 {-1, 0, 0, 0}, // _PVL
724 { 0, 0, 0,-1}, // FPV_
725 {-1, 0, 0,-1}, // _PV_
728 string saved_syllable = syllable;
729 string saved_ssyllable = ssyllable;
731 for (unsigned z = 0;z < 8;z ++) {
732 bool ok = true;
733 syllable = saved_syllable;
734 ssyllable = saved_ssyllable;
736 // parse from the pattern cases[z]
737 for (unsigned zz = 0;ok && zz < 4;zz ++) { // component
738 components[zz] = -1;
739 scomponents[zz] = "";
740 if (ok && cases[z][zz] == 0) {
741 // get the first_consonant
742 ok = false;
743 len = syllable.size();
744 char **p;
745 switch (zz) {
746 case First_Consonant: p = first_consonants; break;
747 case Last_Consonant: p = last_consonants; break;
748 case Padding_Vowel: p = padding_vowels; break;
749 case Vowel: p = vowels; break;
751 for (i = 0;p[i] != 0; i++) {
752 char *pattern = p[i];
753 int pattern_len = strlen(pattern);
755 if (len >= pattern_len && // equal is possible
756 syllable.substr(0,pattern_len) == pattern) {
757 //cerr << "Comp " << zz << " is <" << pattern << ">" << endl;
758 components[zz] = i;
759 scomponents[zz] = ssyllable.substr(0,pattern_len);
760 syllable.erase(0,pattern_len);
761 ssyllable.erase(0,pattern_len);
762 ok = true;
763 break;
769 // some rules to prevent errors
770 // the last consonant "i" is only followed by u+, u+o+, o+, a^, a, a(, u, uo^, o^, o
771 // the last consonant "u" is only followed by i, ie^, e^, e, u+, u+o+, o+, a^, a, a(
772 // padding vowels don't precede u, uo^, o^, o --> vowel is higher priority than padding vowel
773 // fix some errors
774 // Có các tr߶ng hþp sau: uô ua, ui, uy, oi, qu.
775 // uô, ua, ui, oi ðßþc giäi quyªt b¢ng thÑ tñ cases.
776 if (ok && syllable.empty()) {
777 // "q" always precedes 'u' (padding vowel)
778 if (components[First_Consonant] != -1 &&
779 !strcmp(first_consonants[components[First_Consonant]],"q") && // first consonant is 'q'
780 (components[Padding_Vowel] == -1 ||
781 strcmp(padding_vowels[components[Padding_Vowel]],"u"))) // padding not exist or not 'u'
782 ok = false;
783 else if (components[Vowel] != -1 &&
784 !strcmp(vowels[components[Vowel]],"u") &&
785 components[Last_Consonant] != -1 &&
786 !strcmp(last_consonants[components[Last_Consonant]],"y"))
787 ok = false;
790 if (ok && syllable.empty())
791 return true;
792 //else
793 //cerr << "Case " << z << " failed" << endl;
796 return false;
799 std::ostream& operator << (std::ostream &os,const Syllable &sy)
801 char **p;
802 char *diacritics[] = {"_","'","`","?","~","."};
803 for (int i = 0;i < 5;i ++) {
804 if (sy.components[i] < 0)
805 os << "_";
806 else {
807 switch (i) {
808 case Syllable::First_Consonant: p = first_consonants; break;
809 case Syllable::Last_Consonant: p = last_consonants; break;
810 case Syllable::Padding_Vowel: p = padding_vowels; break;
811 case Syllable::Vowel: p = vowels; break;
812 case Syllable::Diacritic: p = diacritics; break;
814 if (i != Syllable::Diacritic)
815 os << sy.scomponents[i];
816 else
817 os << p[sy.components[i]];
819 os << " ";
821 return os;
824 string Syllable::to_str() const
826 string s;
827 char **p;
828 for (int i = 0;i < 4;i ++) {
829 if (components[i] >= 0) {
830 switch (i) {
831 case First_Consonant: p = first_consonants; break;
832 case Last_Consonant: p = last_consonants; break;
833 case Padding_Vowel: p = padding_vowels; break;
834 case Vowel: p = vowels; break;
836 s += scomponents[i]; // no diacritic because i=0..3
837 if (i == Vowel && components[Diacritic] != None) {
838 int last;
839 if (components[Last_Consonant] == -1)
840 last = s.size() - strlen(scomponents[Vowel].c_str());
841 else
842 last = s.size()-1;
843 int j = strchr(diacritic_table[0],viet_tolower(s[last])) - diacritic_table[0];
844 if (viet_toupper(s[last]) == s[last])
845 s[last] = viet_toupper(diacritic_table[components[Diacritic]][j]);
846 else
847 s[last] = diacritic_table[components[Diacritic]][j];
851 return s;
854 string Syllable::to_std_str() const
856 string s("0");
857 char **p;
859 s[0] = '0'+components[Diacritic];
861 for (int i = 0;i < 4;i ++) {
862 if (components[i] >= 0) {
863 switch (i) {
864 case First_Consonant: p = first_consonants; break;
865 case Last_Consonant: p = last_consonants; break;
866 case Padding_Vowel: p = padding_vowels; break;
867 case Vowel: p = vowels; break;
869 s += scomponents[i]; // no diacritic because i=0..3
872 return s;
876 strid Syllable::to_id() const
878 return get_sarch()[to_str()];
881 strid Syllable::to_std_id() const
883 return get_sarch()[to_std_str()];
886 std::string Syllable::get_component(int i)
888 switch (i) {
889 case First_Consonant: return first_consonants[i];
890 case Padding_Vowel: return padding_vowels[i];
891 case Vowel: return vowels[i];
892 case Last_Consonant: return last_consonants[i];
893 default: return "";
897 bool Syllable::set_component(int pos,const char *s)
899 char **p;
900 switch (pos) {
901 case First_Consonant: p = first_consonants;
902 case Padding_Vowel: p = padding_vowels;
903 case Vowel: p = vowels;
904 case Last_Consonant: p = last_consonants;
905 default: return false;
908 for (int i = 0;p[i];i ++)
909 if (get_lowercased_syllable(s) == p[i]) {
910 components[pos] = i;
911 scomponents[pos] = s;
912 return true;
915 return false;
919 void Syllable::standardize(std::string syllable)
921 // We just need to eliminate "òa", "òe", "ùa"
923 // first, cut the first consonant off
924 int start = 0;
925 int len = syllable.size();
926 for (i = 0;first_consonants[i] != 0; i++) {
927 char *pattern = *first_consonants[i];
928 int pattern_len = strlen(pattern);
930 if (len > pattern_len &&
931 syllable.compare(0,pattern_len,pattern) == 0) {
932 start = i;
933 break;
940 int viet_toupper(int ch) // must be sure ch is a character
942 return full_case_table[1][(unsigned char)(char)ch];
945 int viet_tolower(int ch) // must be sure ch is a character
947 return full_case_table[0][(unsigned char)(char)ch];
950 string viet_tolower(const string &str)
952 string s(str);
953 int n = str.size();
954 for (int i = 0;i < n;i ++)
955 s[i] = viet_tolower(s[i]);
956 return s;
959 bool viet_isupper(int ch)
961 return viet_isalpha(ch) && full_case_table[1][ch] == ch;
964 bool viet_islower(int ch)
966 return viet_isalpha(ch) && full_case_table[0][ch] == ch;
969 bool viet_isalpha(int ch)
971 return cat_table[ch] & CAT_ALPHA;
974 bool viet_isdigit(int ch)
976 return cat_table[ch] & CAT_DIGIT;
979 bool viet_isxdigit(int ch)
981 return cat_table[ch] & CAT_XDIGIT;
984 bool viet_isspace(int ch)
986 return cat_table[ch] & CAT_SPACE;
989 bool viet_ispunct(int ch)
991 return cat_table[ch] & CAT_PUNCT;
994 string get_std_syllable(const string &str)
996 uint i,n = str.size();
998 Syllable sy;
999 if (sy.parse(str.c_str())) {
1001 if (sy.parse(s.c_str()) && // 'y' canonicalization
1002 sy.components[Syllable::Padding_Vowel] == -1 &&
1003 sy.get_component(Syllable::Vowel) == "y")
1004 sy.set_component(Syllable::Vowel,s[s.size()-1] == 'Y' ? "I" : "i");
1006 return sy.to_std_str();
1007 } else
1008 return string("0")+str;
1011 string get_unstd_syllable(const string &str)
1013 uint i,n = str.size();
1015 Syllable sy;
1016 if (sy.parse(str.c_str()))
1017 return sy.to_str();
1018 else
1019 return (str[0] >= '0' && str[0] <= '5') ? str.substr(1) : str;
1022 string get_lowercased_syllable(const string &str)
1024 return viet_tolower(str);
1027 bool operator < (const Syllable &s1,const Syllable &s2)
1029 int i;
1030 for (i = 0;i < 4;i ++) {
1031 if (s1.scomponents[i] == s2.scomponents[i])
1032 continue;
1033 if (s1.scomponents[i] > s2.scomponents[i])
1034 return false;
1035 if (s1.scomponents[i] < s2.scomponents[i])
1036 return true;
1038 i = 4;
1039 if (s1.components[i] == s2.components[i])
1040 return false;
1041 if (s1.components[i] > s2.components[i])
1042 return false;
1043 if (s1.components[i] < s2.components[i])
1044 return true;
1047 bool operator == (const Syllable &s1,const Syllable &s2)
1049 for (int i = 0;i < 4;i ++) {
1050 if (s1.scomponents[i] == s2.scomponents[i])
1051 continue;
1052 return false;
1054 return s1.components[4] == s2.components[4];