last changes
[vspell.git] / libvspell / spell.cpp
blob28c68a80a030522c26f00cec35eb46bd6323301b
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include <iterator>
3 #include <algorithm>
4 #include <sstream>
5 #include <iostream>
6 #include <fstream>
7 #include "vspell.h"
8 #include "sentence.h"
9 #include "syllable.h"
10 #include "pfs.h"
11 #include <map>
12 #include "propername.h"
13 #include "keyboard.h"
14 #include "shuffle.h"
15 #include "penalty.h"
16 #include "bellman.h"
17 #include <math.h>
19 // stolen from glib
20 typedef unsigned int guint32;
21 typedef unsigned int guint;
22 typedef guint32 gunichar;
23 typedef char gchar;
24 typedef unsigned char guchar;
25 typedef long glong;
26 typedef signed int gssize;
27 #define UTF8_COMPUTE(Char, Mask, Len) \
28 if (Char < 128) \
29 { \
30 Len = 1; \
31 Mask = 0x7f; \
32 } \
33 else if ((Char & 0xe0) == 0xc0) \
34 { \
35 Len = 2; \
36 Mask = 0x1f; \
37 } \
38 else if ((Char & 0xf0) == 0xe0) \
39 { \
40 Len = 3; \
41 Mask = 0x0f; \
42 } \
43 else if ((Char & 0xf8) == 0xf0) \
44 { \
45 Len = 4; \
46 Mask = 0x07; \
47 } \
48 else if ((Char & 0xfc) == 0xf8) \
49 { \
50 Len = 5; \
51 Mask = 0x03; \
52 } \
53 else if ((Char & 0xfe) == 0xfc) \
54 { \
55 Len = 6; \
56 Mask = 0x01; \
57 } \
58 else \
59 Len = -1;
61 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
62 (Result) = (Chars)[0] & (Mask); \
63 for ((Count) = 1; (Count) < (Len); ++(Count)) \
64 { \
65 if (((Chars)[(Count)] & 0xc0) != 0x80) \
66 { \
67 (Result) = (gunichar)-1; \
68 break; \
69 } \
70 (Result) <<= 6; \
71 (Result) |= ((Chars)[(Count)] & 0x3f); \
74 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
75 static const gchar utf8_skip_data[256] = {
76 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
77 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
78 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
79 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
80 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
81 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
82 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
83 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
86 static const gchar * const g_utf8_skip = utf8_skip_data;
87 static int
88 g_unichar_to_utf8 (gunichar c,
89 gchar *outbuf)
91 guint len = 0;
92 int first;
93 int i;
95 if (c < 0x80)
97 first = 0;
98 len = 1;
100 else if (c < 0x800)
102 first = 0xc0;
103 len = 2;
105 else if (c < 0x10000)
107 first = 0xe0;
108 len = 3;
110 else if (c < 0x200000)
112 first = 0xf0;
113 len = 4;
115 else if (c < 0x4000000)
117 first = 0xf8;
118 len = 5;
120 else
122 first = 0xfc;
123 len = 6;
126 if (outbuf)
128 for (i = len - 1; i > 0; --i)
130 outbuf[i] = (c & 0x3f) | 0x80;
131 c >>= 6;
133 outbuf[0] = c | first;
136 return len;
139 static gunichar
140 g_utf8_get_char (const gchar *p)
142 int i, mask = 0, len;
143 gunichar result;
144 unsigned char c = (unsigned char) *p;
146 UTF8_COMPUTE (c, mask, len);
147 if (len == -1)
148 return (gunichar)-1;
149 UTF8_GET (result, p, i, mask, len);
151 return result;
154 static glong
155 g_utf8_strlen (const gchar *p,
156 gssize max)
158 glong len = 0;
159 const gchar *start = p;
160 //g_return_val_if_fail (p != NULL || max == 0, 0);
161 if (!(p != NULL || max == 0))
162 return 0;
164 if (max < 0)
166 while (*p)
168 p = g_utf8_next_char (p);
169 ++len;
172 else
174 if (max == 0 || !*p)
175 return 0;
177 p = g_utf8_next_char (p);
179 while (p - start < max && *p)
181 ++len;
182 p = g_utf8_next_char (p);
185 /* only do the last len increment if we got a complete
186 * char (don't count partial chars)
188 if (p - start == max)
189 ++len;
192 return len;
195 using namespace std;
199 The process:
200 1. Sentence segmentation. (sentences_split)
201 2. Separate "words" by spaces. (tokenize)
202 3. Punctuation separation. (tokenize/tokenize_punctuation)
203 4. Foreign/Abbreviation detection.
204 5. Proper name detection.
205 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
206 generalize all capitalized words.
207 6* Syllable checking. (check1)
208 7. Find all possible (misspelled) words. (**) (get_all_words)
209 8. "pre-separate" sentence into phrases.
210 9. Word segmentation. (**)
211 10. Find the best segmentation. (segment_best)
212 10* Word checking. (check2)
216 namespace Spell {
218 bool VSpell::init()
220 dic_init();
222 cerr << "Loading dictionary... ";
223 warch.load("wordlist");
224 cerr << "done" << endl;
225 cerr << "Loading ngram... ";
226 File f("ngram","rt");
227 if (!f.error()) {
228 get_ngram().read(f);
229 cerr << "done" << endl;
230 } else
231 cerr << "Ngram loading error. The result may be incorrect" << endl;
232 cerr << "Loading syngram... ";
233 File ff("syngram","rt");
234 if (!ff.error()) {
235 get_syngram().read(ff);
236 cerr << "done" << endl;
237 } else
238 cerr << "Syllable Ngram loading error. The result may be incorrect" << endl;
239 sarch.set_blocked(true);
240 return true;
243 bool VSpell::check(const char *utf8_pp)
245 utf8_text = utf8_pp;
247 syllables.clear();
249 bool run = false;
251 do {
252 string pp = viet_to_viscii_force(utf8_text.c_str());
253 vector<string> pps;
254 sentences_split(pp,pps);
255 unsigned pps_i,pps_len = pps.size();
256 unsigned offset = 0;
257 text = pp;
259 for (pps_i = 0;pps_i < pps_len;pps_i ++) {
260 Text *t = text_factory.create(this);
261 t->offset = offset;
262 t->length = pps[pps_i].size();
263 run = !t->sentence_check(pps[pps_i].c_str());
264 delete t;
265 if (run)
266 break;
267 offset += pps[pps_i].size();
269 } while (run);
270 return true;
273 void VSpell::replace(unsigned from,unsigned size,const char *s)
275 const char *p = utf8_text.c_str();
276 unsigned i;
277 for (i = 0;i < from && *p;i ++)
278 p = g_utf8_next_char(p);
279 unsigned from1 = p - utf8_text.c_str();
280 for (i = 0;i < size && *p;i ++)
281 p = g_utf8_next_char(p);
282 unsigned to1 = p - utf8_text.c_str();
283 utf8_text.replace(from1,to1-from1,s);
285 // remove separators in the range, adjust other seps
286 int n = separators.size();
287 if (n) {
288 int newsize = g_utf8_strlen(s,-1);
289 i = n;
290 do {
291 i --;
292 if (separators[i] >= from) {
293 if (separators[i] < from+size)
294 separators.erase(separators.begin()+i);
295 else
296 separators[i] += newsize-size;
298 } while (i > 0);
302 void VSpell::add_separators(const std::vector<unsigned> &seps)
304 copy(seps.begin(),seps.end(),back_inserter(separators));
307 void VSpell::add_word(const char *s)
309 istringstream is(s);
310 strid_string toks;
311 string ss;
312 while (is >> ss) {
313 toks += sarch[ss];
315 words.insert(toks);
318 bool Text::sentence_check(const char *pp)
320 // preprocess
321 st.set(pp);
322 st.standardize();
323 st.tokenize();
325 if (!st.get_syllable_count()) // nothing to do but crash ;)
326 return true;
328 // syllable checking
329 if (!syllable_check() && !ui_syllable_check())
330 return false;
332 //w.construct(st);
333 set<WordEntry> wes;
334 WordStateFactories factories;
335 ExactWordStateFactory exact;
336 LowerWordStateFactory lower;
337 UpperWordStateFactory upper;
338 FuzzyWordStateFactory fuzzy;
339 factories.push_back(&exact);
340 factories.push_back(&lower);
341 factories.push_back(&upper);
342 factories.push_back(&fuzzy);
343 w.pre_construct(st,wes,factories);
344 mark_proper_name(st,wes);
345 apply_separators(wes);
346 w.post_construct(wes);
347 //cerr << w << endl;
349 WordDAG dagw(&w);
350 DAG *dag = &dagw;
352 WordDAG2 *dagw2;
353 if (vspell->get_trigram()) {
354 dagw2 = new WordDAG2(&dagw);
355 dag = dagw2;
358 Penalty2DAG p2dag(dag,vspell->get_penalty2());
359 if (vspell->get_penalty2()) {
360 // with penalty2dag, we have to do non-fuzzy segmentation first
361 // to feed Penalty2DAG::syllable_weights
362 Segmentation p2seg;
363 penalty2_construct(p2seg);
364 p2dag.set_syllable_weights(p2seg);
365 dag = &p2dag;
368 PenaltyDAG pdag(dag,vspell->get_penalty());
369 if (vspell->get_penalty()) {
370 dag = &pdag;
373 Path path;
374 if (vspell->get_normalization()) {
375 Bellman pfs;
376 pfs.search(*dag,path);
377 } else {
378 PFS pfs;
379 pfs.search(*dag,path);
382 if (vspell->get_trigram()) {
383 dagw2->demangle(path);
384 delete dagw2;
387 seg.resize(path.size()-2);
388 // don't copy head/tail
389 copy(path.begin()+1,path.end()-1,seg.begin());
390 seg.we = w.we;
391 //cerr << seg << endl;
393 // word checking
394 if (!word_check() && !ui_word_check())
395 return false;
397 return true; // done
400 void Text::penalty2_construct(Segmentation &seg)
402 WordStateFactories factories;
403 ExactWordStateFactory exact;
404 LowerWordStateFactory lower;
405 UpperWordStateFactory upper;
406 //FuzzyWordStateFactory fuzzy;
408 factories.push_back(&exact);
409 factories.push_back(&lower);
410 factories.push_back(&upper);
411 //factories.push_back(&fuzzy);
412 Lattice lattice;
413 set<WordEntry> wes;
414 lattice.pre_construct(st,wes,factories);
415 mark_proper_name(st,wes);
416 //apply_separators(wes);
417 lattice.post_construct(wes);
419 WordDAG dagw(&lattice);
420 DAG *dag = &dagw;
422 WordDAG2 *dagw2;
423 if (vspell->get_trigram()) {
424 dagw2 = new WordDAG2(&dagw);
425 dag = dagw2;
428 Path path;
429 if (vspell->get_normalization()) {
430 Bellman pfs;
431 pfs.search(*dag,path);
432 } else {
433 PFS pfs;
434 pfs.search(*dag,path);
437 if (vspell->get_trigram()) {
438 dagw2->demangle(path);
439 delete dagw2;
442 seg.resize(path.size()-2);
443 // don't copy head/tail
444 copy(path.begin()+1,path.end()-1,seg.begin());
445 seg.we = lattice.we;
449 bool Text::syllable_check(int i)
451 if (vspell->in_dict(st[i].get_id()))
452 return true;
454 if (sarch.in_dict(st[i].get_cid())) {
455 Syllable syl; // diacritic check
456 if (syl.parse(sarch[st[i].get_cid()])) {
457 string s = get_lowercased_syllable(syl.to_str());
458 if (get_lowercased_syllable(sarch[st[i].get_id()]) == s)
459 return true;
462 return false;
465 bool Text::syllable_check()
467 int i,n = st.get_syllable_count();
469 suggestions.clear();
471 for (i = 0;i < n;i ++) {
472 if (syllable_check(i))
473 continue;
475 Suggestion _s;
476 _s.id = i;
477 suggestions.push_back(_s);
479 return suggestions.empty();
482 bool Text::word_check()
484 int i,n = seg.size();
486 suggestions.clear();
488 for (i = 0;i < n;i ++) {
489 vector<strid> sylls;
490 strid_string sylls2;
491 int ii,len = seg[i].node->get_syllable_count();
492 seg[i].node->get_syllables(sylls);
494 // case-sensitive comparation.
495 sylls2.resize(len);
496 bool subok = true;
497 for (ii = 0;ii < len;ii ++) {
498 sylls2[ii] = st[seg[i].pos+ii].get_id();
499 if (subok && st[seg[i].pos+ii].get_cid() != sylls[ii])
500 subok = false;
503 // in user dict
504 bool ok = vspell->in_dict(sylls2);
506 if (!ok) {
507 strid_string sylls3;
508 sylls3.resize(sylls.size());
509 for (ii = 0;ii < sylls3.size();ii ++)
510 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
512 if (vspell->get_strict_word_checking()) {
513 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
514 if (!subok &&
515 (is_all_capitalized_word(sylls2) ||
516 (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
517 subok = true;
519 ok = subok;
520 } else {
521 string s;
522 BranchNode *branch = warch.get_root();
523 LeafNode* leaf;
524 for (ii = 0;ii < len && branch;ii ++)
525 branch = branch->get_branch(st[seg[i].pos+ii].get_cid());
527 if (branch && (leaf = branch->get_leaf(sarch["<mainleaf>"])) != NULL) {
528 sylls.clear();
529 leaf->get_syllables(sylls);
530 strid_string sylls3;
531 sylls3.resize(sylls.size());
532 for (ii = 0;ii < sylls3.size();ii ++)
533 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
535 ok = sylls2 == sylls3;
537 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
538 if (!ok &&
539 (is_all_capitalized_word(sylls2) ||
540 (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
541 ok = true;
546 if (!ok) {
547 Suggestion _s;
548 _s.id = i;
549 suggestions.push_back(_s);
550 continue;
553 // all syllable are syntatically valid
555 return suggestions.empty();
558 unsigned Text::pos_from_syllable(const Suggestion &s)
560 return offset+st[s.id].start;
563 unsigned Text::pos_from_word(const Suggestion &s)
565 //return offset+st[seg[s.id]].start;
566 return 0;
569 void Text::replace(unsigned from,unsigned size,const char *s)
571 vspell->replace(from+offset,size,s);
574 int Text::utf8_pos(unsigned from)
576 const string &utf8_text = vspell->get_utf8_text();
577 const char *p = utf8_text.c_str();
578 const char *op = p;
579 from += offset;
580 for (int i = 0;i < from && *p;i ++)
581 p = g_utf8_next_char(p);
582 return (int)(p - op);
585 string Text::substr(unsigned from,unsigned size)
587 const string &utf8_text = vspell->get_utf8_text();
588 const char *p = utf8_text.c_str();
589 unsigned i;
590 from += offset;
591 for (i = 0;i < from && *p;i ++)
592 p = g_utf8_next_char(p);
593 unsigned from1 = p - utf8_text.c_str();
594 for (i = 0;i < size && *p;i ++)
595 p = g_utf8_next_char(p);
596 unsigned to1 = p - utf8_text.c_str();
597 return utf8_text.substr(from1,to1-from1);
600 void Text::apply_separators(set<WordEntry> &wes)
602 vector<unsigned> seps;
603 //set<unsigned> seps;
605 get_separators(seps);
606 sort(seps.begin(),seps.end());
607 //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
608 int sep = 0;
609 int i,n = st.get_syllable_count();
611 for (i = 0;i < n-1 && sep < seps.size();i ++) {
612 int p = offset+st[i].start+strlen(sarch[st[i].get_id()]);
613 if (p <= seps[sep] && seps[sep] <= offset+st[i+1].start) {
614 apply_separator(wes,i);
615 sep ++;
620 void Text::get_separators(vector<unsigned> &v)
622 const vector<unsigned> &vv = vspell->get_separators();
624 int i,n = vv.size();
625 for (i = 0;i < n;i ++)
626 if (vv[i] >= offset && vv[i] < offset+length)
627 v.push_back(vv[i]);
633 static unsigned char viscii_str[] = {
634 0xe1,0xe0,0xe4,0xe3,0xd5,
635 0xe2,0xa4,0xa5,0xa6,0xe7,0xa7,
636 0xe5,0xa1,0xa2,0xc6,0xc7,0xa3,
637 0xe9,0xe8,0xeb,0xa8,0xa9,
638 0xea,0xaa,0xab,0xac,0xad,
639 0xae,0xed,0xec,0xef,0xee,0xb8,
640 0xf3,0xf2,0xf6,0xf5,0xf7,
641 0xf4,0xaf,0xb0,0xb1,0xb2,
642 0xb5,0xbd,0xbe,0xb6,0xb7,0xde,
643 0xfe,0xfa,0xf9,0xfc,0xfb,0xf8,
644 0xdf,0xd1,0xd7,0xd8,0xe6,
645 0xf1,0xfd,0xcf,0xd6,0xdb,0xdc,
646 0xf0,
647 0xc1,0xc0,0xc4,0xc3,0x80,
648 0xc2,0x84,0x85,0x86,0x06,0x87,
649 0xc5,0x81,0x82,0x02,0x05,0x83,
650 0xc9,0xc8,0xcb,0x88,0x89,
651 0xca,0x8a,0x8b,0x8c,0x8d,0x8e,
652 0xcd,0xcc,0x9b,0xce,0x98,
653 0xd3,0xd2,0x99,0xa0,0x9a,
654 0xd4,0x8f,0x90,0x91,0x92,
655 0x93,0xb4,0x95,0x96,0x97,0xb3,
656 0x94,0xda,0xd9,0x9c,0x9d,0x9e,
657 0xbf,0xba,0xbb,0xbc,0xff,0xb9,
658 0xdd,0x9f,0x14,0x19,0x1e,
659 0xd0,
662 //"a'a`a?a~a.a^a^'a^`a^?a^~a^.a(a('a(`a(?a(~a(.e'e`e?e~e.e^e^'e^`e^?e^~e^.i'i`i?i~i.o'o`o?o~o.o^o^'o^`o^?o^~o^.o+o+'o+`o+?o+~o+.u'u`u?u~u.u+u+'u+`u+?u+~u+.y'y`y?y~y.ddA'A`A?A~A.A^A^'A^`A^?A^~A^.A(A('A(`A(?A(~A(.E'E`E?E~E.E^E^'E^`E^?E^~E^.I'I`I?I~I.O'O`O?O~O.O^O^'O^`O^?O^~O^.O+O+'O+`O+?O+~O+.U'U`U?U~U.U+U+'U+`U+?U+~U+.Y'Y`Y?Y~Y.DD";
664 static gunichar unicode_str[] = {
665 225, 224,7843, 227,7841,
666 226,7845,7847,7849,7851,7853,
667 259,7855,7857,7859,7861,7863,
668 233, 232,7867,7869,7865,
669 234,7871,7873,7875,7877,
670 7879, 237, 236,7881, 297,7883,
671 243, 242,7887, 245,7885,
672 244,7889,7891,7893,7895,
673 7897, 417,7899,7901,7903,7905,
674 7907, 250, 249,7911, 361,7909,
675 432,7913,7915,7917,7919,
676 7921, 253,7923,7927,7929,7925,
677 273,
678 193, 192,7842, 195,7840,
679 194,7844,7846,7848,7850,7852,
680 258,7854,7856,7858,7860,7862,
681 201, 200,7866,7868,7864,
682 202,7870,7872,7874,7876,7878,
683 205, 204,7880, 296,7882,
684 211, 210,7886, 213,7884,
685 212,7888,7890,7892,7894,
686 7896, 416,7898,7900,7902,7904,
687 7906, 218, 217,7910, 360,7908,
688 431,7912,7914,7916,7918,7920,
689 221,7922,7926,7928,7924,
690 272,
694 static map<unsigned char,gunichar> viscii_utf8_map;
695 static map<gunichar,unsigned char> utf8_viscii_map;
697 void viet_init()
699 for (unsigned i = 0;viscii_str[i];i ++) {
700 viscii_utf8_map[viscii_str[i]] = unicode_str[i];
701 utf8_viscii_map[unicode_str[i]] = viscii_str[i];
705 bool viet_utf8_to_viscii(const char *in,char *out) // pre-allocated
707 const gchar *p = in;
708 gunichar ch;
709 while ((ch = g_utf8_get_char(p)) != 0) {
710 p = g_utf8_next_char(p);
711 if (ch < 128) {
712 *out++ = ch;
713 continue;
716 map<gunichar,unsigned char>::iterator iter;
717 iter = utf8_viscii_map.find(ch);
718 if (iter != utf8_viscii_map.end())
719 *out++ = (char)iter->second;
720 else {
721 fprintf(stderr,"Warning: unexpected unicode character %d",ch);
722 return false;
725 *out = 0;
726 return true;
729 bool viet_utf8_to_viscii_force(const char *in,char *out)
731 const gchar *p = in;
732 gunichar ch;
733 while ((ch = g_utf8_get_char(p)) != 0) {
734 p = g_utf8_next_char(p);
735 if (ch < 128) {
736 *out++ = ch;
737 continue;
740 map<gunichar,unsigned char>::iterator iter;
741 iter = utf8_viscii_map.find(ch);
742 if (iter != utf8_viscii_map.end())
743 *out++ = (char)iter->second;
744 else {
745 fprintf(stderr,"Warning: unexpected unicode character %d",ch);
746 *out += 'z';
749 *out = 0;
750 return true;
753 void viet_viscii_to_utf8(const char *in,char *out) // pre-allocated
755 unsigned char *p = (unsigned char*)in;
756 unsigned char ch;
757 while ((ch = *p) != 0) {
758 p++;
759 if (ch < 128 && ch >= 32) {
760 *out++ = ch;
761 continue;
764 map<unsigned char,gunichar>::iterator iter;
765 iter = viscii_utf8_map.find(ch);
766 if (iter != viscii_utf8_map.end()) {
767 g_unichar_to_utf8(iter->second,out);
768 out = g_utf8_next_char(out);
769 } else {
770 *out++ = ch; // fall-back case
774 *out = 0;
777 static char buffer[6000];
778 char* viet_to_viscii(const char *in)
780 if (g_utf8_strlen(in,-1) >= 1000)
781 return "";
782 if (viet_utf8_to_viscii(in,buffer))
783 return buffer;
784 else
785 return NULL;
788 char* viet_to_viscii_force(const char *in)
790 if (g_utf8_strlen(in,-1) >= 1000)
791 return "";
792 viet_utf8_to_viscii_force(in,buffer);
793 return buffer;
796 char* viet_to_utf8(const char *in)
798 if (strlen(in) >= 1000)
799 return "";
800 viet_viscii_to_utf8(in,buffer);
801 return buffer;
805 static char_traits_strid::char_type*
806 char_traits_strid::copy(char_traits_strid::char_type* __s1,
807 const char_traits_strid::char_type* __s2,
808 size_t __n)
810 return static_cast<char_type*>(memcpy(__s1, __s2, __n*sizeof(char_type)));
815 bool get_case_syllable_candidates(const char *input,Candidates &output, float v)
817 // There are only two acceptable cases:
818 // 1. The first character is upper case, the rest is lower
819 // 2. All are either lower or upper
820 // if there is some upper case character without following one of these cases, then it's fault.
821 // also, if there is a uppercase word in dictionary, then add it.
823 uint i,n = strlen(input);
824 // check for upper-case chars
825 for (i = n-1;i >= 0;i --)
826 if (viet_toupper(input[i]) == input[i])
827 break;
829 if (i <= 0 || n < 2) // ignore if the only upper char is the first one.
830 return false;
832 string s;
834 s = input;
835 s[0] = viet_toupper(s[0]);
836 for (i = 1;i < n;i ++)
837 s[i] = viet_tolower(s[i]);
838 if (s != string(input))
839 output.insert(s,v);
841 s = input;
842 for (i = 0;i < n;i ++)
843 s[i] = viet_tolower(s[i]);
844 if (s != string(input))
845 output.insert(s,v);
847 s = input;
848 for (i = 0;i < n;i ++)
849 s[i] = viet_toupper(s[i]);
850 if (s != string(input))
851 output.insert(s,v);
852 return true;
855 void get_phonetic_syllable_candidates(const char *input,Candidates &output,float v)
857 vector<confusion_set>& confusion_sets = get_confusion_sets();
858 int i,j,m,n = confusion_sets.size();
859 bool ret = false;
860 set<Syllable> syllset,syllset2;
861 Syllable _syll;
863 _syll.parse(input);
864 syllset2.insert(_syll);
865 while (!syllset2.empty()) {
866 const Syllable sy = *syllset2.begin();
867 syllset2.erase(syllset2.begin());
869 if (syllset.find(sy) != syllset.end())
870 continue; // we already matched&applied this syllable
872 //cerr << sy << endl;
873 syllset.insert(sy);
875 vector<Syllable> sylls;
876 // match & apply
877 for (i = 0;i < n;i ++) {
878 m = confusion_sets[i].size();
879 for (j = 0;j < m;j ++)
880 if (confusion_sets[i][j].match(sy))
881 break;
882 if (j < m) {
883 for (j = 0;j < m;j ++)
884 confusion_sets[i][j].apply(sy,sylls);
887 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
890 // move to _nodes
891 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
892 set<Syllable>::iterator iter;
893 for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
894 string s = iter->to_std_str();
895 string ss = get_lowercased_syllable(s);
896 //cerr << s << endl;
897 if (sarch.in_dict(sarch[s]) ||
898 sarch.in_dict(sarch[ss]))
899 output.insert(iter->to_str(),v+1);
903 void get_syllable_candidates(const char *input,Candidates &output,float v)
905 Syllable syll;
906 string s,s2;
907 set<string> s3;
908 set<string>::iterator s3i;
911 // bo dau sai vi tri
912 if (syll.parse(input) &&
913 syll.to_str() != string(input))
914 output.insert(syll.to_str(),v+10);
916 get_phonetic_syllable_candidates(input,output,v);
918 KeyRecover keyr;
919 keyr.init(input);
920 while (keyr.step(s)) {
921 s2 = get_std_syllable(s);
922 if (s2 != s && syll.parse(s2.c_str()))
923 output.insert(syll.to_str(),v);
924 s3.clear();
925 im_recover(s.c_str(),s3);
926 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
927 s2 = get_std_syllable(*s3i);
928 if (s2 != *s3i && syll.parse(s2.c_str()))
929 output.insert(syll.to_str(),v);
932 keyr.done();
934 // SpaceInserter
935 uint i,n = strlen(input);
936 for (i = 1;i < n;i ++) {
937 s = string(input).substr(0,i);
938 s2 = string(input).substr(i);
939 if (syll.parse(s.c_str())) {
940 s = syll.to_str();
941 if (syll.parse(s2.c_str())) {
942 output.insert(s + string(" ") + syll.to_str(),v);
947 CharInserter inserter;
948 inserter.init(input);
949 while (inserter.step(s)) {
950 s2 = get_std_syllable(s);
951 if (s2 != s && syll.parse(s2.c_str()))
952 output.insert(syll.to_str(),v);
953 s3.clear();
954 im_recover(s.c_str(),s3);
955 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
956 s2 = get_std_syllable(*s3i);
957 if (s2 != *s3i && syll.parse(s2.c_str()))
958 output.insert(syll.to_str(),v);
961 inserter.done();
963 CharEraser eraser;
964 eraser.init(input);
965 while (eraser.step(s)) {
966 s2 = get_std_syllable(s);
967 if (s2 != s && syll.parse(s2.c_str()))
968 output.insert(syll.to_str(),v);
969 s3.clear();
970 im_recover(s.c_str(),s3);
971 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
972 s2 = get_std_syllable(*s3i);
973 if (s2 != *s3i && syll.parse(s2.c_str()))
974 output.insert(syll.to_str(),v);
977 eraser.done();
979 CharTransposer transposer;
980 transposer.init(input);
981 while (transposer.step(s)) {
982 s2 = get_std_syllable(s);
983 if (s2 != s && syll.parse(s2.c_str()))
984 output.insert(syll.to_str(),v);
985 s3.clear();
986 im_recover(s.c_str(),s3);
987 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
988 s2 = get_std_syllable(*s3i);
989 if (s2 != *s3i && syll.parse(s2.c_str()))
990 output.insert(syll.to_str(),v);
993 transposer.done();
996 void get_left_syllable_candidates(const char *input,const char *left,Candidates &output)
998 // merge
999 string s;
1000 s = string(left)+string(input);
1001 get_syllable_candidates(s.c_str(),output,10);
1003 // cut one char from input
1004 if (strlen(input) > 1) {
1005 s = string(input+1);
1006 get_syllable_candidates(s.c_str(),output,10);
1009 // insert one char from left to input
1010 if (strlen(left)) {
1011 s = string(" ") + string(input);
1012 s[0] = left[strlen(left)-1];
1013 get_syllable_candidates(s.c_str(),output,10);
1017 void get_right_syllable_candidates(const char *input,const char *right,Candidates &output)
1019 // merge
1020 string s;
1021 s = string(input)+string(right);
1022 get_syllable_candidates(s.c_str(),output,10);
1024 // cut one char from input
1025 if (strlen(input) > 1) {
1026 s = string(input);
1027 s.resize(strlen(input)-1);
1028 get_syllable_candidates(s.c_str(),output,10);
1031 // insert one char from right to input
1032 if (strlen(right)) {
1033 s = string(input)+string(" ");
1034 s[s.size()-1] = right[0];
1035 get_syllable_candidates(s.c_str(),output,10);
1041 void Candidates::insert(const std::string &s,float f)
1043 Candidate c;
1044 c.candidate = s;
1045 c.priority = f;
1046 set<Candidate>::iterator iter = candidates.find(c);
1047 if (iter != candidates.end()) {
1048 if (iter->priority < c.priority)
1049 candidates.erase(iter);
1050 else
1051 return;
1053 candidates.insert(c);
1056 bool Candidates::CandidateComparator::operator()(const std::string &s1,const std::string &s2)
1058 set<Candidate>::iterator i1,i2;
1059 Candidate c1,c2;
1060 c1.candidate = s1;
1061 c2.candidate = s2;
1062 i1 = c.candidates.find(c1);
1063 i2 = c.candidates.find(c2);
1064 if (i1->priority != i2->priority)
1065 return i1->priority > i2->priority;
1066 float f1,f2;
1067 VocabIndex v;
1068 v = Vocab_None;
1069 f1 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s1))],&v);
1070 f2 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s2))],&v);
1071 //cerr << f1 << "<>" << f2 << endl;
1072 return f1 > f2; // we want reverse order
1075 void Candidates::get_list(std::vector<std::string> &v)
1077 set<Candidate>::iterator iter;
1079 iter = candidates.begin();
1080 while (iter != candidates.end()) {
1081 if (!is_valid_word_form(iter->candidate.c_str())) {
1082 get_case_syllable_candidates(iter->candidate.c_str(),*this,iter->priority);
1083 candidates.erase(iter++);
1084 } else
1085 ++iter;
1088 v.resize(candidates.size());
1089 uint n = 0;
1090 for (iter = candidates.begin();iter != candidates.end(); ++iter)
1091 if (sarch.in_dict(get_std_syllable(get_lowercased_syllable(iter->candidate))))
1092 v[n++] = iter->candidate;
1093 v.resize(n);
1094 sort(v.begin(),v.end(),CandidateComparator(*this));