BranchNode uses map instead of map. different leaves have different leaf names (curre...
[vspell.git] / libvspell / spell.cpp
blob6f4ace29765ecfe80b3232c4d67068c4b0c97c80
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include <iterator>
3 #include <algorithm>
4 #include <sstream>
5 #include <iostream>
6 #include <fstream>
7 #include "vspell.h"
8 #include "sentence.h"
9 #include "syllable.h"
10 #include "pfs.h"
11 #include <map>
12 #include "propername.h"
13 #include "keyboard.h"
14 #include "shuffle.h"
15 #include "penalty.h"
16 #include "bellman.h"
17 #include <math.h>
19 // stolen from glib
20 typedef unsigned int guint32;
21 typedef unsigned int guint;
22 typedef guint32 gunichar;
23 typedef char gchar;
24 typedef unsigned char guchar;
25 typedef long glong;
26 typedef signed int gssize;
27 #define UTF8_COMPUTE(Char, Mask, Len) \
28 if (Char < 128) \
29 { \
30 Len = 1; \
31 Mask = 0x7f; \
32 } \
33 else if ((Char & 0xe0) == 0xc0) \
34 { \
35 Len = 2; \
36 Mask = 0x1f; \
37 } \
38 else if ((Char & 0xf0) == 0xe0) \
39 { \
40 Len = 3; \
41 Mask = 0x0f; \
42 } \
43 else if ((Char & 0xf8) == 0xf0) \
44 { \
45 Len = 4; \
46 Mask = 0x07; \
47 } \
48 else if ((Char & 0xfc) == 0xf8) \
49 { \
50 Len = 5; \
51 Mask = 0x03; \
52 } \
53 else if ((Char & 0xfe) == 0xfc) \
54 { \
55 Len = 6; \
56 Mask = 0x01; \
57 } \
58 else \
59 Len = -1;
61 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
62 (Result) = (Chars)[0] & (Mask); \
63 for ((Count) = 1; (Count) < (Len); ++(Count)) \
64 { \
65 if (((Chars)[(Count)] & 0xc0) != 0x80) \
66 { \
67 (Result) = (gunichar)-1; \
68 break; \
69 } \
70 (Result) <<= 6; \
71 (Result) |= ((Chars)[(Count)] & 0x3f); \
74 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
75 static const gchar utf8_skip_data[256] = {
76 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
77 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
78 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
79 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
80 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
81 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
82 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
83 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
86 static const gchar * const g_utf8_skip = utf8_skip_data;
87 static int
88 g_unichar_to_utf8 (gunichar c,
89 gchar *outbuf)
91 guint len = 0;
92 int first;
93 int i;
95 if (c < 0x80)
97 first = 0;
98 len = 1;
100 else if (c < 0x800)
102 first = 0xc0;
103 len = 2;
105 else if (c < 0x10000)
107 first = 0xe0;
108 len = 3;
110 else if (c < 0x200000)
112 first = 0xf0;
113 len = 4;
115 else if (c < 0x4000000)
117 first = 0xf8;
118 len = 5;
120 else
122 first = 0xfc;
123 len = 6;
126 if (outbuf)
128 for (i = len - 1; i > 0; --i)
130 outbuf[i] = (c & 0x3f) | 0x80;
131 c >>= 6;
133 outbuf[0] = c | first;
136 return len;
139 static gunichar
140 g_utf8_get_char (const gchar *p)
142 int i, mask = 0, len;
143 gunichar result;
144 unsigned char c = (unsigned char) *p;
146 UTF8_COMPUTE (c, mask, len);
147 if (len == -1)
148 return (gunichar)-1;
149 UTF8_GET (result, p, i, mask, len);
151 return result;
154 static glong
155 g_utf8_strlen (const gchar *p,
156 gssize max)
158 glong len = 0;
159 const gchar *start = p;
160 //g_return_val_if_fail (p != NULL || max == 0, 0);
161 if (!(p != NULL || max == 0))
162 return 0;
164 if (max < 0)
166 while (*p)
168 p = g_utf8_next_char (p);
169 ++len;
172 else
174 if (max == 0 || !*p)
175 return 0;
177 p = g_utf8_next_char (p);
179 while (p - start < max && *p)
181 ++len;
182 p = g_utf8_next_char (p);
185 /* only do the last len increment if we got a complete
186 * char (don't count partial chars)
188 if (p - start == max)
189 ++len;
192 return len;
195 using namespace std;
199 The process:
200 1. Sentence segmentation. (sentences_split)
201 2. Separate "words" by spaces. (tokenize)
202 3. Punctuation separation. (tokenize/tokenize_punctuation)
203 4. Foreign/Abbreviation detection.
204 5. Proper name detection.
205 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
206 generalize all capitalized words.
207 6* Syllable checking. (check1)
208 7. Find all possible (misspelled) words. (**) (get_all_words)
209 8. "pre-separate" sentence into phrases.
210 9. Word segmentation. (**)
211 10. Find the best segmentation. (segment_best)
212 10* Word checking. (check2)
216 namespace Spell {
218 bool VSpell::init()
220 dic_init();
222 cerr << "Loading dictionary... ";
223 warch.load("wordlist");
224 cerr << "done" << endl;
225 cerr << "Loading ngram... ";
226 File f("ngram","rt");
227 if (!f.error()) {
228 get_ngram().read(f);
229 cerr << "done" << endl;
230 } else
231 cerr << "Ngram loading error. The result may be incorrect" << endl;
232 cerr << "Loading syngram... ";
233 File ff("syngram","rt");
234 if (!ff.error()) {
235 get_syngram().read(ff);
236 cerr << "done" << endl;
237 } else
238 cerr << "Syllable Ngram loading error. The result may be incorrect" << endl;
239 sarch.set_blocked(true);
240 return true;
243 bool VSpell::check(const char *utf8_pp)
245 utf8_text = utf8_pp;
247 syllables.clear();
249 bool run = false;
251 do {
252 string pp = viet_to_viscii_force(utf8_text.c_str());
253 vector<string> pps;
254 sentences_split(pp,pps);
255 unsigned pps_i,pps_len = pps.size();
256 unsigned offset = 0;
257 text = pp;
259 for (pps_i = 0;pps_i < pps_len;pps_i ++) {
260 Text *t = text_factory.create(this);
261 t->offset = offset;
262 t->length = pps[pps_i].size();
263 run = !t->sentence_check(pps[pps_i].c_str());
264 delete t;
265 if (run)
266 break;
267 offset += pps[pps_i].size();
269 } while (run);
270 return true;
273 void VSpell::replace(unsigned from,unsigned size,const char *s)
275 const char *p = utf8_text.c_str();
276 unsigned i;
277 for (i = 0;i < from && *p;i ++)
278 p = g_utf8_next_char(p);
279 unsigned from1 = p - utf8_text.c_str();
280 for (i = 0;i < size && *p;i ++)
281 p = g_utf8_next_char(p);
282 unsigned to1 = p - utf8_text.c_str();
283 utf8_text.replace(from1,to1-from1,s);
285 // remove separators in the range, adjust other seps
286 int n = separators.size();
287 if (n) {
288 int newsize = g_utf8_strlen(s,-1);
289 i = n;
290 do {
291 i --;
292 if (separators[i] >= from) {
293 if (separators[i] < from+size)
294 separators.erase(separators.begin()+i);
295 else
296 separators[i] += newsize-size;
298 } while (i > 0);
302 void VSpell::add_separators(const std::vector<unsigned> &seps)
304 copy(seps.begin(),seps.end(),back_inserter(separators));
307 void VSpell::add_word(const char *s)
309 istringstream is(s);
310 strid_string toks;
311 string ss;
312 while (is >> ss) {
313 toks += sarch[ss];
315 words.insert(toks);
318 bool Text::sentence_check(const char *pp)
320 // preprocess
321 st.set(pp);
322 st.standardize();
323 st.tokenize();
325 if (!st.get_syllable_count()) // nothing to do but crash ;)
326 return true;
328 // syllable checking
329 if (!syllable_check() && !ui_syllable_check())
330 return false;
332 //w.construct(st);
333 set<WordEntry> wes;
334 WordStateFactories factories;
335 ExactWordStateFactory exact;
336 LowerWordStateFactory lower;
337 UpperWordStateFactory upper;
338 FuzzyWordStateFactory fuzzy;
339 factories.push_back(&exact);
340 factories.push_back(&lower);
341 factories.push_back(&upper);
342 factories.push_back(&fuzzy);
343 w.pre_construct(st,wes,factories);
344 mark_proper_name(st,wes);
345 apply_separators(wes);
346 w.post_construct(wes);
347 //cerr << w << endl;
349 Path path;
350 WordDAG dagw(&w);
351 WordDAG2 *dagw2;
352 DAG *dag = &dagw;
353 if (vspell->get_trigram()) {
354 dagw2 = new WordDAG2(&dagw);
355 dag = dagw2;
357 PenaltyDAG pdag(dag,vspell->get_penalty());
358 if (vspell->get_penalty())
359 dag = &pdag;
361 if (vspell->get_normalization()) {
362 Bellman pfs;
363 pfs.search(*dag,path);
364 } else {
365 PFS pfs;
366 pfs.search(*dag,path);
368 if (vspell->get_trigram()) {
369 dagw2->demangle(path);
370 delete dagw2;
372 seg.resize(path.size()-2);
373 // don't copy head/tail
374 copy(path.begin()+1,path.end()-1,seg.begin());
375 seg.we = w.we;
376 //cerr << seg << endl;
378 // word checking
379 if (!word_check() && !ui_word_check())
380 return false;
382 return true; // done
385 bool Text::syllable_check(int i)
387 if (vspell->in_dict(st[i].get_id()))
388 return true;
390 if (sarch.in_dict(st[i].get_cid())) {
391 Syllable syl; // diacritic check
392 if (syl.parse(sarch[st[i].get_cid()])) {
393 string s = get_lowercased_syllable(syl.to_str());
394 if (get_lowercased_syllable(sarch[st[i].get_id()]) == s)
395 return true;
398 return false;
401 bool Text::syllable_check()
403 int i,n = st.get_syllable_count();
405 suggestions.clear();
407 for (i = 0;i < n;i ++) {
408 if (syllable_check(i))
409 continue;
411 Suggestion _s;
412 _s.id = i;
413 suggestions.push_back(_s);
415 return suggestions.empty();
418 bool Text::word_check()
420 int i,n = seg.size();
422 suggestions.clear();
424 for (i = 0;i < n;i ++) {
425 vector<strid> sylls;
426 strid_string sylls2;
427 int ii,len = seg[i].node->get_syllable_count();
428 seg[i].node->get_syllables(sylls);
430 // case-sensitive comparation.
431 sylls2.resize(len);
432 bool subok = true;
433 for (ii = 0;ii < len;ii ++) {
434 sylls2[ii] = st[seg[i].pos+ii].get_id();
435 if (subok && st[seg[i].pos+ii].get_cid() != sylls[ii])
436 subok = false;
439 // in user dict
440 bool ok = vspell->in_dict(sylls2);
442 if (!ok) {
443 strid_string sylls3;
444 sylls3.resize(sylls.size());
445 for (ii = 0;ii < sylls3.size();ii ++)
446 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
448 if (vspell->get_strict_word_checking()) {
449 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
450 if (!subok &&
451 (is_all_capitalized_word(sylls2) ||
452 (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
453 subok = true;
455 ok = subok;
456 } else {
457 string s;
458 BranchNode *branch = warch.get_root();
459 for (ii = 0;ii < len && branch;ii ++)
460 branch = branch->get_branch(st[seg[i].pos+ii].get_cid());
462 if (branch) {
463 LeafNode* leaf;
464 leaf = branch->get_leaf(sarch["<mainleaf>"]);
465 sylls.clear();
466 leaf->get_syllables(sylls);
467 strid_string sylls3;
468 sylls3.resize(sylls.size());
469 for (ii = 0;ii < sylls3.size();ii ++)
470 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
472 ok = sylls2 == sylls3;
474 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
475 if (!ok &&
476 (is_all_capitalized_word(sylls2) ||
477 (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
478 ok = true;
483 if (!ok) {
484 Suggestion _s;
485 _s.id = i;
486 suggestions.push_back(_s);
487 continue;
490 // all syllable are syntatically valid
492 return suggestions.empty();
495 unsigned Text::pos_from_syllable(const Suggestion &s)
497 return offset+st[s.id].start;
500 unsigned Text::pos_from_word(const Suggestion &s)
502 //return offset+st[seg[s.id]].start;
503 return 0;
506 void Text::replace(unsigned from,unsigned size,const char *s)
508 vspell->replace(from+offset,size,s);
511 string Text::substr(unsigned from,unsigned size)
513 const string &utf8_text = vspell->get_utf8_text();
514 const char *p = utf8_text.c_str();
515 unsigned i;
516 from += offset;
517 for (i = 0;i < from && *p;i ++)
518 p = g_utf8_next_char(p);
519 unsigned from1 = p - utf8_text.c_str();
520 for (i = 0;i < size && *p;i ++)
521 p = g_utf8_next_char(p);
522 unsigned to1 = p - utf8_text.c_str();
523 return utf8_text.substr(from1,to1-from1);
526 void Text::apply_separators(set<WordEntry> &wes)
528 vector<unsigned> seps;
529 //set<unsigned> seps;
531 get_separators(seps);
532 sort(seps.begin(),seps.end());
533 //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
534 int sep = 0;
535 int i,n = st.get_syllable_count();
537 for (i = 0;i < n-1 && sep < seps.size();i ++) {
538 int p = offset+st[i].start+strlen(sarch[st[i].get_id()]);
539 if (p <= seps[sep] && seps[sep] <= offset+st[i+1].start) {
540 apply_separator(wes,i);
541 sep ++;
546 void Text::get_separators(vector<unsigned> &v)
548 const vector<unsigned> &vv = vspell->get_separators();
550 int i,n = vv.size();
551 for (i = 0;i < n;i ++)
552 if (vv[i] >= offset && vv[i] < offset+length)
553 v.push_back(vv[i]);
559 static unsigned char viscii_str[] = {
560 0xe1,0xe0,0xe4,0xe3,0xd5,
561 0xe2,0xa4,0xa5,0xa6,0xe7,0xa7,
562 0xe5,0xa1,0xa2,0xc6,0xc7,0xa3,
563 0xe9,0xe8,0xeb,0xa8,0xa9,
564 0xea,0xaa,0xab,0xac,0xad,
565 0xae,0xed,0xec,0xef,0xee,0xb8,
566 0xf3,0xf2,0xf6,0xf5,0xf7,
567 0xf4,0xaf,0xb0,0xb1,0xb2,
568 0xb5,0xbd,0xbe,0xb6,0xb7,0xde,
569 0xfe,0xfa,0xf9,0xfc,0xfb,0xf8,
570 0xdf,0xd1,0xd7,0xd8,0xe6,
571 0xf1,0xfd,0xcf,0xd6,0xdb,0xdc,
572 0xf0,
573 0xc1,0xc0,0xc4,0xc3,0x80,
574 0xc2,0x84,0x85,0x86,0x06,0x87,
575 0xc5,0x81,0x82,0x02,0x05,0x83,
576 0xc9,0xc8,0xcb,0x88,0x89,
577 0xca,0x8a,0x8b,0x8c,0x8d,0x8e,
578 0xcd,0xcc,0x9b,0xce,0x98,
579 0xd3,0xd2,0x99,0xa0,0x9a,
580 0xd4,0x8f,0x90,0x91,0x92,
581 0x93,0xb4,0x95,0x96,0x97,0xb3,
582 0x94,0xda,0xd9,0x9c,0x9d,0x9e,
583 0xbf,0xba,0xbb,0xbc,0xff,0xb9,
584 0xdd,0x9f,0x14,0x19,0x1e,
585 0xd0,
588 //"a'a`a?a~a.a^a^'a^`a^?a^~a^.a(a('a(`a(?a(~a(.e'e`e?e~e.e^e^'e^`e^?e^~e^.i'i`i?i~i.o'o`o?o~o.o^o^'o^`o^?o^~o^.o+o+'o+`o+?o+~o+.u'u`u?u~u.u+u+'u+`u+?u+~u+.y'y`y?y~y.ddA'A`A?A~A.A^A^'A^`A^?A^~A^.A(A('A(`A(?A(~A(.E'E`E?E~E.E^E^'E^`E^?E^~E^.I'I`I?I~I.O'O`O?O~O.O^O^'O^`O^?O^~O^.O+O+'O+`O+?O+~O+.U'U`U?U~U.U+U+'U+`U+?U+~U+.Y'Y`Y?Y~Y.DD";
590 static gunichar unicode_str[] = {
591 225, 224,7843, 227,7841,
592 226,7845,7847,7849,7851,7853,
593 259,7855,7857,7859,7861,7863,
594 233, 232,7867,7869,7865,
595 234,7871,7873,7875,7877,
596 7879, 237, 236,7881, 297,7883,
597 243, 242,7887, 245,7885,
598 244,7889,7891,7893,7895,
599 7897, 417,7899,7901,7903,7905,
600 7907, 250, 249,7911, 361,7909,
601 432,7913,7915,7917,7919,
602 7921, 253,7923,7927,7929,7925,
603 273,
604 193, 192,7842, 195,7840,
605 194,7844,7846,7848,7850,7852,
606 258,7854,7856,7858,7860,7862,
607 201, 200,7866,7868,7864,
608 202,7870,7872,7874,7876,7878,
609 205, 204,7880, 296,7882,
610 211, 210,7886, 213,7884,
611 212,7888,7890,7892,7894,
612 7896, 416,7898,7900,7902,7904,
613 7906, 218, 217,7910, 360,7908,
614 431,7912,7914,7916,7918,7920,
615 221,7922,7926,7928,7924,
616 272,
620 static map<unsigned char,gunichar> viscii_utf8_map;
621 static map<gunichar,unsigned char> utf8_viscii_map;
623 void viet_init()
625 for (unsigned i = 0;viscii_str[i];i ++) {
626 viscii_utf8_map[viscii_str[i]] = unicode_str[i];
627 utf8_viscii_map[unicode_str[i]] = viscii_str[i];
631 bool viet_utf8_to_viscii(const char *in,char *out) // pre-allocated
633 const gchar *p = in;
634 gunichar ch;
635 while ((ch = g_utf8_get_char(p)) != 0) {
636 p = g_utf8_next_char(p);
637 if (ch < 128) {
638 *out++ = ch;
639 continue;
642 map<gunichar,unsigned char>::iterator iter;
643 iter = utf8_viscii_map.find(ch);
644 if (iter != utf8_viscii_map.end())
645 *out++ = (char)iter->second;
646 else {
647 fprintf(stderr,"Warning: unexpected unicode character %d",ch);
648 return false;
651 *out = 0;
652 return true;
655 bool viet_utf8_to_viscii_force(const char *in,char *out)
657 const gchar *p = in;
658 gunichar ch;
659 while ((ch = g_utf8_get_char(p)) != 0) {
660 p = g_utf8_next_char(p);
661 if (ch < 128) {
662 *out++ = ch;
663 continue;
666 map<gunichar,unsigned char>::iterator iter;
667 iter = utf8_viscii_map.find(ch);
668 if (iter != utf8_viscii_map.end())
669 *out++ = (char)iter->second;
670 else {
671 fprintf(stderr,"Warning: unexpected unicode character %d",ch);
672 *out += 'z';
675 *out = 0;
676 return true;
679 void viet_viscii_to_utf8(const char *in,char *out) // pre-allocated
681 unsigned char *p = (unsigned char*)in;
682 unsigned char ch;
683 while ((ch = *p) != 0) {
684 p++;
685 if (ch < 128 && ch >= 32) {
686 *out++ = ch;
687 continue;
690 map<unsigned char,gunichar>::iterator iter;
691 iter = viscii_utf8_map.find(ch);
692 if (iter != viscii_utf8_map.end()) {
693 g_unichar_to_utf8(iter->second,out);
694 out = g_utf8_next_char(out);
695 } else {
696 *out++ = ch; // fall-back case
700 *out = 0;
703 static char buffer[6000];
704 char* viet_to_viscii(const char *in)
706 if (g_utf8_strlen(in,-1) >= 1000)
707 return "";
708 if (viet_utf8_to_viscii(in,buffer))
709 return buffer;
710 else
711 return NULL;
714 char* viet_to_viscii_force(const char *in)
716 if (g_utf8_strlen(in,-1) >= 1000)
717 return "";
718 viet_utf8_to_viscii_force(in,buffer);
719 return buffer;
722 char* viet_to_utf8(const char *in)
724 if (strlen(in) >= 1000)
725 return "";
726 viet_viscii_to_utf8(in,buffer);
727 return buffer;
731 static char_traits_strid::char_type*
732 char_traits_strid::copy(char_traits_strid::char_type* __s1,
733 const char_traits_strid::char_type* __s2,
734 size_t __n)
736 return static_cast<char_type*>(memcpy(__s1, __s2, __n*sizeof(char_type)));
741 bool get_case_syllable_candidates(const char *input,Candidates &output, float v)
743 // There are only two acceptable cases:
744 // 1. The first character is upper case, the rest is lower
745 // 2. All are either lower or upper
746 // if there is some upper case character without following one of these cases, then it's fault.
747 // also, if there is a uppercase word in dictionary, then add it.
749 uint i,n = strlen(input);
750 // check for upper-case chars
751 for (i = n-1;i >= 0;i --)
752 if (viet_toupper(input[i]) == input[i])
753 break;
755 if (i <= 0 || n < 2) // ignore if the only upper char is the first one.
756 return false;
758 string s;
760 s = input;
761 s[0] = viet_toupper(s[0]);
762 for (i = 1;i < n;i ++)
763 s[i] = viet_tolower(s[i]);
764 if (s != string(input))
765 output.insert(s,v);
767 s = input;
768 for (i = 0;i < n;i ++)
769 s[i] = viet_tolower(s[i]);
770 if (s != string(input))
771 output.insert(s,v);
773 s = input;
774 for (i = 0;i < n;i ++)
775 s[i] = viet_toupper(s[i]);
776 if (s != string(input))
777 output.insert(s,v);
778 return true;
781 void get_phonetic_syllable_candidates(const char *input,Candidates &output,float v)
783 vector<confusion_set>& confusion_sets = get_confusion_sets();
784 int i,j,m,n = confusion_sets.size();
785 bool ret = false;
786 set<Syllable> syllset,syllset2;
787 Syllable _syll;
789 _syll.parse(input);
790 syllset2.insert(_syll);
791 while (!syllset2.empty()) {
792 const Syllable sy = *syllset2.begin();
793 syllset2.erase(syllset2.begin());
795 if (syllset.find(sy) != syllset.end())
796 continue; // we already matched&applied this syllable
798 //cerr << sy << endl;
799 syllset.insert(sy);
801 vector<Syllable> sylls;
802 // match & apply
803 for (i = 0;i < n;i ++) {
804 m = confusion_sets[i].size();
805 for (j = 0;j < m;j ++)
806 if (confusion_sets[i][j].match(sy))
807 break;
808 if (j < m) {
809 for (j = 0;j < m;j ++)
810 confusion_sets[i][j].apply(sy,sylls);
813 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
816 // move to _nodes
817 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
818 set<Syllable>::iterator iter;
819 for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
820 string s = iter->to_std_str();
821 string ss = get_lowercased_syllable(s);
822 //cerr << s << endl;
823 if (sarch.in_dict(sarch[s]) ||
824 sarch.in_dict(sarch[ss]))
825 output.insert(iter->to_str(),v+1);
829 void get_syllable_candidates(const char *input,Candidates &output,float v)
831 Syllable syll;
832 string s,s2;
833 set<string> s3;
834 set<string>::iterator s3i;
836 get_phonetic_syllable_candidates(input,output,v);
838 KeyRecover keyr;
839 keyr.init(input);
840 while (keyr.step(s)) {
841 s2 = get_std_syllable(s);
842 if (s2 != s && syll.parse(s2.c_str()))
843 output.insert(syll.to_str(),v);
844 s3.clear();
845 im_recover(s.c_str(),s3);
846 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
847 s2 = get_std_syllable(*s3i);
848 if (s2 != *s3i && syll.parse(s2.c_str()))
849 output.insert(syll.to_str(),v);
852 keyr.done();
854 // SpaceInserter
855 uint i,n = strlen(input);
856 for (i = 1;i < n;i ++) {
857 s = string(input).substr(0,i);
858 s2 = string(input).substr(i);
859 if (syll.parse(s.c_str())) {
860 s = syll.to_str();
861 if (syll.parse(s2.c_str())) {
862 output.insert(s + string(" ") + syll.to_str(),v);
867 CharInserter inserter;
868 inserter.init(input);
869 while (inserter.step(s)) {
870 s2 = get_std_syllable(s);
871 if (s2 != s && syll.parse(s2.c_str()))
872 output.insert(syll.to_str(),v);
873 s3.clear();
874 im_recover(s.c_str(),s3);
875 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
876 s2 = get_std_syllable(*s3i);
877 if (s2 != *s3i && syll.parse(s2.c_str()))
878 output.insert(syll.to_str(),v);
881 inserter.done();
883 CharEraser eraser;
884 eraser.init(input);
885 while (eraser.step(s)) {
886 s2 = get_std_syllable(s);
887 if (s2 != s && syll.parse(s2.c_str()))
888 output.insert(syll.to_str(),v);
889 s3.clear();
890 im_recover(s.c_str(),s3);
891 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
892 s2 = get_std_syllable(*s3i);
893 if (s2 != *s3i && syll.parse(s2.c_str()))
894 output.insert(syll.to_str(),v);
897 eraser.done();
899 CharTransposer transposer;
900 transposer.init(input);
901 while (transposer.step(s)) {
902 s2 = get_std_syllable(s);
903 if (s2 != s && syll.parse(s2.c_str()))
904 output.insert(syll.to_str(),v);
905 s3.clear();
906 im_recover(s.c_str(),s3);
907 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
908 s2 = get_std_syllable(*s3i);
909 if (s2 != *s3i && syll.parse(s2.c_str()))
910 output.insert(syll.to_str(),v);
913 transposer.done();
916 void get_left_syllable_candidates(const char *input,const char *left,Candidates &output)
918 // merge
919 string s;
920 s = string(left)+string(input);
921 get_syllable_candidates(s.c_str(),output,10);
923 // cut one char from input
924 if (strlen(input) > 1) {
925 s = string(input+1);
926 get_syllable_candidates(s.c_str(),output,10);
929 // insert one char from left to input
930 if (strlen(left)) {
931 s = string(" ") + string(input);
932 s[0] = left[strlen(left)-1];
933 get_syllable_candidates(s.c_str(),output,10);
937 void get_right_syllable_candidates(const char *input,const char *right,Candidates &output)
939 // merge
940 string s;
941 s = string(input)+string(right);
942 get_syllable_candidates(s.c_str(),output,10);
944 // cut one char from input
945 if (strlen(input) > 1) {
946 s = string(input);
947 s.resize(strlen(input)-1);
948 get_syllable_candidates(s.c_str(),output,10);
951 // insert one char from right to input
952 if (strlen(right)) {
953 s = string(input)+string(" ");
954 s[s.size()-1] = right[0];
955 get_syllable_candidates(s.c_str(),output,10);
961 void Candidates::insert(const std::string &s,float f)
963 Candidate c;
964 c.candidate = s;
965 c.priority = f;
966 set<Candidate>::iterator iter = candidates.find(c);
967 if (iter != candidates.end()) {
968 if (iter->priority < c.priority)
969 candidates.erase(iter);
970 else
971 return;
973 candidates.insert(c);
976 bool Candidates::CandidateComparator::operator()(const std::string &s1,const std::string &s2)
978 set<Candidate>::iterator i1,i2;
979 Candidate c1,c2;
980 c1.candidate = s1;
981 c2.candidate = s2;
982 i1 = c.candidates.find(c1);
983 i2 = c.candidates.find(c2);
984 if (i1->priority != i2->priority)
985 return i1->priority > i2->priority;
986 float f1,f2;
987 VocabIndex v;
988 v = Vocab_None;
989 f1 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s1))],&v);
990 f2 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s2))],&v);
991 //cerr << f1 << "<>" << f2 << endl;
992 return f1 > f2; // we want reverse order
995 void Candidates::get_list(std::vector<std::string> &v)
997 uint i,n = candidates.size();
998 v.resize(n);
999 set<Candidate>::iterator iter;
1000 n = 0;
1001 for (iter = candidates.begin();iter != candidates.end();++iter)
1002 if (!get_case_syllable_candidates(iter->candidate.c_str(),*this,iter->priority)) {
1003 if (sarch.in_dict(get_std_syllable(get_lowercased_syllable(iter->candidate)))) {
1004 v[n++] = iter->candidate;
1007 v.resize(n);
1008 sort(v.begin(),v.end(),CandidateComparator(*this));