1 #include "config.h" // -*- tab-width: 2 -*-
12 #include "propername.h"
20 typedef unsigned int guint32
;
21 typedef unsigned int guint
;
22 typedef guint32 gunichar
;
24 typedef unsigned char guchar
;
26 typedef signed int gssize
;
27 #define UTF8_COMPUTE(Char, Mask, Len) \
33 else if ((Char & 0xe0) == 0xc0) \
38 else if ((Char & 0xf0) == 0xe0) \
43 else if ((Char & 0xf8) == 0xf0) \
48 else if ((Char & 0xfc) == 0xf8) \
53 else if ((Char & 0xfe) == 0xfc) \
61 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
62 (Result) = (Chars)[0] & (Mask); \
63 for ((Count) = 1; (Count) < (Len); ++(Count)) \
65 if (((Chars)[(Count)] & 0xc0) != 0x80) \
67 (Result) = (gunichar)-1; \
71 (Result) |= ((Chars)[(Count)] & 0x3f); \
74 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
75 static const gchar utf8_skip_data
[256] = {
76 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
77 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
78 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
79 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
80 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
81 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
82 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
83 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
86 static const gchar
* const g_utf8_skip
= utf8_skip_data
;
88 g_unichar_to_utf8 (gunichar c
,
105 else if (c
< 0x10000)
110 else if (c
< 0x200000)
115 else if (c
< 0x4000000)
128 for (i
= len
- 1; i
> 0; --i
)
130 outbuf
[i
] = (c
& 0x3f) | 0x80;
133 outbuf
[0] = c
| first
;
140 g_utf8_get_char (const gchar
*p
)
142 int i
, mask
= 0, len
;
144 unsigned char c
= (unsigned char) *p
;
146 UTF8_COMPUTE (c
, mask
, len
);
149 UTF8_GET (result
, p
, i
, mask
, len
);
155 g_utf8_strlen (const gchar
*p
,
159 const gchar
*start
= p
;
160 //g_return_val_if_fail (p != NULL || max == 0, 0);
161 if (!(p
!= NULL
|| max
== 0))
168 p
= g_utf8_next_char (p
);
177 p
= g_utf8_next_char (p
);
179 while (p
- start
< max
&& *p
)
182 p
= g_utf8_next_char (p
);
185 /* only do the last len increment if we got a complete
186 * char (don't count partial chars)
188 if (p
- start
== max
)
200 1. Sentence segmentation. (sentences_split)
201 2. Separate "words" by spaces. (tokenize)
202 3. Punctuation separation. (tokenize/tokenize_punctuation)
203 4. Foreign/Abbreviation detection.
204 5. Proper name detection.
205 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
206 generalize all capitalized words.
207 6* Syllable checking. (check1)
208 7. Find all possible (misspelled) words. (**) (get_all_words)
209 8. "pre-separate" sentence into phrases.
210 9. Word segmentation. (**)
211 10. Find the best segmentation. (segment_best)
212 10* Word checking. (check2)
222 cerr
<< "Loading dictionary... ";
223 warch
.load("wordlist");
224 cerr
<< "done" << endl
;
225 cerr
<< "Loading ngram... ";
226 File
f("ngram","rt");
229 cerr
<< "done" << endl
;
231 cerr
<< "Ngram loading error. The result may be incorrect" << endl
;
232 cerr
<< "Loading syngram... ";
233 File
ff("syngram","rt");
235 get_syngram().read(ff
);
236 cerr
<< "done" << endl
;
238 cerr
<< "Syllable Ngram loading error. The result may be incorrect" << endl
;
239 sarch
.set_blocked(true);
243 bool VSpell::check(const char *utf8_pp
)
252 string pp
= viet_to_viscii_force(utf8_text
.c_str());
254 sentences_split(pp
,pps
);
255 unsigned pps_i
,pps_len
= pps
.size();
259 for (pps_i
= 0;pps_i
< pps_len
;pps_i
++) {
260 Text
*t
= text_factory
.create(this);
262 t
->length
= pps
[pps_i
].size();
263 run
= !t
->sentence_check(pps
[pps_i
].c_str());
267 offset
+= pps
[pps_i
].size();
273 void VSpell::replace(unsigned from
,unsigned size
,const char *s
)
275 const char *p
= utf8_text
.c_str();
277 for (i
= 0;i
< from
&& *p
;i
++)
278 p
= g_utf8_next_char(p
);
279 unsigned from1
= p
- utf8_text
.c_str();
280 for (i
= 0;i
< size
&& *p
;i
++)
281 p
= g_utf8_next_char(p
);
282 unsigned to1
= p
- utf8_text
.c_str();
283 utf8_text
.replace(from1
,to1
-from1
,s
);
285 // remove separators in the range, adjust other seps
286 int n
= separators
.size();
288 int newsize
= g_utf8_strlen(s
,-1);
292 if (separators
[i
] >= from
) {
293 if (separators
[i
] < from
+size
)
294 separators
.erase(separators
.begin()+i
);
296 separators
[i
] += newsize
-size
;
302 void VSpell::add_separators(const std::vector
<unsigned> &seps
)
304 copy(seps
.begin(),seps
.end(),back_inserter(separators
));
307 void VSpell::add_word(const char *s
)
318 bool Text::sentence_check(const char *pp
)
325 if (!st
.get_syllable_count()) // nothing to do but crash ;)
329 if (!syllable_check() && !ui_syllable_check())
334 WordStateFactories factories
;
335 ExactWordStateFactory exact
;
336 LowerWordStateFactory lower
;
337 UpperWordStateFactory upper
;
338 FuzzyWordStateFactory fuzzy
;
339 factories
.push_back(&exact
);
340 factories
.push_back(&lower
);
341 factories
.push_back(&upper
);
342 factories
.push_back(&fuzzy
);
343 w
.pre_construct(st
,wes
,factories
);
344 mark_proper_name(st
,wes
);
345 apply_separators(wes
);
346 w
.post_construct(wes
);
353 if (vspell
->get_trigram()) {
354 dagw2
= new WordDAG2(&dagw
);
358 Penalty2DAG
p2dag(dag
,vspell
->get_penalty2());
359 if (vspell
->get_penalty2()) {
360 // with penalty2dag, we have to do non-fuzzy segmentation first
361 // to feed Penalty2DAG::syllable_weights
363 penalty2_construct(p2seg
);
364 p2dag
.set_syllable_weights(p2seg
);
368 PenaltyDAG
pdag(dag
,vspell
->get_penalty());
369 if (vspell
->get_penalty()) {
374 if (vspell
->get_normalization()) {
376 pfs
.search(*dag
,path
);
379 pfs
.search(*dag
,path
);
382 if (vspell
->get_trigram()) {
383 dagw2
->demangle(path
);
387 seg
.resize(path
.size()-2);
388 // don't copy head/tail
389 copy(path
.begin()+1,path
.end()-1,seg
.begin());
391 //cerr << seg << endl;
394 if (!word_check() && !ui_word_check())
400 void Text::penalty2_construct(Segmentation
&seg
)
402 WordStateFactories factories
;
403 ExactWordStateFactory exact
;
404 LowerWordStateFactory lower
;
405 UpperWordStateFactory upper
;
406 //FuzzyWordStateFactory fuzzy;
408 factories
.push_back(&exact
);
409 factories
.push_back(&lower
);
410 factories
.push_back(&upper
);
411 //factories.push_back(&fuzzy);
414 lattice
.pre_construct(st
,wes
,factories
);
415 mark_proper_name(st
,wes
);
416 //apply_separators(wes);
417 lattice
.post_construct(wes
);
419 WordDAG
dagw(&lattice
);
423 if (vspell
->get_trigram()) {
424 dagw2
= new WordDAG2(&dagw
);
429 if (vspell
->get_normalization()) {
431 pfs
.search(*dag
,path
);
434 pfs
.search(*dag
,path
);
437 if (vspell
->get_trigram()) {
438 dagw2
->demangle(path
);
442 seg
.resize(path
.size()-2);
443 // don't copy head/tail
444 copy(path
.begin()+1,path
.end()-1,seg
.begin());
449 bool Text::syllable_check(int i
)
451 if (vspell
->in_dict(st
[i
].get_id()))
454 if (sarch
.in_dict(st
[i
].get_cid())) {
455 Syllable syl
; // diacritic check
456 if (syl
.parse(sarch
[st
[i
].get_cid()])) {
457 string s
= get_lowercased_syllable(syl
.to_str());
458 if (get_lowercased_syllable(sarch
[st
[i
].get_id()]) == s
)
465 bool Text::syllable_check()
467 int i
,n
= st
.get_syllable_count();
471 for (i
= 0;i
< n
;i
++) {
472 if (syllable_check(i
))
477 suggestions
.push_back(_s
);
479 return suggestions
.empty();
482 bool Text::word_check()
484 int i
,n
= seg
.size();
488 for (i
= 0;i
< n
;i
++) {
491 int ii
,len
= seg
[i
].node
->get_syllable_count();
492 seg
[i
].node
->get_syllables(sylls
);
494 // case-sensitive comparation.
497 for (ii
= 0;ii
< len
;ii
++) {
498 sylls2
[ii
] = st
[seg
[i
].pos
+ii
].get_id();
499 if (subok
&& st
[seg
[i
].pos
+ii
].get_cid() != sylls
[ii
])
504 bool ok
= vspell
->in_dict(sylls2
);
508 sylls3
.resize(sylls
.size());
509 for (ii
= 0;ii
< sylls3
.size();ii
++)
510 sylls3
[ii
] = sarch
[get_unstd_syllable(sarch
[sylls
[ii
]])];
512 if (vspell
->get_strict_word_checking()) {
513 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
515 (is_all_capitalized_word(sylls2
) ||
516 (is_first_capitalized_word(sylls2
) && is_lower_cased_word(sylls3
))))
522 BranchNode
*branch
= warch
.get_root();
524 for (ii
= 0;ii
< len
&& branch
;ii
++)
525 branch
= branch
->get_branch(st
[seg
[i
].pos
+ii
].get_cid());
527 if (branch
&& (leaf
= branch
->get_leaf(sarch
["<mainleaf>"])) != NULL
) {
529 leaf
->get_syllables(sylls
);
531 sylls3
.resize(sylls
.size());
532 for (ii
= 0;ii
< sylls3
.size();ii
++)
533 sylls3
[ii
] = sarch
[get_unstd_syllable(sarch
[sylls
[ii
]])];
535 ok
= sylls2
== sylls3
;
537 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
539 (is_all_capitalized_word(sylls2
) ||
540 (is_first_capitalized_word(sylls2
) && is_lower_cased_word(sylls3
))))
549 suggestions
.push_back(_s
);
553 // all syllable are syntatically valid
555 return suggestions
.empty();
558 unsigned Text::pos_from_syllable(const Suggestion
&s
)
560 return offset
+st
[s
.id
].start
;
563 unsigned Text::pos_from_word(const Suggestion
&s
)
565 //return offset+st[seg[s.id]].start;
569 void Text::replace(unsigned from
,unsigned size
,const char *s
)
571 vspell
->replace(from
+offset
,size
,s
);
574 int Text::utf8_pos(unsigned from
)
576 const string
&utf8_text
= vspell
->get_utf8_text();
577 const char *p
= utf8_text
.c_str();
580 for (int i
= 0;i
< from
&& *p
;i
++)
581 p
= g_utf8_next_char(p
);
582 return (int)(p
- op
);
585 string
Text::substr(unsigned from
,unsigned size
)
587 const string
&utf8_text
= vspell
->get_utf8_text();
588 const char *p
= utf8_text
.c_str();
591 for (i
= 0;i
< from
&& *p
;i
++)
592 p
= g_utf8_next_char(p
);
593 unsigned from1
= p
- utf8_text
.c_str();
594 for (i
= 0;i
< size
&& *p
;i
++)
595 p
= g_utf8_next_char(p
);
596 unsigned to1
= p
- utf8_text
.c_str();
597 return utf8_text
.substr(from1
,to1
-from1
);
600 void Text::apply_separators(set
<WordEntry
> &wes
)
602 vector
<unsigned> seps
;
603 //set<unsigned> seps;
605 get_separators(seps
);
606 sort(seps
.begin(),seps
.end());
607 //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
609 int i
,n
= st
.get_syllable_count();
611 for (i
= 0;i
< n
-1 && sep
< seps
.size();i
++) {
612 int p
= offset
+st
[i
].start
+strlen(sarch
[st
[i
].get_id()]);
613 if (p
<= seps
[sep
] && seps
[sep
] <= offset
+st
[i
+1].start
) {
614 apply_separator(wes
,i
);
620 void Text::get_separators(vector
<unsigned> &v
)
622 const vector
<unsigned> &vv
= vspell
->get_separators();
625 for (i
= 0;i
< n
;i
++)
626 if (vv
[i
] >= offset
&& vv
[i
] < offset
+length
)
633 static unsigned char viscii_str
[] = {
634 0xe1,0xe0,0xe4,0xe3,0xd5,
635 0xe2,0xa4,0xa5,0xa6,0xe7,0xa7,
636 0xe5,0xa1,0xa2,0xc6,0xc7,0xa3,
637 0xe9,0xe8,0xeb,0xa8,0xa9,
638 0xea,0xaa,0xab,0xac,0xad,
639 0xae,0xed,0xec,0xef,0xee,0xb8,
640 0xf3,0xf2,0xf6,0xf5,0xf7,
641 0xf4,0xaf,0xb0,0xb1,0xb2,
642 0xb5,0xbd,0xbe,0xb6,0xb7,0xde,
643 0xfe,0xfa,0xf9,0xfc,0xfb,0xf8,
644 0xdf,0xd1,0xd7,0xd8,0xe6,
645 0xf1,0xfd,0xcf,0xd6,0xdb,0xdc,
647 0xc1,0xc0,0xc4,0xc3,0x80,
648 0xc2,0x84,0x85,0x86,0x06,0x87,
649 0xc5,0x81,0x82,0x02,0x05,0x83,
650 0xc9,0xc8,0xcb,0x88,0x89,
651 0xca,0x8a,0x8b,0x8c,0x8d,0x8e,
652 0xcd,0xcc,0x9b,0xce,0x98,
653 0xd3,0xd2,0x99,0xa0,0x9a,
654 0xd4,0x8f,0x90,0x91,0x92,
655 0x93,0xb4,0x95,0x96,0x97,0xb3,
656 0x94,0xda,0xd9,0x9c,0x9d,0x9e,
657 0xbf,0xba,0xbb,0xbc,0xff,0xb9,
658 0xdd,0x9f,0x14,0x19,0x1e,
662 //"a'a`a?a~a.a^a^'a^`a^?a^~a^.a(a('a(`a(?a(~a(.e'e`e?e~e.e^e^'e^`e^?e^~e^.i'i`i?i~i.o'o`o?o~o.o^o^'o^`o^?o^~o^.o+o+'o+`o+?o+~o+.u'u`u?u~u.u+u+'u+`u+?u+~u+.y'y`y?y~y.ddA'A`A?A~A.A^A^'A^`A^?A^~A^.A(A('A(`A(?A(~A(.E'E`E?E~E.E^E^'E^`E^?E^~E^.I'I`I?I~I.O'O`O?O~O.O^O^'O^`O^?O^~O^.O+O+'O+`O+?O+~O+.U'U`U?U~U.U+U+'U+`U+?U+~U+.Y'Y`Y?Y~Y.DD";
664 static gunichar unicode_str
[] = {
665 225, 224,7843, 227,7841,
666 226,7845,7847,7849,7851,7853,
667 259,7855,7857,7859,7861,7863,
668 233, 232,7867,7869,7865,
669 234,7871,7873,7875,7877,
670 7879, 237, 236,7881, 297,7883,
671 243, 242,7887, 245,7885,
672 244,7889,7891,7893,7895,
673 7897, 417,7899,7901,7903,7905,
674 7907, 250, 249,7911, 361,7909,
675 432,7913,7915,7917,7919,
676 7921, 253,7923,7927,7929,7925,
678 193, 192,7842, 195,7840,
679 194,7844,7846,7848,7850,7852,
680 258,7854,7856,7858,7860,7862,
681 201, 200,7866,7868,7864,
682 202,7870,7872,7874,7876,7878,
683 205, 204,7880, 296,7882,
684 211, 210,7886, 213,7884,
685 212,7888,7890,7892,7894,
686 7896, 416,7898,7900,7902,7904,
687 7906, 218, 217,7910, 360,7908,
688 431,7912,7914,7916,7918,7920,
689 221,7922,7926,7928,7924,
694 static map
<unsigned char,gunichar
> viscii_utf8_map
;
695 static map
<gunichar
,unsigned char> utf8_viscii_map
;
699 for (unsigned i
= 0;viscii_str
[i
];i
++) {
700 viscii_utf8_map
[viscii_str
[i
]] = unicode_str
[i
];
701 utf8_viscii_map
[unicode_str
[i
]] = viscii_str
[i
];
705 bool viet_utf8_to_viscii(const char *in
,char *out
) // pre-allocated
709 while ((ch
= g_utf8_get_char(p
)) != 0) {
710 p
= g_utf8_next_char(p
);
716 map
<gunichar
,unsigned char>::iterator iter
;
717 iter
= utf8_viscii_map
.find(ch
);
718 if (iter
!= utf8_viscii_map
.end())
719 *out
++ = (char)iter
->second
;
721 fprintf(stderr
,"Warning: unexpected unicode character %d",ch
);
729 bool viet_utf8_to_viscii_force(const char *in
,char *out
)
733 while ((ch
= g_utf8_get_char(p
)) != 0) {
734 p
= g_utf8_next_char(p
);
740 map
<gunichar
,unsigned char>::iterator iter
;
741 iter
= utf8_viscii_map
.find(ch
);
742 if (iter
!= utf8_viscii_map
.end())
743 *out
++ = (char)iter
->second
;
745 fprintf(stderr
,"Warning: unexpected unicode character %d",ch
);
753 void viet_viscii_to_utf8(const char *in
,char *out
) // pre-allocated
755 unsigned char *p
= (unsigned char*)in
;
757 while ((ch
= *p
) != 0) {
759 if (ch
< 128 && ch
>= 32) {
764 map
<unsigned char,gunichar
>::iterator iter
;
765 iter
= viscii_utf8_map
.find(ch
);
766 if (iter
!= viscii_utf8_map
.end()) {
767 g_unichar_to_utf8(iter
->second
,out
);
768 out
= g_utf8_next_char(out
);
770 *out
++ = ch
; // fall-back case
777 static char buffer
[6000];
778 char* viet_to_viscii(const char *in
)
780 if (g_utf8_strlen(in
,-1) >= 1000)
782 if (viet_utf8_to_viscii(in
,buffer
))
788 char* viet_to_viscii_force(const char *in
)
790 if (g_utf8_strlen(in
,-1) >= 1000)
792 viet_utf8_to_viscii_force(in
,buffer
);
796 char* viet_to_utf8(const char *in
)
798 if (strlen(in
) >= 1000)
800 viet_viscii_to_utf8(in
,buffer
);
805 static char_traits_strid::char_type*
806 char_traits_strid::copy(char_traits_strid::char_type* __s1,
807 const char_traits_strid::char_type* __s2,
810 return static_cast<char_type*>(memcpy(__s1, __s2, __n*sizeof(char_type)));
815 bool get_case_syllable_candidates(const char *input
,Candidates
&output
, float v
)
817 // There are only two acceptable cases:
818 // 1. The first character is upper case, the rest is lower
819 // 2. All are either lower or upper
820 // if there is some upper case character without following one of these cases, then it's fault.
821 // also, if there is a uppercase word in dictionary, then add it.
823 uint i
,n
= strlen(input
);
824 // check for upper-case chars
825 for (i
= n
-1;i
>= 0;i
--)
826 if (viet_toupper(input
[i
]) == input
[i
])
829 if (i
<= 0 || n
< 2) // ignore if the only upper char is the first one.
835 s
[0] = viet_toupper(s
[0]);
836 for (i
= 1;i
< n
;i
++)
837 s
[i
] = viet_tolower(s
[i
]);
838 if (s
!= string(input
))
842 for (i
= 0;i
< n
;i
++)
843 s
[i
] = viet_tolower(s
[i
]);
844 if (s
!= string(input
))
848 for (i
= 0;i
< n
;i
++)
849 s
[i
] = viet_toupper(s
[i
]);
850 if (s
!= string(input
))
855 void get_phonetic_syllable_candidates(const char *input
,Candidates
&output
,float v
)
857 vector
<confusion_set
>& confusion_sets
= get_confusion_sets();
858 int i
,j
,m
,n
= confusion_sets
.size();
860 set
<Syllable
> syllset
,syllset2
;
864 syllset2
.insert(_syll
);
865 while (!syllset2
.empty()) {
866 const Syllable sy
= *syllset2
.begin();
867 syllset2
.erase(syllset2
.begin());
869 if (syllset
.find(sy
) != syllset
.end())
870 continue; // we already matched&applied this syllable
872 //cerr << sy << endl;
875 vector
<Syllable
> sylls
;
877 for (i
= 0;i
< n
;i
++) {
878 m
= confusion_sets
[i
].size();
879 for (j
= 0;j
< m
;j
++)
880 if (confusion_sets
[i
][j
].match(sy
))
883 for (j
= 0;j
< m
;j
++)
884 confusion_sets
[i
][j
].apply(sy
,sylls
);
887 copy(sylls
.begin(),sylls
.end(), inserter(syllset2
,syllset2
.begin()));
891 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
892 set
<Syllable
>::iterator iter
;
893 for (iter
= syllset
.begin();iter
!= syllset
.end(); ++ iter
) {
894 string s
= iter
->to_std_str();
895 string ss
= get_lowercased_syllable(s
);
897 if (sarch
.in_dict(sarch
[s
]) ||
898 sarch
.in_dict(sarch
[ss
]))
899 output
.insert(iter
->to_str(),v
+1);
903 void get_syllable_candidates(const char *input
,Candidates
&output
,float v
)
908 set
<string
>::iterator s3i
;
912 if (syll
.parse(input
) &&
913 syll
.to_str() != string(input
))
914 output
.insert(syll
.to_str(),v
+10);
916 get_phonetic_syllable_candidates(input
,output
,v
);
920 while (keyr
.step(s
)) {
921 s2
= get_std_syllable(s
);
922 if (s2
!= s
&& syll
.parse(s2
.c_str()))
923 output
.insert(syll
.to_str(),v
);
925 im_recover(s
.c_str(),s3
);
926 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
927 s2
= get_std_syllable(*s3i
);
928 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
929 output
.insert(syll
.to_str(),v
);
935 uint i
,n
= strlen(input
);
936 for (i
= 1;i
< n
;i
++) {
937 s
= string(input
).substr(0,i
);
938 s2
= string(input
).substr(i
);
939 if (syll
.parse(s
.c_str())) {
941 if (syll
.parse(s2
.c_str())) {
942 output
.insert(s
+ string(" ") + syll
.to_str(),v
);
947 CharInserter inserter
;
948 inserter
.init(input
);
949 while (inserter
.step(s
)) {
950 s2
= get_std_syllable(s
);
951 if (s2
!= s
&& syll
.parse(s2
.c_str()))
952 output
.insert(syll
.to_str(),v
);
954 im_recover(s
.c_str(),s3
);
955 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
956 s2
= get_std_syllable(*s3i
);
957 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
958 output
.insert(syll
.to_str(),v
);
965 while (eraser
.step(s
)) {
966 s2
= get_std_syllable(s
);
967 if (s2
!= s
&& syll
.parse(s2
.c_str()))
968 output
.insert(syll
.to_str(),v
);
970 im_recover(s
.c_str(),s3
);
971 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
972 s2
= get_std_syllable(*s3i
);
973 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
974 output
.insert(syll
.to_str(),v
);
979 CharTransposer transposer
;
980 transposer
.init(input
);
981 while (transposer
.step(s
)) {
982 s2
= get_std_syllable(s
);
983 if (s2
!= s
&& syll
.parse(s2
.c_str()))
984 output
.insert(syll
.to_str(),v
);
986 im_recover(s
.c_str(),s3
);
987 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
988 s2
= get_std_syllable(*s3i
);
989 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
990 output
.insert(syll
.to_str(),v
);
996 void get_left_syllable_candidates(const char *input
,const char *left
,Candidates
&output
)
1000 s
= string(left
)+string(input
);
1001 get_syllable_candidates(s
.c_str(),output
,10);
1003 // cut one char from input
1004 if (strlen(input
) > 1) {
1005 s
= string(input
+1);
1006 get_syllable_candidates(s
.c_str(),output
,10);
1009 // insert one char from left to input
1011 s
= string(" ") + string(input
);
1012 s
[0] = left
[strlen(left
)-1];
1013 get_syllable_candidates(s
.c_str(),output
,10);
1017 void get_right_syllable_candidates(const char *input
,const char *right
,Candidates
&output
)
1021 s
= string(input
)+string(right
);
1022 get_syllable_candidates(s
.c_str(),output
,10);
1024 // cut one char from input
1025 if (strlen(input
) > 1) {
1027 s
.resize(strlen(input
)-1);
1028 get_syllable_candidates(s
.c_str(),output
,10);
1031 // insert one char from right to input
1032 if (strlen(right
)) {
1033 s
= string(input
)+string(" ");
1034 s
[s
.size()-1] = right
[0];
1035 get_syllable_candidates(s
.c_str(),output
,10);
1041 void Candidates::insert(const std::string
&s
,float f
)
1046 set
<Candidate
>::iterator iter
= candidates
.find(c
);
1047 if (iter
!= candidates
.end()) {
1048 if (iter
->priority
< c
.priority
)
1049 candidates
.erase(iter
);
1053 candidates
.insert(c
);
1056 bool Candidates::CandidateComparator::operator()(const std::string
&s1
,const std::string
&s2
)
1058 set
<Candidate
>::iterator i1
,i2
;
1062 i1
= c
.candidates
.find(c1
);
1063 i2
= c
.candidates
.find(c2
);
1064 if (i1
->priority
!= i2
->priority
)
1065 return i1
->priority
> i2
->priority
;
1069 f1
= -get_syngram().wordProb(sarch
[get_std_syllable(get_lowercased_syllable(s1
))],&v
);
1070 f2
= -get_syngram().wordProb(sarch
[get_std_syllable(get_lowercased_syllable(s2
))],&v
);
1071 //cerr << f1 << "<>" << f2 << endl;
1072 return f1
> f2
; // we want reverse order
1075 void Candidates::get_list(std::vector
<std::string
> &v
)
1077 set
<Candidate
>::iterator iter
;
1079 iter
= candidates
.begin();
1080 while (iter
!= candidates
.end()) {
1081 if (!is_valid_word_form(iter
->candidate
.c_str())) {
1082 get_case_syllable_candidates(iter
->candidate
.c_str(),*this,iter
->priority
);
1083 candidates
.erase(iter
++);
1088 v
.resize(candidates
.size());
1090 for (iter
= candidates
.begin();iter
!= candidates
.end(); ++iter
)
1091 if (sarch
.in_dict(get_std_syllable(get_lowercased_syllable(iter
->candidate
))))
1092 v
[n
++] = iter
->candidate
;
1094 sort(v
.begin(),v
.end(),CandidateComparator(*this));