1 #include "config.h" // -*- tab-width: 2 -*-
12 #include "propername.h"
20 typedef unsigned int guint32
;
21 typedef unsigned int guint
;
22 typedef guint32 gunichar
;
24 typedef unsigned char guchar
;
26 typedef signed int gssize
;
27 #define UTF8_COMPUTE(Char, Mask, Len) \
33 else if ((Char & 0xe0) == 0xc0) \
38 else if ((Char & 0xf0) == 0xe0) \
43 else if ((Char & 0xf8) == 0xf0) \
48 else if ((Char & 0xfc) == 0xf8) \
53 else if ((Char & 0xfe) == 0xfc) \
61 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
62 (Result) = (Chars)[0] & (Mask); \
63 for ((Count) = 1; (Count) < (Len); ++(Count)) \
65 if (((Chars)[(Count)] & 0xc0) != 0x80) \
67 (Result) = (gunichar)-1; \
71 (Result) |= ((Chars)[(Count)] & 0x3f); \
74 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
75 static const gchar utf8_skip_data
[256] = {
76 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
77 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
78 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
79 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
80 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
81 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
82 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
83 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
86 static const gchar
* const g_utf8_skip
= utf8_skip_data
;
88 g_unichar_to_utf8 (gunichar c
,
105 else if (c
< 0x10000)
110 else if (c
< 0x200000)
115 else if (c
< 0x4000000)
128 for (i
= len
- 1; i
> 0; --i
)
130 outbuf
[i
] = (c
& 0x3f) | 0x80;
133 outbuf
[0] = c
| first
;
140 g_utf8_get_char (const gchar
*p
)
142 int i
, mask
= 0, len
;
144 unsigned char c
= (unsigned char) *p
;
146 UTF8_COMPUTE (c
, mask
, len
);
149 UTF8_GET (result
, p
, i
, mask
, len
);
155 g_utf8_strlen (const gchar
*p
,
159 const gchar
*start
= p
;
160 //g_return_val_if_fail (p != NULL || max == 0, 0);
161 if (!(p
!= NULL
|| max
== 0))
168 p
= g_utf8_next_char (p
);
177 p
= g_utf8_next_char (p
);
179 while (p
- start
< max
&& *p
)
182 p
= g_utf8_next_char (p
);
185 /* only do the last len increment if we got a complete
186 * char (don't count partial chars)
188 if (p
- start
== max
)
200 1. Sentence segmentation. (sentences_split)
201 2. Separate "words" by spaces. (tokenize)
202 3. Punctuation separation. (tokenize/tokenize_punctuation)
203 4. Foreign/Abbreviation detection.
204 5. Proper name detection.
205 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
206 generalize all capitalized words.
207 6* Syllable checking. (check1)
208 7. Find all possible (misspelled) words. (**) (get_all_words)
209 8. "pre-separate" sentence into phrases.
210 9. Word segmentation. (**)
211 10. Find the best segmentation. (segment_best)
212 10* Word checking. (check2)
222 cerr
<< "Loading dictionary... ";
223 warch
.load("wordlist");
224 cerr
<< "done" << endl
;
225 cerr
<< "Loading ngram... ";
226 File
f("ngram","rt");
229 cerr
<< "done" << endl
;
231 cerr
<< "Ngram loading error. The result may be incorrect" << endl
;
232 cerr
<< "Loading syngram... ";
233 File
ff("syngram","rt");
235 get_syngram().read(ff
);
236 cerr
<< "done" << endl
;
238 cerr
<< "Syllable Ngram loading error. The result may be incorrect" << endl
;
239 sarch
.set_blocked(true);
243 bool VSpell::check(const char *utf8_pp
)
252 string pp
= viet_to_viscii_force(utf8_text
.c_str());
254 sentences_split(pp
,pps
);
255 unsigned pps_i
,pps_len
= pps
.size();
259 for (pps_i
= 0;pps_i
< pps_len
;pps_i
++) {
260 Text
*t
= text_factory
.create(this);
262 t
->length
= pps
[pps_i
].size();
263 run
= !t
->sentence_check(pps
[pps_i
].c_str());
267 offset
+= pps
[pps_i
].size();
273 void VSpell::replace(unsigned from
,unsigned size
,const char *s
)
275 const char *p
= utf8_text
.c_str();
277 for (i
= 0;i
< from
&& *p
;i
++)
278 p
= g_utf8_next_char(p
);
279 unsigned from1
= p
- utf8_text
.c_str();
280 for (i
= 0;i
< size
&& *p
;i
++)
281 p
= g_utf8_next_char(p
);
282 unsigned to1
= p
- utf8_text
.c_str();
283 utf8_text
.replace(from1
,to1
-from1
,s
);
285 // remove separators in the range, adjust other seps
286 int n
= separators
.size();
288 int newsize
= g_utf8_strlen(s
,-1);
292 if (separators
[i
] >= from
) {
293 if (separators
[i
] < from
+size
)
294 separators
.erase(separators
.begin()+i
);
296 separators
[i
] += newsize
-size
;
302 void VSpell::add_separators(const std::vector
<unsigned> &seps
)
304 copy(seps
.begin(),seps
.end(),back_inserter(separators
));
307 void VSpell::add_word(const char *s
)
318 bool Text::sentence_check(const char *pp
)
325 if (!st
.get_syllable_count()) // nothing to do but crash ;)
329 if (!syllable_check() && !ui_syllable_check())
334 WordStateFactories factories
;
335 ExactWordStateFactory exact
;
336 LowerWordStateFactory lower
;
337 UpperWordStateFactory upper
;
338 FuzzyWordStateFactory fuzzy
;
339 factories
.push_back(&exact
);
340 factories
.push_back(&lower
);
341 factories
.push_back(&upper
);
342 factories
.push_back(&fuzzy
);
343 w
.pre_construct(st
,wes
,factories
);
344 mark_proper_name(st
,wes
);
345 apply_separators(wes
);
346 w
.post_construct(wes
);
353 if (vspell
->get_trigram()) {
354 dagw2
= new WordDAG2(&dagw
);
357 PenaltyDAG
pdag(dag
,vspell
->get_penalty());
358 if (vspell
->get_penalty())
361 if (vspell
->get_normalization()) {
363 pfs
.search(*dag
,path
);
366 pfs
.search(*dag
,path
);
368 if (vspell
->get_trigram()) {
369 dagw2
->demangle(path
);
372 seg
.resize(path
.size()-2);
373 // don't copy head/tail
374 copy(path
.begin()+1,path
.end()-1,seg
.begin());
376 //cerr << seg << endl;
379 if (!word_check() && !ui_word_check())
385 bool Text::syllable_check(int i
)
387 if (vspell
->in_dict(st
[i
].get_id()))
390 if (sarch
.in_dict(st
[i
].get_cid())) {
391 Syllable syl
; // diacritic check
392 if (syl
.parse(sarch
[st
[i
].get_cid()])) {
393 string s
= get_lowercased_syllable(syl
.to_str());
394 if (get_lowercased_syllable(sarch
[st
[i
].get_id()]) == s
)
401 bool Text::syllable_check()
403 int i
,n
= st
.get_syllable_count();
407 for (i
= 0;i
< n
;i
++) {
408 if (syllable_check(i
))
413 suggestions
.push_back(_s
);
415 return suggestions
.empty();
418 bool Text::word_check()
420 int i
,n
= seg
.size();
424 for (i
= 0;i
< n
;i
++) {
427 int ii
,len
= seg
[i
].node
->get_syllable_count();
428 seg
[i
].node
->get_syllables(sylls
);
430 // case-sensitive comparation.
433 for (ii
= 0;ii
< len
;ii
++) {
434 sylls2
[ii
] = st
[seg
[i
].pos
+ii
].get_id();
435 if (subok
&& st
[seg
[i
].pos
+ii
].get_cid() != sylls
[ii
])
440 bool ok
= vspell
->in_dict(sylls2
);
444 sylls3
.resize(sylls
.size());
445 for (ii
= 0;ii
< sylls3
.size();ii
++)
446 sylls3
[ii
] = sarch
[get_unstd_syllable(sarch
[sylls
[ii
]])];
448 if (vspell
->get_strict_word_checking()) {
449 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
451 (is_all_capitalized_word(sylls2
) ||
452 (is_first_capitalized_word(sylls2
) && is_lower_cased_word(sylls3
))))
458 BranchNode
*branch
= warch
.get_root();
459 for (ii
= 0;ii
< len
&& branch
;ii
++)
460 branch
= branch
->get_branch(st
[seg
[i
].pos
+ii
].get_cid());
464 leaf
= branch
->get_leaf(sarch
["<mainleaf>"]);
466 leaf
->get_syllables(sylls
);
468 sylls3
.resize(sylls
.size());
469 for (ii
= 0;ii
< sylls3
.size();ii
++)
470 sylls3
[ii
] = sarch
[get_unstd_syllable(sarch
[sylls
[ii
]])];
472 ok
= sylls2
== sylls3
;
474 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
476 (is_all_capitalized_word(sylls2
) ||
477 (is_first_capitalized_word(sylls2
) && is_lower_cased_word(sylls3
))))
486 suggestions
.push_back(_s
);
490 // all syllable are syntatically valid
492 return suggestions
.empty();
495 unsigned Text::pos_from_syllable(const Suggestion
&s
)
497 return offset
+st
[s
.id
].start
;
500 unsigned Text::pos_from_word(const Suggestion
&s
)
502 //return offset+st[seg[s.id]].start;
506 void Text::replace(unsigned from
,unsigned size
,const char *s
)
508 vspell
->replace(from
+offset
,size
,s
);
511 string
Text::substr(unsigned from
,unsigned size
)
513 const string
&utf8_text
= vspell
->get_utf8_text();
514 const char *p
= utf8_text
.c_str();
517 for (i
= 0;i
< from
&& *p
;i
++)
518 p
= g_utf8_next_char(p
);
519 unsigned from1
= p
- utf8_text
.c_str();
520 for (i
= 0;i
< size
&& *p
;i
++)
521 p
= g_utf8_next_char(p
);
522 unsigned to1
= p
- utf8_text
.c_str();
523 return utf8_text
.substr(from1
,to1
-from1
);
526 void Text::apply_separators(set
<WordEntry
> &wes
)
528 vector
<unsigned> seps
;
529 //set<unsigned> seps;
531 get_separators(seps
);
532 sort(seps
.begin(),seps
.end());
533 //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
535 int i
,n
= st
.get_syllable_count();
537 for (i
= 0;i
< n
-1 && sep
< seps
.size();i
++) {
538 int p
= offset
+st
[i
].start
+strlen(sarch
[st
[i
].get_id()]);
539 if (p
<= seps
[sep
] && seps
[sep
] <= offset
+st
[i
+1].start
) {
540 apply_separator(wes
,i
);
546 void Text::get_separators(vector
<unsigned> &v
)
548 const vector
<unsigned> &vv
= vspell
->get_separators();
551 for (i
= 0;i
< n
;i
++)
552 if (vv
[i
] >= offset
&& vv
[i
] < offset
+length
)
559 static unsigned char viscii_str
[] = {
560 0xe1,0xe0,0xe4,0xe3,0xd5,
561 0xe2,0xa4,0xa5,0xa6,0xe7,0xa7,
562 0xe5,0xa1,0xa2,0xc6,0xc7,0xa3,
563 0xe9,0xe8,0xeb,0xa8,0xa9,
564 0xea,0xaa,0xab,0xac,0xad,
565 0xae,0xed,0xec,0xef,0xee,0xb8,
566 0xf3,0xf2,0xf6,0xf5,0xf7,
567 0xf4,0xaf,0xb0,0xb1,0xb2,
568 0xb5,0xbd,0xbe,0xb6,0xb7,0xde,
569 0xfe,0xfa,0xf9,0xfc,0xfb,0xf8,
570 0xdf,0xd1,0xd7,0xd8,0xe6,
571 0xf1,0xfd,0xcf,0xd6,0xdb,0xdc,
573 0xc1,0xc0,0xc4,0xc3,0x80,
574 0xc2,0x84,0x85,0x86,0x06,0x87,
575 0xc5,0x81,0x82,0x02,0x05,0x83,
576 0xc9,0xc8,0xcb,0x88,0x89,
577 0xca,0x8a,0x8b,0x8c,0x8d,0x8e,
578 0xcd,0xcc,0x9b,0xce,0x98,
579 0xd3,0xd2,0x99,0xa0,0x9a,
580 0xd4,0x8f,0x90,0x91,0x92,
581 0x93,0xb4,0x95,0x96,0x97,0xb3,
582 0x94,0xda,0xd9,0x9c,0x9d,0x9e,
583 0xbf,0xba,0xbb,0xbc,0xff,0xb9,
584 0xdd,0x9f,0x14,0x19,0x1e,
588 //"a'a`a?a~a.a^a^'a^`a^?a^~a^.a(a('a(`a(?a(~a(.e'e`e?e~e.e^e^'e^`e^?e^~e^.i'i`i?i~i.o'o`o?o~o.o^o^'o^`o^?o^~o^.o+o+'o+`o+?o+~o+.u'u`u?u~u.u+u+'u+`u+?u+~u+.y'y`y?y~y.ddA'A`A?A~A.A^A^'A^`A^?A^~A^.A(A('A(`A(?A(~A(.E'E`E?E~E.E^E^'E^`E^?E^~E^.I'I`I?I~I.O'O`O?O~O.O^O^'O^`O^?O^~O^.O+O+'O+`O+?O+~O+.U'U`U?U~U.U+U+'U+`U+?U+~U+.Y'Y`Y?Y~Y.DD";
590 static gunichar unicode_str
[] = {
591 225, 224,7843, 227,7841,
592 226,7845,7847,7849,7851,7853,
593 259,7855,7857,7859,7861,7863,
594 233, 232,7867,7869,7865,
595 234,7871,7873,7875,7877,
596 7879, 237, 236,7881, 297,7883,
597 243, 242,7887, 245,7885,
598 244,7889,7891,7893,7895,
599 7897, 417,7899,7901,7903,7905,
600 7907, 250, 249,7911, 361,7909,
601 432,7913,7915,7917,7919,
602 7921, 253,7923,7927,7929,7925,
604 193, 192,7842, 195,7840,
605 194,7844,7846,7848,7850,7852,
606 258,7854,7856,7858,7860,7862,
607 201, 200,7866,7868,7864,
608 202,7870,7872,7874,7876,7878,
609 205, 204,7880, 296,7882,
610 211, 210,7886, 213,7884,
611 212,7888,7890,7892,7894,
612 7896, 416,7898,7900,7902,7904,
613 7906, 218, 217,7910, 360,7908,
614 431,7912,7914,7916,7918,7920,
615 221,7922,7926,7928,7924,
620 static map
<unsigned char,gunichar
> viscii_utf8_map
;
621 static map
<gunichar
,unsigned char> utf8_viscii_map
;
625 for (unsigned i
= 0;viscii_str
[i
];i
++) {
626 viscii_utf8_map
[viscii_str
[i
]] = unicode_str
[i
];
627 utf8_viscii_map
[unicode_str
[i
]] = viscii_str
[i
];
631 bool viet_utf8_to_viscii(const char *in
,char *out
) // pre-allocated
635 while ((ch
= g_utf8_get_char(p
)) != 0) {
636 p
= g_utf8_next_char(p
);
642 map
<gunichar
,unsigned char>::iterator iter
;
643 iter
= utf8_viscii_map
.find(ch
);
644 if (iter
!= utf8_viscii_map
.end())
645 *out
++ = (char)iter
->second
;
647 fprintf(stderr
,"Warning: unexpected unicode character %d",ch
);
655 bool viet_utf8_to_viscii_force(const char *in
,char *out
)
659 while ((ch
= g_utf8_get_char(p
)) != 0) {
660 p
= g_utf8_next_char(p
);
666 map
<gunichar
,unsigned char>::iterator iter
;
667 iter
= utf8_viscii_map
.find(ch
);
668 if (iter
!= utf8_viscii_map
.end())
669 *out
++ = (char)iter
->second
;
671 fprintf(stderr
,"Warning: unexpected unicode character %d",ch
);
679 void viet_viscii_to_utf8(const char *in
,char *out
) // pre-allocated
681 unsigned char *p
= (unsigned char*)in
;
683 while ((ch
= *p
) != 0) {
685 if (ch
< 128 && ch
>= 32) {
690 map
<unsigned char,gunichar
>::iterator iter
;
691 iter
= viscii_utf8_map
.find(ch
);
692 if (iter
!= viscii_utf8_map
.end()) {
693 g_unichar_to_utf8(iter
->second
,out
);
694 out
= g_utf8_next_char(out
);
696 *out
++ = ch
; // fall-back case
703 static char buffer
[6000];
704 char* viet_to_viscii(const char *in
)
706 if (g_utf8_strlen(in
,-1) >= 1000)
708 if (viet_utf8_to_viscii(in
,buffer
))
714 char* viet_to_viscii_force(const char *in
)
716 if (g_utf8_strlen(in
,-1) >= 1000)
718 viet_utf8_to_viscii_force(in
,buffer
);
722 char* viet_to_utf8(const char *in
)
724 if (strlen(in
) >= 1000)
726 viet_viscii_to_utf8(in
,buffer
);
731 static char_traits_strid::char_type*
732 char_traits_strid::copy(char_traits_strid::char_type* __s1,
733 const char_traits_strid::char_type* __s2,
736 return static_cast<char_type*>(memcpy(__s1, __s2, __n*sizeof(char_type)));
741 bool get_case_syllable_candidates(const char *input
,Candidates
&output
, float v
)
743 // There are only two acceptable cases:
744 // 1. The first character is upper case, the rest is lower
745 // 2. All are either lower or upper
746 // if there is some upper case character without following one of these cases, then it's fault.
747 // also, if there is a uppercase word in dictionary, then add it.
749 uint i
,n
= strlen(input
);
750 // check for upper-case chars
751 for (i
= n
-1;i
>= 0;i
--)
752 if (viet_toupper(input
[i
]) == input
[i
])
755 if (i
<= 0 || n
< 2) // ignore if the only upper char is the first one.
761 s
[0] = viet_toupper(s
[0]);
762 for (i
= 1;i
< n
;i
++)
763 s
[i
] = viet_tolower(s
[i
]);
764 if (s
!= string(input
))
768 for (i
= 0;i
< n
;i
++)
769 s
[i
] = viet_tolower(s
[i
]);
770 if (s
!= string(input
))
774 for (i
= 0;i
< n
;i
++)
775 s
[i
] = viet_toupper(s
[i
]);
776 if (s
!= string(input
))
781 void get_phonetic_syllable_candidates(const char *input
,Candidates
&output
,float v
)
783 vector
<confusion_set
>& confusion_sets
= get_confusion_sets();
784 int i
,j
,m
,n
= confusion_sets
.size();
786 set
<Syllable
> syllset
,syllset2
;
790 syllset2
.insert(_syll
);
791 while (!syllset2
.empty()) {
792 const Syllable sy
= *syllset2
.begin();
793 syllset2
.erase(syllset2
.begin());
795 if (syllset
.find(sy
) != syllset
.end())
796 continue; // we already matched&applied this syllable
798 //cerr << sy << endl;
801 vector
<Syllable
> sylls
;
803 for (i
= 0;i
< n
;i
++) {
804 m
= confusion_sets
[i
].size();
805 for (j
= 0;j
< m
;j
++)
806 if (confusion_sets
[i
][j
].match(sy
))
809 for (j
= 0;j
< m
;j
++)
810 confusion_sets
[i
][j
].apply(sy
,sylls
);
813 copy(sylls
.begin(),sylls
.end(), inserter(syllset2
,syllset2
.begin()));
817 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
818 set
<Syllable
>::iterator iter
;
819 for (iter
= syllset
.begin();iter
!= syllset
.end(); ++ iter
) {
820 string s
= iter
->to_std_str();
821 string ss
= get_lowercased_syllable(s
);
823 if (sarch
.in_dict(sarch
[s
]) ||
824 sarch
.in_dict(sarch
[ss
]))
825 output
.insert(iter
->to_str(),v
+1);
829 void get_syllable_candidates(const char *input
,Candidates
&output
,float v
)
834 set
<string
>::iterator s3i
;
836 get_phonetic_syllable_candidates(input
,output
,v
);
840 while (keyr
.step(s
)) {
841 s2
= get_std_syllable(s
);
842 if (s2
!= s
&& syll
.parse(s2
.c_str()))
843 output
.insert(syll
.to_str(),v
);
845 im_recover(s
.c_str(),s3
);
846 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
847 s2
= get_std_syllable(*s3i
);
848 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
849 output
.insert(syll
.to_str(),v
);
855 uint i
,n
= strlen(input
);
856 for (i
= 1;i
< n
;i
++) {
857 s
= string(input
).substr(0,i
);
858 s2
= string(input
).substr(i
);
859 if (syll
.parse(s
.c_str())) {
861 if (syll
.parse(s2
.c_str())) {
862 output
.insert(s
+ string(" ") + syll
.to_str(),v
);
867 CharInserter inserter
;
868 inserter
.init(input
);
869 while (inserter
.step(s
)) {
870 s2
= get_std_syllable(s
);
871 if (s2
!= s
&& syll
.parse(s2
.c_str()))
872 output
.insert(syll
.to_str(),v
);
874 im_recover(s
.c_str(),s3
);
875 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
876 s2
= get_std_syllable(*s3i
);
877 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
878 output
.insert(syll
.to_str(),v
);
885 while (eraser
.step(s
)) {
886 s2
= get_std_syllable(s
);
887 if (s2
!= s
&& syll
.parse(s2
.c_str()))
888 output
.insert(syll
.to_str(),v
);
890 im_recover(s
.c_str(),s3
);
891 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
892 s2
= get_std_syllable(*s3i
);
893 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
894 output
.insert(syll
.to_str(),v
);
899 CharTransposer transposer
;
900 transposer
.init(input
);
901 while (transposer
.step(s
)) {
902 s2
= get_std_syllable(s
);
903 if (s2
!= s
&& syll
.parse(s2
.c_str()))
904 output
.insert(syll
.to_str(),v
);
906 im_recover(s
.c_str(),s3
);
907 for (s3i
= s3
.begin(); s3i
!= s3
.end(); ++ s3i
) {
908 s2
= get_std_syllable(*s3i
);
909 if (s2
!= *s3i
&& syll
.parse(s2
.c_str()))
910 output
.insert(syll
.to_str(),v
);
916 void get_left_syllable_candidates(const char *input
,const char *left
,Candidates
&output
)
920 s
= string(left
)+string(input
);
921 get_syllable_candidates(s
.c_str(),output
,10);
923 // cut one char from input
924 if (strlen(input
) > 1) {
926 get_syllable_candidates(s
.c_str(),output
,10);
929 // insert one char from left to input
931 s
= string(" ") + string(input
);
932 s
[0] = left
[strlen(left
)-1];
933 get_syllable_candidates(s
.c_str(),output
,10);
937 void get_right_syllable_candidates(const char *input
,const char *right
,Candidates
&output
)
941 s
= string(input
)+string(right
);
942 get_syllable_candidates(s
.c_str(),output
,10);
944 // cut one char from input
945 if (strlen(input
) > 1) {
947 s
.resize(strlen(input
)-1);
948 get_syllable_candidates(s
.c_str(),output
,10);
951 // insert one char from right to input
953 s
= string(input
)+string(" ");
954 s
[s
.size()-1] = right
[0];
955 get_syllable_candidates(s
.c_str(),output
,10);
961 void Candidates::insert(const std::string
&s
,float f
)
966 set
<Candidate
>::iterator iter
= candidates
.find(c
);
967 if (iter
!= candidates
.end()) {
968 if (iter
->priority
< c
.priority
)
969 candidates
.erase(iter
);
973 candidates
.insert(c
);
976 bool Candidates::CandidateComparator::operator()(const std::string
&s1
,const std::string
&s2
)
978 set
<Candidate
>::iterator i1
,i2
;
982 i1
= c
.candidates
.find(c1
);
983 i2
= c
.candidates
.find(c2
);
984 if (i1
->priority
!= i2
->priority
)
985 return i1
->priority
> i2
->priority
;
989 f1
= -get_syngram().wordProb(sarch
[get_std_syllable(get_lowercased_syllable(s1
))],&v
);
990 f2
= -get_syngram().wordProb(sarch
[get_std_syllable(get_lowercased_syllable(s2
))],&v
);
991 //cerr << f1 << "<>" << f2 << endl;
992 return f1
> f2
; // we want reverse order
995 void Candidates::get_list(std::vector
<std::string
> &v
)
997 uint i
,n
= candidates
.size();
999 set
<Candidate
>::iterator iter
;
1001 for (iter
= candidates
.begin();iter
!= candidates
.end();++iter
)
1002 if (!get_case_syllable_candidates(iter
->candidate
.c_str(),*this,iter
->priority
)) {
1003 if (sarch
.in_dict(get_std_syllable(get_lowercased_syllable(iter
->candidate
)))) {
1004 v
[n
++] = iter
->candidate
;
1008 sort(v
.begin(),v
.end(),CandidateComparator(*this));