Material throbber: use in tabstrip
[chromium-blink-merge.git] / third_party / hunspell_new / src / hunspell / suggestmgr.cxx
blobf60007bedfd35098785719c61a56f224eb85069e
1 #include "license.hunspell"
2 #include "license.myspell"
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
9 #include "suggestmgr.hxx"
10 #include "htypes.hxx"
11 #include "csutil.hxx"
13 const w_char W_VLINE = { '\0', '|' };
15 #ifdef HUNSPELL_CHROME_CLIENT
16 namespace {
17 // A simple class which creates temporary hentry objects which are available
18 // only in a scope. To conceal memory operations from SuggestMgr functions,
19 // this object automatically deletes all hentry objects created through
20 // CreateScopedHashEntry() calls in its destructor. So, the following snippet
21 // raises a memory error.
23 // hentry* bad_copy = NULL;
24 // {
25 // ScopedHashEntryFactory factory;
26 // hentry* scoped_copy = factory.CreateScopedHashEntry(0, source);
27 // ...
28 // bad_copy = scoped_copy;
29 // }
30 // if (bad_copy->word[0]) // memory for scoped_copy has been deleted!
32 // As listed in the above snippet, it is simple to use this class.
33 // 1. Declare an instance of this ScopedHashEntryFactory, and;
34 // 2. Call its CreateHashEntry() member instead of using 'new hentry' or
35 // 'operator='.
37 class ScopedHashEntryFactory {
38 public:
39 ScopedHashEntryFactory();
40 ~ScopedHashEntryFactory();
42 // Creates a temporary copy of the given hentry struct.
43 // The returned copy is available only while this object is available.
44 // NOTE: this function just calls memcpy() in creating a copy of the given
45 // hentry struct, i.e. it does NOT copy objects referred by pointers of the
46 // given hentry struct.
47 hentry* CreateScopedHashEntry(int index, const hentry* source);
49 private:
50 // A struct which encapsulates the new hentry struct introduced in hunspell
51 // 1.2.8. For a pointer to an hentry struct 'h', hunspell 1.2.8 stores a word
52 // (including a NUL character) into 'h->word[0]',...,'h->word[h->blen]' even
53 // though arraysize(h->word[]) is 1. Also, it changed 'astr' to a pointer so
54 // it can store affix flags into 'h->astr[0]',...,'h->astr[alen-1]'. To handle
55 // this new hentry struct, we define a struct which combines three values: an
56 // hentry struct 'hentry'; a char array 'word[kMaxWordLen]', and; an unsigned
57 // short array 'astr' so a hentry struct 'h' returned from
58 // CreateScopedHashEntry() satisfies the following equations:
59 // hentry* h = factory.CreateScopedHashEntry(0, source);
60 // h->word[0] == ((HashEntryItem*)h)->entry.word[0].
61 // h->word[1] == ((HashEntryItem*)h)->word[0].
62 // ...
63 // h->word[h->blen] == ((HashEntryItem*)h)->word[h->blen-1].
64 // h->astr[0] == ((HashEntryItem*)h)->astr[0].
65 // h->astr[1] == ((HashEntryItem*)h)->astr[1].
66 // ...
67 // h->astr[h->alen-1] == ((HashEntryItem*)h)->astr[h->alen-1].
68 enum {
69 kMaxWordLen = 128,
70 kMaxAffixLen = 8,
72 struct HashEntryItem {
73 hentry entry;
74 char word[kMaxWordLen];
75 unsigned short astr[kMaxAffixLen];
78 HashEntryItem hash_items_[MAX_ROOTS];
81 ScopedHashEntryFactory::ScopedHashEntryFactory() {
82 memset(&hash_items_[0], 0, sizeof(hash_items_));
85 ScopedHashEntryFactory::~ScopedHashEntryFactory() {
88 hentry* ScopedHashEntryFactory::CreateScopedHashEntry(int index,
89 const hentry* source) {
90 if (index >= MAX_ROOTS || source->blen >= kMaxWordLen)
91 return NULL;
93 // Retrieve a HashEntryItem struct from our spool, initialize it, and
94 // returns the address of its 'hentry' member.
95 size_t source_size = sizeof(hentry) + source->blen + 1;
96 HashEntryItem* hash_item = &hash_items_[index];
97 memcpy(&hash_item->entry, source, source_size);
98 if (source->astr) {
99 hash_item->entry.alen = source->alen;
100 if (hash_item->entry.alen > kMaxAffixLen)
101 hash_item->entry.alen = kMaxAffixLen;
102 memcpy(hash_item->astr, source->astr, hash_item->entry.alen * sizeof(hash_item->astr[0]));
103 hash_item->entry.astr = &hash_item->astr[0];
105 return &hash_item->entry;
108 } // namespace
109 #endif
112 #ifdef HUNSPELL_CHROME_CLIENT
113 SuggestMgr::SuggestMgr(hunspell::BDictReader* reader,
114 const char * tryme, int maxn,
115 AffixMgr * aptr)
117 bdict_reader = reader;
118 #else
119 SuggestMgr::SuggestMgr(const char * tryme, int maxn,
120 AffixMgr * aptr)
122 #endif
124 // register affix manager and check in string of chars to
125 // try when building candidate suggestions
126 pAMgr = aptr;
128 csconv = NULL;
130 ckeyl = 0;
131 ckey = NULL;
132 ckey_utf = NULL;
134 ctryl = 0;
135 ctry = NULL;
136 ctry_utf = NULL;
138 utf8 = 0;
139 langnum = 0;
140 complexprefixes = 0;
142 maxSug = maxn;
143 nosplitsugs = 0;
144 maxngramsugs = MAXNGRAMSUGS;
145 maxcpdsugs = MAXCOMPOUNDSUGS;
147 if (pAMgr) {
148 langnum = pAMgr->get_langnum();
149 ckey = pAMgr->get_key_string();
150 nosplitsugs = pAMgr->get_nosplitsugs();
151 if (pAMgr->get_maxngramsugs() >= 0)
152 maxngramsugs = pAMgr->get_maxngramsugs();
153 utf8 = pAMgr->get_utf8();
154 if (pAMgr->get_maxcpdsugs() >= 0)
155 maxcpdsugs = pAMgr->get_maxcpdsugs();
156 if (!utf8)
158 char * enc = pAMgr->get_encoding();
159 csconv = get_current_cs(enc);
160 free(enc);
162 complexprefixes = pAMgr->get_complexprefixes();
165 if (ckey) {
166 if (utf8) {
167 w_char t[MAXSWL];
168 ckeyl = u8_u16(t, MAXSWL, ckey);
169 ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));
170 if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
171 else ckeyl = 0;
172 } else {
173 ckeyl = strlen(ckey);
177 if (tryme) {
178 ctry = mystrdup(tryme);
179 if (ctry) ctryl = strlen(ctry);
180 if (ctry && utf8) {
181 w_char t[MAXSWL];
182 ctryl = u8_u16(t, MAXSWL, tryme);
183 ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
184 if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));
185 else ctryl = 0;
191 SuggestMgr::~SuggestMgr()
193 pAMgr = NULL;
194 if (ckey) free(ckey);
195 ckey = NULL;
196 if (ckey_utf) free(ckey_utf);
197 ckey_utf = NULL;
198 ckeyl = 0;
199 if (ctry) free(ctry);
200 ctry = NULL;
201 if (ctry_utf) free(ctry_utf);
202 ctry_utf = NULL;
203 ctryl = 0;
204 maxSug = 0;
205 #ifdef MOZILLA_CLIENT
206 delete [] csconv;
207 #endif
210 int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
211 int * timer, clock_t * timelimit) {
212 int cwrd = 1;
213 if (ns == maxSug) return maxSug;
214 for (int k=0; k < ns; k++) {
215 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
217 if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
218 wlst[ns] = mystrdup(candidate);
219 if (wlst[ns] == NULL) {
220 for (int j=0; j<ns; j++) free(wlst[j]);
221 return -1;
223 ns++;
225 return ns;
228 // generate suggestions for a misspelled word
229 // pass in address of array of char * pointers
230 // onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
232 int SuggestMgr::suggest(char*** slst, const char * w, int nsug,
233 int * onlycompoundsug)
235 int nocompoundtwowords = 0;
236 char ** wlst;
237 w_char word_utf[MAXSWL];
238 int wl = 0;
239 int nsugorig = nsug;
240 char w2[MAXWORDUTF8LEN];
241 const char * word = w;
242 int oldSug = 0;
244 // word reversing wrapper for complex prefixes
245 if (complexprefixes) {
246 strcpy(w2, w);
247 if (utf8) reverseword_utf(w2); else reverseword(w2);
248 word = w2;
251 if (*slst) {
252 wlst = *slst;
253 } else {
254 wlst = (char **) malloc(maxSug * sizeof(char *));
255 if (wlst == NULL) return -1;
256 for (int i = 0; i < maxSug; i++) {
257 wlst[i] = NULL;
261 if (utf8) {
262 wl = u8_u16(word_utf, MAXSWL, word);
263 if (wl == -1) {
264 *slst = wlst;
265 return nsug;
269 for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
271 // limit compound suggestion
272 if (cpdsuggest > 0) oldSug = nsug;
274 // suggestions for an uppercase word (html -> HTML)
275 if ((nsug < maxSug) && (nsug > -1)) {
276 nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
277 capchars(wlst, word, nsug, cpdsuggest);
280 // perhaps we made a typical fault of spelling
281 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
282 nsug = replchars(wlst, word, nsug, cpdsuggest);
285 // perhaps we made chose the wrong char from a related set
286 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
287 nsug = mapchars(wlst, word, nsug, cpdsuggest);
290 // only suggest compound words when no other suggestion
291 if ((cpdsuggest == 0) && (nsug > nsugorig)) nocompoundtwowords=1;
293 // did we swap the order of chars by mistake
294 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
295 nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
296 swapchar(wlst, word, nsug, cpdsuggest);
299 // did we swap the order of non adjacent chars by mistake
300 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
301 nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
302 longswapchar(wlst, word, nsug, cpdsuggest);
305 // did we just hit the wrong key in place of a good char (case and keyboard)
306 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
307 nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
308 badcharkey(wlst, word, nsug, cpdsuggest);
311 // did we add a char that should not be there
312 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
313 nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
314 extrachar(wlst, word, nsug, cpdsuggest);
318 // did we forgot a char
319 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
320 nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
321 forgotchar(wlst, word, nsug, cpdsuggest);
324 // did we move a char
325 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
326 nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
327 movechar(wlst, word, nsug, cpdsuggest);
330 // did we just hit the wrong key in place of a good char
331 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
332 nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
333 badchar(wlst, word, nsug, cpdsuggest);
336 // did we double two characters
337 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
338 nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
339 doubletwochars(wlst, word, nsug, cpdsuggest);
342 // perhaps we forgot to hit space and two words ran together
343 if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
344 nsug = twowords(wlst, word, nsug, cpdsuggest);
347 } // repeating ``for'' statement compounding support
349 if (nsug < 0) {
350 // we ran out of memory - we should free up as much as possible
351 for (int i = 0; i < maxSug; i++)
352 if (wlst[i] != NULL) free(wlst[i]);
353 free(wlst);
354 wlst = NULL;
357 if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;
359 *slst = wlst;
360 return nsug;
363 // generate suggestions for a word with typical mistake
364 // pass in address of array of char * pointers
365 #ifdef HUNSPELL_EXPERIMENTAL
366 int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
368 int nocompoundtwowords = 0;
369 char ** wlst;
370 int oldSug;
372 char w2[MAXWORDUTF8LEN];
373 const char * word = w;
375 // word reversing wrapper for complex prefixes
376 if (complexprefixes) {
377 strcpy(w2, w);
378 if (utf8) reverseword_utf(w2); else reverseword(w2);
379 word = w2;
382 if (*slst) {
383 wlst = *slst;
384 } else {
385 wlst = (char **) malloc(maxSug * sizeof(char *));
386 if (wlst == NULL) return -1;
389 for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
391 // limit compound suggestion
392 if (cpdsuggest > 0) oldSug = nsug;
394 // perhaps we made a typical fault of spelling
395 if ((nsug < maxSug) && (nsug > -1))
396 nsug = replchars(wlst, word, nsug, cpdsuggest);
398 // perhaps we made chose the wrong char from a related set
399 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)))
400 nsug = mapchars(wlst, word, nsug, cpdsuggest);
402 if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
404 // perhaps we forgot to hit space and two words ran together
406 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)) && check_forbidden(word, strlen(word))) {
407 nsug = twowords(wlst, word, nsug, cpdsuggest);
410 } // repeating ``for'' statement compounding support
412 if (nsug < 0) {
413 for (int i=0;i<maxSug; i++)
414 if (wlst[i] != NULL) free(wlst[i]);
415 free(wlst);
416 return -1;
419 *slst = wlst;
420 return nsug;
422 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
424 // suggestions for an uppercase word (html -> HTML)
425 int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
427 char candidate[MAXSWUTF8L];
428 w_char candidate_utf[MAXSWL];
429 memcpy(candidate_utf, word, wl * sizeof(w_char));
430 mkallcap_utf(candidate_utf, wl, langnum);
431 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
432 return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
435 // suggestions for an uppercase word (html -> HTML)
436 int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest)
438 char candidate[MAXSWUTF8L];
439 strcpy(candidate, word);
440 mkallcap(candidate, csconv);
441 return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
444 // suggestions for when chose the wrong char out of a related set
445 int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
447 char candidate[MAXSWUTF8L];
448 clock_t timelimit;
449 int timer;
450 candidate[0] = '\0';
452 int wl = strlen(word);
453 if (wl < 2 || ! pAMgr) return ns;
455 int nummap = pAMgr->get_nummap();
456 struct mapentry* maptable = pAMgr->get_maptable();
457 if (maptable==NULL) return ns;
459 timelimit = clock();
460 timer = MINTIMER;
461 return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);
464 int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn,
465 char** wlst, int cpdsuggest, int ns,
466 const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)
468 if (*(word + wn) == '\0') {
469 int cwrd = 1;
470 *(candidate + cn) = '\0';
471 int wl = strlen(candidate);
472 for (int m=0; m < ns; m++)
473 if (strcmp(candidate, wlst[m]) == 0) cwrd = 0;
474 if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
475 if (ns < maxSug) {
476 wlst[ns] = mystrdup(candidate);
477 if (wlst[ns] == NULL) return -1;
478 ns++;
481 return ns;
483 int in_map = 0;
484 for (int j = 0; j < nummap; j++) {
485 for (int k = 0; k < maptable[j].len; k++) {
486 int len = strlen(maptable[j].set[k]);
487 if (strncmp(maptable[j].set[k], word + wn, len) == 0) {
488 in_map = 1;
489 for (int l = 0; l < maptable[j].len; l++) {
490 strcpy(candidate + cn, maptable[j].set[l]);
491 ns = map_related(word, candidate, wn + len, strlen(candidate), wlst,
492 cpdsuggest, ns, maptable, nummap, timer, timelimit);
493 if (!(*timer)) return ns;
498 if (!in_map) {
499 *(candidate + cn) = *(word + wn);
500 ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest,
501 ns, maptable, nummap, timer, timelimit);
503 return ns;
506 // suggestions for a typical fault of spelling, that
507 // differs with more, than 1 letter from the right form.
508 int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
510 char candidate[MAXSWUTF8L];
511 const char * r;
512 int lenr, lenp;
513 int wl = strlen(word);
514 if (wl < 2 || ! pAMgr) return ns;
516 #ifdef HUNSPELL_CHROME_CLIENT
517 const char *pattern, *pattern2;
518 hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator();
519 while (iterator.GetNext(&pattern, &pattern2)) {
520 r = word;
521 lenr = strlen(pattern2);
522 lenp = strlen(pattern);
524 // search every occurence of the pattern in the word
525 while ((r=strstr(r, pattern)) != NULL) {
526 strcpy(candidate, word);
527 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
528 strcpy(candidate+(r-word), pattern2);
529 strcpy(candidate+(r-word)+lenr, r+lenp);
530 ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
531 if (ns == -1) return -1;
532 // check REP suggestions with space
533 char * sp = strchr(candidate, ' ');
534 if (sp) {
535 char * prev = candidate;
536 while (sp) {
537 *sp = '\0';
538 if (checkword(prev, strlen(prev), 0, NULL, NULL)) {
539 int oldns = ns;
540 *sp = ' ';
541 ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
542 if (ns == -1) return -1;
543 if (oldns < ns) {
544 free(wlst[ns - 1]);
545 wlst[ns - 1] = mystrdup(candidate);
546 if (!wlst[ns - 1]) return -1;
549 *sp = ' ';
550 prev = sp + 1;
551 sp = strchr(prev, ' ');
554 r++; // search for the next letter
557 #else
558 int numrep = pAMgr->get_numrep();
559 struct replentry* reptable = pAMgr->get_reptable();
560 if (reptable==NULL) return ns;
561 for (int i=0; i < numrep; i++ ) {
562 r = word;
563 lenr = strlen(reptable[i].pattern2);
564 lenp = strlen(reptable[i].pattern);
565 // search every occurence of the pattern in the word
566 while ((r=strstr(r, reptable[i].pattern)) != NULL && (!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) &&
567 (!reptable[i].start || r == word)) {
568 strcpy(candidate, word);
569 if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
570 strcpy(candidate+(r-word),reptable[i].pattern2);
571 strcpy(candidate+(r-word)+lenr, r+lenp);
572 ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
573 if (ns == -1) return -1;
574 // check REP suggestions with space
575 char * sp = strchr(candidate, ' ');
576 if (sp) {
577 char * prev = candidate;
578 while (sp) {
579 *sp = '\0';
580 if (checkword(prev, strlen(prev), 0, NULL, NULL)) {
581 int oldns = ns;
582 *sp = ' ';
583 ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
584 if (ns == -1) return -1;
585 if (oldns < ns) {
586 free(wlst[ns - 1]);
587 wlst[ns - 1] = mystrdup(candidate);
588 if (!wlst[ns - 1]) return -1;
591 *sp = ' ';
592 prev = sp + 1;
593 sp = strchr(prev, ' ');
596 r++; // search for the next letter
599 #endif
600 return ns;
603 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
604 int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsuggest)
606 char candidate[MAXSWUTF8L];
607 int state=0;
608 int wl = strlen(word);
609 if (wl < 5 || ! pAMgr) return ns;
610 for (int i=2; i < wl; i++ ) {
611 if (word[i]==word[i-2]) {
612 state++;
613 if (state==3) {
614 strcpy(candidate,word);
615 strcpy(candidate+i-1,word+i+1);
616 ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL);
617 if (ns == -1) return -1;
618 state=0;
620 } else {
621 state=0;
624 return ns;
627 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
628 int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
630 w_char candidate_utf[MAXSWL];
631 char candidate[MAXSWUTF8L];
632 int state=0;
633 if (wl < 5 || ! pAMgr) return ns;
634 for (int i=2; i < wl; i++) {
635 if (w_char_eq(word[i], word[i-2])) {
636 state++;
637 if (state==3) {
638 memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
639 memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char));
640 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2);
641 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
642 if (ns == -1) return -1;
643 state=0;
645 } else {
646 state=0;
649 return ns;
652 // error is wrong char in place of correct one (case and keyboard related version)
653 int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)
655 char tmpc;
656 char candidate[MAXSWUTF8L];
657 int wl = strlen(word);
658 strcpy(candidate, word);
659 // swap out each char one by one and try uppercase and neighbor
660 // keyboard chars in its place to see if that makes a good word
662 for (int i=0; i < wl; i++) {
663 tmpc = candidate[i];
664 // check with uppercase letters
665 candidate[i] = csconv[((unsigned char)tmpc)].cupper;
666 if (tmpc != candidate[i]) {
667 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
668 if (ns == -1) return -1;
669 candidate[i] = tmpc;
671 // check neighbor characters in keyboard string
672 if (!ckey) continue;
673 char * loc = strchr(ckey, tmpc);
674 while (loc) {
675 if ((loc > ckey) && (*(loc - 1) != '|')) {
676 candidate[i] = *(loc - 1);
677 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
678 if (ns == -1) return -1;
680 if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
681 candidate[i] = *(loc + 1);
682 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
683 if (ns == -1) return -1;
685 loc = strchr(loc + 1, tmpc);
687 candidate[i] = tmpc;
689 return ns;
692 // error is wrong char in place of correct one (case and keyboard related version)
693 int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
695 w_char tmpc;
696 w_char candidate_utf[MAXSWL];
697 char candidate[MAXSWUTF8L];
698 memcpy(candidate_utf, word, wl * sizeof(w_char));
699 // swap out each char one by one and try all the tryme
700 // chars in its place to see if that makes a good word
701 for (int i=0; i < wl; i++) {
702 tmpc = candidate_utf[i];
703 // check with uppercase letters
704 mkallcap_utf(candidate_utf + i, 1, langnum);
705 if (!w_char_eq(tmpc, candidate_utf[i])) {
706 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
707 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
708 if (ns == -1) return -1;
709 candidate_utf[i] = tmpc;
711 // check neighbor characters in keyboard string
712 if (!ckey) continue;
713 w_char * loc = ckey_utf;
714 while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;
715 while (loc < (ckey_utf + ckeyl)) {
716 if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
717 candidate_utf[i] = *(loc - 1);
718 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
719 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
720 if (ns == -1) return -1;
722 if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
723 candidate_utf[i] = *(loc + 1);
724 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
725 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
726 if (ns == -1) return -1;
728 do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
730 candidate_utf[i] = tmpc;
732 return ns;
735 // error is wrong char in place of correct one
736 int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
738 char tmpc;
739 char candidate[MAXSWUTF8L];
740 clock_t timelimit = clock();
741 int timer = MINTIMER;
742 int wl = strlen(word);
743 strcpy(candidate, word);
744 // swap out each char one by one and try all the tryme
745 // chars in its place to see if that makes a good word
746 for (int j=0; j < ctryl; j++) {
747 for (int i=wl-1; i >= 0; i--) {
748 tmpc = candidate[i];
749 if (ctry[j] == tmpc) continue;
750 candidate[i] = ctry[j];
751 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
752 if (ns == -1) return -1;
753 if (!timer) return ns;
754 candidate[i] = tmpc;
757 return ns;
760 // error is wrong char in place of correct one
761 int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
763 w_char tmpc;
764 w_char candidate_utf[MAXSWL];
765 char candidate[MAXSWUTF8L];
766 clock_t timelimit = clock();
767 int timer = MINTIMER;
768 memcpy(candidate_utf, word, wl * sizeof(w_char));
769 // swap out each char one by one and try all the tryme
770 // chars in its place to see if that makes a good word
771 for (int j=0; j < ctryl; j++) {
772 for (int i=wl-1; i >= 0; i--) {
773 tmpc = candidate_utf[i];
774 if (w_char_eq(tmpc, ctry_utf[j])) continue;
775 candidate_utf[i] = ctry_utf[j];
776 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
777 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
778 if (ns == -1) return -1;
779 if (!timer) return ns;
780 candidate_utf[i] = tmpc;
783 return ns;
786 // error is word has an extra letter it does not need
787 int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
789 char candidate[MAXSWUTF8L];
790 w_char candidate_utf[MAXSWL];
791 w_char * p;
792 w_char tmpc = W_VLINE; // not used value, only for VCC warning message
793 if (wl < 2) return ns;
794 // try omitting one char of word at a time
795 memcpy(candidate_utf, word, wl * sizeof(w_char));
796 for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
797 w_char tmpc2 = *p;
798 if (p < candidate_utf + wl - 1) *p = tmpc;
799 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
800 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
801 if (ns == -1) return -1;
802 tmpc = tmpc2;
804 return ns;
807 // error is word has an extra letter it does not need
808 int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
810 char tmpc = '\0';
811 char candidate[MAXSWUTF8L];
812 char * p;
813 int wl = strlen(word);
814 if (wl < 2) return ns;
815 // try omitting one char of word at a time
816 strcpy (candidate, word);
817 for (p = candidate + wl - 1; p >=candidate; p--) {
818 char tmpc2 = *p;
819 *p = tmpc;
820 ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);
821 if (ns == -1) return -1;
822 tmpc = tmpc2;
824 return ns;
827 // error is missing a letter it needs
828 int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
830 // TODO(rouslan): Remove the interim change below when this patch lands:
831 // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395
832 char candidate[MAXSWUTF8L + 4];
833 char * p;
834 clock_t timelimit = clock();
835 int timer = MINTIMER;
836 int wl = strlen(word);
837 // try inserting a tryme character before every letter (and the null terminator)
838 for (int i = 0; i < ctryl; i++) {
839 strcpy(candidate, word);
840 for (p = candidate + wl; p >= candidate; p--) {
841 *(p+1) = *p;
842 *p = ctry[i];
843 ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);
844 if (ns == -1) return -1;
845 if (!timer) return ns;
848 return ns;
851 // error is missing a letter it needs
852 int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
854 // TODO(rouslan): Remove the interim change below when this patch lands:
855 // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395
856 w_char candidate_utf[MAXSWL + 1];
857 char candidate[MAXSWUTF8L + 4];
858 w_char * p;
859 clock_t timelimit = clock();
860 int timer = MINTIMER;
861 // try inserting a tryme character at the end of the word and before every letter
862 for (int i = 0; i < ctryl; i++) {
863 memcpy (candidate_utf, word, wl * sizeof(w_char));
864 for (p = candidate_utf + wl; p >= candidate_utf; p--) {
865 *(p + 1) = *p;
866 *p = ctry_utf[i];
867 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
868 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
869 if (ns == -1) return -1;
870 if (!timer) return ns;
873 return ns;
877 /* error is should have been two words */
878 int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
880 char candidate[MAXSWUTF8L];
881 char * p;
882 int c1, c2;
883 int forbidden = 0;
884 int cwrd;
886 int wl=strlen(word);
887 if (wl < 3) return ns;
889 if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);
891 strcpy(candidate + 1, word);
892 // split the string into two pieces after every char
893 // if both pieces are good words make them a suggestion
894 for (p = candidate + 1; p[1] != '\0'; p++) {
895 p[-1] = *p;
896 // go to end of the UTF-8 character
897 while (utf8 && ((p[1] & 0xc0) == 0x80)) {
898 *p = p[1];
899 p++;
901 if (utf8 && p[1] == '\0') break; // last UTF-8 character
902 *p = '\0';
903 c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);
904 if (c1) {
905 c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL);
906 if (c2) {
907 *p = ' ';
909 // spec. Hungarian code (need a better compound word support)
910 if ((langnum == LANG_hu) && !forbidden &&
911 // if 3 repeating letter, use - instead of space
912 (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
913 // or multiple compounding, with more, than 6 syllables
914 ((c1 == 3) && (c2 >= 2)))) *p = '-';
916 cwrd = 1;
917 for (int k=0; k < ns; k++)
918 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
919 if (ns < maxSug) {
920 if (cwrd) {
921 wlst[ns] = mystrdup(candidate);
922 if (wlst[ns] == NULL) return -1;
923 ns++;
925 } else return ns;
926 // add two word suggestion with dash, if TRY string contains
927 // "a" or "-"
928 // NOTE: cwrd doesn't modified for REP twoword sugg.
929 if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
930 mystrlen(p + 1) > 1 &&
931 mystrlen(candidate) - mystrlen(p) > 1) {
932 *p = '-';
933 for (int k=0; k < ns; k++)
934 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
935 if (ns < maxSug) {
936 if (cwrd) {
937 wlst[ns] = mystrdup(candidate);
938 if (wlst[ns] == NULL) return -1;
939 ns++;
941 } else return ns;
946 return ns;
950 // error is adjacent letter were swapped
951 int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
953 char candidate[MAXSWUTF8L];
954 char * p;
955 char tmpc;
956 int wl=strlen(word);
957 // try swapping adjacent chars one by one
958 strcpy(candidate, word);
959 for (p = candidate; p[1] != 0; p++) {
960 tmpc = *p;
961 *p = p[1];
962 p[1] = tmpc;
963 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
964 if (ns == -1) return -1;
965 p[1] = *p;
966 *p = tmpc;
968 // try double swaps for short words
969 // ahev -> have, owudl -> would
970 if (wl == 4 || wl == 5) {
971 candidate[0] = word[1];
972 candidate[1] = word[0];
973 candidate[2] = word[2];
974 candidate[wl - 2] = word[wl - 1];
975 candidate[wl - 1] = word[wl - 2];
976 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
977 if (ns == -1) return -1;
978 if (wl == 5) {
979 candidate[0] = word[0];
980 candidate[1] = word[2];
981 candidate[2] = word[1];
982 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
983 if (ns == -1) return -1;
986 return ns;
989 // error is adjacent letter were swapped
990 int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
992 w_char candidate_utf[MAXSWL];
993 char candidate[MAXSWUTF8L];
994 w_char * p;
995 w_char tmpc;
996 int len = 0;
997 // try swapping adjacent chars one by one
998 memcpy (candidate_utf, word, wl * sizeof(w_char));
999 for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
1000 tmpc = *p;
1001 *p = p[1];
1002 p[1] = tmpc;
1003 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1004 if (len == 0) len = strlen(candidate);
1005 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
1006 if (ns == -1) return -1;
1007 p[1] = *p;
1008 *p = tmpc;
1010 // try double swaps for short words
1011 // ahev -> have, owudl -> would, suodn -> sound
1012 if (wl == 4 || wl == 5) {
1013 candidate_utf[0] = word[1];
1014 candidate_utf[1] = word[0];
1015 candidate_utf[2] = word[2];
1016 candidate_utf[wl - 2] = word[wl - 1];
1017 candidate_utf[wl - 1] = word[wl - 2];
1018 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1019 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
1020 if (ns == -1) return -1;
1021 if (wl == 5) {
1022 candidate_utf[0] = word[0];
1023 candidate_utf[1] = word[2];
1024 candidate_utf[2] = word[1];
1025 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1026 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
1027 if (ns == -1) return -1;
1030 return ns;
1033 // error is not adjacent letter were swapped
1034 int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
1036 char candidate[MAXSWUTF8L];
1037 char * p;
1038 char * q;
1039 char tmpc;
1040 int wl=strlen(word);
1041 // try swapping not adjacent chars one by one
1042 strcpy(candidate, word);
1043 for (p = candidate; *p != 0; p++) {
1044 for (q = candidate; *q != 0; q++) {
1045 if (abs((int)(p-q)) > 1) {
1046 tmpc = *p;
1047 *p = *q;
1048 *q = tmpc;
1049 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1050 if (ns == -1) return -1;
1051 *q = *p;
1052 *p = tmpc;
1056 return ns;
1060 // error is adjacent letter were swapped
1061 int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
1063 w_char candidate_utf[MAXSWL];
1064 char candidate[MAXSWUTF8L];
1065 w_char * p;
1066 w_char * q;
1067 w_char tmpc;
1068 // try swapping not adjacent chars
1069 memcpy (candidate_utf, word, wl * sizeof(w_char));
1070 for (p = candidate_utf; p < (candidate_utf + wl); p++) {
1071 for (q = candidate_utf; q < (candidate_utf + wl); q++) {
1072 if (abs((int)(p-q)) > 1) {
1073 tmpc = *p;
1074 *p = *q;
1075 *q = tmpc;
1076 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1077 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1078 if (ns == -1) return -1;
1079 *q = *p;
1080 *p = tmpc;
1084 return ns;
1087 // error is a letter was moved
1088 int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest)
1090 char candidate[MAXSWUTF8L];
1091 char * p;
1092 char * q;
1093 char tmpc;
1095 int wl=strlen(word);
1096 // try moving a char
1097 strcpy(candidate, word);
1098 for (p = candidate; *p != 0; p++) {
1099 for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) {
1100 tmpc = *(q-1);
1101 *(q-1) = *q;
1102 *q = tmpc;
1103 if ((q-p) < 2) continue; // omit swap char
1104 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1105 if (ns == -1) return -1;
1107 strcpy(candidate, word);
1109 for (p = candidate + wl - 1; p > candidate; p--) {
1110 for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) {
1111 tmpc = *(q+1);
1112 *(q+1) = *q;
1113 *q = tmpc;
1114 if ((p-q) < 2) continue; // omit swap char
1115 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1116 if (ns == -1) return -1;
1118 strcpy(candidate, word);
1120 return ns;
1123 // error is a letter was moved
1124 int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
1126 w_char candidate_utf[MAXSWL];
1127 char candidate[MAXSWUTF8L];
1128 w_char * p;
1129 w_char * q;
1130 w_char tmpc;
1131 // try moving a char
1132 memcpy (candidate_utf, word, wl * sizeof(w_char));
1133 for (p = candidate_utf; p < (candidate_utf + wl); p++) {
1134 for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) {
1135 tmpc = *(q-1);
1136 *(q-1) = *q;
1137 *q = tmpc;
1138 if ((q-p) < 2) continue; // omit swap char
1139 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1140 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1141 if (ns == -1) return -1;
1143 memcpy (candidate_utf, word, wl * sizeof(w_char));
1145 for (p = candidate_utf + wl - 1; p > candidate_utf; p--) {
1146 for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) {
1147 tmpc = *(q+1);
1148 *(q+1) = *q;
1149 *q = tmpc;
1150 if ((p-q) < 2) continue; // omit swap char
1151 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1152 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1153 if (ns == -1) return -1;
1155 memcpy (candidate_utf, word, wl * sizeof(w_char));
1157 return ns;
1160 // generate a set of suggestions for very poorly spelled words
1161 int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)
1164 int i, j;
1165 int lval;
1166 int sc, scphon;
1167 int lp, lpphon;
1168 int nonbmp = 0;
1170 // exhaustively search through all root words
1171 // keeping track of the MAX_ROOTS most similar root words
1172 struct hentry * roots[MAX_ROOTS];
1173 char * rootsphon[MAX_ROOTS];
1174 int scores[MAX_ROOTS];
1175 int scoresphon[MAX_ROOTS];
1176 for (i = 0; i < MAX_ROOTS; i++) {
1177 roots[i] = NULL;
1178 scores[i] = -100 * i;
1179 rootsphon[i] = NULL;
1180 scoresphon[i] = -100 * i;
1182 lp = MAX_ROOTS - 1;
1183 lpphon = MAX_ROOTS - 1;
1184 scphon = -20000;
1185 int low = NGRAM_LOWERING;
1187 char w2[MAXWORDUTF8LEN];
1188 char f[MAXSWUTF8L];
1189 char * word = w;
1191 // word reversing wrapper for complex prefixes
1192 if (complexprefixes) {
1193 strcpy(w2, w);
1194 if (utf8) reverseword_utf(w2); else reverseword(w2);
1195 word = w2;
1198 char mw[MAXSWUTF8L];
1199 w_char u8[MAXSWL];
1200 int nc = strlen(word);
1201 int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
1203 // set character based ngram suggestion for words with non-BMP Unicode characters
1204 if (n == -1) {
1205 utf8 = 0; // XXX not state-free
1206 n = nc;
1207 nonbmp = 1;
1208 low = 0;
1211 struct hentry* hp = NULL;
1212 int col = -1;
1213 #ifdef HUNSPELL_CHROME_CLIENT
1214 ScopedHashEntryFactory hash_entry_factory;
1215 #endif
1216 phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
1217 char target[MAXSWUTF8L];
1218 char candidate[MAXSWUTF8L];
1219 if (ph) {
1220 if (utf8) {
1221 w_char _w[MAXSWL];
1222 int _wl = u8_u16(_w, MAXSWL, word);
1223 mkallcap_utf(_w, _wl, langnum);
1224 u16_u8(candidate, MAXSWUTF8L, _w, _wl);
1225 } else {
1226 strcpy(candidate, word);
1227 if (!nonbmp) mkallcap(candidate, csconv);
1229 phonet(candidate, target, nc, *ph); // XXX phonet() is 8-bit (nc, not n)
1232 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;
1233 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;
1234 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
1235 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
1237 for (i = 0; i < md; i++) {
1238 while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {
1239 if ((hp->astr) && (pAMgr) &&
1240 (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1241 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1242 TESTAFF(hp->astr, nosuggest, hp->alen) ||
1243 TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1244 TESTAFF(hp->astr, onlyincompound, hp->alen))) continue;
1246 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
1247 leftcommonsubstring(word, HENTRY_WORD(hp));
1249 // check special pronounciation
1250 if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1251 int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
1252 + leftcommonsubstring(word, f);
1253 if (sc2 > sc) sc = sc2;
1256 scphon = -20000;
1257 if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {
1258 char target2[MAXSWUTF8L];
1259 if (utf8) {
1260 w_char _w[MAXSWL];
1261 int _wl = u8_u16(_w, MAXSWL, HENTRY_WORD(hp));
1262 mkallcap_utf(_w, _wl, langnum);
1263 u16_u8(candidate, MAXSWUTF8L, _w, _wl);
1264 } else {
1265 strcpy(candidate, HENTRY_WORD(hp));
1266 mkallcap(candidate, csconv);
1268 phonet(candidate, target2, -1, *ph);
1269 scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
1272 if (sc > scores[lp]) {
1273 scores[lp] = sc;
1274 #ifdef HUNSPELL_CHROME_CLIENT
1275 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp);
1276 #else
1277 roots[lp] = hp;
1278 #endif
1279 lval = sc;
1280 for (j=0; j < MAX_ROOTS; j++)
1281 if (scores[j] < lval) {
1282 lp = j;
1283 lval = scores[j];
1288 if (scphon > scoresphon[lpphon]) {
1289 scoresphon[lpphon] = scphon;
1290 rootsphon[lpphon] = HENTRY_WORD(hp);
1291 lval = scphon;
1292 for (j=0; j < MAX_ROOTS; j++)
1293 if (scoresphon[j] < lval) {
1294 lpphon = j;
1295 lval = scoresphon[j];
1300 // find minimum threshold for a passable suggestion
1301 // mangle original word three differnt ways
1302 // and score them to generate a minimum acceptable score
1303 int thresh = 0;
1304 for (int sp = 1; sp < 4; sp++) {
1305 if (utf8) {
1306 for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
1307 u16_u8(mw, MAXSWUTF8L, u8, n);
1308 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1309 } else {
1310 strcpy(mw, word);
1311 for (int k=sp; k < n; k+=4) *(mw + k) = '*';
1312 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1315 thresh = thresh / 3;
1316 thresh--;
1318 // now expand affixes on each of these root words and
1319 // and use length adjusted ngram scores to select
1320 // possible suggestions
1321 char * guess[MAX_GUESS];
1322 char * guessorig[MAX_GUESS];
1323 int gscore[MAX_GUESS];
1324 for(i=0;i<MAX_GUESS;i++) {
1325 guess[i] = NULL;
1326 guessorig[i] = NULL;
1327 gscore[i] = -100 * i;
1330 lp = MAX_GUESS - 1;
1332 struct guessword * glst;
1333 glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
1334 if (! glst) {
1335 if (nonbmp) utf8 = 1;
1336 return ns;
1339 for (i = 0; i < MAX_ROOTS; i++) {
1340 if (roots[i]) {
1341 struct hentry * rp = roots[i];
1342 int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,
1343 rp->astr, rp->alen, word, nc,
1344 ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));
1346 for (int k = 0; k < nw ; k++) {
1347 sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
1348 leftcommonsubstring(word, glst[k].word);
1350 if (sc > thresh) {
1351 if (sc > gscore[lp]) {
1352 if (guess[lp]) {
1353 free (guess[lp]);
1354 if (guessorig[lp]) {
1355 free(guessorig[lp]);
1356 guessorig[lp] = NULL;
1359 gscore[lp] = sc;
1360 guess[lp] = glst[k].word;
1361 guessorig[lp] = glst[k].orig;
1362 lval = sc;
1363 for (j=0; j < MAX_GUESS; j++)
1364 if (gscore[j] < lval) {
1365 lp = j;
1366 lval = gscore[j];
1368 } else {
1369 free(glst[k].word);
1370 if (glst[k].orig) free(glst[k].orig);
1372 } else {
1373 free(glst[k].word);
1374 if (glst[k].orig) free(glst[k].orig);
1379 free(glst);
1381 // now we are done generating guesses
1382 // sort in order of decreasing score
1385 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1386 if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1388 // weight suggestions with a similarity index, based on
1389 // the longest common subsequent algorithm and resort
1391 int is_swap = 0;
1392 int re = 0;
1393 double fact = 1.0;
1394 if (pAMgr) {
1395 int maxd = pAMgr->get_maxdiff();
1396 if (maxd >= 0) fact = (10.0 - maxd)/5.0;
1399 for (i=0; i < MAX_GUESS; i++) {
1400 if (guess[i]) {
1401 // lowering guess[i]
1402 char gl[MAXSWUTF8L];
1403 int len;
1404 if (utf8) {
1405 w_char _w[MAXSWL];
1406 len = u8_u16(_w, MAXSWL, guess[i]);
1407 mkallsmall_utf(_w, len, langnum);
1408 u16_u8(gl, MAXSWUTF8L, _w, len);
1409 } else {
1410 strcpy(gl, guess[i]);
1411 if (!nonbmp) mkallsmall(gl, csconv);
1412 len = strlen(guess[i]);
1415 int _lcs = lcslen(word, gl);
1417 // same characters with different casing
1418 if ((n == len) && (n == _lcs)) {
1419 gscore[i] += 2000;
1420 break;
1422 // using 2-gram instead of 3, and other weightening
1424 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
1425 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
1427 gscore[i] =
1428 // length of longest common subsequent minus length difference
1429 2 * _lcs - abs((int) (n - len)) +
1430 // weight length of the left common substring
1431 leftcommonsubstring(word, gl) +
1432 // weight equal character positions
1433 (!nonbmp && commoncharacterpositions(word, gl, &is_swap) ? 1: 0) +
1434 // swap character (not neighboring)
1435 ((is_swap) ? 10 : 0) +
1436 // ngram
1437 ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +
1438 // weighted ngrams
1439 re +
1440 // different limit for dictionaries with PHONE rules
1441 (ph ? (re < len * fact ? -1000 : 0) : (re < (n + len)*fact? -1000 : 0));
1445 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1447 // phonetic version
1448 if (ph) for (i=0; i < MAX_ROOTS; i++) {
1449 if (rootsphon[i]) {
1450 // lowering rootphon[i]
1451 char gl[MAXSWUTF8L];
1452 int len;
1453 if (utf8) {
1454 w_char _w[MAXSWL];
1455 len = u8_u16(_w, MAXSWL, rootsphon[i]);
1456 mkallsmall_utf(_w, len, langnum);
1457 u16_u8(gl, MAXSWUTF8L, _w, len);
1458 } else {
1459 strcpy(gl, rootsphon[i]);
1460 if (!nonbmp) mkallsmall(gl, csconv);
1461 len = strlen(rootsphon[i]);
1464 // heuristic weigthing of ngram scores
1465 scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +
1466 // weight length of the left common substring
1467 leftcommonsubstring(word, gl);
1471 if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1473 // copy over
1474 int oldns = ns;
1476 int same = 0;
1477 for (i=0; i < MAX_GUESS; i++) {
1478 if (guess[i]) {
1479 if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
1480 int unique = 1;
1481 // leave only excellent suggestions, if exists
1482 if (gscore[i] > 1000) same = 1; else if (gscore[i] < -100) {
1483 same = 1;
1484 // keep the best ngram suggestions, unless in ONLYMAXDIFF mode
1485 if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) {
1486 free(guess[i]);
1487 if (guessorig[i]) free(guessorig[i]);
1488 continue;
1491 for (j = 0; j < ns; j++) {
1492 // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1493 if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||
1494 (guessorig[i] && strstr(guessorig[i], wlst[j])) ||
1495 // check forbidden words
1496 !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
1498 if (unique) {
1499 wlst[ns++] = guess[i];
1500 if (guessorig[i]) {
1501 free(guess[i]);
1502 wlst[ns-1] = guessorig[i];
1504 } else {
1505 free(guess[i]);
1506 if (guessorig[i]) free(guessorig[i]);
1508 } else {
1509 free(guess[i]);
1510 if (guessorig[i]) free(guessorig[i]);
1515 oldns = ns;
1516 if (ph) for (i=0; i < MAX_ROOTS; i++) {
1517 if (rootsphon[i]) {
1518 if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {
1519 int unique = 1;
1520 for (j = 0; j < ns; j++) {
1521 // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1522 if (strstr(rootsphon[i], wlst[j]) ||
1523 // check forbidden words
1524 !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0;
1526 if (unique) {
1527 wlst[ns++] = mystrdup(rootsphon[i]);
1528 if (!wlst[ns - 1]) return ns - 1;
1534 if (nonbmp) utf8 = 1;
1535 return ns;
1539 // see if a candidate suggestion is spelled correctly
1540 // needs to check both root words and words with affixes
1542 // obsolote MySpell-HU modifications:
1543 // return value 2 and 3 marks compounding with hyphen (-)
1544 // `3' marks roots without suffix
1545 int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)
1547 struct hentry * rv=NULL;
1548 struct hentry * rv2=NULL;
1549 int nosuffix = 0;
1551 // check time limit
1552 if (timer) {
1553 (*timer)--;
1554 if (!(*timer) && timelimit) {
1555 if ((clock() - *timelimit) > TIMELIMIT) return 0;
1556 *timer = MAXPLUSTIMER;
1560 if (pAMgr) {
1561 if (cpdsuggest==1) {
1562 if (pAMgr->get_compound()) {
1563 rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1, 0); //EXT
1564 if (rv && (!(rv2 = pAMgr->lookup(word)) || !rv2->astr ||
1565 !(TESTAFF(rv2->astr,pAMgr->get_forbiddenword(),rv2->alen) ||
1566 TESTAFF(rv2->astr,pAMgr->get_nosuggest(),rv2->alen)))) return 3; // XXX obsolote categorisation + only ICONV needs affix flag check?
1568 return 0;
1571 rv = pAMgr->lookup(word);
1573 if (rv) {
1574 if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
1575 || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
1576 while (rv) {
1577 if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1578 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1579 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1580 rv = rv->next_homonym;
1581 } else break;
1583 } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
1585 if (rv) {
1586 nosuffix=1;
1587 } else {
1588 rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
1591 if (!rv && pAMgr->have_contclass()) {
1592 rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
1593 if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
1596 // check forbidden words
1597 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||
1598 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1599 TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
1600 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
1602 if (rv) { // XXX obsolote
1603 if ((pAMgr->get_compoundflag()) &&
1604 TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
1605 return 1;
1608 return 0;
1611 int SuggestMgr::check_forbidden(const char * word, int len)
1613 struct hentry * rv = NULL;
1615 if (pAMgr) {
1616 rv = pAMgr->lookup(word);
1617 if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1618 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
1619 if (!(pAMgr->prefix_check(word,len,1)))
1620 rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
1621 // check forbidden words
1622 if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
1624 return 0;
1627 #ifdef HUNSPELL_EXPERIMENTAL
1628 // suggest possible stems
1629 int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
1631 char ** wlst;
1633 struct hentry * rv = NULL;
1635 char w2[MAXSWUTF8L];
1636 const char * word = w;
1638 // word reversing wrapper for complex prefixes
1639 if (complexprefixes) {
1640 strcpy(w2, w);
1641 if (utf8) reverseword_utf(w2); else reverseword(w2);
1642 word = w2;
1645 int wl = strlen(word);
1648 if (*slst) {
1649 wlst = *slst;
1650 } else {
1651 wlst = (char **) calloc(maxSug, sizeof(char *));
1652 if (wlst == NULL) return -1;
1655 rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
1657 // delete dash from end of word
1658 if (nsug > 0) {
1659 for (int j=0; j < nsug; j++) {
1660 if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
1664 *slst = wlst;
1665 return nsug;
1667 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1670 char * SuggestMgr::suggest_morph(const char * w)
1672 char result[MAXLNLEN];
1673 char * r = (char *) result;
1674 char * st;
1676 struct hentry * rv = NULL;
1678 *result = '\0';
1680 if (! pAMgr) return NULL;
1682 char w2[MAXSWUTF8L];
1683 const char * word = w;
1685 // word reversing wrapper for complex prefixes
1686 if (complexprefixes) {
1687 strcpy(w2, w);
1688 if (utf8) reverseword_utf(w2); else reverseword(w2);
1689 word = w2;
1692 rv = pAMgr->lookup(word);
1694 while (rv) {
1695 if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1696 TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1697 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1698 if (!HENTRY_FIND(rv, MORPH_STEM)) {
1699 mystrcat(result, " ", MAXLNLEN);
1700 mystrcat(result, MORPH_STEM, MAXLNLEN);
1701 mystrcat(result, word, MAXLNLEN);
1703 if (HENTRY_DATA(rv)) {
1704 mystrcat(result, " ", MAXLNLEN);
1705 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
1707 mystrcat(result, "\n", MAXLNLEN);
1709 rv = rv->next_homonym;
1712 st = pAMgr->affix_check_morph(word,strlen(word));
1713 if (st) {
1714 mystrcat(result, st, MAXLNLEN);
1715 free(st);
1718 if (pAMgr->get_compound() && (*result == '\0'))
1719 pAMgr->compound_check_morph(word, strlen(word),
1720 0, 0, 100, 0,NULL, 0, &r, NULL);
1722 return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
1725 #ifdef HUNSPELL_EXPERIMENTAL
1726 char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
1728 char * p = NULL;
1729 char ** wlst = (char **) calloc(maxSug, sizeof(char *));
1730 if (!**wlst) return NULL;
1731 // we will use only the first suggestion
1732 for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
1733 int ns = suggest(&wlst, word, maxSug - 1, NULL);
1734 if (ns == maxSug) {
1735 p = suggest_morph(wlst[maxSug - 1]);
1736 free(wlst[maxSug - 1]);
1738 if (wlst) free(wlst);
1739 return p;
1741 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1743 /* affixation */
1744 char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)
1746 char result[MAXLNLEN];
1747 *result = '\0';
1748 int sfxcount = get_sfxcount(pattern);
1750 if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;
1752 if (HENTRY_DATA(rv)) {
1753 char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
1754 HENTRY_DATA(rv), pattern, 0);
1755 if (aff) {
1756 mystrcat(result, aff, MAXLNLEN);
1757 mystrcat(result, "\n", MAXLNLEN);
1758 free(aff);
1762 // check all allomorphs
1763 char allomorph[MAXLNLEN];
1764 char * p = NULL;
1765 if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
1766 while (p) {
1767 struct hentry * rv2 = NULL;
1768 p += MORPH_TAG_LEN;
1769 int plen = fieldlen(p);
1770 strncpy(allomorph, p, plen);
1771 allomorph[plen] = '\0';
1772 rv2 = pAMgr->lookup(allomorph);
1773 while (rv2) {
1774 // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
1775 if (HENTRY_DATA(rv2)) {
1776 char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);
1777 if (st && (strncmp(st + MORPH_TAG_LEN,
1778 HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {
1779 char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,
1780 HENTRY_DATA(rv2), pattern, 0);
1781 if (aff) {
1782 mystrcat(result, aff, MAXLNLEN);
1783 mystrcat(result, "\n", MAXLNLEN);
1784 free(aff);
1788 rv2 = rv2->next_homonym;
1790 p = strstr(p + plen, MORPH_ALLOMORPH);
1793 return (*result) ? mystrdup(result) : NULL;
1796 char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {
1797 char result[MAXLNLEN];
1798 char result2[MAXLNLEN];
1799 char newpattern[MAXLNLEN];
1800 *newpattern = '\0';
1801 if (n == 0) return 0;
1802 *result2 = '\0';
1803 struct hentry * rv = NULL;
1804 if (!pAMgr) return NULL;
1806 // search affixed forms with and without derivational suffixes
1807 while(1) {
1809 for (int k = 0; k < n; k++) {
1810 *result = '\0';
1811 // add compound word parts (except the last one)
1812 char * s = (char *) desc[k];
1813 char * part = strstr(s, MORPH_PART);
1814 if (part) {
1815 char * nextpart = strstr(part + 1, MORPH_PART);
1816 while (nextpart) {
1817 copy_field(result + strlen(result), part, MORPH_PART);
1818 part = nextpart;
1819 nextpart = strstr(part + 1, MORPH_PART);
1821 s = part;
1824 char **pl;
1825 char tok[MAXLNLEN];
1826 strcpy(tok, s);
1827 char * alt = strstr(tok, " | ");
1828 while (alt) {
1829 alt[1] = MSEP_ALT;
1830 alt = strstr(alt, " | ");
1832 int pln = line_tok(tok, &pl, MSEP_ALT);
1833 for (int i = 0; i < pln; i++) {
1834 // remove inflectional and terminal suffixes
1835 char * is = strstr(pl[i], MORPH_INFL_SFX);
1836 if (is) *is = '\0';
1837 char * ts = strstr(pl[i], MORPH_TERM_SFX);
1838 while (ts) {
1839 *ts = '_';
1840 ts = strstr(pl[i], MORPH_TERM_SFX);
1842 char * st = strstr(s, MORPH_STEM);
1843 if (st) {
1844 copy_field(tok, st, MORPH_STEM);
1845 rv = pAMgr->lookup(tok);
1846 while (rv) {
1847 char newpat[MAXLNLEN];
1848 strcpy(newpat, pl[i]);
1849 strcat(newpat, pattern);
1850 char * sg = suggest_hentry_gen(rv, newpat);
1851 if (!sg) sg = suggest_hentry_gen(rv, pattern);
1852 if (sg) {
1853 char ** gen;
1854 int genl = line_tok(sg, &gen, MSEP_REC);
1855 free(sg);
1856 sg = NULL;
1857 for (int j = 0; j < genl; j++) {
1858 if (strstr(pl[i], MORPH_SURF_PFX)) {
1859 int r2l = strlen(result2);
1860 result2[r2l] = MSEP_REC;
1861 strcpy(result2 + r2l + 1, result);
1862 copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
1863 mystrcat(result2, gen[j], MAXLNLEN);
1864 } else {
1865 sprintf(result2 + strlen(result2), "%c%s%s",
1866 MSEP_REC, result, gen[j]);
1869 freelist(&gen, genl);
1871 rv = rv->next_homonym;
1875 freelist(&pl, pln);
1878 if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;
1879 strcpy(newpattern, pattern);
1880 pattern = newpattern;
1881 char * ds = strstr(pattern, MORPH_DERI_SFX);
1882 while (ds) {
1883 strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);
1884 ds = strstr(pattern, MORPH_DERI_SFX);
1887 return (*result2 ? mystrdup(result2) : NULL);
1891 // generate an n-gram score comparing s1 and s2
1892 int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)
1894 int nscore = 0;
1895 int ns;
1896 int l1;
1897 int l2;
1898 int test = 0;
1900 if (utf8) {
1901 w_char su1[MAXSWL];
1902 w_char su2[MAXSWL];
1903 l1 = u8_u16(su1, MAXSWL, s1);
1904 l2 = u8_u16(su2, MAXSWL, s2);
1905 if ((l2 <= 0) || (l1 == -1)) return 0;
1906 // lowering dictionary word
1907 if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);
1908 for (int j = 1; j <= n; j++) {
1909 ns = 0;
1910 for (int i = 0; i <= (l1-j); i++) {
1911 int k = 0;
1912 for (int l = 0; l <= (l2-j); l++) {
1913 for (k = 0; k < j; k++) {
1914 w_char * c1 = su1 + i + k;
1915 w_char * c2 = su2 + l + k;
1916 if ((c1->l != c2->l) || (c1->h != c2->h)) break;
1918 if (k == j) {
1919 ns++;
1920 break;
1923 if (k != j && opt & NGRAM_WEIGHTED) {
1924 ns--;
1925 test++;
1926 if (i == 0 || i == l1-j) ns--; // side weight
1929 nscore = nscore + ns;
1930 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;
1932 } else {
1933 l2 = strlen(s2);
1934 if (l2 == 0) return 0;
1935 l1 = strlen(s1);
1936 char *t = mystrdup(s2);
1937 if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);
1938 for (int j = 1; j <= n; j++) {
1939 ns = 0;
1940 for (int i = 0; i <= (l1-j); i++) {
1941 char c = *(s1 + i + j);
1942 *(s1 + i + j) = '\0';
1943 if (strstr(t,(s1+i))) {
1944 ns++;
1945 } else if (opt & NGRAM_WEIGHTED) {
1946 ns--;
1947 test++;
1948 if (i == 0 || i == l1-j) ns--; // side weight
1950 *(s1 + i + j ) = c;
1952 nscore = nscore + ns;
1953 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;
1955 free(t);
1958 ns = 0;
1959 if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
1960 if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
1961 ns = (nscore - ((ns > 0) ? ns : 0));
1962 return ns;
1965 // length of the left common substring of s1 and (decapitalised) s2
1966 int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {
1967 if (utf8) {
1968 w_char su1[MAXSWL];
1969 w_char su2[MAXSWL];
1970 su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0;
1971 // decapitalize dictionary word
1972 if (complexprefixes) {
1973 int l1 = u8_u16(su1, MAXSWL, s1);
1974 int l2 = u8_u16(su2, MAXSWL, s2);
1975 if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
1976 } else {
1977 int i;
1978 u8_u16(su1, 1, s1);
1979 u8_u16(su2, 1, s2);
1980 unsigned short idx = (su2->h << 8) + su2->l;
1981 unsigned short otheridx = (su1->h << 8) + su1->l;
1982 if (otheridx != idx &&
1983 (otheridx != unicodetolower(idx, langnum))) return 0;
1984 int l1 = u8_u16(su1, MAXSWL, s1);
1985 int l2 = u8_u16(su2, MAXSWL, s2);
1986 for(i = 1; (i < l1) && (i < l2) &&
1987 (su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++);
1988 return i;
1990 } else {
1991 if (complexprefixes) {
1992 int l1 = strlen(s1);
1993 int l2 = strlen(s2);
1994 if (*(s2+l1-1) == *(s2+l2-1)) return 1;
1995 } else {
1996 char * olds = s1;
1997 // decapitalise dictionary word
1998 if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;
1999 do {
2000 s1++; s2++;
2001 } while ((*s1 == *s2) && (*s1 != '\0'));
2002 return (int)(s1 - olds);
2005 return 0;
2008 int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) {
2009 int num = 0;
2010 int diff = 0;
2011 int diffpos[2];
2012 *is_swap = 0;
2013 if (utf8) {
2014 w_char su1[MAXSWL];
2015 w_char su2[MAXSWL];
2016 int l1 = u8_u16(su1, MAXSWL, s1);
2017 int l2 = u8_u16(su2, MAXSWL, s2);
2018 // decapitalize dictionary word
2019 if (complexprefixes) {
2020 mkallsmall_utf(su2+l2-1, 1, langnum);
2021 } else {
2022 mkallsmall_utf(su2, 1, langnum);
2024 for (int i = 0; (i < l1) && (i < l2); i++) {
2025 if (((short *) su1)[i] == ((short *) su2)[i]) {
2026 num++;
2027 } else {
2028 if (diff < 2) diffpos[diff] = i;
2029 diff++;
2032 if ((diff == 2) && (l1 == l2) &&
2033 (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
2034 (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
2035 } else {
2036 int i;
2037 char t[MAXSWUTF8L];
2038 strcpy(t, s2);
2039 // decapitalize dictionary word
2040 if (complexprefixes) {
2041 int l2 = strlen(t);
2042 *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
2043 } else {
2044 mkallsmall(t, csconv);
2046 for (i = 0; (*(s1+i) != 0) && (*(t+i) != 0); i++) {
2047 if (*(s1+i) == *(t+i)) {
2048 num++;
2049 } else {
2050 if (diff < 2) diffpos[diff] = i;
2051 diff++;
2054 if ((diff == 2) && (*(s1+i) == 0) && (*(t+i) == 0) &&
2055 (*(s1+diffpos[0]) == *(t+diffpos[1])) &&
2056 (*(s1+diffpos[1]) == *(t+diffpos[0]))) *is_swap = 1;
2058 return num;
2061 int SuggestMgr::mystrlen(const char * word) {
2062 if (utf8) {
2063 w_char w[MAXSWL];
2064 return u8_u16(w, MAXSWL, word);
2065 } else return strlen(word);
2068 // sort in decreasing order of score
2069 void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )
2071 int m = 1;
2072 while (m < n) {
2073 int j = m;
2074 while (j > 0) {
2075 if (rsc[j-1] < rsc[j]) {
2076 int sctmp = rsc[j-1];
2077 char * wdtmp = rword[j-1];
2078 rsc[j-1] = rsc[j];
2079 rword[j-1] = rword[j];
2080 rsc[j] = sctmp;
2081 rword[j] = wdtmp;
2082 if (rword2) {
2083 wdtmp = rword2[j-1];
2084 rword2[j-1] = rword2[j];
2085 rword2[j] = wdtmp;
2087 j--;
2088 } else break;
2090 m++;
2092 return;
2095 // longest common subsequence
2096 void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
2097 int n, m;
2098 w_char su[MAXSWL];
2099 w_char su2[MAXSWL];
2100 char * b;
2101 char * c;
2102 int i;
2103 int j;
2104 if (utf8) {
2105 m = u8_u16(su, MAXSWL, s);
2106 n = u8_u16(su2, MAXSWL, s2);
2107 } else {
2108 m = strlen(s);
2109 n = strlen(s2);
2111 c = (char *) calloc(m + 1, n + 1);
2112 b = (char *) calloc(m + 1, n + 1);
2113 if (!c || !b) {
2114 if (c) free(c);
2115 if (b) free(b);
2116 *result = NULL;
2117 return;
2119 for (i = 1; i <= m; i++) {
2120 for (j = 1; j <= n; j++) {
2121 if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)))
2122 || ((!utf8) && ((*(s+i-1)) == (*(s2+j-1))))) {
2123 c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
2124 b[i*(n+1) + j] = LCS_UPLEFT;
2125 } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
2126 c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
2127 b[i*(n+1) + j] = LCS_UP;
2128 } else {
2129 c[i*(n+1) + j] = c[i*(n+1) + j-1];
2130 b[i*(n+1) + j] = LCS_LEFT;
2134 *result = b;
2135 free(c);
2136 *l1 = m;
2137 *l2 = n;
2140 int SuggestMgr::lcslen(const char * s, const char* s2) {
2141 int m;
2142 int n;
2143 int i;
2144 int j;
2145 char * result;
2146 int len = 0;
2147 lcs(s, s2, &m, &n, &result);
2148 if (!result) return 0;
2149 i = m;
2150 j = n;
2151 while ((i != 0) && (j != 0)) {
2152 if (result[i*(n+1) + j] == LCS_UPLEFT) {
2153 len++;
2154 i--;
2155 j--;
2156 } else if (result[i*(n+1) + j] == LCS_UP) {
2157 i--;
2158 } else j--;
2160 free(result);
2161 return len;