1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
71 #include "suggestmgr.hxx"
73 #ifndef MOZILLA_CLIENT
79 const w_char W_VLINE
= { '\0', '|' };
81 SuggestMgr::SuggestMgr(const char * tryme
, int maxn
,
85 // register affix manager and check in string of chars to
86 // try when building candidate suggestions
103 maxngramsugs
= MAXNGRAMSUGS
;
106 char * enc
= pAMgr
->get_encoding();
107 csconv
= get_current_cs(enc
);
109 langnum
= pAMgr
->get_langnum();
110 ckey
= pAMgr
->get_key_string();
111 nosplitsugs
= pAMgr
->get_nosplitsugs();
112 if (pAMgr
->get_maxngramsugs() >= 0) maxngramsugs
= pAMgr
->get_maxngramsugs();
113 utf8
= pAMgr
->get_utf8();
114 complexprefixes
= pAMgr
->get_complexprefixes();
120 ckeyl
= u8_u16(t
, MAXSWL
, ckey
);
121 ckey_utf
= (w_char
*) malloc(ckeyl
* sizeof(w_char
));
122 if (ckey_utf
) memcpy(ckey_utf
, t
, ckeyl
* sizeof(w_char
));
124 ckeyl
= strlen(ckey
);
129 ctry
= mystrdup(tryme
);
130 if (ctry
) ctryl
= strlen(ctry
);
133 ctryl
= u8_u16(t
, MAXSWL
, tryme
);
134 ctry_utf
= (w_char
*) malloc(ctryl
* sizeof(w_char
));
135 if (ctry_utf
) memcpy(ctry_utf
, t
, ctryl
* sizeof(w_char
));
142 SuggestMgr::~SuggestMgr()
145 if (ckey
) free(ckey
);
147 if (ckey_utf
) free(ckey_utf
);
150 if (ctry
) free(ctry
);
152 if (ctry_utf
) free(ctry_utf
);
158 int SuggestMgr::testsug(char** wlst
, const char * candidate
, int wl
, int ns
, int cpdsuggest
,
159 int * timer
, clock_t * timelimit
) {
161 if (ns
== maxSug
) return maxSug
;
162 for (int k
=0; k
< ns
; k
++) {
163 if (strcmp(candidate
,wlst
[k
]) == 0) cwrd
= 0;
165 if ((cwrd
) && checkword(candidate
, wl
, cpdsuggest
, timer
, timelimit
)) {
166 wlst
[ns
] = mystrdup(candidate
);
167 if (wlst
[ns
] == NULL
) {
168 for (int j
=0; j
<ns
; j
++) free(wlst
[j
]);
176 // generate suggestions for a mispelled word
177 // pass in address of array of char * pointers
178 // onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
180 int SuggestMgr::suggest(char*** slst
, const char * w
, int nsug
,
181 int * onlycompoundsug
)
183 int nocompoundtwowords
= 0;
185 w_char word_utf
[MAXSWL
];
188 char w2
[MAXWORDUTF8LEN
];
189 const char * word
= w
;
191 // word reversing wrapper for complex prefixes
192 if (complexprefixes
) {
194 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
201 wlst
= (char **) malloc(maxSug
* sizeof(char *));
202 if (wlst
== NULL
) return -1;
203 for (int i
= 0; i
< maxSug
; i
++) {
209 wl
= u8_u16(word_utf
, MAXSWL
, word
);
212 for (int cpdsuggest
=0; (cpdsuggest
<2) && (nocompoundtwowords
==0); cpdsuggest
++) {
214 // suggestions for an uppercase word (html -> HTML)
215 if ((nsug
< maxSug
) && (nsug
> -1)) {
216 nsug
= (utf8
) ? capchars_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
217 capchars(wlst
, word
, nsug
, cpdsuggest
);
220 // perhaps we made a typical fault of spelling
221 if ((nsug
< maxSug
) && (nsug
> -1))
222 nsug
= replchars(wlst
, word
, nsug
, cpdsuggest
);
224 // perhaps we made chose the wrong char from a related set
225 if ((nsug
< maxSug
) && (nsug
> -1)) {
226 nsug
= mapchars(wlst
, word
, nsug
, cpdsuggest
);
229 // did we swap the order of chars by mistake
230 if ((nsug
< maxSug
) && (nsug
> -1)) {
231 nsug
= (utf8
) ? swapchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
232 swapchar(wlst
, word
, nsug
, cpdsuggest
);
235 // did we swap the order of non adjacent chars by mistake
236 if ((nsug
< maxSug
) && (nsug
> -1)) {
237 nsug
= (utf8
) ? longswapchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
238 longswapchar(wlst
, word
, nsug
, cpdsuggest
);
241 // did we just hit the wrong key in place of a good char (case and keyboard)
242 if ((nsug
< maxSug
) && (nsug
> -1)) {
243 nsug
= (utf8
) ? badcharkey_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
244 badcharkey(wlst
, word
, nsug
, cpdsuggest
);
247 // only suggest compound words when no other suggestion
248 if ((cpdsuggest
== 0) && (nsug
> 0)) nocompoundtwowords
=1;
250 // did we add a char that should not be there
251 if ((nsug
< maxSug
) && (nsug
> -1)) {
252 nsug
= (utf8
) ? extrachar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
253 extrachar(wlst
, word
, nsug
, cpdsuggest
);
257 // did we forgot a char
258 if ((nsug
< maxSug
) && (nsug
> -1)) {
259 nsug
= (utf8
) ? forgotchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
260 forgotchar(wlst
, word
, nsug
, cpdsuggest
);
263 // did we move a char
264 if ((nsug
< maxSug
) && (nsug
> -1)) {
265 nsug
= (utf8
) ? movechar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
266 movechar(wlst
, word
, nsug
, cpdsuggest
);
269 // did we just hit the wrong key in place of a good char
270 if ((nsug
< maxSug
) && (nsug
> -1)) {
271 nsug
= (utf8
) ? badchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
272 badchar(wlst
, word
, nsug
, cpdsuggest
);
275 // did we double two characters
276 if ((nsug
< maxSug
) && (nsug
> -1)) {
277 nsug
= (utf8
) ? doubletwochars_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
278 doubletwochars(wlst
, word
, nsug
, cpdsuggest
);
281 // perhaps we forgot to hit space and two words ran together
282 if ((!nosplitsugs
) && (nsug
< maxSug
) && (nsug
> -1)) {
283 nsug
= twowords(wlst
, word
, nsug
, cpdsuggest
);
286 } // repeating ``for'' statement compounding support
289 // we ran out of memory - we should free up as much as possible
290 for (int i
= 0; i
< maxSug
; i
++)
291 if (wlst
[i
] != NULL
) free(wlst
[i
]);
296 if (!nocompoundtwowords
&& (nsug
> 0) && onlycompoundsug
) *onlycompoundsug
= 1;
302 // generate suggestions for a word with typical mistake
303 // pass in address of array of char * pointers
304 #ifdef HUNSPELL_EXPERIMENTAL
305 int SuggestMgr::suggest_auto(char*** slst
, const char * w
, int nsug
)
307 int nocompoundtwowords
= 0;
310 char w2
[MAXWORDUTF8LEN
];
311 const char * word
= w
;
313 // word reversing wrapper for complex prefixes
314 if (complexprefixes
) {
316 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
323 wlst
= (char **) malloc(maxSug
* sizeof(char *));
324 if (wlst
== NULL
) return -1;
327 for (int cpdsuggest
=0; (cpdsuggest
<2) && (nocompoundtwowords
==0); cpdsuggest
++) {
329 // perhaps we made a typical fault of spelling
330 if ((nsug
< maxSug
) && (nsug
> -1))
331 nsug
= replchars(wlst
, word
, nsug
, cpdsuggest
);
333 // perhaps we made chose the wrong char from a related set
334 if ((nsug
< maxSug
) && (nsug
> -1))
335 nsug
= mapchars(wlst
, word
, nsug
, cpdsuggest
);
337 if ((cpdsuggest
==0) && (nsug
>0)) nocompoundtwowords
=1;
339 // perhaps we forgot to hit space and two words ran together
341 if ((nsug
< maxSug
) && (nsug
> -1) && check_forbidden(word
, strlen(word
))) {
342 nsug
= twowords(wlst
, word
, nsug
, cpdsuggest
);
345 } // repeating ``for'' statement compounding support
348 for (int i
=0;i
<maxSug
; i
++)
349 if (wlst
[i
] != NULL
) free(wlst
[i
]);
357 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
359 // suggestions for an uppercase word (html -> HTML)
360 int SuggestMgr::capchars_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
362 char candidate
[MAXSWUTF8L
];
363 w_char candidate_utf
[MAXSWL
];
364 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
365 mkallcap_utf(candidate_utf
, wl
, langnum
);
366 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
367 return testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
370 // suggestions for an uppercase word (html -> HTML)
371 int SuggestMgr::capchars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
373 char candidate
[MAXSWUTF8L
];
374 strcpy(candidate
, word
);
375 mkallcap(candidate
, csconv
);
376 return testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
379 // suggestions for when chose the wrong char out of a related set
380 int SuggestMgr::mapchars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
385 int wl
= strlen(word
);
386 if (wl
< 2 || ! pAMgr
) return ns
;
388 int nummap
= pAMgr
->get_nummap();
389 struct mapentry
* maptable
= pAMgr
->get_maptable();
390 if (maptable
==NULL
) return ns
;
396 int len
= u8_u16(w
, MAXSWL
, word
);
397 ns
= map_related_utf(w
, len
, 0, cpdsuggest
, wlst
, ns
, maptable
, nummap
, &timer
, &timelimit
);
398 } else ns
= map_related(word
, 0, wlst
, cpdsuggest
, ns
, maptable
, nummap
, &timer
, &timelimit
);
402 int SuggestMgr::map_related(const char * word
, int i
, char** wlst
,
403 int cpdsuggest
, int ns
,
404 const mapentry
* maptable
, int nummap
, int * timer
, clock_t * timelimit
)
406 char c
= *(word
+ i
);
409 int wl
= strlen(word
);
410 for (int m
=0; m
< ns
; m
++)
411 if (strcmp(word
,wlst
[m
]) == 0) cwrd
= 0;
412 if ((cwrd
) && checkword(word
, wl
, cpdsuggest
, timer
, timelimit
)) {
414 wlst
[ns
] = mystrdup(word
);
415 if (wlst
[ns
] == NULL
) return -1;
422 for (int j
= 0; j
< nummap
; j
++) {
423 if (strchr(maptable
[j
].set
,c
) != 0) {
425 char * newword
= mystrdup(word
);
426 if (!newword
) return -1;
427 for (int k
= 0; k
< maptable
[j
].len
; k
++) {
428 *(newword
+ i
) = *(maptable
[j
].set
+ k
);
429 ns
= map_related(newword
, (i
+1), wlst
, cpdsuggest
,
430 ns
, maptable
, nummap
, timer
, timelimit
);
431 if (!(*timer
)) return ns
;
438 ns
= map_related(word
, i
, wlst
, cpdsuggest
,
439 ns
, maptable
, nummap
, timer
, timelimit
);
444 int SuggestMgr::map_related_utf(w_char
* word
, int len
, int i
, int cpdsuggest
,
445 char** wlst
, int ns
, const mapentry
* maptable
, int nummap
,
446 int * timer
, clock_t * timelimit
)
452 u16_u8(s
, MAXSWUTF8L
, word
, len
);
454 for (int m
=0; m
< ns
; m
++)
455 if (strcmp(s
,wlst
[m
]) == 0) cwrd
= 0;
456 if ((cwrd
) && checkword(s
, wl
, cpdsuggest
, timer
, timelimit
)) {
458 wlst
[ns
] = mystrdup(s
);
459 if (wlst
[ns
] == NULL
) return -1;
466 unsigned short c
= *((unsigned short *) word
+ i
);
467 for (int j
= 0; j
< nummap
; j
++) {
468 if (flag_bsearch((unsigned short *) maptable
[j
].set_utf16
, c
, maptable
[j
].len
)) {
470 for (int k
= 0; k
< maptable
[j
].len
; k
++) {
471 *(word
+ i
) = *(maptable
[j
].set_utf16
+ k
);
472 ns
= map_related_utf(word
, len
, i
+ 1, cpdsuggest
,
473 wlst
, ns
, maptable
, nummap
, timer
, timelimit
);
474 if (!(*timer
)) return ns
;
476 *((unsigned short *) word
+ i
) = c
;
481 ns
= map_related_utf(word
, len
, i
, cpdsuggest
,
482 wlst
, ns
, maptable
, nummap
, timer
, timelimit
);
489 // suggestions for a typical fault of spelling, that
490 // differs with more, than 1 letter from the right form.
491 int SuggestMgr::replchars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
493 char candidate
[MAXSWUTF8L
];
496 int wl
= strlen(word
);
497 if (wl
< 2 || ! pAMgr
) return ns
;
498 int numrep
= pAMgr
->get_numrep();
499 struct replentry
* reptable
= pAMgr
->get_reptable();
500 if (reptable
==NULL
) return ns
;
501 for (int i
=0; i
< numrep
; i
++ ) {
503 lenr
= strlen(reptable
[i
].pattern2
);
504 lenp
= strlen(reptable
[i
].pattern
);
505 // search every occurence of the pattern in the word
506 while ((r
=strstr(r
, reptable
[i
].pattern
)) != NULL
) {
507 strcpy(candidate
, word
);
508 if (r
-word
+ lenr
+ strlen(r
+lenp
) >= MAXSWUTF8L
) break;
509 strcpy(candidate
+(r
-word
),reptable
[i
].pattern2
);
510 strcpy(candidate
+(r
-word
)+lenr
, r
+lenp
);
511 ns
= testsug(wlst
, candidate
, wl
-lenp
+lenr
, ns
, cpdsuggest
, NULL
, NULL
);
512 if (ns
== -1) return -1;
513 // check REP suggestions with space
514 char * sp
= strchr(candidate
, ' ');
517 if (checkword(candidate
, strlen(candidate
), 0, NULL
, NULL
)) {
520 ns
= testsug(wlst
, sp
+ 1, strlen(sp
+ 1), ns
, cpdsuggest
, NULL
, NULL
);
521 if (ns
== -1) return -1;
524 wlst
[ns
- 1] = mystrdup(candidate
);
525 if (!wlst
[ns
- 1]) return -1;
530 r
++; // search for the next letter
536 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
537 int SuggestMgr::doubletwochars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
539 char candidate
[MAXSWUTF8L
];
541 int wl
= strlen(word
);
542 if (wl
< 5 || ! pAMgr
) return ns
;
543 for (int i
=2; i
< wl
; i
++ ) {
544 if (word
[i
]==word
[i
-2]) {
547 strcpy(candidate
,word
);
548 strcpy(candidate
+i
-1,word
+i
+1);
549 ns
= testsug(wlst
, candidate
, wl
-2, ns
, cpdsuggest
, NULL
, NULL
);
550 if (ns
== -1) return -1;
560 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
561 int SuggestMgr::doubletwochars_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
563 w_char candidate_utf
[MAXSWL
];
564 char candidate
[MAXSWUTF8L
];
566 if (wl
< 5 || ! pAMgr
) return ns
;
567 for (int i
=2; i
< wl
; i
++) {
568 if (w_char_eq(word
[i
], word
[i
-2])) {
571 memcpy(candidate_utf
, word
, (i
- 1) * sizeof(w_char
));
572 memcpy(candidate_utf
+i
-1, word
+i
+1, (wl
-i
-1) * sizeof(w_char
));
573 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
-2);
574 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
575 if (ns
== -1) return -1;
585 // error is wrong char in place of correct one (case and keyboard related version)
586 int SuggestMgr::badcharkey(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
589 char candidate
[MAXSWUTF8L
];
590 int wl
= strlen(word
);
591 strcpy(candidate
, word
);
592 // swap out each char one by one and try uppercase and neighbor
593 // keyboard chars in its place to see if that makes a good word
595 for (int i
=0; i
< wl
; i
++) {
597 // check with uppercase letters
598 candidate
[i
] = csconv
[((unsigned char)tmpc
)].cupper
;
599 if (tmpc
!= candidate
[i
]) {
600 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
601 if (ns
== -1) return -1;
604 // check neighbor characters in keyboard string
606 char * loc
= strchr(ckey
, tmpc
);
608 if ((loc
> ckey
) && (*(loc
- 1) != '|')) {
609 candidate
[i
] = *(loc
- 1);
610 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
611 if (ns
== -1) return -1;
613 if ((*(loc
+ 1) != '|') && (*(loc
+ 1) != '\0')) {
614 candidate
[i
] = *(loc
+ 1);
615 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
616 if (ns
== -1) return -1;
618 loc
= strchr(loc
+ 1, tmpc
);
625 // error is wrong char in place of correct one (case and keyboard related version)
626 int SuggestMgr::badcharkey_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
629 w_char candidate_utf
[MAXSWL
];
630 char candidate
[MAXSWUTF8L
];
631 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
632 // swap out each char one by one and try all the tryme
633 // chars in its place to see if that makes a good word
634 for (int i
=0; i
< wl
; i
++) {
635 tmpc
= candidate_utf
[i
];
636 // check with uppercase letters
637 mkallcap_utf(candidate_utf
+ i
, 1, langnum
);
638 if (!w_char_eq(tmpc
, candidate_utf
[i
])) {
639 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
640 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
641 if (ns
== -1) return -1;
642 candidate_utf
[i
] = tmpc
;
644 // check neighbor characters in keyboard string
646 w_char
* loc
= ckey_utf
;
647 while ((loc
< (ckey_utf
+ ckeyl
)) && !w_char_eq(*loc
, tmpc
)) loc
++;
648 while (loc
< (ckey_utf
+ ckeyl
)) {
649 if ((loc
> ckey_utf
) && !w_char_eq(*(loc
- 1), W_VLINE
)) {
650 candidate_utf
[i
] = *(loc
- 1);
651 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
652 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
653 if (ns
== -1) return -1;
655 if (((loc
+ 1) < (ckey_utf
+ ckeyl
)) && !w_char_eq(*(loc
+ 1), W_VLINE
)) {
656 candidate_utf
[i
] = *(loc
+ 1);
657 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
658 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
659 if (ns
== -1) return -1;
661 do { loc
++; } while ((loc
< (ckey_utf
+ ckeyl
)) && !w_char_eq(*loc
, tmpc
));
663 candidate_utf
[i
] = tmpc
;
668 // error is wrong char in place of correct one
669 int SuggestMgr::badchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
672 char candidate
[MAXSWUTF8L
];
673 clock_t timelimit
= clock();
674 int timer
= MINTIMER
;
675 int wl
= strlen(word
);
676 strcpy(candidate
, word
);
677 // swap out each char one by one and try all the tryme
678 // chars in its place to see if that makes a good word
679 for (int j
=0; j
< ctryl
; j
++) {
680 for (int i
=wl
-1; i
>= 0; i
--) {
682 if (ctry
[j
] == tmpc
) continue;
683 candidate
[i
] = ctry
[j
];
684 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, &timer
, &timelimit
);
685 if (ns
== -1) return -1;
686 if (!timer
) return ns
;
693 // error is wrong char in place of correct one
694 int SuggestMgr::badchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
697 w_char candidate_utf
[MAXSWL
];
698 char candidate
[MAXSWUTF8L
];
699 clock_t timelimit
= clock();
700 int timer
= MINTIMER
;
701 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
702 // swap out each char one by one and try all the tryme
703 // chars in its place to see if that makes a good word
704 for (int j
=0; j
< ctryl
; j
++) {
705 for (int i
=wl
-1; i
>= 0; i
--) {
706 tmpc
= candidate_utf
[i
];
707 if (w_char_eq(tmpc
, ctry_utf
[j
])) continue;
708 candidate_utf
[i
] = ctry_utf
[j
];
709 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
710 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, &timer
, &timelimit
);
711 if (ns
== -1) return -1;
712 if (!timer
) return ns
;
713 candidate_utf
[i
] = tmpc
;
719 // error is word has an extra letter it does not need
720 int SuggestMgr::extrachar_utf(char** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
722 char candidate
[MAXSWUTF8L
];
723 w_char candidate_utf
[MAXSWL
];
725 w_char tmpc
= W_VLINE
; // not used value, only for VCC warning message
726 if (wl
< 2) return ns
;
727 // try omitting one char of word at a time
728 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
729 for (p
= candidate_utf
+ wl
- 1; p
>= candidate_utf
; p
--) {
731 if (p
< candidate_utf
+ wl
- 1) *p
= tmpc
;
732 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
- 1);
733 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
734 if (ns
== -1) return -1;
740 // error is word has an extra letter it does not need
741 int SuggestMgr::extrachar(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
744 char candidate
[MAXSWUTF8L
];
746 int wl
= strlen(word
);
747 if (wl
< 2) return ns
;
748 // try omitting one char of word at a time
749 strcpy (candidate
, word
);
750 for (p
= candidate
+ wl
- 1; p
>=candidate
; p
--) {
753 ns
= testsug(wlst
, candidate
, wl
-1, ns
, cpdsuggest
, NULL
, NULL
);
754 if (ns
== -1) return -1;
760 // error is missing a letter it needs
761 int SuggestMgr::forgotchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
763 char candidate
[MAXSWUTF8L
];
765 clock_t timelimit
= clock();
766 int timer
= MINTIMER
;
767 int wl
= strlen(word
);
768 // try inserting a tryme character before every letter (and the null terminator)
769 for (int i
= 0; i
< ctryl
; i
++) {
770 strcpy(candidate
, word
);
771 for (p
= candidate
+ wl
; p
>= candidate
; p
--) {
774 ns
= testsug(wlst
, candidate
, wl
+1, ns
, cpdsuggest
, &timer
, &timelimit
);
775 if (ns
== -1) return -1;
776 if (!timer
) return ns
;
782 // error is missing a letter it needs
783 int SuggestMgr::forgotchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
785 w_char candidate_utf
[MAXSWL
];
786 char candidate
[MAXSWUTF8L
];
788 clock_t timelimit
= clock();
789 int timer
= MINTIMER
;
790 // try inserting a tryme character at the end of the word and before every letter
791 for (int i
= 0; i
< ctryl
; i
++) {
792 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
793 for (p
= candidate_utf
+ wl
; p
>= candidate_utf
; p
--) {
796 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
+ 1);
797 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, &timer
, &timelimit
);
798 if (ns
== -1) return -1;
799 if (!timer
) return ns
;
806 /* error is should have been two words */
807 int SuggestMgr::twowords(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
809 char candidate
[MAXSWUTF8L
];
816 if (wl
< 3) return ns
;
818 if (langnum
== LANG_hu
) forbidden
= check_forbidden(word
, wl
);
820 strcpy(candidate
+ 1, word
);
821 // split the string into two pieces after every char
822 // if both pieces are good words make them a suggestion
823 for (p
= candidate
+ 1; p
[1] != '\0'; p
++) {
825 // go to end of the UTF-8 character
826 while (utf8
&& ((p
[1] & 0xc0) == 0x80)) {
830 if (utf8
&& p
[1] == '\0') break; // last UTF-8 character
832 c1
= checkword(candidate
,strlen(candidate
), cpdsuggest
, NULL
, NULL
);
834 c2
= checkword((p
+1),strlen(p
+1), cpdsuggest
, NULL
, NULL
);
838 // spec. Hungarian code (need a better compound word support)
839 if ((langnum
== LANG_hu
) && !forbidden
&&
840 // if 3 repeating letter, use - instead of space
841 (((p
[-1] == p
[1]) && (((p
>candidate
+1) && (p
[-1] == p
[-2])) || (p
[-1] == p
[2]))) ||
842 // or multiple compounding, with more, than 6 syllables
843 ((c1
== 3) && (c2
>= 2)))) *p
= '-';
846 for (int k
=0; k
< ns
; k
++)
847 if (strcmp(candidate
,wlst
[k
]) == 0) cwrd
= 0;
850 wlst
[ns
] = mystrdup(candidate
);
851 if (wlst
[ns
] == NULL
) return -1;
855 // add two word suggestion with dash, if TRY string contains
857 // NOTE: cwrd doesn't modified for REP twoword sugg.
858 if (ctry
&& (strchr(ctry
, 'a') || strchr(ctry
, '-')) &&
859 mystrlen(p
+ 1) > 1 &&
860 mystrlen(candidate
) - mystrlen(p
) > 1) {
862 for (int k
=0; k
< ns
; k
++)
863 if (strcmp(candidate
,wlst
[k
]) == 0) cwrd
= 0;
866 wlst
[ns
] = mystrdup(candidate
);
867 if (wlst
[ns
] == NULL
) return -1;
879 // error is adjacent letter were swapped
880 int SuggestMgr::swapchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
882 char candidate
[MAXSWUTF8L
];
886 // try swapping adjacent chars one by one
887 strcpy(candidate
, word
);
888 for (p
= candidate
; p
[1] != 0; p
++) {
892 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
893 if (ns
== -1) return -1;
897 // try double swaps for short words
898 // ahev -> have, owudl -> would
899 if (wl
== 4 || wl
== 5) {
900 candidate
[0] = word
[1];
901 candidate
[1] = word
[0];
902 candidate
[2] = word
[2];
903 candidate
[wl
- 2] = word
[wl
- 1];
904 candidate
[wl
- 1] = word
[wl
- 2];
905 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
906 if (ns
== -1) return -1;
908 candidate
[0] = word
[0];
909 candidate
[1] = word
[2];
910 candidate
[2] = word
[1];
911 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
912 if (ns
== -1) return -1;
918 // error is adjacent letter were swapped
919 int SuggestMgr::swapchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
921 w_char candidate_utf
[MAXSWL
];
922 char candidate
[MAXSWUTF8L
];
926 // try swapping adjacent chars one by one
927 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
928 for (p
= candidate_utf
; p
< (candidate_utf
+ wl
- 1); p
++) {
932 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
933 if (len
== 0) len
= strlen(candidate
);
934 ns
= testsug(wlst
, candidate
, len
, ns
, cpdsuggest
, NULL
, NULL
);
935 if (ns
== -1) return -1;
939 // try double swaps for short words
940 // ahev -> have, owudl -> would, suodn -> sound
941 if (wl
== 4 || wl
== 5) {
942 candidate_utf
[0] = word
[1];
943 candidate_utf
[1] = word
[0];
944 candidate_utf
[2] = word
[2];
945 candidate_utf
[wl
- 2] = word
[wl
- 1];
946 candidate_utf
[wl
- 1] = word
[wl
- 2];
947 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
948 ns
= testsug(wlst
, candidate
, len
, ns
, cpdsuggest
, NULL
, NULL
);
949 if (ns
== -1) return -1;
951 candidate_utf
[0] = word
[0];
952 candidate_utf
[1] = word
[2];
953 candidate_utf
[2] = word
[1];
954 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
955 ns
= testsug(wlst
, candidate
, len
, ns
, cpdsuggest
, NULL
, NULL
);
956 if (ns
== -1) return -1;
962 // error is not adjacent letter were swapped
963 int SuggestMgr::longswapchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
965 char candidate
[MAXSWUTF8L
];
970 // try swapping not adjacent chars one by one
971 strcpy(candidate
, word
);
972 for (p
= candidate
; *p
!= 0; p
++) {
973 for (q
= candidate
; *q
!= 0; q
++) {
978 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
979 if (ns
== -1) return -1;
989 // error is adjacent letter were swapped
990 int SuggestMgr::longswapchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
992 w_char candidate_utf
[MAXSWL
];
993 char candidate
[MAXSWUTF8L
];
997 // try swapping not adjacent chars
998 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
999 for (p
= candidate_utf
; p
< (candidate_utf
+ wl
); p
++) {
1000 for (q
= candidate_utf
; q
< (candidate_utf
+ wl
); q
++) {
1005 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
1006 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
1007 if (ns
== -1) return -1;
1016 // error is a letter was moved
1017 int SuggestMgr::movechar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
1019 char candidate
[MAXSWUTF8L
];
1024 int wl
=strlen(word
);
1025 // try moving a char
1026 strcpy(candidate
, word
);
1027 for (p
= candidate
; *p
!= 0; p
++) {
1028 for (q
= p
+ 1; (*q
!= 0) && ((q
- p
) < 10); q
++) {
1032 if ((q
-p
) < 2) continue; // omit swap char
1033 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
1034 if (ns
== -1) return -1;
1036 strcpy(candidate
, word
);
1038 for (p
= candidate
+ wl
- 1; p
> candidate
; p
--) {
1039 for (q
= p
- 1; (q
>= candidate
) && ((p
- q
) < 10); q
--) {
1043 if ((p
-q
) < 2) continue; // omit swap char
1044 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
1045 if (ns
== -1) return -1;
1047 strcpy(candidate
, word
);
1052 // error is a letter was moved
1053 int SuggestMgr::movechar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
1055 w_char candidate_utf
[MAXSWL
];
1056 char candidate
[MAXSWUTF8L
];
1060 // try moving a char
1061 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
1062 for (p
= candidate_utf
; p
< (candidate_utf
+ wl
); p
++) {
1063 for (q
= p
+ 1; (q
< (candidate_utf
+ wl
)) && ((q
- p
) < 10); q
++) {
1067 if ((q
-p
) < 2) continue; // omit swap char
1068 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
1069 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
1070 if (ns
== -1) return -1;
1072 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
1074 for (p
= candidate_utf
+ wl
- 1; p
> candidate_utf
; p
--) {
1075 for (q
= p
- 1; (q
>= candidate_utf
) && ((p
- q
) < 10); q
--) {
1079 if ((p
-q
) < 2) continue; // omit swap char
1080 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
1081 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
1082 if (ns
== -1) return -1;
1084 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
1089 // generate a set of suggestions for very poorly spelled words
1090 int SuggestMgr::ngsuggest(char** wlst
, char * w
, int ns
, HashMgr
** pHMgr
, int md
)
1099 // exhaustively search through all root words
1100 // keeping track of the MAX_ROOTS most similar root words
1101 struct hentry
* roots
[MAX_ROOTS
];
1102 char * rootsphon
[MAX_ROOTS
];
1103 int scores
[MAX_ROOTS
];
1104 int scoresphon
[MAX_ROOTS
];
1105 for (i
= 0; i
< MAX_ROOTS
; i
++) {
1107 scores
[i
] = -100 * i
;
1108 rootsphon
[i
] = NULL
;
1109 scoresphon
[i
] = -100 * i
;
1112 lpphon
= MAX_ROOTS
- 1;
1113 scphon
= scoresphon
[MAX_ROOTS
-1];
1115 char w2
[MAXWORDUTF8LEN
];
1119 // word reversing wrapper for complex prefixes
1120 if (complexprefixes
) {
1122 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
1126 char mw
[MAXSWUTF8L
];
1128 int nc
= strlen(word
);
1129 int n
= (utf8
) ? u8_u16(u8
, MAXSWL
, word
) : nc
;
1131 // set character based ngram suggestion for words with non-BMP Unicode characters
1138 struct hentry
* hp
= NULL
;
1140 phonetable
* ph
= (pAMgr
) ? pAMgr
->get_phonetable() : NULL
;
1141 char target
[MAXSWUTF8L
];
1142 char candidate
[MAXSWUTF8L
];
1144 strcpy(candidate
, word
);
1145 mkallcap(candidate
, csconv
);
1146 phonet(candidate
, target
, n
, *ph
);
1149 for (i
= 0; i
< md
; i
++) {
1150 while (0 != (hp
= (pHMgr
[i
])->walk_hashtable(col
, hp
))) {
1151 if ((hp
->astr
) && (pAMgr
) &&
1152 (TESTAFF(hp
->astr
, pAMgr
->get_forbiddenword(), hp
->alen
) ||
1153 TESTAFF(hp
->astr
, ONLYUPCASEFLAG
, hp
->alen
) ||
1154 TESTAFF(hp
->astr
, pAMgr
->get_nosuggest(), hp
->alen
) ||
1155 TESTAFF(hp
->astr
, pAMgr
->get_onlyincompound(), hp
->alen
))) continue;
1157 sc
= ngram(3, word
, HENTRY_WORD(hp
), NGRAM_LONGER_WORSE
+ NGRAM_LOWERING
) +
1158 leftcommonsubstring(word
, HENTRY_WORD(hp
));
1160 // check special pronounciation
1161 if ((hp
->var
& H_OPT_PHON
) && copy_field(f
, HENTRY_DATA(hp
), MORPH_PHON
)) {
1162 int sc2
= ngram(3, word
, f
, NGRAM_LONGER_WORSE
+ NGRAM_LOWERING
) +
1163 leftcommonsubstring(word
, f
);
1164 if (sc2
> sc
) sc
= sc2
;
1167 if (ph
&& (sc
> 2) && (abs(n
- (int) hp
->clen
) <= 3)) {
1168 char target2
[MAXSWUTF8L
];
1169 strcpy(candidate
, HENTRY_WORD(hp
));
1170 mkallcap(candidate
, csconv
);
1171 phonet(candidate
, target2
, -1, *ph
);
1172 scphon
= 2 * ngram(3, target
, target2
, NGRAM_LONGER_WORSE
);
1175 if (sc
> scores
[lp
]) {
1179 for (j
=0; j
< MAX_ROOTS
; j
++)
1180 if (scores
[j
] < lval
) {
1186 if (scphon
> scoresphon
[lpphon
]) {
1187 scoresphon
[lpphon
] = scphon
;
1188 rootsphon
[lpphon
] = HENTRY_WORD(hp
);
1190 for (j
=0; j
< MAX_ROOTS
; j
++)
1191 if (scoresphon
[j
] < lval
) {
1193 lval
= scoresphon
[j
];
1198 // find minimum threshhold for a passable suggestion
1199 // mangle original word three differnt ways
1200 // and score them to generate a minimum acceptable score
1202 for (int sp
= 1; sp
< 4; sp
++) {
1204 for (int k
=sp
; k
< n
; k
+=4) *((unsigned short *) u8
+ k
) = '*';
1205 u16_u8(mw
, MAXSWUTF8L
, u8
, n
);
1206 thresh
= thresh
+ ngram(n
, word
, mw
, NGRAM_ANY_MISMATCH
+ NGRAM_LOWERING
);
1209 for (int k
=sp
; k
< n
; k
+=4) *(mw
+ k
) = '*';
1210 thresh
= thresh
+ ngram(n
, word
, mw
, NGRAM_ANY_MISMATCH
+ NGRAM_LOWERING
);
1213 thresh
= thresh
/ 3;
1216 // now expand affixes on each of these root words and
1217 // and use length adjusted ngram scores to select
1218 // possible suggestions
1219 char * guess
[MAX_GUESS
];
1220 char * guessorig
[MAX_GUESS
];
1221 int gscore
[MAX_GUESS
];
1222 for(i
=0;i
<MAX_GUESS
;i
++) {
1224 guessorig
[i
] = NULL
;
1225 gscore
[i
] = -100 * i
;
1230 struct guessword
* glst
;
1231 glst
= (struct guessword
*) calloc(MAX_WORDS
,sizeof(struct guessword
));
1233 if (nonbmp
) utf8
= 1;
1237 for (i
= 0; i
< MAX_ROOTS
; i
++) {
1239 struct hentry
* rp
= roots
[i
];
1240 int nw
= pAMgr
->expand_rootword(glst
, MAX_WORDS
, HENTRY_WORD(rp
), rp
->blen
,
1241 rp
->astr
, rp
->alen
, word
, nc
,
1242 ((rp
->var
& H_OPT_PHON
) ? copy_field(f
, HENTRY_DATA(rp
), MORPH_PHON
) : NULL
));
1244 for (int k
= 0; k
< nw
; k
++) {
1245 sc
= ngram(n
, word
, glst
[k
].word
, NGRAM_ANY_MISMATCH
+ NGRAM_LOWERING
) +
1246 leftcommonsubstring(word
, glst
[k
].word
);
1248 if ((sc
> thresh
)) {
1249 if (sc
> gscore
[lp
]) {
1252 if (guessorig
[lp
]) {
1253 free(guessorig
[lp
]);
1254 guessorig
[lp
] = NULL
;
1258 guess
[lp
] = glst
[k
].word
;
1259 guessorig
[lp
] = glst
[k
].orig
;
1261 for (j
=0; j
< MAX_GUESS
; j
++)
1262 if (gscore
[j
] < lval
) {
1268 if (glst
[k
].orig
) free(glst
[k
].orig
);
1272 if (glst
[k
].orig
) free(glst
[k
].orig
);
1279 // now we are done generating guesses
1280 // sort in order of decreasing score
1283 bubblesort(&guess
[0], &guessorig
[0], &gscore
[0], MAX_GUESS
);
1284 if (ph
) bubblesort(&rootsphon
[0], NULL
, &scoresphon
[0], MAX_ROOTS
);
1286 // weight suggestions with a similarity index, based on
1287 // the longest common subsequent algorithm and resort
1290 for (i
=0; i
< MAX_GUESS
; i
++) {
1292 // lowering guess[i]
1293 char gl
[MAXSWUTF8L
];
1297 len
= u8_u16(_w
, MAXSWL
, guess
[i
]);
1298 mkallsmall_utf(_w
, len
, langnum
);
1299 u16_u8(gl
, MAXSWUTF8L
, _w
, len
);
1301 strcpy(gl
, guess
[i
]);
1302 mkallsmall(gl
, csconv
);
1303 len
= strlen(guess
[i
]);
1306 int _lcs
= lcslen(word
, gl
);
1308 // same characters with different casing
1309 if ((n
== len
) && (n
== _lcs
)) {
1314 // heuristic weigthing of ngram scores
1316 // length of longest common subsequent minus length difference
1317 2 * _lcs
- abs((int) (n
- len
)) +
1318 // weight length of the left common substring
1319 leftcommonsubstring(word
, gl
) +
1320 // weight equal character positions
1321 ((_lcs
== commoncharacterpositions(word
, gl
, &is_swap
)) ? 1: 0) +
1322 // swap character (not neighboring)
1323 ((is_swap
) ? 1000 : 0);
1327 bubblesort(&guess
[0], &guessorig
[0], &gscore
[0], MAX_GUESS
);
1330 if (ph
) for (i
=0; i
< MAX_ROOTS
; i
++) {
1332 // lowering rootphon[i]
1333 char gl
[MAXSWUTF8L
];
1337 len
= u8_u16(_w
, MAXSWL
, rootsphon
[i
]);
1338 mkallsmall_utf(_w
, len
, langnum
);
1339 u16_u8(gl
, MAXSWUTF8L
, _w
, len
);
1341 strcpy(gl
, rootsphon
[i
]);
1342 mkallsmall(gl
, csconv
);
1343 len
= strlen(rootsphon
[i
]);
1346 // heuristic weigthing of ngram scores
1347 scoresphon
[i
] += 2 * lcslen(word
, gl
) - abs((int) (n
- len
)) +
1348 // weight length of the left common substring
1349 leftcommonsubstring(word
, gl
);
1353 if (ph
) bubblesort(&rootsphon
[0], NULL
, &scoresphon
[0], MAX_ROOTS
);
1359 for (i
=0; i
< MAX_GUESS
; i
++) {
1361 if ((ns
< oldns
+ maxngramsugs
) && (ns
< maxSug
) && (!same
|| (gscore
[i
] > 1000))) {
1363 // leave only excellent suggestions, if exists
1364 if (gscore
[i
] > 1000) same
= 1;
1365 for (j
= 0; j
< ns
; j
++) {
1366 // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1367 if ((!guessorig
[i
] && strstr(guess
[i
], wlst
[j
])) ||
1368 (guessorig
[i
] && strstr(guessorig
[i
], wlst
[j
])) ||
1369 // check forbidden words
1370 !checkword(guess
[i
], strlen(guess
[i
]), 0, NULL
, NULL
)) unique
= 0;
1373 wlst
[ns
++] = guess
[i
];
1376 wlst
[ns
-1] = guessorig
[i
];
1380 if (guessorig
[i
]) free(guessorig
[i
]);
1384 if (guessorig
[i
]) free(guessorig
[i
]);
1390 if (ph
) for (i
=0; i
< MAX_ROOTS
; i
++) {
1392 if ((ns
< oldns
+ MAXPHONSUGS
) && (ns
< maxSug
)) {
1394 for (j
= 0; j
< ns
; j
++) {
1395 // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1396 if (strstr(rootsphon
[i
], wlst
[j
]) ||
1397 // check forbidden words
1398 !checkword(rootsphon
[i
], strlen(rootsphon
[i
]), 0, NULL
, NULL
)) unique
= 0;
1401 wlst
[ns
++] = mystrdup(rootsphon
[i
]);
1402 if (!wlst
[ns
- 1]) return ns
- 1;
1408 if (nonbmp
) utf8
= 1;
1413 // see if a candidate suggestion is spelled correctly
1414 // needs to check both root words and words with affixes
1416 // obsolote MySpell-HU modifications:
1417 // return value 2 and 3 marks compounding with hyphen (-)
1418 // `3' marks roots without suffix
1419 int SuggestMgr::checkword(const char * word
, int len
, int cpdsuggest
, int * timer
, clock_t * timelimit
)
1421 struct hentry
* rv
=NULL
;
1427 if (!(*timer
) && timelimit
) {
1428 if ((clock() - *timelimit
) > TIMELIMIT
) return 0;
1429 *timer
= MAXPLUSTIMER
;
1434 if (cpdsuggest
==1) {
1435 if (pAMgr
->get_compound()) {
1436 rv
= pAMgr
->compound_check(word
, len
, 0, 0, 100, 0, NULL
, 0, 1); //EXT
1437 if (rv
) return 3; // XXX obsolote categorisation
1442 rv
= pAMgr
->lookup(word
);
1445 if ((rv
->astr
) && (TESTAFF(rv
->astr
,pAMgr
->get_forbiddenword(),rv
->alen
)
1446 || TESTAFF(rv
->astr
,pAMgr
->get_nosuggest(),rv
->alen
))) return 0;
1448 if (rv
->astr
&& (TESTAFF(rv
->astr
,pAMgr
->get_needaffix(),rv
->alen
) ||
1449 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1450 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) {
1451 rv
= rv
->next_homonym
;
1454 } else rv
= pAMgr
->prefix_check(word
, len
, 0); // only prefix, and prefix + suffix XXX
1459 rv
= pAMgr
->suffix_check(word
, len
, 0, NULL
, NULL
, 0, NULL
); // only suffix
1462 if (!rv
&& pAMgr
->have_contclass()) {
1463 rv
= pAMgr
->suffix_check_twosfx(word
, len
, 0, NULL
, FLAG_NULL
);
1464 if (!rv
) rv
= pAMgr
->prefix_check_twosfx(word
, len
, 1, FLAG_NULL
);
1467 // check forbidden words
1468 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
,pAMgr
->get_forbiddenword(),rv
->alen
) ||
1469 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1470 TESTAFF(rv
->astr
,pAMgr
->get_nosuggest(),rv
->alen
) ||
1471 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) return 0;
1473 if (rv
) { // XXX obsolote
1474 if ((pAMgr
->get_compoundflag()) &&
1475 TESTAFF(rv
->astr
, pAMgr
->get_compoundflag(), rv
->alen
)) return 2 + nosuffix
;
1482 int SuggestMgr::check_forbidden(const char * word
, int len
)
1484 struct hentry
* rv
= NULL
;
1487 rv
= pAMgr
->lookup(word
);
1488 if (rv
&& rv
->astr
&& (TESTAFF(rv
->astr
,pAMgr
->get_needaffix(),rv
->alen
) ||
1489 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) rv
= NULL
;
1490 if (!(pAMgr
->prefix_check(word
,len
,1)))
1491 rv
= pAMgr
->suffix_check(word
,len
, 0, NULL
, NULL
, 0, NULL
); // prefix+suffix, suffix
1492 // check forbidden words
1493 if ((rv
) && (rv
->astr
) && TESTAFF(rv
->astr
,pAMgr
->get_forbiddenword(),rv
->alen
)) return 1;
1498 #ifdef HUNSPELL_EXPERIMENTAL
1499 // suggest possible stems
1500 int SuggestMgr::suggest_pos_stems(char*** slst
, const char * w
, int nsug
)
1504 struct hentry
* rv
= NULL
;
1506 char w2
[MAXSWUTF8L
];
1507 const char * word
= w
;
1509 // word reversing wrapper for complex prefixes
1510 if (complexprefixes
) {
1512 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
1516 int wl
= strlen(word
);
1522 wlst
= (char **) calloc(maxSug
, sizeof(char *));
1523 if (wlst
== NULL
) return -1;
1526 rv
= pAMgr
->suffix_check(word
, wl
, 0, NULL
, wlst
, maxSug
, &nsug
);
1528 // delete dash from end of word
1530 for (int j
=0; j
< nsug
; j
++) {
1531 if (wlst
[j
][strlen(wlst
[j
]) - 1] == '-') wlst
[j
][strlen(wlst
[j
]) - 1] = '\0';
1538 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1541 char * SuggestMgr::suggest_morph(const char * w
)
1543 char result
[MAXLNLEN
];
1544 char * r
= (char *) result
;
1547 struct hentry
* rv
= NULL
;
1551 if (! pAMgr
) return NULL
;
1553 char w2
[MAXSWUTF8L
];
1554 const char * word
= w
;
1556 // word reversing wrapper for complex prefixes
1557 if (complexprefixes
) {
1559 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
1563 rv
= pAMgr
->lookup(word
);
1566 if ((!rv
->astr
) || !(TESTAFF(rv
->astr
, pAMgr
->get_forbiddenword(), rv
->alen
) ||
1567 TESTAFF(rv
->astr
, pAMgr
->get_needaffix(), rv
->alen
) ||
1568 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) {
1569 if (!HENTRY_FIND(rv
, MORPH_STEM
)) {
1570 mystrcat(result
, " ", MAXLNLEN
);
1571 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
1572 mystrcat(result
, word
, MAXLNLEN
);
1574 if (HENTRY_DATA(rv
)) {
1575 mystrcat(result
, " ", MAXLNLEN
);
1576 mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
1578 mystrcat(result
, "\n", MAXLNLEN
);
1580 rv
= rv
->next_homonym
;
1583 st
= pAMgr
->affix_check_morph(word
,strlen(word
));
1585 mystrcat(result
, st
, MAXLNLEN
);
1589 if (pAMgr
->get_compound() && (*result
== '\0'))
1590 pAMgr
->compound_check_morph(word
, strlen(word
),
1591 0, 0, 100, 0,NULL
, 0, &r
, NULL
);
1593 return (*result
) ? mystrdup(line_uniq(result
, MSEP_REC
)) : NULL
;
1596 #ifdef HUNSPELL_EXPERIMENTAL
1597 char * SuggestMgr::suggest_morph_for_spelling_error(const char * word
)
1600 char ** wlst
= (char **) calloc(maxSug
, sizeof(char *));
1601 if (!**wlst
) return NULL
;
1602 // we will use only the first suggestion
1603 for (int i
= 0; i
< maxSug
- 1; i
++) wlst
[i
] = "";
1604 int ns
= suggest(&wlst
, word
, maxSug
- 1, NULL
);
1606 p
= suggest_morph(wlst
[maxSug
- 1]);
1607 free(wlst
[maxSug
- 1]);
1609 if (wlst
) free(wlst
);
1612 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1615 char * SuggestMgr::suggest_hentry_gen(hentry
* rv
, char * pattern
)
1617 char result
[MAXLNLEN
];
1619 int sfxcount
= get_sfxcount(pattern
);
1621 if (get_sfxcount(HENTRY_DATA(rv
)) > sfxcount
) return NULL
;
1623 if (HENTRY_DATA(rv
)) {
1624 char * aff
= pAMgr
->morphgen(HENTRY_WORD(rv
), rv
->blen
, rv
->astr
, rv
->alen
,
1625 HENTRY_DATA(rv
), pattern
, 0);
1627 mystrcat(result
, aff
, MAXLNLEN
);
1628 mystrcat(result
, "\n", MAXLNLEN
);
1633 // check all allomorphs
1634 char allomorph
[MAXLNLEN
];
1636 if (HENTRY_DATA(rv
)) p
= (char *) strstr(HENTRY_DATA2(rv
), MORPH_ALLOMORPH
);
1638 struct hentry
* rv2
= NULL
;
1640 int plen
= fieldlen(p
);
1641 strncpy(allomorph
, p
, plen
);
1642 allomorph
[plen
] = '\0';
1643 rv2
= pAMgr
->lookup(allomorph
);
1645 // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
1646 if (HENTRY_DATA(rv2
)) {
1647 char * st
= (char *) strstr(HENTRY_DATA2(rv2
), MORPH_STEM
);
1648 if (st
&& (strncmp(st
+ MORPH_TAG_LEN
,
1649 HENTRY_WORD(rv
), fieldlen(st
+ MORPH_TAG_LEN
)) == 0)) {
1650 char * aff
= pAMgr
->morphgen(HENTRY_WORD(rv2
), rv2
->blen
, rv2
->astr
, rv2
->alen
,
1651 HENTRY_DATA(rv2
), pattern
, 0);
1653 mystrcat(result
, aff
, MAXLNLEN
);
1654 mystrcat(result
, "\n", MAXLNLEN
);
1659 rv2
= rv2
->next_homonym
;
1661 p
= strstr(p
+ plen
, MORPH_ALLOMORPH
);
1664 return (*result
) ? mystrdup(result
) : NULL
;
1667 char * SuggestMgr::suggest_gen(char ** desc
, int n
, char * pattern
) {
1668 char result
[MAXLNLEN
];
1669 char result2
[MAXLNLEN
];
1670 char newpattern
[MAXLNLEN
];
1672 if (n
== 0) return 0;
1674 struct hentry
* rv
= NULL
;
1675 if (!pAMgr
) return NULL
;
1677 // search affixed forms with and without derivational suffixes
1680 for (int k
= 0; k
< n
; k
++) {
1682 // add compound word parts (except the last one)
1683 char * s
= (char *) desc
[k
];
1684 char * part
= strstr(s
, MORPH_PART
);
1686 char * nextpart
= strstr(part
+ 1, MORPH_PART
);
1688 copy_field(result
+ strlen(result
), part
, MORPH_PART
);
1690 nextpart
= strstr(part
+ 1, MORPH_PART
);
1698 char * alt
= strstr(tok
, " | ");
1701 alt
= strstr(alt
, " | ");
1703 int pln
= line_tok(tok
, &pl
, MSEP_ALT
);
1704 for (int i
= 0; i
< pln
; i
++) {
1705 // remove inflectional and terminal suffixes
1706 char * is
= strstr(pl
[i
], MORPH_INFL_SFX
);
1708 char * ts
= strstr(pl
[i
], MORPH_TERM_SFX
);
1711 ts
= strstr(pl
[i
], MORPH_TERM_SFX
);
1713 char * st
= strstr(s
, MORPH_STEM
);
1715 copy_field(tok
, st
, MORPH_STEM
);
1716 rv
= pAMgr
->lookup(tok
);
1718 char newpat
[MAXLNLEN
];
1719 strcpy(newpat
, pl
[i
]);
1720 strcat(newpat
, pattern
);
1721 char * sg
= suggest_hentry_gen(rv
, newpat
);
1722 if (!sg
) sg
= suggest_hentry_gen(rv
, pattern
);
1725 int genl
= line_tok(sg
, &gen
, MSEP_REC
);
1728 for (int j
= 0; j
< genl
; j
++) {
1729 if (strstr(pl
[i
], MORPH_SURF_PFX
)) {
1730 int r2l
= strlen(result2
);
1731 result2
[r2l
] = MSEP_REC
;
1732 strcpy(result2
+ r2l
+ 1, result
);
1733 copy_field(result2
+ strlen(result2
), pl
[i
], MORPH_SURF_PFX
);
1734 mystrcat(result2
, gen
[j
], MAXLNLEN
);
1736 sprintf(result2
+ strlen(result2
), "%c%s%s",
1737 MSEP_REC
, result
, gen
[j
]);
1740 freelist(&gen
, genl
);
1742 rv
= rv
->next_homonym
;
1749 if (*result2
|| !strstr(pattern
, MORPH_DERI_SFX
)) break;
1750 strcpy(newpattern
, pattern
);
1751 pattern
= newpattern
;
1752 char * ds
= strstr(pattern
, MORPH_DERI_SFX
);
1754 strncpy(ds
, MORPH_TERM_SFX
, MORPH_TAG_LEN
);
1755 ds
= strstr(pattern
, MORPH_DERI_SFX
);
1758 return (*result2
? mystrdup(result2
) : NULL
);
1762 // generate an n-gram score comparing s1 and s2
1763 int SuggestMgr::ngram(int n
, char * s1
, const char * s2
, int opt
)
1773 l1
= u8_u16(su1
, MAXSWL
, s1
);
1774 l2
= u8_u16(su2
, MAXSWL
, s2
);
1775 if ((l2
<= 0) || (l1
== -1)) return 0;
1776 // lowering dictionary word
1777 if (opt
& NGRAM_LOWERING
) mkallsmall_utf(su2
, l2
, langnum
);
1778 for (int j
= 1; j
<= n
; j
++) {
1780 for (int i
= 0; i
<= (l1
-j
); i
++) {
1781 for (int l
= 0; l
<= (l2
-j
); l
++) {
1783 for (k
= 0; (k
< j
); k
++) {
1784 w_char
* c1
= su1
+ i
+ k
;
1785 w_char
* c2
= su2
+ l
+ k
;
1786 if ((c1
->l
!= c2
->l
) || (c1
->h
!= c2
->h
)) break;
1794 nscore
= nscore
+ ns
;
1801 if (l2
== 0) return 0;
1803 if (opt
& NGRAM_LOWERING
) mkallsmall(t
, csconv
);
1804 for (int j
= 1; j
<= n
; j
++) {
1806 for (int i
= 0; i
<= (l1
-j
); i
++) {
1807 char c
= *(s1
+ i
+ j
);
1808 *(s1
+ i
+ j
) = '\0';
1809 if (strstr(t
,(s1
+i
))) ns
++;
1812 nscore
= nscore
+ ns
;
1818 if (opt
& NGRAM_LONGER_WORSE
) ns
= (l2
-l1
)-2;
1819 if (opt
& NGRAM_ANY_MISMATCH
) ns
= abs(l2
-l1
)-2;
1820 ns
= (nscore
- ((ns
> 0) ? ns
: 0));
1824 // length of the left common substring of s1 and (decapitalised) s2
1825 int SuggestMgr::leftcommonsubstring(char * s1
, const char * s2
) {
1829 // decapitalize dictionary word
1830 if (complexprefixes
) {
1831 int l1
= u8_u16(su1
, MAXSWL
, s1
);
1832 int l2
= u8_u16(su2
, MAXSWL
, s2
);
1833 if (*((short *)su1
+l1
-1) == *((short *)su2
+l2
-1)) return 1;
1838 unsigned short idx
= (su2
->h
<< 8) + su2
->l
;
1839 if (*((short *)su1
) != *((short *)su2
) &&
1840 (*((unsigned short *)su1
) != unicodetolower(idx
, langnum
))) return 0;
1841 int l1
= u8_u16(su1
, MAXSWL
, s1
);
1842 int l2
= u8_u16(su2
, MAXSWL
, s2
);
1843 for(i
= 1; (i
< l1
) && (i
< l2
) &&
1844 (*((short *)(su1
+ i
)) == *((short *)(su2
+ i
))); i
++);
1848 if (complexprefixes
) {
1849 int l1
= strlen(s1
);
1850 int l2
= strlen(s2
);
1851 if (*(s2
+l1
-1) == *(s2
+l2
-1)) return 1;
1854 // decapitalise dictionary word
1855 if ((*s1
!= *s2
) && (*s1
!= csconv
[((unsigned char)*s2
)].clower
)) return 0;
1858 } while ((*s1
== *s2
) && (*s1
!= '\0'));
1865 int SuggestMgr::commoncharacterpositions(char * s1
, const char * s2
, int * is_swap
) {
1873 int l1
= u8_u16(su1
, MAXSWL
, s1
);
1874 int l2
= u8_u16(su2
, MAXSWL
, s2
);
1875 // decapitalize dictionary word
1876 if (complexprefixes
) {
1877 mkallsmall_utf(su2
+l2
-1, 1, langnum
);
1879 mkallsmall_utf(su2
, 1, langnum
);
1881 for (int i
= 0; (i
< l1
) && (i
< l2
); i
++) {
1882 if (((short *) su1
)[i
] == ((short *) su2
)[i
]) {
1885 if (diff
< 2) diffpos
[diff
] = i
;
1889 if ((diff
== 2) && (l1
== l2
) &&
1890 (((short *) su1
)[diffpos
[0]] == ((short *) su2
)[diffpos
[1]]) &&
1891 (((short *) su1
)[diffpos
[1]] == ((short *) su2
)[diffpos
[0]])) *is_swap
= 1;
1896 // decapitalize dictionary word
1897 if (complexprefixes
) {
1899 *(t
+l2
-1) = csconv
[((unsigned char)*(t
+l2
-1))].clower
;
1901 mkallsmall(t
, csconv
);
1903 for (i
= 0; (*(s1
+i
) != 0) && (*(t
+i
) != 0); i
++) {
1904 if (*(s1
+i
) == *(t
+i
)) {
1907 if (diff
< 2) diffpos
[diff
] = i
;
1911 if ((diff
== 2) && (*(s1
+i
) == 0) && (*(t
+i
) == 0) &&
1912 (*(s1
+diffpos
[0]) == *(t
+diffpos
[1])) &&
1913 (*(s1
+diffpos
[1]) == *(t
+diffpos
[0]))) *is_swap
= 1;
1918 int SuggestMgr::mystrlen(const char * word
) {
1921 return u8_u16(w
, MAXSWL
, word
);
1922 } else return strlen(word
);
1925 // sort in decreasing order of score
1926 void SuggestMgr::bubblesort(char** rword
, char** rword2
, int* rsc
, int n
)
1932 if (rsc
[j
-1] < rsc
[j
]) {
1933 int sctmp
= rsc
[j
-1];
1934 char * wdtmp
= rword
[j
-1];
1936 rword
[j
-1] = rword
[j
];
1940 wdtmp
= rword2
[j
-1];
1941 rword2
[j
-1] = rword2
[j
];
1952 // longest common subsequence
1953 void SuggestMgr::lcs(const char * s
, const char * s2
, int * l1
, int * l2
, char ** result
) {
1962 m
= u8_u16(su
, MAXSWL
, s
);
1963 n
= u8_u16(su2
, MAXSWL
, s2
);
1968 c
= (char *) malloc((m
+ 1) * (n
+ 1));
1969 b
= (char *) malloc((m
+ 1) * (n
+ 1));
1976 for (i
= 1; i
<= m
; i
++) c
[i
*(n
+1)] = 0;
1977 for (j
= 0; j
<= n
; j
++) c
[j
] = 0;
1978 for (i
= 1; i
<= m
; i
++) {
1979 for (j
= 1; j
<= n
; j
++) {
1980 if ((utf8
) && (*((short *) su
+i
-1) == *((short *)su2
+j
-1))
1981 || (!utf8
) && ((*(s
+i
-1)) == (*(s2
+j
-1)))) {
1982 c
[i
*(n
+1) + j
] = c
[(i
-1)*(n
+1) + j
-1]+1;
1983 b
[i
*(n
+1) + j
] = LCS_UPLEFT
;
1984 } else if (c
[(i
-1)*(n
+1) + j
] >= c
[i
*(n
+1) + j
-1]) {
1985 c
[i
*(n
+1) + j
] = c
[(i
-1)*(n
+1) + j
];
1986 b
[i
*(n
+1) + j
] = LCS_UP
;
1988 c
[i
*(n
+1) + j
] = c
[i
*(n
+1) + j
-1];
1989 b
[i
*(n
+1) + j
] = LCS_LEFT
;
1999 int SuggestMgr::lcslen(const char * s
, const char* s2
) {
2006 lcs(s
, s2
, &m
, &n
, &result
);
2007 if (!result
) return 0;
2010 while ((i
!= 0) && (j
!= 0)) {
2011 if (result
[i
*(n
+1) + j
] == LCS_UPLEFT
) {
2015 } else if (result
[i
*(n
+1) + j
] == LCS_UP
) {