1 in addition to that: configure.ac portion was fixed to not have unbalanced []
3 From d9f392dc35f75b1246862b2db8090e8d5b6ec068 Mon Sep 17 00:00:00 2001
4 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
5 Date: Sun, 17 Jun 2018 17:21:01 +0200
6 Subject: [PATCH] recent Hunspell fixes for suggestion, spelling and analysis
8 6f976bf fix compiling on WIN32, use time.h and thread_local
9 24f0963 [morph] better time limitation for morphological analysis
10 8e6ceaa [spelling] tdf#118162 better time limitation for compounding
11 3f00ff3 [suggestion] tdf#118162 time limit for a HunspellImpl::suggest() call
12 a1f9dfa [suggestion] tdf#118162 time limit for a SuggestMgr::suggest() call
13 d70bf2d [spelling] optimize IGNORE to speed up dictionary loading
14 16b4900 [spelling] add time limit for compound word handling
15 b0ded55 [suggestion] lower limit for doubletwochars
16 b3a44fa [suggestion] limit longswapchar, lower limit for movechar
17 a295af9 [morph] clean up for separators of morphological analysis
18 ca5f629 [morph] add missing field separator for members with prefixes
22 src/hunspell/affentry.cxx | 12 +--
23 src/hunspell/affixmgr.cxx | 89 +++++++++++++------
24 src/hunspell/atypes.hxx | 10 +++
25 src/hunspell/csutil.hxx | 12 +++
26 src/hunspell/hashmgr.cxx | 2 +-
27 src/hunspell/hunspell.cxx | 210 ++++++++++++++++++++++++++------------------
28 src/hunspell/hunvisapi.h | 12 ++-
29 src/hunspell/hunvisapi.h.in | 12 ++-
30 src/hunspell/suggestmgr.cxx | 72 +++++++++++----
31 src/hunspell/suggestmgr.hxx | 5 --
32 12 files changed, 300 insertions(+), 145 deletions(-)
34 diff --git a/Makefile.in b/Makefile.in
35 index 06d933e..241f797 100644
38 @@ -296,6 +296,7 @@ GMSGFMT = @GMSGFMT@
39 GMSGFMT_015 = @GMSGFMT_015@
41 HAVE_ASPRINTF = @HAVE_ASPRINTF@
42 +HAVE_CXX11 = @HAVE_CXX11@
43 HAVE_NEWLOCALE = @HAVE_NEWLOCALE@
44 HAVE_POSIX_PRINTF = @HAVE_POSIX_PRINTF@
45 HAVE_SNPRINTF = @HAVE_SNPRINTF@
46 diff --git a/configure.ac b/configure.ac
47 index fb79d0d..2936107 100644
50 @@ -16,6 +16,14 @@ HUNSPELL_VERSION_MINOR=`echo $VERSION | cut -d"." -f2`
51 AC_SUBST(HUNSPELL_VERSION_MAJOR)
52 AC_SUBST(HUNSPELL_VERSION_MINOR)
54 +# check C++11 compiling environment for thread_local
55 +# to handle time limits better also with threads
57 + [*-std=c++11*], [HAVE_CXX11=1],
62 # Checks for programs.
65 diff --git a/src/hunspell/affentry.cxx b/src/hunspell/affentry.cxx
66 index 4ef0c00..ffcdb21 100644
67 --- a/src/hunspell/affentry.cxx
68 +++ b/src/hunspell/affentry.cxx
69 @@ -399,28 +399,28 @@ std::string PfxEntry::check_morph(const char* word,
70 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
71 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
74 + result.push_back(MSEP_FLD);
75 result.append(morphcode);
77 result.append(getKey());
78 if (!HENTRY_FIND(he, MORPH_STEM)) {
80 + result.push_back(MSEP_FLD);
81 result.append(MORPH_STEM);
82 result.append(HENTRY_WORD(he));
84 // store the pointer of the hash entry
85 if (HENTRY_DATA(he)) {
87 + result.push_back(MSEP_FLD);
88 result.append(HENTRY_DATA2(he));
90 // return with debug information
91 char* flag = pmyMgr->encode_flag(getFlag());
93 + result.push_back(MSEP_FLD);
94 result.append(MORPH_FLAG);
98 - result.append("\n");
99 + result.push_back(MSEP_REC);
101 he = he->next_homonym;
103 @@ -804,7 +804,7 @@ std::string SfxEntry::check_twosfx_morph(const char* word,
105 if (ppfx->getMorph()) {
106 result.append(ppfx->getMorph());
107 - result.append(" ");
108 + result.push_back(MSEP_FLD);
112 diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
113 index 2c540f2..1610ef0 100644
114 --- a/src/hunspell/affixmgr.cxx
115 +++ b/src/hunspell/affixmgr.cxx
124 @@ -1014,7 +1015,7 @@ int AffixMgr::process_sfx_order() {
125 // add flags to the result for dictionary debugging
126 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
127 char* st = encode_flag(flag);
128 - result.append(" ");
129 + result.push_back(MSEP_FLD);
130 result.append(MORPH_FLAG);
133 @@ -1594,6 +1595,17 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
137 + // add a time limit to handle possible
138 + // combinatorical explosion of the overlapping words
140 + HUNSPELL_THREAD_LOCAL clock_t timelimit;
143 + timelimit = clock();
144 + else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
148 setcminmax(&cmin, &cmax, word.c_str(), len);
151 @@ -1618,6 +1630,9 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
153 do { // simplified checkcompoundpattern loop
155 + if (timelimit == 0)
159 for (; scpd <= checkcpdtable.size() &&
160 (checkcpdtable[scpd - 1].pattern3.empty() ||
161 @@ -2186,6 +2201,17 @@ int AffixMgr::compound_check_morph(const char* word,
163 hentry** oldwords = words;
165 + // add a time limit to handle possible
166 + // combinatorical explosion of the overlapping words
168 + HUNSPELL_THREAD_LOCAL clock_t timelimit;
171 + timelimit = clock();
172 + else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
176 setcminmax(&cmin, &cmax, word, len);
179 @@ -2204,6 +2230,9 @@ int AffixMgr::compound_check_morph(const char* word,
181 do { // onlycpdrule loop
183 + if (timelimit == 0)
186 oldnumsyllable = numsyllable;
187 oldwordnum = wordnum;
189 @@ -2245,6 +2274,9 @@ int AffixMgr::compound_check_morph(const char* word,
190 rv = rv->next_homonym;
193 + if (timelimit == 0)
199 @@ -2435,22 +2467,22 @@ int AffixMgr::compound_check_morph(const char* word,
201 if (rv && words && words[wnum + 1]) {
202 result.append(presult);
203 - result.append(" ");
204 + result.push_back(MSEP_FLD);
205 result.append(MORPH_PART);
206 result.append(word + i);
207 if (complexprefixes && HENTRY_DATA(rv))
208 result.append(HENTRY_DATA2(rv));
209 if (!HENTRY_FIND(rv, MORPH_STEM)) {
210 - result.append(" ");
211 + result.push_back(MSEP_FLD);
212 result.append(MORPH_STEM);
213 result.append(HENTRY_WORD(rv));
215 // store the pointer of the hash entry
216 if (!complexprefixes && HENTRY_DATA(rv)) {
217 - result.append(" ");
218 + result.push_back(MSEP_FLD);
219 result.append(HENTRY_DATA2(rv));
221 - result.append("\n");
222 + result.push_back(MSEP_REC);
226 @@ -2492,7 +2524,7 @@ int AffixMgr::compound_check_morph(const char* word,
227 ((!checkcompounddup || (rv != rv_first)))) {
229 result.append(presult);
230 - result.append(" ");
231 + result.push_back(MSEP_FLD);
232 result.append(MORPH_PART);
233 result.append(word + i);
235 @@ -2500,17 +2532,17 @@ int AffixMgr::compound_check_morph(const char* word,
237 result.append(HENTRY_DATA2(rv));
238 if (!HENTRY_FIND(rv, MORPH_STEM)) {
239 - result.append(" ");
240 + result.push_back(MSEP_FLD);
241 result.append(MORPH_STEM);
242 result.append(HENTRY_WORD(rv));
244 // store the pointer of the hash entry
245 if (!complexprefixes) {
246 - result.append(" ");
247 + result.push_back(MSEP_FLD);
248 result.append(HENTRY_DATA2(rv));
251 - result.append("\n");
252 + result.push_back(MSEP_REC);
256 @@ -2549,7 +2581,7 @@ int AffixMgr::compound_check_morph(const char* word,
257 line_uniq_app(m, MSEP_REC);
260 - result.append("\n");
261 + result.push_back(MSEP_REC);
265 @@ -2639,6 +2671,7 @@ int AffixMgr::compound_check_morph(const char* word,
266 result.append(MORPH_PART);
267 result.append(word + i);
268 line_uniq_app(m, MSEP_REC);
269 + result.push_back(MSEP_FLD);
272 result.push_back(MSEP_REC);
273 @@ -2864,17 +2897,17 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
275 if (ppfx->getMorph()) {
276 result.append(ppfx->getMorph());
277 - result.append(" ");
278 + result.push_back(MSEP_FLD);
280 debugflag(result, ppfx->getFlag());
283 if (se->getMorph()) {
284 - result.append(" ");
285 + result.push_back(MSEP_FLD);
286 result.append(se->getMorph());
288 debugflag(result, se->getFlag());
289 - result.append("\n");
290 + result.push_back(MSEP_REC);
294 @@ -2899,12 +2932,12 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
297 if (sptr->getMorph()) {
298 - result3.append(" ");
299 + result3.push_back(MSEP_FLD);
300 result3.append(sptr->getMorph());
302 debugflag(result3, sptr->getFlag());
303 strlinecat(result2, result3);
304 - result2.append("\n");
305 + result2.push_back(MSEP_REC);
306 result.append(result2);
309 @@ -2967,28 +3000,28 @@ std::string AffixMgr::suffix_check_morph(const char* word,
311 if (ppfx->getMorph()) {
312 result.append(ppfx->getMorph());
313 - result.append(" ");
314 + result.push_back(MSEP_FLD);
316 debugflag(result, ppfx->getFlag());
318 if (complexprefixes && HENTRY_DATA(rv))
319 result.append(HENTRY_DATA2(rv));
320 if (!HENTRY_FIND(rv, MORPH_STEM)) {
321 - result.append(" ");
322 + result.push_back(MSEP_FLD);
323 result.append(MORPH_STEM);
324 result.append(HENTRY_WORD(rv));
327 if (!complexprefixes && HENTRY_DATA(rv)) {
328 - result.append(" ");
329 + result.push_back(MSEP_FLD);
330 result.append(HENTRY_DATA2(rv));
332 if (se->getMorph()) {
333 - result.append(" ");
334 + result.push_back(MSEP_FLD);
335 result.append(se->getMorph());
337 debugflag(result, se->getFlag());
338 - result.append("\n");
339 + result.push_back(MSEP_REC);
340 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
343 @@ -3034,29 +3067,29 @@ std::string AffixMgr::suffix_check_morph(const char* word,
345 if (ppfx->getMorph()) {
346 result.append(ppfx->getMorph());
347 - result.append(" ");
348 + result.push_back(MSEP_FLD);
350 debugflag(result, ppfx->getFlag());
352 if (complexprefixes && HENTRY_DATA(rv))
353 result.append(HENTRY_DATA2(rv));
354 if (!HENTRY_FIND(rv, MORPH_STEM)) {
355 - result.append(" ");
356 + result.push_back(MSEP_FLD);
357 result.append(MORPH_STEM);
358 result.append(HENTRY_WORD(rv));
361 if (!complexprefixes && HENTRY_DATA(rv)) {
362 - result.append(" ");
363 + result.push_back(MSEP_FLD);
364 result.append(HENTRY_DATA2(rv));
367 if (sptr->getMorph()) {
368 - result.append(" ");
369 + result.push_back(MSEP_FLD);
370 result.append(sptr->getMorph());
372 debugflag(result, sptr->getFlag());
373 - result.append("\n");
374 + result.push_back(MSEP_REC);
375 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
377 sptr = sptr->getNextEQ();
378 @@ -3245,7 +3278,7 @@ std::string AffixMgr::morphgen(const char* ts,
379 // use input suffix fields, if exist
380 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
381 mymorph.assign(morph);
382 - mymorph.append(" ");
383 + mymorph.push_back(MSEP_FLD);
384 stemmorphcatpos = mymorph.size();
386 stemmorphcatpos = std::string::npos;
387 @@ -4557,7 +4590,7 @@ bool AffixMgr::parse_affix(const std::string& line,
388 entry->appnd = std::string(start_piece, dash);
389 std::string dash_str(dash + 1, iter);
391 - if (!ignorechars.empty()) {
392 + if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
394 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
396 @@ -4593,7 +4626,7 @@ bool AffixMgr::parse_affix(const std::string& line,
398 entry->appnd = std::string(start_piece, iter);
400 - if (!ignorechars.empty()) {
401 + if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
403 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
405 diff --git a/src/hunspell/atypes.hxx b/src/hunspell/atypes.hxx
406 index f841523..38396db 100644
407 --- a/src/hunspell/atypes.hxx
408 +++ b/src/hunspell/atypes.hxx
409 @@ -95,6 +95,16 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {}
411 #define TESTAFF(a, b, c) (std::binary_search(a, a + c, b))
413 +// timelimit: max. ~1/4 sec (process time on Linux) for
414 +// for a suggestion, including max. ~/10 sec for a case
415 +// sensitive plain or compound word suggestion, within
416 +// ~1/20 sec long time consuming suggestion functions
417 +#define TIMELIMIT_GLOBAL (CLOCKS_PER_SEC / 4)
418 +#define TIMELIMIT_SUGGESTION (CLOCKS_PER_SEC / 10)
419 +#define TIMELIMIT (CLOCKS_PER_SEC / 20)
420 +#define MINTIMER 100
421 +#define MAXPLUSTIMER 100
426 diff --git a/src/hunspell/csutil.hxx b/src/hunspell/csutil.hxx
427 index 01c0a24..3397257 100644
428 --- a/src/hunspell/csutil.hxx
429 +++ b/src/hunspell/csutil.hxx
430 @@ -311,4 +311,16 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h,
431 return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
434 +// to avoid unnecessary string copies and Unicode conversions
435 +// we simply check the ignored_chars characters in the word
436 +// (in the case of UTF-8 encoded strings, "false" means
437 +// "likely false", if ignored_chars characters are not ASCII)
438 +inline bool has_no_ignored_chars(const std::string& word,
439 + const std::string& ignored_chars) {
440 + for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
441 + if (word.find(*it) != std::string::npos)
447 diff --git a/src/hunspell/hashmgr.cxx b/src/hunspell/hashmgr.cxx
448 index 5183f02..7e843c3 100644
449 --- a/src/hunspell/hashmgr.cxx
450 +++ b/src/hunspell/hashmgr.cxx
451 @@ -190,7 +190,7 @@ int HashMgr::add_word(const std::string& in_word,
453 std::string *word_copy = NULL;
454 std::string *desc_copy = NULL;
455 - if (!ignorechars.empty() || complexprefixes) {
456 + if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) {
457 word_copy = new std::string(in_word);
459 if (!ignorechars.empty()) {
460 diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
461 index d6e871f..0dcd748 100644
462 --- a/src/hunspell/hunspell.cxx
463 +++ b/src/hunspell/hunspell.cxx
470 #include "affixmgr.hxx"
471 #include "hunspell.hxx"
472 @@ -101,7 +102,8 @@ public:
473 bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
474 bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL);
475 std::vector<std::string> suggest(const std::string& word);
476 - std::vector<std::string> suggest_internal(const std::string& word);
477 + std::vector<std::string> suggest_internal(const std::string& word,
478 + bool& capitalized, size_t& abbreviated, int& captype);
479 const std::string& get_wordchars() const;
480 const std::vector<w_char>& get_wordchars_utf16() const;
481 const std::string& get_dict_encoding() const;
482 @@ -755,7 +757,7 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
485 const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL;
486 - if (ignoredchars != NULL) {
487 + if (ignoredchars != NULL && !has_no_ignored_chars(w, ignoredchars)) {
490 const std::vector<w_char>& ignoredchars_utf16 =
491 @@ -887,8 +889,83 @@ std::vector<std::string> Hunspell::suggest(const std::string& word) {
494 std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
495 - std::vector<std::string> slst;
496 - slst = suggest_internal(word);
500 + std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype);
501 + // word reversing wrapper for complex prefixes
502 + if (complexprefixes) {
503 + for (size_t j = 0; j < slst.size(); ++j) {
505 + reverseword_utf(slst[j]);
507 + reverseword(slst[j]);
513 + for (size_t j = 0; j < slst.size(); ++j) {
514 + mkinitcap(slst[j]);
517 + // expand suggestions with dot(s)
518 + if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
519 + for (size_t j = 0; j < slst.size(); ++j) {
520 + slst[j].append(word.substr(word.size() - abbv));
524 + // remove bad capitalized and forbidden forms
525 + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
530 + for (size_t j = 0; j < slst.size(); ++j) {
531 + if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
533 + std::vector<w_char> w;
535 + u8_u16(w, slst[j]);
560 + // remove duplications
562 + for (size_t j = 0; j < slst.size(); ++j) {
564 + for (size_t k = 0; k < l; ++k) {
565 + if (slst[k] == slst[j]) {
575 RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
577 @@ -902,7 +979,8 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
581 -std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word) {
582 +std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
583 + bool& capwords, size_t& abbv, int& captype) {
584 std::vector<std::string> slst;
587 @@ -920,8 +998,8 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
588 if (word.size() >= MAXWORDLEN)
591 - int captype = NOCAP;
598 @@ -942,9 +1020,13 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
606 + HUNSPELL_THREAD_LOCAL clock_t timelimit;
607 + // initialize in every suggestion call
608 + timelimit = clock();
610 // check capitalized form for FORCEUCASE
611 if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
612 int info = SPELL_ORIGCAP;
613 @@ -959,26 +1041,36 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
616 good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
617 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
620 std::string wspace(scw);
621 wspace.push_back('.');
622 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
623 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
632 good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
633 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
635 std::string wspace(scw);
636 mkallsmall2(wspace, sunicw);
637 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
638 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
646 good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
647 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
649 // something.The -> something. The
650 size_t dot_pos = scw.find('.');
651 if (dot_pos != std::string::npos) {
652 @@ -1005,6 +1097,8 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
654 mkinitsmall2(wspace, sunicw);
655 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
656 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
660 mkallsmall2(wspace, sunicw);
661 @@ -1012,11 +1106,15 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
662 insert_sug(slst, wspace);
663 size_t prevns = slst.size();
664 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
665 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
667 if (captype == HUHINITCAP) {
668 mkinitcap2(wspace, sunicw);
669 if (spell(wspace.c_str()))
670 insert_sug(slst, wspace);
671 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
672 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
675 // aNew -> "a New" (instead of "a new")
676 for (size_t j = prevns; j < slst.size(); ++j) {
677 @@ -1044,10 +1142,14 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
678 std::string wspace(scw);
679 mkallsmall2(wspace, sunicw);
680 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
681 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
683 if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
684 insert_sug(slst, wspace);
685 mkinitcap2(wspace, sunicw);
686 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
687 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
689 for (size_t j = 0; j < slst.size(); ++j) {
691 if (pAMgr && pAMgr->get_checksharps()) {
692 @@ -1084,21 +1186,27 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
695 pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP);
696 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
704 std::string wspace(scw);
705 mkallsmall2(wspace, sunicw);
706 pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP);
707 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
714 std::string wspace(scw);
715 mkallsmall2(wspace, sunicw);
716 pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP);
717 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
722 @@ -1106,6 +1214,8 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
723 mkallsmall2(wspace, sunicw);
724 size_t oldns = slst.size();
725 pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP);
726 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
728 for (size_t j = oldns; j < slst.size(); ++j) {
731 @@ -1137,6 +1247,8 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
732 std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
733 if (!spell(chunk.c_str())) {
734 std::vector<std::string> nlst = suggest(chunk.c_str());
735 + if (clock() > timelimit + TIMELIMIT_GLOBAL)
737 for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
738 std::string wspace = scw.substr(0, prev_pos);
740 @@ -1160,80 +1272,6 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word)
741 dash_pos = scw.size();
745 - // word reversing wrapper for complex prefixes
746 - if (complexprefixes) {
747 - for (size_t j = 0; j < slst.size(); ++j) {
749 - reverseword_utf(slst[j]);
751 - reverseword(slst[j]);
757 - for (size_t j = 0; j < slst.size(); ++j) {
758 - mkinitcap(slst[j]);
761 - // expand suggestions with dot(s)
762 - if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
763 - for (size_t j = 0; j < slst.size(); ++j) {
764 - slst[j].append(word.substr(word.size() - abbv));
768 - // remove bad capitalized and forbidden forms
769 - if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
774 - for (size_t j = 0; j < slst.size(); ++j) {
775 - if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
777 - std::vector<w_char> w;
779 - u8_u16(w, slst[j]);
804 - // remove duplications
806 - for (size_t j = 0; j < slst.size(); ++j) {
808 - for (size_t k = 0; k < l; ++k) {
809 - if (slst[k] == slst[j]) {
821 diff --git a/src/hunspell/hunvisapi.h b/src/hunspell/hunvisapi.h
822 index eb2b348..8283017 100644
823 --- a/src/hunspell/hunvisapi.h
824 +++ b/src/hunspell/hunvisapi.h
827 #if defined(HUNSPELL_STATIC)
828 # define LIBHUNSPELL_DLL_EXPORTED
829 -#elif defined(_MSC_VER)
830 +#elif defined(_WIN32)
831 # if defined(BUILDING_LIBHUNSPELL)
832 # define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport)
835 # define LIBHUNSPELL_DLL_EXPORTED
838 +/* use thread_local, if it's possible, otherwise static */
841 +# define HUNSPELL_THREAD_LOCAL thread_local
843 +# define HUNSPELL_THREAD_LOCAL thread_local
845 +# define HUNSPELL_THREAD_LOCAL static
849 diff --git a/src/hunspell/hunvisapi.h.in b/src/hunspell/hunvisapi.h.in
850 index a1020c8..85972dd 100644
851 --- a/src/hunspell/hunvisapi.h.in
852 +++ b/src/hunspell/hunvisapi.h.in
855 #if defined(HUNSPELL_STATIC)
856 # define LIBHUNSPELL_DLL_EXPORTED
857 -#elif defined(_MSC_VER)
858 +#elif defined(_WIN32)
859 # if defined(BUILDING_LIBHUNSPELL)
860 # define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport)
863 # define LIBHUNSPELL_DLL_EXPORTED
866 +/* use thread_local, if it's possible, otherwise static */
869 +# define HUNSPELL_THREAD_LOCAL thread_local
871 +# define HUNSPELL_THREAD_LOCAL thread_local
873 +# define HUNSPELL_THREAD_LOCAL static
877 diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
878 index ade85af..d9fabca 100644
879 --- a/src/hunspell/suggestmgr.cxx
880 +++ b/src/hunspell/suggestmgr.cxx
887 #include "suggestmgr.hxx"
888 #include "htypes.hxx"
891 const w_char W_VLINE = {'\0', '|'};
893 +#define MAX_CHAR_DISTANCE 4
895 SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) {
896 // register affix manager and check in string of chars to
897 // try when building candidate suggestions
898 @@ -211,6 +214,11 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
900 for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion;
903 + HUNSPELL_THREAD_LOCAL clock_t timelimit;
904 + // initialize both in non-compound and compound cycles
905 + timelimit = clock();
907 // limit compound suggestion
909 oldSug = slst.size();
910 @@ -233,12 +241,16 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
912 good_suggestion = true;
914 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
915 + return good_suggestion;
917 // perhaps we made chose the wrong char from a related set
918 if ((slst.size() < maxSug) &&
919 (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
920 mapchars(slst, word, cpdsuggest);
922 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
923 + return good_suggestion;
925 // only suggest compound words when no other suggestion
926 if ((cpdsuggest == 0) && (slst.size() > nsugorig))
927 @@ -251,6 +263,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
929 swapchar(slst, word, cpdsuggest);
931 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
932 + return good_suggestion;
934 // did we swap the order of non adjacent chars by mistake
935 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
936 @@ -259,6 +273,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
938 longswapchar(slst, word, cpdsuggest);
940 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
941 + return good_suggestion;
943 // did we just hit the wrong key in place of a good char (case and keyboard)
944 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
945 @@ -267,6 +283,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
947 badcharkey(slst, word, cpdsuggest);
949 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
950 + return good_suggestion;
952 // did we add a char that should not be there
953 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
954 @@ -275,6 +293,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
956 extrachar(slst, word, cpdsuggest);
958 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
959 + return good_suggestion;
961 // did we forgot a char
962 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
963 @@ -283,6 +303,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
965 forgotchar(slst, word, cpdsuggest);
967 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
968 + return good_suggestion;
970 // did we move a char
971 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
972 @@ -291,6 +313,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
974 movechar(slst, word, cpdsuggest);
976 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
977 + return good_suggestion;
979 // did we just hit the wrong key in place of a good char
980 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
981 @@ -299,6 +323,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
983 badchar(slst, word, cpdsuggest);
985 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
986 + return good_suggestion;
988 // did we double two characters
989 if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
990 @@ -307,6 +333,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
992 doubletwochars(slst, word, cpdsuggest);
994 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
995 + return good_suggestion;
997 // perhaps we forgot to hit space and two words ran together
998 // (dictionary word pairs have top priority here, so
999 @@ -315,6 +343,8 @@ bool SuggestMgr::suggest(std::vector<std::string>& slst,
1000 if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) {
1001 good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion);
1003 + if (clock() > timelimit + TIMELIMIT_SUGGESTION)
1004 + return good_suggestion;
1006 } // repeating ``for'' statement compounding support
1008 @@ -469,8 +499,11 @@ int SuggestMgr::replchars(std::vector<std::string>& wlst,
1012 -// perhaps we doubled two characters (pattern aba -> ababa, for example vacation
1014 +// perhaps we doubled two characters
1015 +// (for example vacation -> vacacation)
1016 +// The recognized pattern with regex back-references:
1017 +// "(.)(.)\1\2\1" or "..(.)(.)\1\2"
1019 int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
1022 @@ -481,7 +514,7 @@ int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
1023 for (int i = 2; i < wl; i++) {
1024 if (word[i] == word[i - 2]) {
1027 + if (state == 3 || (state == 2 && i >= 4)) {
1028 std::string candidate(word, word + i - 1);
1029 candidate.insert(candidate.end(), word + i + 1, word + wl);
1030 testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1031 @@ -494,8 +527,11 @@ int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
1035 -// perhaps we doubled two characters (pattern aba -> ababa, for example vacation
1037 +// perhaps we doubled two characters
1038 +// (for example vacation -> vacacation)
1039 +// The recognized pattern with regex back-references:
1040 +// "(.)(.)\1\2\1" or "..(.)(.)\1\2"
1042 int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst,
1045 @@ -506,7 +542,7 @@ int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst,
1046 for (int i = 2; i < wl; i++) {
1047 if (word[i] == word[i - 2]) {
1050 + if (state == 3 || (state == 2 && i >= 4)) {
1051 std::vector<w_char> candidate_utf(word, word + i - 1);
1052 candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl);
1053 std::string candidate;
1054 @@ -939,7 +975,8 @@ int SuggestMgr::longswapchar(std::vector<std::string>& wlst,
1055 // try swapping not adjacent chars one by one
1056 for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
1057 for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) {
1058 - if (std::abs(std::distance(q, p)) > 1) {
1059 + size_t distance = std::abs(std::distance(q, p));
1060 + if (distance > 1 && distance <= MAX_CHAR_DISTANCE) {
1062 testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1064 @@ -958,7 +995,8 @@ int SuggestMgr::longswapchar_utf(std::vector<std::string>& wlst,
1065 // try swapping not adjacent chars
1066 for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
1067 for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) {
1068 - if (std::abs(std::distance(q, p)) > 1) {
1069 + size_t distance = std::abs(std::distance(q, p));
1070 + if (distance > 1 && distance <= MAX_CHAR_DISTANCE) {
1072 std::string candidate;
1073 u16_u8(candidate, candidate_utf);
1074 @@ -980,7 +1018,7 @@ int SuggestMgr::movechar(std::vector<std::string>& wlst,
1076 // try moving a char
1077 for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
1078 - for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) < 10; ++q) {
1079 + for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1080 std::swap(*q, *(q - 1));
1081 if (std::distance(p, q) < 2)
1082 continue; // omit swap char
1083 @@ -990,7 +1028,7 @@ int SuggestMgr::movechar(std::vector<std::string>& wlst,
1086 for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) {
1087 - for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) < 10; ++q) {
1088 + for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1089 std::swap(*q, *(q - 1));
1090 if (std::distance(p, q) < 2)
1091 continue; // omit swap char
1092 @@ -1013,7 +1051,7 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
1094 // try moving a char
1095 for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
1096 - for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) < 10; ++q) {
1097 + for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1098 std::swap(*q, *(q - 1));
1099 if (std::distance(p, q) < 2)
1100 continue; // omit swap char
1101 @@ -1025,7 +1063,7 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
1104 for (std::vector<w_char>::reverse_iterator p = candidate_utf.rbegin(); p < candidate_utf.rend(); ++p) {
1105 - for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) < 10; ++q) {
1106 + for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1107 std::swap(*q, *(q - 1));
1108 if (std::distance(p, q) < 2)
1109 continue; // omit swap char
1110 @@ -1715,15 +1753,15 @@ std::string SuggestMgr::suggest_morph(const std::string& in_w) {
1111 TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1112 TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) {
1113 if (!HENTRY_FIND(rv, MORPH_STEM)) {
1114 - result.append(" ");
1115 + result.push_back(MSEP_FLD);
1116 result.append(MORPH_STEM);
1119 if (HENTRY_DATA(rv)) {
1120 - result.append(" ");
1121 + result.push_back(MSEP_FLD);
1122 result.append(HENTRY_DATA2(rv));
1124 - result.append("\n");
1125 + result.push_back(MSEP_REC);
1127 rv = rv->next_homonym;
1129 @@ -1779,7 +1817,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
1130 HENTRY_DATA(rv), pattern, 0);
1133 - result.append("\n");
1134 + result.push_back(MSEP_REC);
1138 @@ -1803,7 +1841,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
1139 rv2->alen, HENTRY_DATA(rv2), pattern, 0);
1142 - result.append("\n");
1143 + result.push_back(MSEP_REC);
1147 diff --git a/src/hunspell/suggestmgr.hxx b/src/hunspell/suggestmgr.hxx
1148 index f0daf23..a435aac 100644
1149 --- a/src/hunspell/suggestmgr.hxx
1150 +++ b/src/hunspell/suggestmgr.hxx
1152 #define MAXPHONSUGS 2
1153 #define MAXCOMPOUNDSUGS 3
1155 -// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function
1156 -#define TIMELIMIT (CLOCKS_PER_SEC >> 2)
1157 -#define MINTIMER 100
1158 -#define MAXPLUSTIMER 100
1160 #define NGRAM_LONGER_WORSE (1 << 0)
1161 #define NGRAM_ANY_MISMATCH (1 << 1)
1162 #define NGRAM_LOWERING (1 << 2)