external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch

   1 From 9ad1696fb13d65e5d569b7106749dd4014877c15 Mon Sep 17 00:00:00 2001
   2 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
   3 Date: Wed, 13 Dec 2017 19:27:30 +0100
   4 Subject: [PATCH] Recent Hunspell fixes and improvements
   5 MIME-Version: 1.0
   6 Content-Type: text/plain; charset=UTF-8
   7 Content-Transfer-Encoding: 8bit
   8
   9 Containing the following up-stream patches:
  10
  11 commit 7ba5beb517310a942bafd7d6d08fc92beae0e439
  12 Author: László Németh <nemeth@numbertext.org>
  13 Date:   Wed Dec 13 19:01:35 2017 +0100
  14
  15     Support dictionary based REP replacements
  16
  17     using the following syntax in the dic file:
  18
  19     word ph:pattern->replacement
  20
  21 commit 711466a276d5d9f3a5f6e9089bb3262894196fbc
  22 Author: László Németh <nemeth@numbertext.org>
  23 Date:   Tue Dec 12 15:09:36 2017 +0100
  24
  25     fix compiler warnings
  26
  27 commit db142a3addc87bbbdd9a76bc519c69e8ad95af73
  28 Author: László Németh <nemeth@numbertext.org>
  29 Date:   Fri Dec 1 17:24:17 2017 +0100
  30
  31     Fix regression in Hungarian "moving rule"
  32
  33     from commit eb97eb789cec47a172f6e9a01db3e6cf3b8dc81d.
  34
  35     Dictionary words with COMPOUNDFORBIDFLAG are removed
  36     from the beginning and middle of compound words,
  37     overriding the effect of COMPOUNDPERMITFLAG,
  38     except in Hungarian "moving rule".
  39
  40     Add test example.
  41
  42 commit 05082b4e8a917cfbddefbc5fd2d543895b27f4c1
  43 Author: László Németh <nemeth@numbertext.org>
  44 Date:   Fri Dec 1 16:11:20 2017 +0100
  45
  46     BREAK: keep also break-at-first-break-point breaking
  47
  48     to handle the case of suffixes with dashes in compounds.
  49
  50     Add also test example.
  51
  52 commit caa24d60f1a4514d4e0ef48fa14105e85eb6514c
  53 Author: László Németh <nemeth@numbertext.org>
  54 Date:   Fri Dec 1 11:16:35 2017 +0100
  55
  56     Improve ph: usage for capitalization and Unicode
  57
  58     - at capitalized dictionary words, add lowercase ph: patterns
  59       to the REP rules in a capitalized form, too, to get correct
  60       suggestions for lowercase and capitalized mispellings:
  61
  62       Wednesday ph:wendsay (in dic file) results
  63
  64       both wendsay and Wendsay -> Wednesday suggestions.
  65
  66       For German and Hungarian:
  67
  68       add also lowercase pattern -> lowercase dictionary word
  69       replacement to the REP rules, supporting lowercasing
  70       by compound word generation or derivational suffixes.
  71
  72     - fix UTF-8 support of starred ph: fields
  73
  74     - test examples
  75
  76 commit 8912f2ade54cdc186fe0580471063d92d99eb572
  77 Author: László Németh <nemeth@numbertext.org>
  78 Date:   Fri Dec 1 10:26:07 2017 +0100
  79
  80     Allow suggestion search for prefix + *two suffixes*
  81
  82     Remove artificial performance limit to get correct
  83     suggestions for relatively simple misspellings in
  84     Hungarian, etc., when the word form contains prefix
  85     and both derivative and inflectional suffixes, too:
  86
  87     lefikszálása -> lefixálása
  88
  89 commit eb97eb789cec47a172f6e9a01db3e6cf3b8dc81d
  90 Author: László Németh <nemeth@numbertext.org>
  91 Date:   Fri Dec 1 08:03:38 2017 +0100
  92
  93     Dictionary words with COMPOUNDFORBIDFLAG are removed
  94
  95     from the beginning and middle of compound words,
  96     overriding the effect of COMPOUNDPERMITFLAG.
  97
  98 commit 526f600e194aacbc3817df26f01d8c95c38bf582
  99 Author: László Németh <nemeth@numbertext.org>
 100 Date:   Wed Nov 29 14:58:46 2017 +0100
 101
 102     skip empty ph: field and support character stripping
 103
 104     at replacement rule creation.
 105
 106     When the ph: field ends with the character *,
 107     strip last character of the replacement (the correct word)
 108     and last two character of the field (the * and last
 109     character of the matching pattern) in the replacement rule.
 110
 111     For example,
 112
 113     pretty ph:prity*
 114
 115     dictionary item results "prit -> prett" REP replacement
 116     rule instead of "prity -> pretty", to get
 117     "prity -> pretty" and "pritiest -> prettiest" suggestions.
 118
 119 commit ebdd308463a0e8432f56f12804976ba7029a95c4
 120 Author: László Németh <nemeth@numbertext.org>
 121 Date:   Wed Nov 29 13:13:21 2017 +0100
 122
 123     clean-up suggestion
 124
 125     - no ngram and compound word suggestions, if "good" suggestion
 126       exists, ie. uppercase, REP, ph: or dictionary word pair suggestions
 127
 128     - word pairs are always suggested, if they exist in the dic file
 129
 130     - word pairs have top priority in suggestions, and
 131       these are the only suggestions if there is no other good suggestion.
 132
 133     - also dictionary word pairs separated by dash instead of space
 134       are handled specially in two-word suggestion (depending from the
 135       language)
 136
 137 commit 066704985ae474999345f309c71b4929eff1ca95
 138 Author: László Németh <nemeth@numbertext.org>
 139 Date:   Tue Nov 28 12:55:35 2017 +0100
 140
 141     check dictionary word pairs to filter compound word overgeneration
 142
 143     Now it's possible to filter bad compound words by listing
 144     the correct word pairs with space in the dictionary.
 145
 146 commit bbf2eb4ad0c589c38d03321c8b126826d2284a3f
 147 Author: László Németh <nemeth@numbertext.org>
 148 Date:   Tue Nov 28 11:25:01 2017 +0100
 149
 150     word pairs of the dic file get highest suggestion priority
 151
 152     when the words are written without space.
 153
 154     Instead using REP replacements, now it's enough to add
 155
 156     a lot
 157
 158     to the English dic file (like in a traditional spelling
 159     dictionary) to get suggestions for "alot" in the requested
 160     order:
 161
 162     alot
 163     & alot 7 0: a lot, alto, slot, alt, lot...
 164
 165     (without using word pairs or the REP replacements, the order was
 166
 167     alot
 168     & alot 7 0: alto, slot, alt, lot, a lot...)
 169
 170 commit 90cb55f8f1a21c7f62539baf8f3cf6f062080afd
 171 Author: László Németh <nemeth@numbertext.org>
 172 Date:   Tue Nov 28 09:57:23 2017 +0100
 173
 174     Clean-up ngram suggestions for lowercase words
 175
 176     don't suggest capitalized dictionary words for lower
 177     case misspellings in ngram suggestions, except
 178      - PHONE usage, or
 179      - in the case of German, where not only proper
 180        nouns are capitalized, or
 181      - the capitalized word has special pronunciation
 182
 183     - fix typos and comments
 184
 185 commit e80685c83d591b834c61866295577a9e214969cb
 186 Author: László Németh <nemeth@numbertext.org>
 187 Date:   Mon Nov 27 18:26:42 2017 +0100
 188
 189     Remove SUBSTANDARD dictionary roots from suggestions.
 190
 191 commit 89a8ec6ce47ac4442992f4f6ed606012b1a2b799
 192 Author: László Németh <nemeth@numbertext.org>
 193 Date:   Mon Nov 27 08:52:24 2017 +0100
 194
 195     Optimize condition order in walk_hashtable loop
 196
 197 commit 4e4106fc64bc26df10f8dc24e0e578abb70025c7
 198 Author: László Németh <nemeth@numbertext.org>
 199 Date:   Sat Nov 25 01:37:52 2017 +0100
 200
 201     Reduce strange ngram suggestions
 202
 203     - don't suggest proper names for lowercase
 204       misspellings, except in German
 205
 206     - length difference of misspellings and
 207       suggestions must be less than 5 characters
 208
 209     Other: search capitalized suggestions for lowercase misspellings
 210     without ngram suggestions, too.
 211
 212 commit 0b8a4d8851c94485dcc13cf8b8688c8d3fb9a783
 213 Author: László Németh <nemeth@numbertext.org>
 214 Date:   Fri Nov 24 20:01:09 2017 +0100
 215
 216     Use only middle replentries for compound word checking
 217
 218     allowing compound filtering for compound stems and affixed
 219     forms in every languages.
 220
 221     This replaces the partial fix for the CHECKCOMPOUNDREP regression
 222     in commit 1fada01663b29b57c010a9c274e45a5cf9ecf222.
 223
 224 commit 957950b792fb0fda8fa95983434be265729bb75b
 225 Author: László Németh <nemeth@numbertext.org>
 226 Date:   Fri Nov 24 10:56:13 2017 +0100
 227
 228     Spelling dictionary should be a real spelling dictionary
 229
 230     Listing common misspelling of words and *word sequences*
 231     is the new recommended method to fix missing, incomplete or
 232     verbose suggestions. Combined with CHECKCOMPOUNDREP,
 233     this method can limit overgeneration of compound words
 234     in important cases, too.
 235
 236     For example, the following line in the dic file
 237
 238     a lot ph:alot
 239
 240     will result the best suggestion ("a lot") for the bad "alot"
 241     at the first place in the suggestion list.
 242
 243     Use for:
 244
 245     - give correct suggestions (wendsay or wensday -> Wednesday)
 246
 247     Wednesday ph:wendsay ph:wensday
 248
 249     - set priority of good suggestions (eg. wich -> which, witch, winch)
 250
 251     which ph:wich
 252     witch ph:witch
 253
 254     - suggest with one or *more* spaces (eg. inspite->in spite)
 255
 256     in spite ph:inspite
 257     Oh, my gosh! ph:omg
 258
 259     - switch off ngram suggestions for a common misspelling
 260
 261     - better suggestion during affixation and compounding
 262
 263     With CHECKCOMPOUNDREP
 264
 265     - forbid bad compound words
 266
 267     Implementation details:
 268
 269     REP reptable created from REP definitions of the aff file and from
 270     "ph:" fields of the dic file (reptable contains phonetic and other
 271     common misspellings of letters, letter groups, morphemes and words
 272     for better suggestions). REP suggestions have greater priority in
 273     the suggestion list, and they switch off ngram suggestion
 274     search, avoiding overgeneration of suggestions.
 275
 276 commit 4a8921bd65b39e24344ef38c396e797384b74677
 277 Author: László Németh <nemeth@numbertext.org>
 278 Date:   Wed Nov 22 23:27:00 2017 +0100
 279
 280     BREAK tries to break at the second word break
 281
 282     to recognize dictionary words with word break characters
 283     (at the beginning of the compound word).
 284
 285     This fixes the problems with the new Hungarian orthography
 286     about compounding of words with n-dash.
 287
 288     Example:
 289
 290     The Hungarian compound word "e-mail-cím" (e-mail address)
 291     will break into "e-mail" (dictionary word) and "cím", instead
 292     of "e" and "mail-cím" ("mail" is not a dictionary word) at
 293     first level of recursive word breaking.
 294 ---
 295  src/hunspell/affixmgr.cxx   | 183 +++++++++++-----------------------
 296  src/hunspell/affixmgr.hxx   |   5 +-
 297  src/hunspell/csutil.hxx     |   6 +-
 298  src/hunspell/hashmgr.cxx    | 236 +++++++++++++++++++++++++++++++++++++++++---
 299  src/hunspell/hashmgr.hxx    |  15 ++-
 300  src/hunspell/htypes.hxx     |   9 +-
 301  src/hunspell/hunspell.cxx   |  75 ++++++++++----
 302  src/hunspell/suggestmgr.cxx | 200 ++++++++++++++++++++++++-------------
 303  src/hunspell/suggestmgr.hxx |   7 +-
 304  9 files changed, 503 insertions(+), 233 deletions(-)
 305
 306 diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
 307 index ffce7bb..a98071a 100644
 308 --- a/src/hunspell/affixmgr.cxx
 309 +++ b/src/hunspell/affixmgr.cxx
 310 @@ -96,7 +96,6 @@ AffixMgr::AffixMgr(const char* affpath,
 311    complexprefixes = 0;
 312    parsedmaptable = false;
 313    parsedbreaktable = false;
 314 -  parsedrep = false;
 315    iconvtable = NULL;
 316    oconvtable = NULL;
 317    // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
 318 @@ -529,14 +528,6 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
 319        }
 320      }
 321
 322 -    /* parse in the typical fault correcting table */
 323 -    if (line.compare(0, 3, "REP", 3) == 0) {
 324 -      if (!parse_reptable(line, afflst)) {
 325 -        finishFileMgr(afflst);
 326 -        return 1;
 327 -      }
 328 -    }
 329 -
 330      /* parse in the input conversion table */
 331      if (line.compare(0, 5, "ICONV", 5) == 0) {
 332        if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
 333 @@ -1278,22 +1269,41 @@ std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
 334  // Is word a non compound with a REP substitution (see checkcompoundrep)?
 335  int AffixMgr::cpdrep_check(const char* word, int wl) {
 336
 337 -  if ((wl < 2) || reptable.empty())
 338 +  if ((wl < 2) || get_reptable().empty())
 339      return 0;
 340
 341 -  for (size_t i = 0; i < reptable.size(); ++i) {
 342 -    const char* r = word;
 343 -    const size_t lenp = reptable[i].pattern.size();
 344 -    // search every occurence of the pattern in the word
 345 -    while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
 346 -      std::string candidate(word);
 347 -      size_t type = r == word && langnum != LANG_hu ? 1 : 0;
 348 -      if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu)
 349 -        type += 2;
 350 -      candidate.replace(r - word, lenp, reptable[i].outstrings[type]);
 351 +  for (size_t i = 0; i < get_reptable().size(); ++i) {
 352 +    // use only available mid patterns
 353 +    if (!get_reptable()[i].outstrings[0].empty()) {
 354 +      const char* r = word;
 355 +      const size_t lenp = get_reptable()[i].pattern.size();
 356 +      // search every occurence of the pattern in the word
 357 +      while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) {
 358 +        std::string candidate(word);
 359 +        candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
 360 +        if (candidate_check(candidate.c_str(), candidate.size()))
 361 +          return 1;
 362 +        ++r;  // search for the next letter
 363 +      }
 364 +    }
 365 +  }
 366 +
 367 + return 0;
 368 +}
 369 +
 370 +// forbid compound words, if they are in the dictionary as a
 371 +// word pair separated by space
 372 +int AffixMgr::cpdwordpair_check(const char * word, int wl) {
 373 +  if (wl > 2) {
 374 +    std::string candidate(word);
 375 +    for (size_t i = 1; i < candidate.size(); i++) {
 376 +      // go to end of the UTF-8 character
 377 +      if (utf8 && ((word[i] & 0xc0) == 0x80))
 378 +          continue;
 379 +      candidate.insert(i, 1, ' ');
 380        if (candidate_check(candidate.c_str(), candidate.size()))
 381          return 1;
 382 -      ++r;  // search for the next letter
 383 +      candidate.erase(i, 1);
 384      }
 385    }
 386
 387 @@ -1647,6 +1657,12 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
 388          affixed = 1;
 389          rv = lookup(st.c_str());  // perhaps without prefix
 390
 391 +        // forbid dictionary stems with COMPOUNDFORBIDFLAG in
 392 +        // compound words, overriding the effect of COMPOUNDPERMITFLAG
 393 +        if ((rv) && compoundforbidflag &&
 394 +                TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
 395 +            continue;
 396 +
 397          // search homonym with compound flag
 398          while ((rv) && !hu_mov_rule &&
 399                 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
 400 @@ -1911,7 +1927,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
 401                   TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
 402                // forbid compound word, if it is a non compound word with typical
 403                // fault
 404 -              if (checkcompoundrep && cpdrep_check(word.c_str(), len))
 405 +              if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
 406 +                      cpdwordpair_check(word.c_str(), len))
 407                  return NULL;
 408                return rv_first;
 409              }
 410 @@ -2035,7 +2052,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
 411                  ((!checkcompounddup || (rv != rv_first)))) {
 412                // forbid compound word, if it is a non compound word with typical
 413                // fault
 414 -              if (checkcompoundrep && cpdrep_check(word.c_str(), len))
 415 +              if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
 416 +                      cpdwordpair_check(word.c_str(), len))
 417                  return NULL;
 418                return rv_first;
 419              }
 420 @@ -2060,7 +2078,11 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
 421              }
 422              if (rv) {
 423                // forbid compound word, if it is a non compound word with typical
 424 -              // fault
 425 +              // fault, or a dictionary word pair
 426 +
 427 +              if (cpdwordpair_check(word.c_str(), len))
 428 +                  return NULL;
 429 +
 430                if (checkcompoundrep || forbiddenword) {
 431
 432                  if (checkcompoundrep && cpdrep_check(word.c_str(), len))
 433 @@ -2071,7 +2093,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
 434                    char r = st[i + rv->blen];
 435                    st[i + rv->blen] = '\0';
 436
 437 -                  if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
 438 +                  if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
 439 +                      cpdwordpair_check(st.c_str(), i + rv->blen)) {
 440                      st[ + i + rv->blen] = r;
 441                      continue;
 442                    }
 443 @@ -2198,6 +2221,12 @@ int AffixMgr::compound_check_morph(const char* word,
 444
 445        rv = lookup(st.c_str());  // perhaps without prefix
 446
 447 +      // forbid dictionary stems with COMPOUNDFORBIDFLAG in
 448 +      // compound words, overriding the effect of COMPOUNDPERMITFLAG
 449 +      if ((rv) && compoundforbidflag &&
 450 +              TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
 451 +          continue;
 452 +
 453        // search homonym with compound flag
 454        while ((rv) && !hu_mov_rule &&
 455               ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
 456 @@ -3414,7 +3443,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst,
 457
 458  // return replacing table
 459  const std::vector<replentry>& AffixMgr::get_reptable() const {
 460 -  return reptable;
 461 +  return pHMgr->get_reptable();
 462  }
 463
 464  // return iconv table
 465 @@ -3554,6 +3583,11 @@ FLAG AffixMgr::get_nongramsuggest() const {
 466    return nongramsuggest;
 467  }
 468
 469 +// return the substandard root/affix control flag
 470 +FLAG AffixMgr::get_substandard() const {
 471 +  return substandard;
 472 +}
 473 +
 474  // return the forbidden words flag modify flag
 475  FLAG AffixMgr::get_needaffix() const {
 476    return needaffix;
 477 @@ -3692,103 +3726,6 @@ bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
 478    return true;
 479  }
 480
 481 -/* parse in the typical fault correcting table */
 482 -bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) {
 483 -  if (parsedrep) {
 484 -    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
 485 -                     af->getlinenum());
 486 -    return false;
 487 -  }
 488 -  parsedrep = true;
 489 -  int numrep = -1;
 490 -  int i = 0;
 491 -  int np = 0;
 492 -  std::string::const_iterator iter = line.begin();
 493 -  std::string::const_iterator start_piece = mystrsep(line, iter);
 494 -  while (start_piece != line.end()) {
 495 -    switch (i) {
 496 -      case 0: {
 497 -        np++;
 498 -        break;
 499 -      }
 500 -      case 1: {
 501 -        numrep = atoi(std::string(start_piece, iter).c_str());
 502 -        if (numrep < 1) {
 503 -          HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
 504 -                           af->getlinenum());
 505 -          return false;
 506 -        }
 507 -        reptable.reserve(numrep);
 508 -        np++;
 509 -        break;
 510 -      }
 511 -      default:
 512 -        break;
 513 -    }
 514 -    ++i;
 515 -    start_piece = mystrsep(line, iter);
 516 -  }
 517 -  if (np != 2) {
 518 -    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
 519 -                     af->getlinenum());
 520 -    return false;
 521 -  }
 522 -
 523 -  /* now parse the numrep lines to read in the remainder of the table */
 524 -  for (int j = 0; j < numrep; ++j) {
 525 -    std::string nl;
 526 -    if (!af->getline(nl))
 527 -      return false;
 528 -    mychomp(nl);
 529 -    reptable.push_back(replentry());
 530 -    iter = nl.begin();
 531 -    i = 0;
 532 -    int type = 0;
 533 -    start_piece = mystrsep(nl, iter);
 534 -    while (start_piece != nl.end()) {
 535 -      switch (i) {
 536 -        case 0: {
 537 -          if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
 538 -            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
 539 -                             af->getlinenum());
 540 -            reptable.clear();
 541 -            return false;
 542 -          }
 543 -          break;
 544 -        }
 545 -        case 1: {
 546 -          if (*start_piece == '^')
 547 -            type = 1;
 548 -          reptable.back().pattern.assign(start_piece + type, iter);
 549 -          mystrrep(reptable.back().pattern, "_", " ");
 550 -          if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
 551 -            type += 2;
 552 -            reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
 553 -          }
 554 -          break;
 555 -        }
 556 -        case 2: {
 557 -          reptable.back().outstrings[type].assign(start_piece, iter);
 558 -          mystrrep(reptable.back().outstrings[type], "_", " ");
 559 -          break;
 560 -        }
 561 -        default:
 562 -          break;
 563 -      }
 564 -      ++i;
 565 -      start_piece = mystrsep(nl, iter);
 566 -    }
 567 -    if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
 568 -      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
 569 -                       af->getlinenum());
 570 -      reptable.clear();
 571 -      return false;
 572 -    }
 573 -  }
 574 -  return true;
 575 -}
 576 -
 577 -/* parse in the typical fault correcting table */
 578  bool AffixMgr::parse_convtable(const std::string& line,
 579                                FileMgr* af,
 580                                RepList** rl,
 581 diff --git a/src/hunspell/affixmgr.hxx b/src/hunspell/affixmgr.hxx
 582 index d41e69c..38842a3 100644
 583 --- a/src/hunspell/affixmgr.hxx
 584 +++ b/src/hunspell/affixmgr.hxx
 585 @@ -120,8 +120,6 @@ class AffixMgr {
 586    FLAG nongramsuggest;
 587    FLAG needaffix;
 588    int cpdmin;
 589 -  bool parsedrep;
 590 -  std::vector<replentry> reptable;
 591    RepList* iconvtable;
 592    RepList* oconvtable;
 593    bool parsedmaptable;
 594 @@ -251,6 +249,7 @@ class AffixMgr {
 595
 596    short get_syllable(const std::string& word);
 597    int cpdrep_check(const char* word, int len);
 598 +  int cpdwordpair_check(const char * word, int len);
 599    int cpdpat_check(const char* word,
 600                     int len,
 601                     hentry* r1,
 602 @@ -311,6 +310,7 @@ class AffixMgr {
 603    FLAG get_forbiddenword() const;
 604    FLAG get_nosuggest() const;
 605    FLAG get_nongramsuggest() const;
 606 +  FLAG get_substandard() const;
 607    FLAG get_needaffix() const;
 608    FLAG get_onlyincompound() const;
 609    const char* get_derived() const;
 610 @@ -338,7 +338,6 @@ class AffixMgr {
 611    bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af);
 612    bool parse_num(const std::string& line, int* out, FileMgr* af);
 613    bool parse_cpdsyllable(const std::string& line, FileMgr* af);
 614 -  bool parse_reptable(const std::string& line, FileMgr* af);
 615    bool parse_convtable(const std::string& line,
 616                        FileMgr* af,
 617                        RepList** rl,
 618 diff --git a/src/hunspell/csutil.hxx b/src/hunspell/csutil.hxx
 619 index 5d83f80..01c0a24 100644
 620 --- a/src/hunspell/csutil.hxx
 621 +++ b/src/hunspell/csutil.hxx
 622 @@ -272,7 +272,7 @@ LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
 623  // hash entry macros
 624  LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
 625    char* ret;
 626 -  if (!h->var)
 627 +  if (!(h->var & H_OPT))
 628      ret = NULL;
 629    else if (h->var & H_OPT_ALIASM)
 630      ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
 631 @@ -284,7 +284,7 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
 632  LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
 633      const struct hentry* h) {
 634    const char* ret;
 635 -  if (!h->var)
 636 +  if (!(h->var & H_OPT))
 637      ret = NULL;
 638    else if (h->var & H_OPT_ALIASM)
 639      ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
 640 @@ -297,7 +297,7 @@ LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
 641  LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
 642      const struct hentry* h) {
 643    const char* ret;
 644 -  if (!h->var)
 645 +  if (!(h->var & H_OPT))
 646      ret = "";
 647    else if (h->var & H_OPT_ALIASM)
 648      ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
 649 diff --git a/src/hunspell/hashmgr.cxx b/src/hunspell/hashmgr.cxx
 650 index 23421b5..ec3803b 100644
 651 --- a/src/hunspell/hashmgr.cxx
 652 +++ b/src/hunspell/hashmgr.cxx
 653 @@ -78,6 +78,7 @@
 654  #include "hashmgr.hxx"
 655  #include "csutil.hxx"
 656  #include "atypes.hxx"
 657 +#include "langnum.hxx"
 658
 659  // build a hash table from a munched word list
 660
 661 @@ -182,7 +183,8 @@ int HashMgr::add_word(const std::string& in_word,
 662                        unsigned short* aff,
 663                        int al,
 664                        const std::string* in_desc,
 665 -                      bool onlyupcase) {
 666 +                      bool onlyupcase,
 667 +                      int captype) {
 668    const std::string* word = &in_word;
 669    const std::string* desc = in_desc;
 670
 671 @@ -243,20 +245,119 @@ int HashMgr::add_word(const std::string& in_word,
 672    hp->astr = aff;
 673    hp->next = NULL;
 674    hp->next_homonym = NULL;
 675 +  hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0;
 676
 677    // store the description string or its pointer
 678    if (desc) {
 679 -    hp->var = H_OPT;
 680 +    hp->var += H_OPT;
 681      if (aliasm) {
 682        hp->var += H_OPT_ALIASM;
 683        store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
 684      } else {
 685        strcpy(hpw + word->size() + 1, desc->c_str());
 686      }
 687 -    if (strstr(HENTRY_DATA(hp), MORPH_PHON))
 688 +    if (strstr(HENTRY_DATA(hp), MORPH_PHON)) {
 689        hp->var += H_OPT_PHON;
 690 -  } else
 691 -    hp->var = 0;
 692 +      // store ph: fields (pronounciation, misspellings, old orthography etc.)
 693 +      // of a morphological description in reptable to use in REP replacements.
 694 +      if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO))
 695 +          reptable.reserve(tablesize/MORPH_PHON_RATIO);
 696 +      std::string fields = HENTRY_DATA(hp);
 697 +      std::string::const_iterator iter = fields.begin();
 698 +      std::string::const_iterator start_piece = mystrsep(fields, iter);
 699 +      while (start_piece != fields.end()) {
 700 +        if (std::string(start_piece, iter).find(MORPH_PHON) == 0) {
 701 +          std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1);
 702 +          if (ph.size() > 0) {
 703 +            std::vector<w_char> w;
 704 +            size_t strippatt;
 705 +            std::string wordpart;
 706 +            // dictionary based REP replacement, separated by "->"
 707 +            // for example "pretty ph:prity ph:priti->pretti" to handle
 708 +            // both prity -> pretty and pritier -> prettiest suggestions.
 709 +            if (((strippatt = ph.find("->")) != std::string::npos) &&
 710 +                    (strippatt > 0) && (strippatt < ph.size() - 2)) {
 711 +                wordpart = ph.substr(strippatt + 2);
 712 +                ph.erase(ph.begin() + strippatt, ph.end());
 713 +            } else
 714 +                wordpart = in_word;
 715 +            // when the ph: field ends with the character *,
 716 +            // strip last character of the pattern and the replacement
 717 +            // to match in REP suggestions also at character changes,
 718 +            // for example, "pretty ph:prity*" results "prit->prett"
 719 +            // REP replacement instead of "prity->pretty", to get
 720 +            // prity->pretty and pritiest->prettiest suggestions.
 721 +            if (ph.at(ph.size()-1) == '*') {
 722 +              strippatt = 1;
 723 +              size_t stripword = 0;
 724 +              if (utf8) {
 725 +                while ((strippatt < ph.size()) &&
 726 +                  ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80))
 727 +                     ++strippatt;
 728 +                while ((stripword < wordpart.size()) &&
 729 +                  ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80))
 730 +                     ++stripword;
 731 +              }
 732 +              ++strippatt;
 733 +              ++stripword;
 734 +              if ((ph.size() > strippatt) && (wordpart.size() > stripword)) {
 735 +                ph.erase(ph.size()-strippatt, strippatt);
 736 +                wordpart.erase(in_word.size()-stripword, stripword);
 737 +              }
 738 +            }
 739 +            // capitalize lowercase pattern for capitalized words to support
 740 +            // good suggestions also for capitalized misspellings, eg.
 741 +            // Wednesday ph:wendsay
 742 +            // results wendsay -> Wednesday and Wendsay -> Wednesday, too.
 743 +            if (captype==INITCAP) {
 744 +              std::string ph_capitalized;
 745 +              if (utf8) {
 746 +                u8_u16(w, ph);
 747 +                if (get_captype_utf8(w, langnum) == NOCAP) {
 748 +                  mkinitcap_utf(w, langnum);
 749 +                  u16_u8(ph_capitalized, w);
 750 +                }
 751 +              } else if (get_captype(ph, csconv) == NOCAP)
 752 +                  mkinitcap(ph_capitalized, csconv);
 753 +
 754 +              if (ph_capitalized.size() > 0) {
 755 +                // add also lowercase word in the case of German or
 756 +                // Hungarian to support lowercase suggestions lowercased by
 757 +                // compound word generation or derivational suffixes
 758 +                // (for example by adjectival suffix "-i" of geographical
 759 +                // names in Hungarian:
 760 +                // Massachusetts ph:messzecsuzec
 761 +                // messzecsuzeci -> massachusettsi (adjective)
 762 +                // For lowercasing by conditional PFX rules, see
 763 +                // tests/germancompounding test example or the
 764 +                // Hungarian dictionary.)
 765 +                if (langnum == LANG_de || langnum == LANG_hu) {
 766 +                  std::string wordpart_lower(wordpart);
 767 +                  if (utf8) {
 768 +                    u8_u16(w, wordpart_lower);
 769 +                    mkallsmall_utf(w, langnum);
 770 +                    u16_u8(wordpart_lower, w);
 771 +                  } else {
 772 +                    mkallsmall(wordpart_lower, csconv);
 773 +                  }
 774 +                  reptable.push_back(replentry());
 775 +                  reptable.back().pattern.assign(ph);
 776 +                  reptable.back().outstrings[0].assign(wordpart_lower);
 777 +                }
 778 +                reptable.push_back(replentry());
 779 +                reptable.back().pattern.assign(ph_capitalized);
 780 +                reptable.back().outstrings[0].assign(wordpart);
 781 +              }
 782 +            }
 783 +            reptable.push_back(replentry());
 784 +            reptable.back().pattern.assign(ph);
 785 +            reptable.back().outstrings[0].assign(wordpart);
 786 +          }
 787 +        }
 788 +        start_piece = mystrsep(fields, iter);
 789 +      }
 790 +    }
 791 +  }
 792
 793    struct hentry* dp = tableptr[i];
 794    if (!dp) {
 795 @@ -347,12 +448,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word,
 796        mkallsmall_utf(w, langnum);
 797        mkinitcap_utf(w, langnum);
 798        u16_u8(st, w);
 799 -      return add_word(st, wcl, flags2, flagslen + 1, dp, true);
 800 +      return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP);
 801      } else {
 802        std::string new_word(word);
 803        mkallsmall(new_word, csconv);
 804        mkinitcap(new_word, csconv);
 805 -      int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
 806 +      int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP);
 807        return ret;
 808      }
 809    }
 810 @@ -435,7 +536,7 @@ int HashMgr::add(const std::string& word) {
 811      int al = 0;
 812      unsigned short* flags = NULL;
 813      int wcl = get_clen_and_captype(word, &captype);
 814 -    add_word(word, wcl, flags, al, NULL, false);
 815 +    add_word(word, wcl, flags, al, NULL, false, captype);
 816      return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
 817                                         captype);
 818    }
 819 @@ -450,14 +551,14 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example)
 820      int captype;
 821      int wcl = get_clen_and_captype(word, &captype);
 822      if (aliasf) {
 823 -      add_word(word, wcl, dp->astr, dp->alen, NULL, false);
 824 +      add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype);
 825      } else {
 826        unsigned short* flags =
 827            (unsigned short*)malloc(dp->alen * sizeof(unsigned short));
 828        if (flags) {
 829          memcpy((void*)flags, (void*)dp->astr,
 830                 dp->alen * sizeof(unsigned short));
 831 -        add_word(word, wcl, flags, dp->alen, NULL, false);
 832 +        add_word(word, wcl, flags, dp->alen, NULL, false, captype);
 833        } else
 834          return 1;
 835      }
 836 @@ -605,7 +706,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
 837      int wcl = get_clen_and_captype(ts, &captype, workbuf);
 838      const std::string *dp_str = dp.empty() ? NULL : &dp;
 839      // add the word and its index plus its capitalized form optionally
 840 -    if (add_word(ts, wcl, flags, al, dp_str, false) ||
 841 +    if (add_word(ts, wcl, flags, al, dp_str, false, captype) ||
 842          add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
 843        delete dict;
 844        return 5;
 845 @@ -940,8 +1041,19 @@ int HashMgr::load_config(const char* affpath, const char* key) {
 846      if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
 847        complexprefixes = 1;
 848
 849 +    /* parse in the typical fault correcting table */
 850 +    if (line.compare(0, 3, "REP", 3) == 0) {
 851 +      if (!parse_reptable(line, afflst)) {
 852 +        delete afflst;
 853 +        return 1;
 854 +      }
 855 +    }
 856 +
 857 +    // don't check the full affix file, yet
 858      if (((line.compare(0, 3, "SFX", 3) == 0) ||
 859 -         (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
 860 +         (line.compare(0, 3, "PFX", 3) == 0)) &&
 861 +            line.size() > 3 && isspace(line[3]) &&
 862 +            !reptable.empty()) // (REP table is in the end of Afrikaans aff file)
 863        break;
 864    }
 865
 866 @@ -1191,3 +1303,103 @@ char* HashMgr::get_aliasm(int index) const {
 867    HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
 868    return NULL;
 869  }
 870 +
 871 +/* parse in the typical fault correcting table */
 872 +bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) {
 873 +  if (!reptable.empty()) {
 874 +    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
 875 +                     af->getlinenum());
 876 +    return false;
 877 +  }
 878 +  int numrep = -1;
 879 +  int i = 0;
 880 +  int np = 0;
 881 +  std::string::const_iterator iter = line.begin();
 882 +  std::string::const_iterator start_piece = mystrsep(line, iter);
 883 +  while (start_piece != line.end()) {
 884 +    switch (i) {
 885 +      case 0: {
 886 +        np++;
 887 +        break;
 888 +      }
 889 +      case 1: {
 890 +        numrep = atoi(std::string(start_piece, iter).c_str());
 891 +        if (numrep < 1) {
 892 +          HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
 893 +                           af->getlinenum());
 894 +          return false;
 895 +        }
 896 +        reptable.reserve(numrep);
 897 +        np++;
 898 +        break;
 899 +      }
 900 +      default:
 901 +        break;
 902 +    }
 903 +    ++i;
 904 +    start_piece = mystrsep(line, iter);
 905 +  }
 906 +  if (np != 2) {
 907 +    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
 908 +                     af->getlinenum());
 909 +    return false;
 910 +  }
 911 +
 912 +  /* now parse the numrep lines to read in the remainder of the table */
 913 +  for (int j = 0; j < numrep; ++j) {
 914 +    std::string nl;
 915 +    if (!af->getline(nl))
 916 +      return false;
 917 +    mychomp(nl);
 918 +    reptable.push_back(replentry());
 919 +    iter = nl.begin();
 920 +    i = 0;
 921 +    int type = 0;
 922 +    start_piece = mystrsep(nl, iter);
 923 +    while (start_piece != nl.end()) {
 924 +      switch (i) {
 925 +        case 0: {
 926 +          if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
 927 +            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
 928 +                             af->getlinenum());
 929 +            reptable.clear();
 930 +            return false;
 931 +          }
 932 +          break;
 933 +        }
 934 +        case 1: {
 935 +          if (*start_piece == '^')
 936 +            type = 1;
 937 +          reptable.back().pattern.assign(start_piece + type, iter);
 938 +          mystrrep(reptable.back().pattern, "_", " ");
 939 +          if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
 940 +            type += 2;
 941 +            reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
 942 +          }
 943 +          break;
 944 +        }
 945 +        case 2: {
 946 +          reptable.back().outstrings[type].assign(start_piece, iter);
 947 +          mystrrep(reptable.back().outstrings[type], "_", " ");
 948 +          break;
 949 +        }
 950 +        default:
 951 +          break;
 952 +      }
 953 +      ++i;
 954 +      start_piece = mystrsep(nl, iter);
 955 +    }
 956 +    if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
 957 +      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
 958 +                       af->getlinenum());
 959 +      reptable.clear();
 960 +      return false;
 961 +    }
 962 +  }
 963 +  return true;
 964 +}
 965 +
 966 +// return replacing table
 967 +const std::vector<replentry>& HashMgr::get_reptable() const {
 968 +  return reptable;
 969 +}
 970 diff --git a/src/hunspell/hashmgr.hxx b/src/hunspell/hashmgr.hxx
 971 index da485d7..b6eaddd 100644
 972 --- a/src/hunspell/hashmgr.hxx
 973 +++ b/src/hunspell/hashmgr.hxx
 974 @@ -81,6 +81,12 @@
 975
 976  enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
 977
 978 +// morphological description of a dictionary item can contain
 979 +// arbitrary number "ph:" (MORPH_PHON) fields to store typical
 980 +// phonetic or other misspellings of that word.
 981 +// ratio of lines/lines with "ph:" in the dic file: 1/MORPH_PHON_RATIO
 982 +#define MORPH_PHON_RATIO 500
 983 +
 984  class HashMgr {
 985    int tablesize;
 986    struct hentry** tableptr;
 987 @@ -99,6 +105,10 @@ class HashMgr {
 988    unsigned short* aliasflen;
 989    int numaliasm;  // morphological desciption `compression' with aliases
 990    char** aliasm;
 991 +  // reptable created from REP table of aff file and from "ph:" fields
 992 +  // of the dic file. It contains phonetic and other common misspellings
 993 +  // (letters, letter groups and words) for better suggestions
 994 +  std::vector<replentry> reptable;
 995
 996   public:
 997    HashMgr(const char* tpath, const char* apath, const char* key = NULL);
 998 @@ -119,6 +129,7 @@ class HashMgr {
 999    int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const;
1000    int is_aliasm() const;
1001    char* get_aliasm(int index) const;
1002 +  const std::vector<replentry>& get_reptable() const;
1003
1004   private:
1005    int get_clen_and_captype(const std::string& word, int* captype);
1006 @@ -129,7 +140,8 @@ class HashMgr {
1007                 unsigned short* ap,
1008                 int al,
1009                 const std::string* desc,
1010 -               bool onlyupcase);
1011 +               bool onlyupcase,
1012 +               int captype);
1013    int load_config(const char* affpath, const char* key);
1014    bool parse_aliasf(const std::string& line, FileMgr* af);
1015    int add_hidden_capitalized_word(const std::string& word,
1016 @@ -139,6 +151,7 @@ class HashMgr {
1017                                    const std::string* dp,
1018                                    int captype);
1019    bool parse_aliasm(const std::string& line, FileMgr* af);
1020 +  bool parse_reptable(const std::string& line, FileMgr* af);
1021    int remove_forbidden_flag(const std::string& word);
1022  };
1023
1024 diff --git a/src/hunspell/htypes.hxx b/src/hunspell/htypes.hxx
1025 index 8f66a00..76228c4 100644
1026 --- a/src/hunspell/htypes.hxx
1027 +++ b/src/hunspell/htypes.hxx
1028 @@ -44,9 +44,10 @@
1029    (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1));
1030
1031  // hentry options
1032 -#define H_OPT (1 << 0)
1033 -#define H_OPT_ALIASM (1 << 1)
1034 -#define H_OPT_PHON (1 << 2)
1035 +#define H_OPT (1 << 0)          // is there optional morphological data?
1036 +#define H_OPT_ALIASM (1 << 1)   // using alias compression?
1037 +#define H_OPT_PHON (1 << 2)     // is there ph: field in the morphological data?
1038 +#define H_OPT_INITCAP (1 << 3)  // is dictionary word capitalized?
1039
1040  // see also csutil.hxx
1041  #define HENTRY_WORD(h) &(h->word[0])
1042 @@ -61,7 +62,7 @@ struct hentry {
1043    unsigned short* astr;  // affix flag vector
1044    struct hentry* next;   // next word with same hash code
1045    struct hentry* next_homonym;  // next homonym word (with same hash code)
1046 -  char var;      // variable fields (only for special pronounciation yet)
1047 +  char var;      // bit vector of H_OPT hentry options
1048    char word[1];  // variable-length word (8-bit or UTF-8 encoding)
1049  };
1050
1051 diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
1052 index 1ef11df..6c5aeb6 100644
1053 --- a/src/hunspell/hunspell.cxx
1054 +++ b/src/hunspell/hunspell.cxx
1055 @@ -666,6 +666,37 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
1056        size_t plen = wordbreak[j].size();
1057        size_t found = scw.find(wordbreak[j]);
1058        if ((found > 0) && (found < wl - plen)) {
1059 +        size_t found2 = scw.find(wordbreak[j], found + 1);
1060 +        // try to break at the second occurance
1061 +        // to recognize dictionary words with wordbreak
1062 +        if (found2 > 0 && (found2 < wl - plen))
1063 +            found = found2;
1064 +        if (!spell(scw.substr(found + plen)))
1065 +          continue;
1066 +        std::string suffix(scw.substr(found));
1067 +        scw.resize(found);
1068 +        // examine 2 sides of the break point
1069 +        if (spell(scw))
1070 +          return true;
1071 +        scw.append(suffix);
1072 +
1073 +        // LANG_hu: spec. dash rule
1074 +        if (langnum == LANG_hu && wordbreak[j] == "-") {
1075 +          suffix = scw.substr(found + 1);
1076 +          scw.resize(found + 1);
1077 +          if (spell(scw))
1078 +            return true;  // check the first part with dash
1079 +          scw.append(suffix);
1080 +        }
1081 +        // end of LANG specific region
1082 +      }
1083 +    }
1084 +
1085 +    // other patterns (break at first break point)
1086 +    for (size_t j = 0; j < wordbreak.size(); ++j) {
1087 +      size_t plen = wordbreak[j].size();
1088 +      size_t found = scw.find(wordbreak[j]);
1089 +      if ((found > 0) && (found < wl - plen)) {
1090          if (!spell(scw.substr(found + plen)))
1091            continue;
1092          std::string suffix(scw.substr(found));
1093 @@ -870,6 +901,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1094    }
1095
1096    int capwords = 0;
1097 +  bool good = false;
1098
1099    // check capitalized form for FORCEUCASE
1100    if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
1101 @@ -884,22 +916,27 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1102
1103    switch (captype) {
1104      case NOCAP: {
1105 -      pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1106 +      good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1107 +      if (abbv) {
1108 +        std::string wspace(scw);
1109 +        wspace.push_back('.');
1110 +        good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1111 +      }
1112        break;
1113      }
1114
1115      case INITCAP: {
1116        capwords = 1;
1117 -      pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1118 +      good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1119        std::string wspace(scw);
1120        mkallsmall2(wspace, sunicw);
1121 -      pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1122 +      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1123        break;
1124      }
1125      case HUHINITCAP:
1126        capwords = 1;
1127      case HUHCAP: {
1128 -      pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1129 +      good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1130        // something.The -> something. The
1131        size_t dot_pos = scw.find('.');
1132        if (dot_pos != std::string::npos) {
1133 @@ -925,19 +962,19 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1134          // TheOpenOffice.org -> The OpenOffice.org
1135          wspace = scw;
1136          mkinitsmall2(wspace, sunicw);
1137 -        pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1138 +        good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1139        }
1140        wspace = scw;
1141        mkallsmall2(wspace, sunicw);
1142        if (spell(wspace.c_str()))
1143          insert_sug(slst, wspace);
1144        size_t prevns = slst.size();
1145 -      pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1146 +      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1147        if (captype == HUHINITCAP) {
1148          mkinitcap2(wspace, sunicw);
1149          if (spell(wspace.c_str()))
1150            insert_sug(slst, wspace);
1151 -        pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1152 +        good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1153        }
1154        // aNew -> "a New" (instead of "a new")
1155        for (size_t j = prevns; j < slst.size(); ++j) {
1156 @@ -964,11 +1001,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1157      case ALLCAP: {
1158        std::string wspace(scw);
1159        mkallsmall2(wspace, sunicw);
1160 -      pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1161 +      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1162        if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
1163          insert_sug(slst, wspace);
1164        mkinitcap2(wspace, sunicw);
1165 -      pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1166 +      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1167        for (size_t j = 0; j < slst.size(); ++j) {
1168          mkallcap(slst[j]);
1169          if (pAMgr && pAMgr->get_checksharps()) {
1170 @@ -1000,12 +1037,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1171      }
1172    }
1173    // END OF LANG_hu section
1174 -
1175 -  // try ngram approach since found nothing or only compound words
1176 -  if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
1177 +  // try ngram approach since found nothing good suggestion
1178 +  if (!good && pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
1179      switch (captype) {
1180        case NOCAP: {
1181 -        pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs);
1182 +        pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP);
1183          break;
1184        }
1185        case HUHINITCAP:
1186 @@ -1013,21 +1049,21 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1187        case HUHCAP: {
1188          std::string wspace(scw);
1189          mkallsmall2(wspace, sunicw);
1190 -        pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
1191 +        pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP);
1192          break;
1193        }
1194        case INITCAP: {
1195          capwords = 1;
1196          std::string wspace(scw);
1197          mkallsmall2(wspace, sunicw);
1198 -        pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
1199 +        pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP);
1200          break;
1201        }
1202        case ALLCAP: {
1203          std::string wspace(scw);
1204          mkallsmall2(wspace, sunicw);
1205          size_t oldns = slst.size();
1206 -        pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
1207 +        pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP);
1208          for (size_t j = oldns; j < slst.size(); ++j) {
1209            mkallcap(slst[j]);
1210          }
1211 @@ -1037,6 +1073,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1212    }
1213
1214    // try dash suggestion (Afo-American -> Afro-American)
1215 +  // Note: LibreOffice was modified to treat dashes as word
1216 +  // characters to check "scot-free" etc. word forms, but
1217 +  // we need to handle suggestions for "Afo-American", etc.,
1218 +  // while "Afro-American" is missing from the dictionary.
1219 +  // TODO avoid possible overgeneration
1220    size_t dash_pos = scw.find('-');
1221    if (dash_pos != std::string::npos) {
1222      int nodashsug = 1;
1223 @@ -1048,7 +1089,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
1224      size_t prev_pos = 0;
1225      bool last = false;
1226
1227 -    while (nodashsug && !last) {
1228 +    while (!good && nodashsug && !last) {
1229        if (dash_pos == scw.size())
1230          last = 1;
1231        std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
1232 diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
1233 index 73ea91e..ade85af 100644
1234 --- a/src/hunspell/suggestmgr.cxx
1235 +++ b/src/hunspell/suggestmgr.cxx
1236 @@ -132,6 +132,11 @@ SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) {
1237        ctryl = u8_u16(ctry_utf, tryme);
1238      }
1239    }
1240 +
1241 +  // language with possible dash usage
1242 +  // (latin letters or dash in TRY characters)
1243 +  lang_with_dash_usage = (ctry &&
1244 +      ((strchr(ctry, '-') != NULL) || (strchr(ctry, 'a') != NULL)));
1245  }
1246
1247  SuggestMgr::~SuggestMgr() {
1248 @@ -169,10 +174,13 @@ void SuggestMgr::testsug(std::vector<std::string>& wlst,
1249    }
1250  }
1251
1252 -// generate suggestions for a misspelled word
1253 -//    pass in address of array of char * pointers
1254 -// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
1255 -void SuggestMgr::suggest(std::vector<std::string>& slst,
1256 +/* generate suggestions for a misspelled word
1257 + *    pass in address of array of char * pointers
1258 + * onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
1259 + * return value: true, if there is a good suggestion
1260 + * (REP, ph: or a dictionary word pair)
1261 + */
1262 +bool SuggestMgr::suggest(std::vector<std::string>& slst,
1263                          const char* w,
1264                          int* onlycompoundsug) {
1265    int nocompoundtwowords = 0;
1266 @@ -182,6 +190,7 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
1267    std::string w2;
1268    const char* word = w;
1269    size_t oldSug = 0;
1270 +  bool good_suggestion = false;
1271
1272    // word reversing wrapper for complex prefixes
1273    if (complexprefixes) {
1274 @@ -196,11 +205,11 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
1275    if (utf8) {
1276      wl = u8_u16(word_utf, word);
1277      if (wl == -1) {
1278 -      return;
1279 +      return false;
1280      }
1281    }
1282
1283 -  for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0);
1284 +  for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion;
1285         cpdsuggest++) {
1286      // limit compound suggestion
1287      if (cpdsuggest > 0)
1288 @@ -208,15 +217,21 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
1289
1290      // suggestions for an uppercase word (html -> HTML)
1291      if (slst.size() < maxSug) {
1292 +      size_t i = slst.size();
1293        if (utf8)
1294          capchars_utf(slst, &word_utf[0], wl, cpdsuggest);
1295        else
1296          capchars(slst, word, cpdsuggest);
1297 +      if (slst.size() > i)
1298 +        good_suggestion = true;
1299      }
1300
1301      // perhaps we made a typical fault of spelling
1302      if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
1303 +      size_t i = slst.size();
1304        replchars(slst, word, cpdsuggest);
1305 +      if (slst.size() > i)
1306 +        good_suggestion = true;
1307      }
1308
1309      // perhaps we made chose the wrong char from a related set
1310 @@ -294,15 +309,19 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
1311      }
1312
1313      // perhaps we forgot to hit space and two words ran together
1314 -    if (!nosplitsugs && (slst.size() < maxSug) &&
1315 -        (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
1316 -      twowords(slst, word, cpdsuggest);
1317 +    // (dictionary word pairs have top priority here, so
1318 +    // we always suggest them, in despite of nosplitsugs, and
1319 +    // drop compound word and other suggestions)
1320 +    if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) {
1321 +      good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion);
1322      }
1323
1324    }  // repeating ``for'' statement compounding support
1325
1326    if (!nocompoundtwowords && (!slst.empty()) && onlycompoundsug)
1327      *onlycompoundsug = 1;
1328 +
1329 +  return good_suggestion;
1330  }
1331
1332  // suggestions for an uppercase word (html -> HTML)
1333 @@ -721,17 +740,22 @@ int SuggestMgr::forgotchar_utf(std::vector<std::string>& wlst,
1334    return wlst.size();
1335  }
1336
1337 -/* error is should have been two words */
1338 -int SuggestMgr::twowords(std::vector<std::string>& wlst,
1339 +/* error is should have been two words
1340 + * return value is true, if there is a dictionary word pair,
1341 + * or there was already a good suggestion before calling
1342 + * this function.
1343 + */
1344 +bool SuggestMgr::twowords(std::vector<std::string>& wlst,
1345                           const char* word,
1346 -                         int cpdsuggest) {
1347 +                         int cpdsuggest,
1348 +                         bool good) {
1349    int c2;
1350    int forbidden = 0;
1351    int cwrd;
1352
1353    int wl = strlen(word);
1354    if (wl < 3)
1355 -    return wlst.size();
1356 +    return false;
1357
1358    if (langnum == LANG_hu)
1359      forbidden = check_forbidden(word, wl);
1360 @@ -750,63 +774,87 @@ int SuggestMgr::twowords(std::vector<std::string>& wlst,
1361      }
1362      if (utf8 && p[1] == '\0')
1363        break;  // last UTF-8 character
1364 -    *p = '\0';
1365 -    int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
1366 -    if (c1) {
1367 -      c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
1368 -      if (c2) {
1369 -        *p = ' ';
1370 -
1371 -        // spec. Hungarian code (need a better compound word support)
1372 -        if ((langnum == LANG_hu) && !forbidden &&
1373 -            // if 3 repeating letter, use - instead of space
1374 -            (((p[-1] == p[1]) &&
1375 -              (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
1376 -             // or multiple compounding, with more, than 6 syllables
1377 -             ((c1 == 3) && (c2 >= 2))))
1378 -          *p = '-';
1379 -
1380 -        cwrd = 1;
1381 -        for (size_t k = 0; k < wlst.size(); ++k) {
1382 -          if (wlst[k] == candidate) {
1383 -            cwrd = 0;
1384 -            break;
1385 -          }
1386 -        }
1387 -        if (wlst.size() < maxSug) {
1388 -          if (cwrd) {
1389 -            wlst.push_back(candidate);
1390 -          }
1391 -        } else {
1392 -          free(candidate);
1393 -          return wlst.size();
1394 +
1395 +    // Suggest only word pairs, if they are listed in the dictionary.
1396 +    // For example, adding "a lot" to the English dic file will
1397 +    // result only "alot" -> "a lot" suggestion instead of
1398 +    // "alto, slot, alt, lot, allot, aloft, aloe, clot, plot, blot, a lot".
1399 +    // Note: using "ph:alot" keeps the other suggestions:
1400 +    // a lot ph:alot
1401 +    // alot -> a lot, alto, slot...
1402 +    *p = ' ';
1403 +    if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
1404 +      // remove not word pair suggestions
1405 +      if (!good) {
1406 +        good = true;
1407 +        wlst.clear();
1408 +      }
1409 +      wlst.insert(wlst.begin(), candidate);
1410 +    }
1411 +
1412 +    // word pairs with dash?
1413 +    if (lang_with_dash_usage) {
1414 +      *p = '-';
1415 +
1416 +      if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
1417 +        // remove not word pair suggestions
1418 +        if (!good) {
1419 +          good = true;
1420 +          wlst.clear();
1421          }
1422 -        // add two word suggestion with dash, if TRY string contains
1423 -        // "a" or "-"
1424 -        // NOTE: cwrd doesn't modified for REP twoword sugg.
1425 -        if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
1426 -            mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
1427 -          *p = '-';
1428 +        wlst.insert(wlst.begin(), candidate);
1429 +      }
1430 +    }
1431 +
1432 +    if (wlst.size() < maxSug && !nosplitsugs && !good) {
1433 +      *p = '\0';
1434 +      int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
1435 +      if (c1) {
1436 +        c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
1437 +        if (c2) {
1438 +          // spec. Hungarian code (TODO need a better compound word support)
1439 +          if ((langnum == LANG_hu) && !forbidden &&
1440 +              // if 3 repeating letter, use - instead of space
1441 +              (((p[-1] == p[1]) &&
1442 +              (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
1443 +              // or multiple compounding, with more, than 6 syllables
1444 +              ((c1 == 3) && (c2 >= 2))))
1445 +            *p = '-';
1446 +          else
1447 +            *p = ' ';
1448 +
1449 +          cwrd = 1;
1450            for (size_t k = 0; k < wlst.size(); ++k) {
1451              if (wlst[k] == candidate) {
1452                cwrd = 0;
1453                break;
1454              }
1455            }
1456 -          if (wlst.size() < maxSug) {
1457 -            if (cwrd) {
1458 +
1459 +          if (cwrd && (wlst.size() < maxSug))
1460                wlst.push_back(candidate);
1461 +
1462 +          // add two word suggestion with dash, depending on the language
1463 +          // Note that cwrd doesn't modified for REP twoword sugg.
1464 +          if ( !nosplitsugs && lang_with_dash_usage &&
1465 +              mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
1466 +            *p = '-';
1467 +            for (size_t k = 0; k < wlst.size(); ++k) {
1468 +              if (wlst[k] == candidate) {
1469 +                cwrd = 0;
1470 +                break;
1471 +              }
1472              }
1473 -          } else {
1474 -            free(candidate);
1475 -            return wlst.size();
1476 +
1477 +            if ((wlst.size() < maxSug) && cwrd)
1478 +              wlst.push_back(candidate);
1479            }
1480          }
1481        }
1482      }
1483    }
1484    free(candidate);
1485 -  return wlst.size();
1486 +  return good;
1487  }
1488
1489  // error is adjacent letter were swapped
1490 @@ -994,7 +1042,8 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
1491  // generate a set of suggestions for very poorly spelled words
1492  void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
1493                            const char* w,
1494 -                          const std::vector<HashMgr*>& rHMgr) {
1495 +                          const std::vector<HashMgr*>& rHMgr,
1496 +                          int captype) {
1497    int lval;
1498    int sc;
1499    int lp, lpphon;
1500 @@ -1071,18 +1120,34 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
1501      u8_u16(w_word, word);
1502      u8_u16(w_target, target);
1503    }
1504 -
1505 +
1506    std::string f;
1507    std::vector<w_char> w_f;
1508 -
1509 +
1510    for (size_t i = 0; i < rHMgr.size(); ++i) {
1511      while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
1512 -      if ((hp->astr) && (pAMgr) &&
1513 -          (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1514 -           TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1515 -           TESTAFF(hp->astr, nosuggest, hp->alen) ||
1516 -           TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1517 -           TESTAFF(hp->astr, onlyincompound, hp->alen)))
1518 +      // skip exceptions
1519 +      if (
1520 +           // skip it, if the word length different by 5 or
1521 +           // more characters (to avoid strange suggestions)
1522 +           // (except Unicode characters over BMP)
1523 +           (((abs(n - hp->clen) > 4) && !nonbmp)) ||
1524 +           // don't suggest capitalized dictionary words for
1525 +           // lower case misspellings in ngram suggestions, except
1526 +           // - PHONE usage, or
1527 +           // - in the case of German, where not only proper
1528 +           //   nouns are capitalized, or
1529 +           // - the capitalized word has special pronunciation
1530 +           ((captype == NOCAP) && (hp->var & H_OPT_INITCAP) &&
1531 +              !ph && (langnum != LANG_de) && !(hp->var & H_OPT_PHON)) ||
1532 +           // or it has one of the following special flags
1533 +           ((hp->astr) && (pAMgr) &&
1534 +             (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1535 +             TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1536 +             TESTAFF(hp->astr, nosuggest, hp->alen) ||
1537 +             TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1538 +             TESTAFF(hp->astr, onlyincompound, hp->alen)))
1539 +         )
1540          continue;
1541
1542        if (utf8) {
1543 @@ -1105,7 +1170,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
1544          sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
1545        }
1546
1547 -      // check special pronounciation
1548 +      // check special pronunciation
1549        f.clear();
1550        if ((hp->var & H_OPT_PHON) &&
1551            copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1552 @@ -1559,7 +1624,8 @@ int SuggestMgr::checkword(const std::string& word,
1553      if (rv) {
1554        if ((rv->astr) &&
1555            (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1556 -           TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen)))
1557 +           TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) ||
1558 +           TESTAFF(rv->astr, pAMgr->get_substandard(), rv->alen)))
1559          return 0;
1560        while (rv) {
1561          if (rv->astr &&
1562 @@ -1584,7 +1650,7 @@ int SuggestMgr::checkword(const std::string& word,
1563      if (!rv && pAMgr->have_contclass()) {
1564        rv = pAMgr->suffix_check_twosfx(word.c_str(), word.size(), 0, NULL, FLAG_NULL);
1565        if (!rv)
1566 -        rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 1, FLAG_NULL);
1567 +        rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 0, FLAG_NULL);
1568      }
1569
1570      // check forbidden words
1571 diff --git a/src/hunspell/suggestmgr.hxx b/src/hunspell/suggestmgr.hxx
1572 index 19ffc03..f0daf23 100644
1573 --- a/src/hunspell/suggestmgr.hxx
1574 +++ b/src/hunspell/suggestmgr.hxx
1575 @@ -109,6 +109,7 @@ class SuggestMgr {
1576    char* ctry;
1577    size_t ctryl;
1578    std::vector<w_char> ctry_utf;
1579 +  bool lang_with_dash_usage;
1580
1581    AffixMgr* pAMgr;
1582    unsigned int maxSug;
1583 @@ -124,8 +125,8 @@ class SuggestMgr {
1584    SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr);
1585    ~SuggestMgr();
1586
1587 -  void suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
1588 -  void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr);
1589 +  bool suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
1590 +  void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr, int captype);
1591
1592    std::string suggest_morph(const std::string& word);
1593    std::string suggest_gen(const std::vector<std::string>& pl, const std::string& pattern);
1594 @@ -149,7 +150,7 @@ class SuggestMgr {
1595    int extrachar(std::vector<std::string>&, const char*, int);
1596    int badcharkey(std::vector<std::string>&, const char*, int);
1597    int badchar(std::vector<std::string>&, const char*, int);
1598 -  int twowords(std::vector<std::string>&, const char*, int);
1599 +  bool twowords(std::vector<std::string>&, const char*, int, bool);
1600
1601    void capchars_utf(std::vector<std::string>&, const w_char*, int wl, int);
1602    int doubletwochars_utf(std::vector<std::string>&, const w_char*, int wl, int);
1603 --
1604 2.7.4
1605