extensions/spellcheck/hunspell/src/affentry.cpp

   1 /******* BEGIN LICENSE BLOCK *******
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * The contents of this file are subject to the Mozilla Public License Version
   5  * 1.1 (the "License"); you may not use this file except in compliance with
   6  * the License. You may obtain a copy of the License at
   7  * http://www.mozilla.org/MPL/
   8  *
   9  * Software distributed under the License is distributed on an "AS IS" basis,
  10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11  * for the specific language governing rights and limitations under the
  12  * License.
  13  *
  14  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
  15  * and László Németh (Hunspell). Portions created by the Initial Developers
  16  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
  17  *
  18  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
  19  *                 David Einstein (deinst@world.std.com)
  20  *                 László Németh (nemethl@gyorsposta.hu)
  21  *                 Davide Prina
  22  *                 Giuseppe Modugno
  23  *                 Gianluca Turconi
  24  *                 Simon Brouwer
  25  *                 Noll Janos
  26  *                 Biro Arpad
  27  *                 Goldman Eleonora
  28  *                 Sarlos Tamas
  29  *                 Bencsath Boldizsar
  30  *                 Halacsy Peter
  31  *                 Dvornik Laszlo
  32  *                 Gefferth Andras
  33  *                 Nagy Viktor
  34  *                 Varga Daniel
  35  *                 Chris Halls
  36  *                 Rene Engelhard
  37  *                 Bram Moolenaar
  38  *                 Dafydd Jones
  39  *                 Harri Pitkanen
  40  *                 Andras Timar
  41  *                 Tor Lillqvist
  42  *
  43  * Alternatively, the contents of this file may be used under the terms of
  44  * either the GNU General Public License Version 2 or later (the "GPL"), or
  45  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  46  * in which case the provisions of the GPL or the LGPL are applicable instead
  47  * of those above. If you wish to allow use of your version of this file only
  48  * under the terms of either the GPL or the LGPL, and not to allow others to
  49  * use your version of this file under the terms of the MPL, indicate your
  50  * decision by deleting the provisions above and replace them with the notice
  51  * and other provisions required by the GPL or the LGPL. If you do not delete
  52  * the provisions above, a recipient may use your version of this file under
  53  * the terms of any one of the MPL, the GPL or the LGPL.
  54  *
  55  ******* END LICENSE BLOCK *******/
  56
  57 #ifndef MOZILLA_CLIENT
  58 #include <cstdlib>
  59 #include <cstring>
  60 #include <cctype>
  61 #include <cstdio>
  62 #else
  63 #include <stdlib.h>
  64 #include <string.h>
  65 #include <stdio.h>
  66 #include <ctype.h>
  67 #endif
  68
  69 #include "affentry.hxx"
  70 #include "csutil.hxx"
  71
  72 #ifndef MOZILLA_CLIENT
  73 #ifndef W32
  74 using namespace std;
  75 #endif
  76 #endif
  77
  78
  79 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
  80 {
  81   // register affix manager
  82   pmyMgr = pmgr;
  83
  84   // set up its intial values
  85
  86   aflag = dp->aflag;         // flag
  87   strip = dp->strip;         // string to strip
  88   appnd = dp->appnd;         // string to append
  89   stripl = dp->stripl;       // length of strip string
  90   appndl = dp->appndl;       // length of append string
  91   numconds = dp->numconds;   // length of the condition
  92   opts = dp->opts;           // cross product flag
  93   // then copy over all of the conditions
  94   if (opts & aeLONGCOND) {
  95     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
  96     c.l.conds2 = dp->c.l.conds2;
  97   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
  98   next = NULL;
  99   nextne = NULL;
 100   nexteq = NULL;
 101   morphcode = dp->morphcode;
 102   contclass = dp->contclass;
 103   contclasslen = dp->contclasslen;
 104 }
 105
 106
 107 PfxEntry::~PfxEntry()
 108 {
 109     aflag = 0;
 110     if (appnd) free(appnd);
 111     if (strip) free(strip);
 112     pmyMgr = NULL;
 113     appnd = NULL;
 114     strip = NULL;
 115     if (opts & aeLONGCOND) free(c.l.conds2);
 116     if (morphcode && !(opts & aeALIASM)) free(morphcode);
 117     if (contclass && !(opts & aeALIASF)) free(contclass);
 118 }
 119
 120 // add prefix to this word assuming conditions hold
 121 char * PfxEntry::add(const char * word, int len)
 122 {
 123     char tword[MAXWORDUTF8LEN + 4];
 124
 125     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
 126        (len >= numconds) && test_condition(word) &&
 127        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
 128        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
 129     /* we have a match so add prefix */
 130               char * pp = tword;
 131               if (appndl) {
 132                   strcpy(tword,appnd);
 133                   pp += appndl;
 134                }
 135                strcpy(pp, (word + stripl));
 136                return mystrdup(tword);
 137      }
 138      return NULL;
 139 }
 140
 141 inline char * PfxEntry::nextchar(char * p) {
 142     if (p) {
 143         p++;
 144         if (opts & aeLONGCOND) {
 145             // jump to the 2nd part of the condition
 146             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
 147         // end of the MAXCONDLEN length condition
 148         } else if (p == c.conds + MAXCONDLEN) return NULL;
 149         return *p ? p : NULL;
 150     }
 151     return NULL;
 152 }
 153
 154 inline int PfxEntry::test_condition(const char * st)
 155 {
 156     const char * pos = NULL; // group with pos input position
 157     bool neg = false;        // complementer
 158     bool ingroup = false;    // character in the group
 159     if (numconds == 0) return 1;
 160     char * p = c.conds;
 161     while (1) {
 162       switch (*p) {
 163         case '\0': return 1;
 164         case '[': {
 165                 neg = false;
 166                 ingroup = false;
 167                 p = nextchar(p);
 168                 pos = st; break;
 169             }
 170         case '^': { p = nextchar(p); neg = true; break; }
 171         case ']': {
 172                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
 173                 pos = NULL;
 174                 p = nextchar(p);
 175                 // skip the next character
 176                 if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
 177                 if (*st == '\0' && p) return 0; // word <= condition
 178                 break;
 179             }
 180          case '.': if (!pos) { // dots are not metacharacters in groups: [.]
 181                 p = nextchar(p);
 182                 // skip the next character
 183                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
 184                 if (*st == '\0' && p) return 0; // word <= condition
 185                 break;
 186             }
 187     default: {
 188                 if (*st == *p) {
 189                     st++;
 190                     p = nextchar(p);
 191                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
 192                         while (p && (*p & 0xc0) == 0x80) {       // character
 193                             if (*p != *st) {
 194                                 if (!pos) return 0;
 195                                 st = pos;
 196                                 break;
 197                             }
 198                             p = nextchar(p);
 199                             st++;
 200                         }
 201                         if (pos && st != pos) {
 202                             ingroup = true;
 203                             while (p && *p != ']' && (p = nextchar(p)));
 204                         }
 205                     } else if (pos) {
 206                         ingroup = true;
 207                         while (p && *p != ']' && (p = nextchar(p)));
 208                     }
 209                 } else if (pos) { // group
 210                     p = nextchar(p);
 211                 } else return 0;
 212             }
 213       }
 214       if (!p) return 1;
 215     }
 216 }
 217
 218 // check if this prefix entry matches
 219 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
 220 {
 221     int                 tmpl;   // length of tmpword
 222     struct hentry *     he;     // hash entry of root word or NULL
 223     char                tmpword[MAXWORDUTF8LEN + 4];
 224
 225     // on entry prefix is 0 length or already matches the beginning of the word.
 226     // So if the remaining root word has positive length
 227     // and if there are enough chars in root word and added back strip chars
 228     // to meet the number of characters conditions, then test it
 229
 230      tmpl = len - appndl;
 231
 232      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
 233
 234             // generate new root word by removing prefix and adding
 235             // back any characters that would have been stripped
 236
 237             if (stripl) strcpy (tmpword, strip);
 238             strcpy ((tmpword + stripl), (word + appndl));
 239
 240             // now make sure all of the conditions on characters
 241             // are met.  Please see the appendix at the end of
 242             // this file for more info on exactly what is being
 243             // tested
 244
 245             // if all conditions are met then check if resulting
 246             // root word in the dictionary
 247
 248             if (test_condition(tmpword)) {
 249                 tmpl += stripl;
 250                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 251                    do {
 252                       if (TESTAFF(he->astr, aflag, he->alen) &&
 253                         // forbid single prefixes with needaffix flag
 254                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
 255                         // needflag
 256                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
 257                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
 258                             return he;
 259                       he = he->next_homonym; // check homonyms
 260                    } while (he);
 261                 }
 262
 263                 // prefix matched but no root word was found
 264                 // if aeXPRODUCT is allowed, try again but now
 265                 // ross checked combined with a suffix
 266
 267                 //if ((opts & aeXPRODUCT) && in_compound) {
 268                 if ((opts & aeXPRODUCT)) {
 269                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
 270                         0, NULL, FLAG_NULL, needflag, in_compound);
 271                    if (he) return he;
 272                 }
 273             }
 274      }
 275     return NULL;
 276 }
 277
 278 // check if this prefix entry matches
 279 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
 280     char in_compound, const FLAG needflag)
 281 {
 282     int                 tmpl;   // length of tmpword
 283     struct hentry *     he;     // hash entry of root word or NULL
 284     char                tmpword[MAXWORDUTF8LEN + 4];
 285
 286     // on entry prefix is 0 length or already matches the beginning of the word.
 287     // So if the remaining root word has positive length
 288     // and if there are enough chars in root word and added back strip chars
 289     // to meet the number of characters conditions, then test it
 290
 291      tmpl = len - appndl;
 292
 293      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 294         (tmpl + stripl >= numconds)) {
 295
 296             // generate new root word by removing prefix and adding
 297             // back any characters that would have been stripped
 298
 299             if (stripl) strcpy (tmpword, strip);
 300             strcpy ((tmpword + stripl), (word + appndl));
 301
 302             // now make sure all of the conditions on characters
 303             // are met.  Please see the appendix at the end of
 304             // this file for more info on exactly what is being
 305             // tested
 306
 307             // if all conditions are met then check if resulting
 308             // root word in the dictionary
 309
 310             if (test_condition(tmpword)) {
 311                 tmpl += stripl;
 312
 313                 // prefix matched but no root word was found
 314                 // if aeXPRODUCT is allowed, try again but now
 315                 // cross checked combined with a suffix
 316
 317                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 318                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
 319                    if (he) return he;
 320                 }
 321             }
 322      }
 323     return NULL;
 324 }
 325
 326 // check if this prefix entry matches
 327 char * PfxEntry::check_twosfx_morph(const char * word, int len,
 328          char in_compound, const FLAG needflag)
 329 {
 330     int                 tmpl;   // length of tmpword
 331     char                tmpword[MAXWORDUTF8LEN + 4];
 332
 333     // on entry prefix is 0 length or already matches the beginning of the word.
 334     // So if the remaining root word has positive length
 335     // and if there are enough chars in root word and added back strip chars
 336     // to meet the number of characters conditions, then test it
 337
 338      tmpl = len - appndl;
 339
 340      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 341         (tmpl + stripl >= numconds)) {
 342
 343             // generate new root word by removing prefix and adding
 344             // back any characters that would have been stripped
 345
 346             if (stripl) strcpy (tmpword, strip);
 347             strcpy ((tmpword + stripl), (word + appndl));
 348
 349             // now make sure all of the conditions on characters
 350             // are met.  Please see the appendix at the end of
 351             // this file for more info on exactly what is being
 352             // tested
 353
 354             // if all conditions are met then check if resulting
 355             // root word in the dictionary
 356
 357             if (test_condition(tmpword)) {
 358                 tmpl += stripl;
 359
 360                 // prefix matched but no root word was found
 361                 // if aeXPRODUCT is allowed, try again but now
 362                 // ross checked combined with a suffix
 363
 364                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 365                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
 366                              aeXPRODUCT, (AffEntry *)this, needflag);
 367                 }
 368             }
 369      }
 370     return NULL;
 371 }
 372
 373 // check if this prefix entry matches
 374 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
 375 {
 376     int                 tmpl;   // length of tmpword
 377     struct hentry *     he;     // hash entry of root word or NULL
 378     char                tmpword[MAXWORDUTF8LEN + 4];
 379     char                result[MAXLNLEN];
 380     char * st;
 381
 382     *result = '\0';
 383
 384     // on entry prefix is 0 length or already matches the beginning of the word.
 385     // So if the remaining root word has positive length
 386     // and if there are enough chars in root word and added back strip chars
 387     // to meet the number of characters conditions, then test it
 388
 389      tmpl = len - appndl;
 390
 391      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 392         (tmpl + stripl >= numconds)) {
 393
 394             // generate new root word by removing prefix and adding
 395             // back any characters that would have been stripped
 396
 397             if (stripl) strcpy (tmpword, strip);
 398             strcpy ((tmpword + stripl), (word + appndl));
 399
 400             // now make sure all of the conditions on characters
 401             // are met.  Please see the appendix at the end of
 402             // this file for more info on exactly what is being
 403             // tested
 404
 405             // if all conditions are met then check if resulting
 406             // root word in the dictionary
 407
 408             if (test_condition(tmpword)) {
 409                 tmpl += stripl;
 410                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 411                     do {
 412                       if (TESTAFF(he->astr, aflag, he->alen) &&
 413                         // forbid single prefixes with needaffix flag
 414                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
 415                         // needflag
 416                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
 417                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
 418                             if (morphcode) {
 419                                 mystrcat(result, " ", MAXLNLEN);
 420                                 mystrcat(result, morphcode, MAXLNLEN);
 421                             } else mystrcat(result,getKey(), MAXLNLEN);
 422                             if (!HENTRY_FIND(he, MORPH_STEM)) {
 423                                 mystrcat(result, " ", MAXLNLEN);
 424                                 mystrcat(result, MORPH_STEM, MAXLNLEN);
 425                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
 426                             }
 427                             // store the pointer of the hash entry
 428                             if (HENTRY_DATA(he)) {
 429                                 mystrcat(result, " ", MAXLNLEN);
 430                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
 431                             } else {
 432                                 // return with debug information
 433                                 char * flag = pmyMgr->encode_flag(getFlag());
 434                                 mystrcat(result, " ", MAXLNLEN);
 435                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);
 436                                 mystrcat(result, flag, MAXLNLEN);
 437                                 free(flag);
 438                             }
 439                             mystrcat(result, "\n", MAXLNLEN);
 440                       }
 441                       he = he->next_homonym;
 442                     } while (he);
 443                 }
 444
 445                 // prefix matched but no root word was found
 446                 // if aeXPRODUCT is allowed, try again but now
 447                 // ross checked combined with a suffix
 448
 449                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 450                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
 451                      FLAG_NULL, needflag);
 452                    if (st) {
 453                         mystrcat(result, st, MAXLNLEN);
 454                         free(st);
 455                    }
 456                 }
 457             }
 458      }
 459
 460     if (*result) return mystrdup(result);
 461     return NULL;
 462 }
 463
 464 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
 465 {
 466   // register affix manager
 467   pmyMgr = pmgr;
 468
 469   // set up its intial values
 470   aflag = dp->aflag;         // char flag
 471   strip = dp->strip;         // string to strip
 472   appnd = dp->appnd;         // string to append
 473   stripl = dp->stripl;       // length of strip string
 474   appndl = dp->appndl;       // length of append string
 475   numconds = dp->numconds;   // length of the condition
 476   opts = dp->opts;           // cross product flag
 477
 478   // then copy over all of the conditions
 479   if (opts & aeLONGCOND) {
 480     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
 481     c.l.conds2 = dp->c.l.conds2;
 482   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
 483
 484   rappnd = myrevstrdup(appnd);
 485   morphcode = dp->morphcode;
 486   contclass = dp->contclass;
 487   contclasslen = dp->contclasslen;
 488 }
 489
 490
 491 SfxEntry::~SfxEntry()
 492 {
 493     aflag = 0;
 494     if (appnd) free(appnd);
 495     if (rappnd) free(rappnd);
 496     if (strip) free(strip);
 497     pmyMgr = NULL;
 498     appnd = NULL;
 499     strip = NULL;
 500     if (opts & aeLONGCOND) free(c.l.conds2);
 501     if (morphcode && !(opts & aeALIASM)) free(morphcode);
 502     if (contclass && !(opts & aeALIASF)) free(contclass);
 503 }
 504
 505 // add suffix to this word assuming conditions hold
 506 char * SfxEntry::add(const char * word, int len)
 507 {
 508     char                tword[MAXWORDUTF8LEN + 4];
 509
 510      /* make sure all conditions match */
 511      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
 512         (len >= numconds) && test_condition(word + len, word) &&
 513         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
 514         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
 515               /* we have a match so add suffix */
 516               strcpy(tword,word);
 517               if (appndl) {
 518                   strcpy(tword + len - stripl, appnd);
 519               } else {
 520                   *(tword + len - stripl) = '\0';
 521               }
 522               return mystrdup(tword);
 523      }
 524      return NULL;
 525 }
 526
 527 inline char * SfxEntry::nextchar(char * p) {
 528     if (p) {
 529         p++;
 530         if (opts & aeLONGCOND) {
 531             // jump to the 2nd part of the condition
 532             if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
 533         // end of the MAXCONDLEN length condition
 534         } else if (p == c.conds + MAXCONDLEN) return NULL;
 535         return *p ? p : NULL;
 536     }
 537     return NULL;
 538 }
 539
 540 inline int SfxEntry::test_condition(const char * st, const char * beg)
 541 {
 542     const char * pos = NULL;    // group with pos input position
 543     bool neg = false;           // complementer
 544     bool ingroup = false;       // character in the group
 545     if (numconds == 0) return 1;
 546     char * p = c.conds;
 547     st--;
 548     int i = 1;
 549     while (1) {
 550       switch (*p) {
 551         case '\0': return 1;
 552         case '[': { p = nextchar(p); pos = st; break; }
 553         case '^': { p = nextchar(p); neg = true; break; }
 554         case ']': { if (!neg && !ingroup) return 0;
 555                 i++;
 556                 // skip the next character
 557                 if (!ingroup) {
 558                     for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
 559                     st--;
 560                 }
 561                 pos = NULL;
 562                 neg = false;
 563                 ingroup = false;
 564                 p = nextchar(p);
 565                 if (st < beg && p) return 0; // word <= condition
 566                 break;
 567             }
 568         case '.': if (!pos) { // dots are not metacharacters in groups: [.]
 569                 p = nextchar(p);
 570                 // skip the next character
 571                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
 572                 if (st < beg) { // word <= condition
 573                     if (p) return 0; else return 1;
 574                 }
 575                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
 576                     st--;
 577                     if (st < beg) { // word <= condition
 578                         if (p) return 0; else return 1;
 579                     }
 580                 }
 581                 break;
 582             }
 583     default: {
 584                 if (*st == *p) {
 585                     p = nextchar(p);
 586                     if ((opts & aeUTF8) && (*st & 0x80)) {
 587                         st--;
 588                         while (p && (st >= beg)) {
 589                             if (*p != *st) {
 590                                 if (!pos) return 0;
 591                                 st = pos;
 592                                 break;
 593                             }
 594                             // first byte of the UTF-8 multibyte character
 595                             if ((*p & 0xc0) != 0x80) break;
 596                             p = nextchar(p);
 597                             st--;
 598                         }
 599                         if (pos && st != pos) {
 600                             if (neg) return 0;
 601                             else if (i == numconds) return 1;
 602                             ingroup = true;
 603                             while (p && *p != ']' && (p = nextchar(p)));
 604                             st--;
 605                         }
 606                         if (p && *p != ']') p = nextchar(p);
 607                     } else if (pos) {
 608                         if (neg) return 0;
 609                         else if (i == numconds) return 1;
 610                         ingroup = true;
 611                         while (p && *p != ']' && (p = nextchar(p)));
 612 //                      if (p && *p != ']') p = nextchar(p);
 613                         st--;
 614                     }
 615                     if (!pos) {
 616                         i++;
 617                         st--;
 618                     }
 619                     if (st < beg && p && *p != ']') return 0; // word <= condition
 620                 } else if (pos) { // group
 621                     p = nextchar(p);
 622                 } else return 0;
 623             }
 624       }
 625       if (!p) return 1;
 626     }
 627 }
 628
 629 // see if this suffix is present in the word
 630 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
 631     AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
 632     const FLAG badflag)
 633 {
 634     int                 tmpl;            // length of tmpword
 635     struct hentry *     he;              // hash entry pointer
 636     unsigned char *     cp;
 637     char                tmpword[MAXWORDUTF8LEN + 4];
 638     PfxEntry* ep = (PfxEntry *) ppfx;
 639
 640     // if this suffix is being cross checked with a prefix
 641     // but it does not support cross products skip it
 642
 643     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
 644         return NULL;
 645
 646     // upon entry suffix is 0 length or already matches the end of the word.
 647     // So if the remaining root word has positive length
 648     // and if there are enough chars in root word and added back strip chars
 649     // to meet the number of characters conditions, then test it
 650
 651     tmpl = len - appndl;
 652     // the second condition is not enough for UTF-8 strings
 653     // it checked in test_condition()
 654
 655     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 656         (tmpl + stripl >= numconds)) {
 657
 658             // generate new root word by removing suffix and adding
 659             // back any characters that would have been stripped or
 660             // or null terminating the shorter string
 661
 662             strcpy (tmpword, word);
 663             cp = (unsigned char *)(tmpword + tmpl);
 664             if (stripl) {
 665                 strcpy ((char *)cp, strip);
 666                 tmpl += stripl;
 667                 cp = (unsigned char *)(tmpword + tmpl);
 668             } else *cp = '\0';
 669
 670             // now make sure all of the conditions on characters
 671             // are met.  Please see the appendix at the end of
 672             // this file for more info on exactly what is being
 673             // tested
 674
 675             // if all conditions are met then check if resulting
 676             // root word in the dictionary
 677
 678             if (test_condition((char *) cp, (char *) tmpword)) {
 679
 680 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
 681                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
 682 #endif
 683                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 684                     do {
 685                         // check conditional suffix (enabled by prefix)
 686                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
 687                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
 688                             (((optflags & aeXPRODUCT) == 0) ||
 689                             TESTAFF(he->astr, ep->getFlag(), he->alen) ||
 690                              // enabled by prefix
 691                             ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
 692                             ) &&
 693                             // handle cont. class
 694                             ((!cclass) ||
 695                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
 696                             ) &&
 697                             // check only in compound homonyms (bad flags)
 698                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
 699                             ) &&
 700                             // handle required flag
 701                             ((!needflag) ||
 702                               (TESTAFF(he->astr, needflag, he->alen) ||
 703                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
 704                             )
 705                         ) return he;
 706                         he = he->next_homonym; // check homonyms
 707                     } while (he);
 708
 709                 // obsolote stemming code (used only by the
 710                 // experimental SuffixMgr:suggest_pos_stems)
 711                 // store resulting root in wlst
 712                 } else if (wlst && (*ns < maxSug)) {
 713                     int cwrd = 1;
 714                     for (int k=0; k < *ns; k++)
 715                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
 716                     if (cwrd) {
 717                         wlst[*ns] = mystrdup(tmpword);
 718                         if (wlst[*ns] == NULL) {
 719                             for (int j=0; j<*ns; j++) free(wlst[j]);
 720                             *ns = -1;
 721                             return NULL;
 722                         }
 723                         (*ns)++;
 724                     }
 725                 }
 726             }
 727     }
 728     return NULL;
 729 }
 730
 731 // see if two-level suffix is present in the word
 732 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
 733     AffEntry* ppfx, const FLAG needflag)
 734 {
 735     int                 tmpl;            // length of tmpword
 736     struct hentry *     he;              // hash entry pointer
 737     unsigned char *     cp;
 738     char                tmpword[MAXWORDUTF8LEN + 4];
 739     PfxEntry* ep = (PfxEntry *) ppfx;
 740
 741
 742     // if this suffix is being cross checked with a prefix
 743     // but it does not support cross products skip it
 744
 745     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
 746         return NULL;
 747
 748     // upon entry suffix is 0 length or already matches the end of the word.
 749     // So if the remaining root word has positive length
 750     // and if there are enough chars in root word and added back strip chars
 751     // to meet the number of characters conditions, then test it
 752
 753     tmpl = len - appndl;
 754
 755     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 756        (tmpl + stripl >= numconds)) {
 757
 758             // generate new root word by removing suffix and adding
 759             // back any characters that would have been stripped or
 760             // or null terminating the shorter string
 761
 762             strcpy (tmpword, word);
 763             cp = (unsigned char *)(tmpword + tmpl);
 764             if (stripl) {
 765                 strcpy ((char *)cp, strip);
 766                 tmpl += stripl;
 767                 cp = (unsigned char *)(tmpword + tmpl);
 768             } else *cp = '\0';
 769
 770             // now make sure all of the conditions on characters
 771             // are met.  Please see the appendix at the end of
 772             // this file for more info on exactly what is being
 773             // tested
 774
 775             // if all conditions are met then recall suffix_check
 776
 777             if (test_condition((char *) cp, (char *) tmpword)) {
 778                 if (ppfx) {
 779                     // handle conditional suffix
 780                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
 781                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
 782                     else
 783                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
 784                 } else {
 785                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
 786                 }
 787                 if (he) return he;
 788             }
 789     }
 790     return NULL;
 791 }
 792
 793 // see if two-level suffix is present in the word
 794 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
 795     AffEntry* ppfx, const FLAG needflag)
 796 {
 797     int                 tmpl;            // length of tmpword
 798     unsigned char *     cp;
 799     char                tmpword[MAXWORDUTF8LEN + 4];
 800     PfxEntry* ep = (PfxEntry *) ppfx;
 801     char * st;
 802
 803     char result[MAXLNLEN];
 804
 805     *result = '\0';
 806
 807     // if this suffix is being cross checked with a prefix
 808     // but it does not support cross products skip it
 809
 810     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
 811         return NULL;
 812
 813     // upon entry suffix is 0 length or already matches the end of the word.
 814     // So if the remaining root word has positive length
 815     // and if there are enough chars in root word and added back strip chars
 816     // to meet the number of characters conditions, then test it
 817
 818     tmpl = len - appndl;
 819
 820     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 821        (tmpl + stripl >= numconds)) {
 822
 823             // generate new root word by removing suffix and adding
 824             // back any characters that would have been stripped or
 825             // or null terminating the shorter string
 826
 827             strcpy (tmpword, word);
 828             cp = (unsigned char *)(tmpword + tmpl);
 829             if (stripl) {
 830                 strcpy ((char *)cp, strip);
 831                 tmpl += stripl;
 832                 cp = (unsigned char *)(tmpword + tmpl);
 833             } else *cp = '\0';
 834
 835             // now make sure all of the conditions on characters
 836             // are met.  Please see the appendix at the end of
 837             // this file for more info on exactly what is being
 838             // tested
 839
 840             // if all conditions are met then recall suffix_check
 841
 842             if (test_condition((char *) cp, (char *) tmpword)) {
 843                 if (ppfx) {
 844                     // handle conditional suffix
 845                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
 846                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
 847                         if (st) {
 848                             if (((PfxEntry *) ppfx)->getMorph()) {
 849                                 mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN);
 850                                 mystrcat(result, " ", MAXLNLEN);
 851                             }
 852                             mystrcat(result,st, MAXLNLEN);
 853                             free(st);
 854                             mychomp(result);
 855                         }
 856                     } else {
 857                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
 858                         if (st) {
 859                             mystrcat(result, st, MAXLNLEN);
 860                             free(st);
 861                             mychomp(result);
 862                         }
 863                     }
 864                 } else {
 865                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
 866                         if (st) {
 867                             mystrcat(result, st, MAXLNLEN);
 868                             free(st);
 869                             mychomp(result);
 870                         }
 871                 }
 872                 if (*result) return mystrdup(result);
 873             }
 874     }
 875     return NULL;
 876 }
 877
 878 // get next homonym with same affix
 879 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
 880     const FLAG cclass, const FLAG needflag)
 881 {
 882     PfxEntry* ep = (PfxEntry *) ppfx;
 883     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
 884
 885     while (he->next_homonym) {
 886         he = he->next_homonym;
 887         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
 888                             ((optflags & aeXPRODUCT) == 0 ||
 889                             TESTAFF(he->astr, eFlag, he->alen) ||
 890                              // handle conditional suffix
 891                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
 892                             ) &&
 893                             // handle cont. class
 894                             ((!cclass) ||
 895                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
 896                             ) &&
 897                             // handle required flag
 898                             ((!needflag) ||
 899                               (TESTAFF(he->astr, needflag, he->alen) ||
 900                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
 901                             )
 902                         ) return he;
 903     }
 904     return NULL;
 905 }
 906
 907
 908 #if 0
 909
 910 Appendix:  Understanding Affix Code
 911
 912
 913 An affix is either a  prefix or a suffix attached to root words to make
 914 other words.
 915
 916 Basically a Prefix or a Suffix is set of AffEntry objects
 917 which store information about the prefix or suffix along
 918 with supporting routines to check if a word has a particular
 919 prefix or suffix or a combination.
 920
 921 The structure affentry is defined as follows:
 922
 923 struct affentry
 924 {
 925    unsigned short aflag;    // ID used to represent the affix
 926    char * strip;            // string to strip before adding affix
 927    char * appnd;            // the affix string to add
 928    unsigned char stripl;    // length of the strip string
 929    unsigned char appndl;    // length of the affix string
 930    char numconds;           // the number of conditions that must be met
 931    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix
 932    char   conds[SETSIZE];   // array which encodes the conditions to be met
 933 };
 934
 935
 936 Here is a suffix borrowed from the en_US.aff file.  This file
 937 is whitespace delimited.
 938
 939 SFX D Y 4
 940 SFX D   0     e          d
 941 SFX D   y     ied        [^aeiou]y
 942 SFX D   0     ed         [^ey]
 943 SFX D   0     ed         [aeiou]y
 944
 945 This information can be interpreted as follows:
 946
 947 In the first line has 4 fields
 948
 949 Field
 950 -----
 951 1     SFX - indicates this is a suffix
 952 2     D   - is the name of the character flag which represents this suffix
 953 3     Y   - indicates it can be combined with prefixes (cross product)
 954 4     4   - indicates that sequence of 4 affentry structures are needed to
 955                properly store the affix information
 956
 957 The remaining lines describe the unique information for the 4 SfxEntry
 958 objects that make up this affix.  Each line can be interpreted
 959 as follows: (note fields 1 and 2 are as a check against line 1 info)
 960
 961 Field
 962 -----
 963 1     SFX         - indicates this is a suffix
 964 2     D           - is the name of the character flag for this affix
 965 3     y           - the string of chars to strip off before adding affix
 966                          (a 0 here indicates the NULL string)
 967 4     ied         - the string of affix characters to add
 968 5     [^aeiou]y   - the conditions which must be met before the affix
 969                     can be applied
 970
 971 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
 972 there are 2 conditions that must be met.  The first condition is that
 973 the next to the last character in the word must *NOT* be any of the
 974 following "a", "e", "i", "o" or "u".  The second condition is that
 975 the last character of the word must end in "y".
 976
 977 So how can we encode this information concisely and be able to
 978 test for both conditions in a fast manner?  The answer is found
 979 but studying the wonderful ispell code of Geoff Kuenning, et.al.
 980 (now available under a normal BSD license).
 981
 982 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
 983 using a character (cast to an unsigned char) of a string, we have 8 bits
 984 of information we can store about that character.  Specifically we
 985 could use each bit to say if that character is allowed in any of the
 986 last (or first for prefixes) 8 characters of the word.
 987
 988 Basically, each character at one end of the word (up to the number
 989 of conditions) is used to index into the conds array and the resulting
 990 value found there says whether the that character is valid for a
 991 specific character position in the word.
 992
 993 For prefixes, it does this by setting bit 0 if that char is valid
 994 in the first position, bit 1 if valid in the second position, and so on.
 995
 996 If a bit is not set, then that char is not valid for that postion in the
 997 word.
 998
 999 If working with suffixes bit 0 is used for the character closest
1000 to the front, bit 1 for the next character towards the end, ...,
1001 with bit numconds-1 representing the last char at the end of the string.
1002
1003 Note: since entries in the conds[] are 8 bits, only 8 conditions
1004 (read that only 8 character positions) can be examined at one
1005 end of a word (the beginning for prefixes and the end for suffixes.
1006
1007 So to make this clearer, lets encode the conds array values for the
1008 first two affentries for the suffix D described earlier.
1009
1010
1011   For the first affentry:
1012      numconds = 1             (only examine the last character)
1013
1014      conds['e'] =  (1 << 0)   (the word must end in an E)
1015      all others are all 0
1016
1017   For the second affentry:
1018      numconds = 2             (only examine the last two characters)
1019
1020      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1021          where X is all characters *but* a, e, i, o, or u
1022
1023
1024      conds['y'] = (1 << 1)     (the last char must be a y)
1025      all other bits for all other entries in the conds array are zero
1026
1027
1028 #endif