Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / extensions / spellcheck / hunspell / src / affentry.cpp
blob6e300e919766d865513d1854044d93eeb9d73048
1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
21 * Davide Prina
22 * Giuseppe Modugno
23 * Gianluca Turconi
24 * Simon Brouwer
25 * Noll Janos
26 * Biro Arpad
27 * Goldman Eleonora
28 * Sarlos Tamas
29 * Bencsath Boldizsar
30 * Halacsy Peter
31 * Dvornik Laszlo
32 * Gefferth Andras
33 * Nagy Viktor
34 * Varga Daniel
35 * Chris Halls
36 * Rene Engelhard
37 * Bram Moolenaar
38 * Dafydd Jones
39 * Harri Pitkanen
40 * Andras Timar
41 * Tor Lillqvist
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
58 #include <cstdlib>
59 #include <cstring>
60 #include <cctype>
61 #include <cstdio>
62 #else
63 #include <stdlib.h>
64 #include <string.h>
65 #include <stdio.h>
66 #include <ctype.h>
67 #endif
69 #include "affentry.hxx"
70 #include "csutil.hxx"
72 #ifndef MOZILLA_CLIENT
73 #ifndef W32
74 using namespace std;
75 #endif
76 #endif
79 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
81 // register affix manager
82 pmyMgr = pmgr;
84 // set up its intial values
86 aflag = dp->aflag; // flag
87 strip = dp->strip; // string to strip
88 appnd = dp->appnd; // string to append
89 stripl = dp->stripl; // length of strip string
90 appndl = dp->appndl; // length of append string
91 numconds = dp->numconds; // length of the condition
92 opts = dp->opts; // cross product flag
93 // then copy over all of the conditions
94 if (opts & aeLONGCOND) {
95 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
96 c.l.conds2 = dp->c.l.conds2;
97 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
98 next = NULL;
99 nextne = NULL;
100 nexteq = NULL;
101 morphcode = dp->morphcode;
102 contclass = dp->contclass;
103 contclasslen = dp->contclasslen;
107 PfxEntry::~PfxEntry()
109 aflag = 0;
110 if (appnd) free(appnd);
111 if (strip) free(strip);
112 pmyMgr = NULL;
113 appnd = NULL;
114 strip = NULL;
115 if (opts & aeLONGCOND) free(c.l.conds2);
116 if (morphcode && !(opts & aeALIASM)) free(morphcode);
117 if (contclass && !(opts & aeALIASF)) free(contclass);
120 // add prefix to this word assuming conditions hold
121 char * PfxEntry::add(const char * word, int len)
123 char tword[MAXWORDUTF8LEN + 4];
125 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
126 (len >= numconds) && test_condition(word) &&
127 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
128 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
129 /* we have a match so add prefix */
130 char * pp = tword;
131 if (appndl) {
132 strcpy(tword,appnd);
133 pp += appndl;
135 strcpy(pp, (word + stripl));
136 return mystrdup(tword);
138 return NULL;
141 inline char * PfxEntry::nextchar(char * p) {
142 if (p) {
143 p++;
144 if (opts & aeLONGCOND) {
145 // jump to the 2nd part of the condition
146 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
147 // end of the MAXCONDLEN length condition
148 } else if (p == c.conds + MAXCONDLEN) return NULL;
149 return *p ? p : NULL;
151 return NULL;
154 inline int PfxEntry::test_condition(const char * st)
156 const char * pos = NULL; // group with pos input position
157 bool neg = false; // complementer
158 bool ingroup = false; // character in the group
159 if (numconds == 0) return 1;
160 char * p = c.conds;
161 while (1) {
162 switch (*p) {
163 case '\0': return 1;
164 case '[': {
165 neg = false;
166 ingroup = false;
167 p = nextchar(p);
168 pos = st; break;
170 case '^': { p = nextchar(p); neg = true; break; }
171 case ']': {
172 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
173 pos = NULL;
174 p = nextchar(p);
175 // skip the next character
176 if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
177 if (*st == '\0' && p) return 0; // word <= condition
178 break;
180 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
181 p = nextchar(p);
182 // skip the next character
183 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
184 if (*st == '\0' && p) return 0; // word <= condition
185 break;
187 default: {
188 if (*st == *p) {
189 st++;
190 p = nextchar(p);
191 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
192 while (p && (*p & 0xc0) == 0x80) { // character
193 if (*p != *st) {
194 if (!pos) return 0;
195 st = pos;
196 break;
198 p = nextchar(p);
199 st++;
201 if (pos && st != pos) {
202 ingroup = true;
203 while (p && *p != ']' && (p = nextchar(p)));
205 } else if (pos) {
206 ingroup = true;
207 while (p && *p != ']' && (p = nextchar(p)));
209 } else if (pos) { // group
210 p = nextchar(p);
211 } else return 0;
214 if (!p) return 1;
218 // check if this prefix entry matches
219 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
221 int tmpl; // length of tmpword
222 struct hentry * he; // hash entry of root word or NULL
223 char tmpword[MAXWORDUTF8LEN + 4];
225 // on entry prefix is 0 length or already matches the beginning of the word.
226 // So if the remaining root word has positive length
227 // and if there are enough chars in root word and added back strip chars
228 // to meet the number of characters conditions, then test it
230 tmpl = len - appndl;
232 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
234 // generate new root word by removing prefix and adding
235 // back any characters that would have been stripped
237 if (stripl) strcpy (tmpword, strip);
238 strcpy ((tmpword + stripl), (word + appndl));
240 // now make sure all of the conditions on characters
241 // are met. Please see the appendix at the end of
242 // this file for more info on exactly what is being
243 // tested
245 // if all conditions are met then check if resulting
246 // root word in the dictionary
248 if (test_condition(tmpword)) {
249 tmpl += stripl;
250 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
251 do {
252 if (TESTAFF(he->astr, aflag, he->alen) &&
253 // forbid single prefixes with needaffix flag
254 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
255 // needflag
256 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
257 (contclass && TESTAFF(contclass, needflag, contclasslen))))
258 return he;
259 he = he->next_homonym; // check homonyms
260 } while (he);
263 // prefix matched but no root word was found
264 // if aeXPRODUCT is allowed, try again but now
265 // ross checked combined with a suffix
267 //if ((opts & aeXPRODUCT) && in_compound) {
268 if ((opts & aeXPRODUCT)) {
269 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
270 0, NULL, FLAG_NULL, needflag, in_compound);
271 if (he) return he;
275 return NULL;
278 // check if this prefix entry matches
279 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
280 char in_compound, const FLAG needflag)
282 int tmpl; // length of tmpword
283 struct hentry * he; // hash entry of root word or NULL
284 char tmpword[MAXWORDUTF8LEN + 4];
286 // on entry prefix is 0 length or already matches the beginning of the word.
287 // So if the remaining root word has positive length
288 // and if there are enough chars in root word and added back strip chars
289 // to meet the number of characters conditions, then test it
291 tmpl = len - appndl;
293 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
294 (tmpl + stripl >= numconds)) {
296 // generate new root word by removing prefix and adding
297 // back any characters that would have been stripped
299 if (stripl) strcpy (tmpword, strip);
300 strcpy ((tmpword + stripl), (word + appndl));
302 // now make sure all of the conditions on characters
303 // are met. Please see the appendix at the end of
304 // this file for more info on exactly what is being
305 // tested
307 // if all conditions are met then check if resulting
308 // root word in the dictionary
310 if (test_condition(tmpword)) {
311 tmpl += stripl;
313 // prefix matched but no root word was found
314 // if aeXPRODUCT is allowed, try again but now
315 // cross checked combined with a suffix
317 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
318 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
319 if (he) return he;
323 return NULL;
326 // check if this prefix entry matches
327 char * PfxEntry::check_twosfx_morph(const char * word, int len,
328 char in_compound, const FLAG needflag)
330 int tmpl; // length of tmpword
331 char tmpword[MAXWORDUTF8LEN + 4];
333 // on entry prefix is 0 length or already matches the beginning of the word.
334 // So if the remaining root word has positive length
335 // and if there are enough chars in root word and added back strip chars
336 // to meet the number of characters conditions, then test it
338 tmpl = len - appndl;
340 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
341 (tmpl + stripl >= numconds)) {
343 // generate new root word by removing prefix and adding
344 // back any characters that would have been stripped
346 if (stripl) strcpy (tmpword, strip);
347 strcpy ((tmpword + stripl), (word + appndl));
349 // now make sure all of the conditions on characters
350 // are met. Please see the appendix at the end of
351 // this file for more info on exactly what is being
352 // tested
354 // if all conditions are met then check if resulting
355 // root word in the dictionary
357 if (test_condition(tmpword)) {
358 tmpl += stripl;
360 // prefix matched but no root word was found
361 // if aeXPRODUCT is allowed, try again but now
362 // ross checked combined with a suffix
364 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
365 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
366 aeXPRODUCT, (AffEntry *)this, needflag);
370 return NULL;
373 // check if this prefix entry matches
374 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
376 int tmpl; // length of tmpword
377 struct hentry * he; // hash entry of root word or NULL
378 char tmpword[MAXWORDUTF8LEN + 4];
379 char result[MAXLNLEN];
380 char * st;
382 *result = '\0';
384 // on entry prefix is 0 length or already matches the beginning of the word.
385 // So if the remaining root word has positive length
386 // and if there are enough chars in root word and added back strip chars
387 // to meet the number of characters conditions, then test it
389 tmpl = len - appndl;
391 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
392 (tmpl + stripl >= numconds)) {
394 // generate new root word by removing prefix and adding
395 // back any characters that would have been stripped
397 if (stripl) strcpy (tmpword, strip);
398 strcpy ((tmpword + stripl), (word + appndl));
400 // now make sure all of the conditions on characters
401 // are met. Please see the appendix at the end of
402 // this file for more info on exactly what is being
403 // tested
405 // if all conditions are met then check if resulting
406 // root word in the dictionary
408 if (test_condition(tmpword)) {
409 tmpl += stripl;
410 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
411 do {
412 if (TESTAFF(he->astr, aflag, he->alen) &&
413 // forbid single prefixes with needaffix flag
414 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
415 // needflag
416 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
417 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
418 if (morphcode) {
419 mystrcat(result, " ", MAXLNLEN);
420 mystrcat(result, morphcode, MAXLNLEN);
421 } else mystrcat(result,getKey(), MAXLNLEN);
422 if (!HENTRY_FIND(he, MORPH_STEM)) {
423 mystrcat(result, " ", MAXLNLEN);
424 mystrcat(result, MORPH_STEM, MAXLNLEN);
425 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
427 // store the pointer of the hash entry
428 if (HENTRY_DATA(he)) {
429 mystrcat(result, " ", MAXLNLEN);
430 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
431 } else {
432 // return with debug information
433 char * flag = pmyMgr->encode_flag(getFlag());
434 mystrcat(result, " ", MAXLNLEN);
435 mystrcat(result, MORPH_FLAG, MAXLNLEN);
436 mystrcat(result, flag, MAXLNLEN);
437 free(flag);
439 mystrcat(result, "\n", MAXLNLEN);
441 he = he->next_homonym;
442 } while (he);
445 // prefix matched but no root word was found
446 // if aeXPRODUCT is allowed, try again but now
447 // ross checked combined with a suffix
449 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
450 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
451 FLAG_NULL, needflag);
452 if (st) {
453 mystrcat(result, st, MAXLNLEN);
454 free(st);
460 if (*result) return mystrdup(result);
461 return NULL;
464 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
466 // register affix manager
467 pmyMgr = pmgr;
469 // set up its intial values
470 aflag = dp->aflag; // char flag
471 strip = dp->strip; // string to strip
472 appnd = dp->appnd; // string to append
473 stripl = dp->stripl; // length of strip string
474 appndl = dp->appndl; // length of append string
475 numconds = dp->numconds; // length of the condition
476 opts = dp->opts; // cross product flag
478 // then copy over all of the conditions
479 if (opts & aeLONGCOND) {
480 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
481 c.l.conds2 = dp->c.l.conds2;
482 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
484 rappnd = myrevstrdup(appnd);
485 morphcode = dp->morphcode;
486 contclass = dp->contclass;
487 contclasslen = dp->contclasslen;
491 SfxEntry::~SfxEntry()
493 aflag = 0;
494 if (appnd) free(appnd);
495 if (rappnd) free(rappnd);
496 if (strip) free(strip);
497 pmyMgr = NULL;
498 appnd = NULL;
499 strip = NULL;
500 if (opts & aeLONGCOND) free(c.l.conds2);
501 if (morphcode && !(opts & aeALIASM)) free(morphcode);
502 if (contclass && !(opts & aeALIASF)) free(contclass);
505 // add suffix to this word assuming conditions hold
506 char * SfxEntry::add(const char * word, int len)
508 char tword[MAXWORDUTF8LEN + 4];
510 /* make sure all conditions match */
511 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
512 (len >= numconds) && test_condition(word + len, word) &&
513 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
514 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
515 /* we have a match so add suffix */
516 strcpy(tword,word);
517 if (appndl) {
518 strcpy(tword + len - stripl, appnd);
519 } else {
520 *(tword + len - stripl) = '\0';
522 return mystrdup(tword);
524 return NULL;
527 inline char * SfxEntry::nextchar(char * p) {
528 if (p) {
529 p++;
530 if (opts & aeLONGCOND) {
531 // jump to the 2nd part of the condition
532 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
533 // end of the MAXCONDLEN length condition
534 } else if (p == c.conds + MAXCONDLEN) return NULL;
535 return *p ? p : NULL;
537 return NULL;
540 inline int SfxEntry::test_condition(const char * st, const char * beg)
542 const char * pos = NULL; // group with pos input position
543 bool neg = false; // complementer
544 bool ingroup = false; // character in the group
545 if (numconds == 0) return 1;
546 char * p = c.conds;
547 st--;
548 int i = 1;
549 while (1) {
550 switch (*p) {
551 case '\0': return 1;
552 case '[': { p = nextchar(p); pos = st; break; }
553 case '^': { p = nextchar(p); neg = true; break; }
554 case ']': { if (!neg && !ingroup) return 0;
555 i++;
556 // skip the next character
557 if (!ingroup) {
558 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
559 st--;
561 pos = NULL;
562 neg = false;
563 ingroup = false;
564 p = nextchar(p);
565 if (st < beg && p) return 0; // word <= condition
566 break;
568 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
569 p = nextchar(p);
570 // skip the next character
571 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
572 if (st < beg) { // word <= condition
573 if (p) return 0; else return 1;
575 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
576 st--;
577 if (st < beg) { // word <= condition
578 if (p) return 0; else return 1;
581 break;
583 default: {
584 if (*st == *p) {
585 p = nextchar(p);
586 if ((opts & aeUTF8) && (*st & 0x80)) {
587 st--;
588 while (p && (st >= beg)) {
589 if (*p != *st) {
590 if (!pos) return 0;
591 st = pos;
592 break;
594 // first byte of the UTF-8 multibyte character
595 if ((*p & 0xc0) != 0x80) break;
596 p = nextchar(p);
597 st--;
599 if (pos && st != pos) {
600 if (neg) return 0;
601 else if (i == numconds) return 1;
602 ingroup = true;
603 while (p && *p != ']' && (p = nextchar(p)));
604 st--;
606 if (p && *p != ']') p = nextchar(p);
607 } else if (pos) {
608 if (neg) return 0;
609 else if (i == numconds) return 1;
610 ingroup = true;
611 while (p && *p != ']' && (p = nextchar(p)));
612 // if (p && *p != ']') p = nextchar(p);
613 st--;
615 if (!pos) {
616 i++;
617 st--;
619 if (st < beg && p && *p != ']') return 0; // word <= condition
620 } else if (pos) { // group
621 p = nextchar(p);
622 } else return 0;
625 if (!p) return 1;
629 // see if this suffix is present in the word
630 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
631 AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
632 const FLAG badflag)
634 int tmpl; // length of tmpword
635 struct hentry * he; // hash entry pointer
636 unsigned char * cp;
637 char tmpword[MAXWORDUTF8LEN + 4];
638 PfxEntry* ep = (PfxEntry *) ppfx;
640 // if this suffix is being cross checked with a prefix
641 // but it does not support cross products skip it
643 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
644 return NULL;
646 // upon entry suffix is 0 length or already matches the end of the word.
647 // So if the remaining root word has positive length
648 // and if there are enough chars in root word and added back strip chars
649 // to meet the number of characters conditions, then test it
651 tmpl = len - appndl;
652 // the second condition is not enough for UTF-8 strings
653 // it checked in test_condition()
655 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
656 (tmpl + stripl >= numconds)) {
658 // generate new root word by removing suffix and adding
659 // back any characters that would have been stripped or
660 // or null terminating the shorter string
662 strcpy (tmpword, word);
663 cp = (unsigned char *)(tmpword + tmpl);
664 if (stripl) {
665 strcpy ((char *)cp, strip);
666 tmpl += stripl;
667 cp = (unsigned char *)(tmpword + tmpl);
668 } else *cp = '\0';
670 // now make sure all of the conditions on characters
671 // are met. Please see the appendix at the end of
672 // this file for more info on exactly what is being
673 // tested
675 // if all conditions are met then check if resulting
676 // root word in the dictionary
678 if (test_condition((char *) cp, (char *) tmpword)) {
680 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
681 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
682 #endif
683 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
684 do {
685 // check conditional suffix (enabled by prefix)
686 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
687 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
688 (((optflags & aeXPRODUCT) == 0) ||
689 TESTAFF(he->astr, ep->getFlag(), he->alen) ||
690 // enabled by prefix
691 ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
692 ) &&
693 // handle cont. class
694 ((!cclass) ||
695 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
696 ) &&
697 // check only in compound homonyms (bad flags)
698 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
699 ) &&
700 // handle required flag
701 ((!needflag) ||
702 (TESTAFF(he->astr, needflag, he->alen) ||
703 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
705 ) return he;
706 he = he->next_homonym; // check homonyms
707 } while (he);
709 // obsolote stemming code (used only by the
710 // experimental SuffixMgr:suggest_pos_stems)
711 // store resulting root in wlst
712 } else if (wlst && (*ns < maxSug)) {
713 int cwrd = 1;
714 for (int k=0; k < *ns; k++)
715 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
716 if (cwrd) {
717 wlst[*ns] = mystrdup(tmpword);
718 if (wlst[*ns] == NULL) {
719 for (int j=0; j<*ns; j++) free(wlst[j]);
720 *ns = -1;
721 return NULL;
723 (*ns)++;
728 return NULL;
731 // see if two-level suffix is present in the word
732 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
733 AffEntry* ppfx, const FLAG needflag)
735 int tmpl; // length of tmpword
736 struct hentry * he; // hash entry pointer
737 unsigned char * cp;
738 char tmpword[MAXWORDUTF8LEN + 4];
739 PfxEntry* ep = (PfxEntry *) ppfx;
742 // if this suffix is being cross checked with a prefix
743 // but it does not support cross products skip it
745 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
746 return NULL;
748 // upon entry suffix is 0 length or already matches the end of the word.
749 // So if the remaining root word has positive length
750 // and if there are enough chars in root word and added back strip chars
751 // to meet the number of characters conditions, then test it
753 tmpl = len - appndl;
755 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
756 (tmpl + stripl >= numconds)) {
758 // generate new root word by removing suffix and adding
759 // back any characters that would have been stripped or
760 // or null terminating the shorter string
762 strcpy (tmpword, word);
763 cp = (unsigned char *)(tmpword + tmpl);
764 if (stripl) {
765 strcpy ((char *)cp, strip);
766 tmpl += stripl;
767 cp = (unsigned char *)(tmpword + tmpl);
768 } else *cp = '\0';
770 // now make sure all of the conditions on characters
771 // are met. Please see the appendix at the end of
772 // this file for more info on exactly what is being
773 // tested
775 // if all conditions are met then recall suffix_check
777 if (test_condition((char *) cp, (char *) tmpword)) {
778 if (ppfx) {
779 // handle conditional suffix
780 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
781 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
782 else
783 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
784 } else {
785 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
787 if (he) return he;
790 return NULL;
793 // see if two-level suffix is present in the word
794 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
795 AffEntry* ppfx, const FLAG needflag)
797 int tmpl; // length of tmpword
798 unsigned char * cp;
799 char tmpword[MAXWORDUTF8LEN + 4];
800 PfxEntry* ep = (PfxEntry *) ppfx;
801 char * st;
803 char result[MAXLNLEN];
805 *result = '\0';
807 // if this suffix is being cross checked with a prefix
808 // but it does not support cross products skip it
810 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
811 return NULL;
813 // upon entry suffix is 0 length or already matches the end of the word.
814 // So if the remaining root word has positive length
815 // and if there are enough chars in root word and added back strip chars
816 // to meet the number of characters conditions, then test it
818 tmpl = len - appndl;
820 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
821 (tmpl + stripl >= numconds)) {
823 // generate new root word by removing suffix and adding
824 // back any characters that would have been stripped or
825 // or null terminating the shorter string
827 strcpy (tmpword, word);
828 cp = (unsigned char *)(tmpword + tmpl);
829 if (stripl) {
830 strcpy ((char *)cp, strip);
831 tmpl += stripl;
832 cp = (unsigned char *)(tmpword + tmpl);
833 } else *cp = '\0';
835 // now make sure all of the conditions on characters
836 // are met. Please see the appendix at the end of
837 // this file for more info on exactly what is being
838 // tested
840 // if all conditions are met then recall suffix_check
842 if (test_condition((char *) cp, (char *) tmpword)) {
843 if (ppfx) {
844 // handle conditional suffix
845 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
846 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
847 if (st) {
848 if (((PfxEntry *) ppfx)->getMorph()) {
849 mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN);
850 mystrcat(result, " ", MAXLNLEN);
852 mystrcat(result,st, MAXLNLEN);
853 free(st);
854 mychomp(result);
856 } else {
857 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
858 if (st) {
859 mystrcat(result, st, MAXLNLEN);
860 free(st);
861 mychomp(result);
864 } else {
865 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
866 if (st) {
867 mystrcat(result, st, MAXLNLEN);
868 free(st);
869 mychomp(result);
872 if (*result) return mystrdup(result);
875 return NULL;
878 // get next homonym with same affix
879 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
880 const FLAG cclass, const FLAG needflag)
882 PfxEntry* ep = (PfxEntry *) ppfx;
883 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
885 while (he->next_homonym) {
886 he = he->next_homonym;
887 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
888 ((optflags & aeXPRODUCT) == 0 ||
889 TESTAFF(he->astr, eFlag, he->alen) ||
890 // handle conditional suffix
891 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
892 ) &&
893 // handle cont. class
894 ((!cclass) ||
895 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
896 ) &&
897 // handle required flag
898 ((!needflag) ||
899 (TESTAFF(he->astr, needflag, he->alen) ||
900 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
902 ) return he;
904 return NULL;
908 #if 0
910 Appendix: Understanding Affix Code
913 An affix is either a prefix or a suffix attached to root words to make
914 other words.
916 Basically a Prefix or a Suffix is set of AffEntry objects
917 which store information about the prefix or suffix along
918 with supporting routines to check if a word has a particular
919 prefix or suffix or a combination.
921 The structure affentry is defined as follows:
923 struct affentry
925 unsigned short aflag; // ID used to represent the affix
926 char * strip; // string to strip before adding affix
927 char * appnd; // the affix string to add
928 unsigned char stripl; // length of the strip string
929 unsigned char appndl; // length of the affix string
930 char numconds; // the number of conditions that must be met
931 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
932 char conds[SETSIZE]; // array which encodes the conditions to be met
936 Here is a suffix borrowed from the en_US.aff file. This file
937 is whitespace delimited.
939 SFX D Y 4
940 SFX D 0 e d
941 SFX D y ied [^aeiou]y
942 SFX D 0 ed [^ey]
943 SFX D 0 ed [aeiou]y
945 This information can be interpreted as follows:
947 In the first line has 4 fields
949 Field
950 -----
951 1 SFX - indicates this is a suffix
952 2 D - is the name of the character flag which represents this suffix
953 3 Y - indicates it can be combined with prefixes (cross product)
954 4 4 - indicates that sequence of 4 affentry structures are needed to
955 properly store the affix information
957 The remaining lines describe the unique information for the 4 SfxEntry
958 objects that make up this affix. Each line can be interpreted
959 as follows: (note fields 1 and 2 are as a check against line 1 info)
961 Field
962 -----
963 1 SFX - indicates this is a suffix
964 2 D - is the name of the character flag for this affix
965 3 y - the string of chars to strip off before adding affix
966 (a 0 here indicates the NULL string)
967 4 ied - the string of affix characters to add
968 5 [^aeiou]y - the conditions which must be met before the affix
969 can be applied
971 Field 5 is interesting. Since this is a suffix, field 5 tells us that
972 there are 2 conditions that must be met. The first condition is that
973 the next to the last character in the word must *NOT* be any of the
974 following "a", "e", "i", "o" or "u". The second condition is that
975 the last character of the word must end in "y".
977 So how can we encode this information concisely and be able to
978 test for both conditions in a fast manner? The answer is found
979 but studying the wonderful ispell code of Geoff Kuenning, et.al.
980 (now available under a normal BSD license).
982 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
983 using a character (cast to an unsigned char) of a string, we have 8 bits
984 of information we can store about that character. Specifically we
985 could use each bit to say if that character is allowed in any of the
986 last (or first for prefixes) 8 characters of the word.
988 Basically, each character at one end of the word (up to the number
989 of conditions) is used to index into the conds array and the resulting
990 value found there says whether the that character is valid for a
991 specific character position in the word.
993 For prefixes, it does this by setting bit 0 if that char is valid
994 in the first position, bit 1 if valid in the second position, and so on.
996 If a bit is not set, then that char is not valid for that postion in the
997 word.
999 If working with suffixes bit 0 is used for the character closest
1000 to the front, bit 1 for the next character towards the end, ...,
1001 with bit numconds-1 representing the last char at the end of the string.
1003 Note: since entries in the conds[] are 8 bits, only 8 conditions
1004 (read that only 8 character positions) can be examined at one
1005 end of a word (the beginning for prefixes and the end for suffixes.
1007 So to make this clearer, lets encode the conds array values for the
1008 first two affentries for the suffix D described earlier.
1011 For the first affentry:
1012 numconds = 1 (only examine the last character)
1014 conds['e'] = (1 << 0) (the word must end in an E)
1015 all others are all 0
1017 For the second affentry:
1018 numconds = 2 (only examine the last two characters)
1020 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
1021 where X is all characters *but* a, e, i, o, or u
1024 conds['y'] = (1 << 1) (the last char must be a y)
1025 all other bits for all other entries in the conds array are zero
1028 #endif