1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
69 #include "affentry.hxx"
72 #ifndef MOZILLA_CLIENT
79 PfxEntry::PfxEntry(AffixMgr
* pmgr
, affentry
* dp
)
81 // register affix manager
84 // set up its intial values
86 aflag
= dp
->aflag
; // flag
87 strip
= dp
->strip
; // string to strip
88 appnd
= dp
->appnd
; // string to append
89 stripl
= dp
->stripl
; // length of strip string
90 appndl
= dp
->appndl
; // length of append string
91 numconds
= dp
->numconds
; // length of the condition
92 opts
= dp
->opts
; // cross product flag
93 // then copy over all of the conditions
94 if (opts
& aeLONGCOND
) {
95 memcpy(c
.conds
, dp
->c
.l
.conds1
, MAXCONDLEN_1
);
96 c
.l
.conds2
= dp
->c
.l
.conds2
;
97 } else memcpy(c
.conds
, dp
->c
.conds
, MAXCONDLEN
);
101 morphcode
= dp
->morphcode
;
102 contclass
= dp
->contclass
;
103 contclasslen
= dp
->contclasslen
;
107 PfxEntry::~PfxEntry()
110 if (appnd
) free(appnd
);
111 if (strip
) free(strip
);
115 if (opts
& aeLONGCOND
) free(c
.l
.conds2
);
116 if (morphcode
&& !(opts
& aeALIASM
)) free(morphcode
);
117 if (contclass
&& !(opts
& aeALIASF
)) free(contclass
);
120 // add prefix to this word assuming conditions hold
121 char * PfxEntry::add(const char * word
, int len
)
123 char tword
[MAXWORDUTF8LEN
+ 4];
125 if ((len
> stripl
|| (len
== 0 && pmyMgr
->get_fullstrip())) &&
126 (len
>= numconds
) && test_condition(word
) &&
127 (!stripl
|| (strncmp(word
, strip
, stripl
) == 0)) &&
128 ((MAXWORDUTF8LEN
+ 4) > (len
+ appndl
- stripl
))) {
129 /* we have a match so add prefix */
135 strcpy(pp
, (word
+ stripl
));
136 return mystrdup(tword
);
141 inline char * PfxEntry::nextchar(char * p
) {
144 if (opts
& aeLONGCOND
) {
145 // jump to the 2nd part of the condition
146 if (p
== c
.conds
+ MAXCONDLEN_1
) return c
.l
.conds2
;
147 // end of the MAXCONDLEN length condition
148 } else if (p
== c
.conds
+ MAXCONDLEN
) return NULL
;
149 return *p
? p
: NULL
;
154 inline int PfxEntry::test_condition(const char * st
)
156 const char * pos
= NULL
; // group with pos input position
157 bool neg
= false; // complementer
158 bool ingroup
= false; // character in the group
159 if (numconds
== 0) return 1;
170 case '^': { p
= nextchar(p
); neg
= true; break; }
172 if ((neg
&& ingroup
) || (!neg
&& !ingroup
)) return 0;
175 // skip the next character
176 if (!ingroup
) for (st
++; (opts
& aeUTF8
) && (*st
& 0xc0) == 0x80; st
++);
177 if (*st
== '\0' && p
) return 0; // word <= condition
180 case '.': if (!pos
) { // dots are not metacharacters in groups: [.]
182 // skip the next character
183 for (st
++; (opts
& aeUTF8
) && (*st
& 0xc0) == 0x80; st
++);
184 if (*st
== '\0' && p
) return 0; // word <= condition
191 if ((opts
& aeUTF8
) && (*(st
- 1) & 0x80)) { // multibyte
192 while (p
&& (*p
& 0xc0) == 0x80) { // character
201 if (pos
&& st
!= pos
) {
203 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
207 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
209 } else if (pos
) { // group
218 // check if this prefix entry matches
219 struct hentry
* PfxEntry::checkword(const char * word
, int len
, char in_compound
, const FLAG needflag
)
221 int tmpl
; // length of tmpword
222 struct hentry
* he
; // hash entry of root word or NULL
223 char tmpword
[MAXWORDUTF8LEN
+ 4];
225 // on entry prefix is 0 length or already matches the beginning of the word.
226 // So if the remaining root word has positive length
227 // and if there are enough chars in root word and added back strip chars
228 // to meet the number of characters conditions, then test it
232 if (tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) {
234 // generate new root word by removing prefix and adding
235 // back any characters that would have been stripped
237 if (stripl
) strcpy (tmpword
, strip
);
238 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
240 // now make sure all of the conditions on characters
241 // are met. Please see the appendix at the end of
242 // this file for more info on exactly what is being
245 // if all conditions are met then check if resulting
246 // root word in the dictionary
248 if (test_condition(tmpword
)) {
250 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
252 if (TESTAFF(he
->astr
, aflag
, he
->alen
) &&
253 // forbid single prefixes with needaffix flag
254 ! TESTAFF(contclass
, pmyMgr
->get_needaffix(), contclasslen
) &&
256 ((!needflag
) || TESTAFF(he
->astr
, needflag
, he
->alen
) ||
257 (contclass
&& TESTAFF(contclass
, needflag
, contclasslen
))))
259 he
= he
->next_homonym
; // check homonyms
263 // prefix matched but no root word was found
264 // if aeXPRODUCT is allowed, try again but now
265 // ross checked combined with a suffix
267 //if ((opts & aeXPRODUCT) && in_compound) {
268 if ((opts
& aeXPRODUCT
)) {
269 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, aeXPRODUCT
, (AffEntry
*)this, NULL
,
270 0, NULL
, FLAG_NULL
, needflag
, in_compound
);
278 // check if this prefix entry matches
279 struct hentry
* PfxEntry::check_twosfx(const char * word
, int len
,
280 char in_compound
, const FLAG needflag
)
282 int tmpl
; // length of tmpword
283 struct hentry
* he
; // hash entry of root word or NULL
284 char tmpword
[MAXWORDUTF8LEN
+ 4];
286 // on entry prefix is 0 length or already matches the beginning of the word.
287 // So if the remaining root word has positive length
288 // and if there are enough chars in root word and added back strip chars
289 // to meet the number of characters conditions, then test it
293 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
294 (tmpl
+ stripl
>= numconds
)) {
296 // generate new root word by removing prefix and adding
297 // back any characters that would have been stripped
299 if (stripl
) strcpy (tmpword
, strip
);
300 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
302 // now make sure all of the conditions on characters
303 // are met. Please see the appendix at the end of
304 // this file for more info on exactly what is being
307 // if all conditions are met then check if resulting
308 // root word in the dictionary
310 if (test_condition(tmpword
)) {
313 // prefix matched but no root word was found
314 // if aeXPRODUCT is allowed, try again but now
315 // cross checked combined with a suffix
317 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
318 he
= pmyMgr
->suffix_check_twosfx(tmpword
, tmpl
, aeXPRODUCT
, (AffEntry
*)this, needflag
);
326 // check if this prefix entry matches
327 char * PfxEntry::check_twosfx_morph(const char * word
, int len
,
328 char in_compound
, const FLAG needflag
)
330 int tmpl
; // length of tmpword
331 char tmpword
[MAXWORDUTF8LEN
+ 4];
333 // on entry prefix is 0 length or already matches the beginning of the word.
334 // So if the remaining root word has positive length
335 // and if there are enough chars in root word and added back strip chars
336 // to meet the number of characters conditions, then test it
340 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
341 (tmpl
+ stripl
>= numconds
)) {
343 // generate new root word by removing prefix and adding
344 // back any characters that would have been stripped
346 if (stripl
) strcpy (tmpword
, strip
);
347 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
349 // now make sure all of the conditions on characters
350 // are met. Please see the appendix at the end of
351 // this file for more info on exactly what is being
354 // if all conditions are met then check if resulting
355 // root word in the dictionary
357 if (test_condition(tmpword
)) {
360 // prefix matched but no root word was found
361 // if aeXPRODUCT is allowed, try again but now
362 // ross checked combined with a suffix
364 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
365 return pmyMgr
->suffix_check_twosfx_morph(tmpword
, tmpl
,
366 aeXPRODUCT
, (AffEntry
*)this, needflag
);
373 // check if this prefix entry matches
374 char * PfxEntry::check_morph(const char * word
, int len
, char in_compound
, const FLAG needflag
)
376 int tmpl
; // length of tmpword
377 struct hentry
* he
; // hash entry of root word or NULL
378 char tmpword
[MAXWORDUTF8LEN
+ 4];
379 char result
[MAXLNLEN
];
384 // on entry prefix is 0 length or already matches the beginning of the word.
385 // So if the remaining root word has positive length
386 // and if there are enough chars in root word and added back strip chars
387 // to meet the number of characters conditions, then test it
391 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
392 (tmpl
+ stripl
>= numconds
)) {
394 // generate new root word by removing prefix and adding
395 // back any characters that would have been stripped
397 if (stripl
) strcpy (tmpword
, strip
);
398 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
400 // now make sure all of the conditions on characters
401 // are met. Please see the appendix at the end of
402 // this file for more info on exactly what is being
405 // if all conditions are met then check if resulting
406 // root word in the dictionary
408 if (test_condition(tmpword
)) {
410 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
412 if (TESTAFF(he
->astr
, aflag
, he
->alen
) &&
413 // forbid single prefixes with needaffix flag
414 ! TESTAFF(contclass
, pmyMgr
->get_needaffix(), contclasslen
) &&
416 ((!needflag
) || TESTAFF(he
->astr
, needflag
, he
->alen
) ||
417 (contclass
&& TESTAFF(contclass
, needflag
, contclasslen
)))) {
419 mystrcat(result
, " ", MAXLNLEN
);
420 mystrcat(result
, morphcode
, MAXLNLEN
);
421 } else mystrcat(result
,getKey(), MAXLNLEN
);
422 if (!HENTRY_FIND(he
, MORPH_STEM
)) {
423 mystrcat(result
, " ", MAXLNLEN
);
424 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
425 mystrcat(result
, HENTRY_WORD(he
), MAXLNLEN
);
427 // store the pointer of the hash entry
428 if (HENTRY_DATA(he
)) {
429 mystrcat(result
, " ", MAXLNLEN
);
430 mystrcat(result
, HENTRY_DATA2(he
), MAXLNLEN
);
432 // return with debug information
433 char * flag
= pmyMgr
->encode_flag(getFlag());
434 mystrcat(result
, " ", MAXLNLEN
);
435 mystrcat(result
, MORPH_FLAG
, MAXLNLEN
);
436 mystrcat(result
, flag
, MAXLNLEN
);
439 mystrcat(result
, "\n", MAXLNLEN
);
441 he
= he
->next_homonym
;
445 // prefix matched but no root word was found
446 // if aeXPRODUCT is allowed, try again but now
447 // ross checked combined with a suffix
449 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
450 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, aeXPRODUCT
, (AffEntry
*)this,
451 FLAG_NULL
, needflag
);
453 mystrcat(result
, st
, MAXLNLEN
);
460 if (*result
) return mystrdup(result
);
464 SfxEntry::SfxEntry(AffixMgr
* pmgr
, affentry
* dp
)
466 // register affix manager
469 // set up its intial values
470 aflag
= dp
->aflag
; // char flag
471 strip
= dp
->strip
; // string to strip
472 appnd
= dp
->appnd
; // string to append
473 stripl
= dp
->stripl
; // length of strip string
474 appndl
= dp
->appndl
; // length of append string
475 numconds
= dp
->numconds
; // length of the condition
476 opts
= dp
->opts
; // cross product flag
478 // then copy over all of the conditions
479 if (opts
& aeLONGCOND
) {
480 memcpy(c
.l
.conds1
, dp
->c
.l
.conds1
, MAXCONDLEN_1
);
481 c
.l
.conds2
= dp
->c
.l
.conds2
;
482 } else memcpy(c
.conds
, dp
->c
.conds
, MAXCONDLEN
);
484 rappnd
= myrevstrdup(appnd
);
485 morphcode
= dp
->morphcode
;
486 contclass
= dp
->contclass
;
487 contclasslen
= dp
->contclasslen
;
491 SfxEntry::~SfxEntry()
494 if (appnd
) free(appnd
);
495 if (rappnd
) free(rappnd
);
496 if (strip
) free(strip
);
500 if (opts
& aeLONGCOND
) free(c
.l
.conds2
);
501 if (morphcode
&& !(opts
& aeALIASM
)) free(morphcode
);
502 if (contclass
&& !(opts
& aeALIASF
)) free(contclass
);
505 // add suffix to this word assuming conditions hold
506 char * SfxEntry::add(const char * word
, int len
)
508 char tword
[MAXWORDUTF8LEN
+ 4];
510 /* make sure all conditions match */
511 if ((len
> stripl
|| (len
== 0 && pmyMgr
->get_fullstrip())) &&
512 (len
>= numconds
) && test_condition(word
+ len
, word
) &&
513 (!stripl
|| (strcmp(word
+ len
- stripl
, strip
) == 0)) &&
514 ((MAXWORDUTF8LEN
+ 4) > (len
+ appndl
- stripl
))) {
515 /* we have a match so add suffix */
518 strcpy(tword
+ len
- stripl
, appnd
);
520 *(tword
+ len
- stripl
) = '\0';
522 return mystrdup(tword
);
527 inline char * SfxEntry::nextchar(char * p
) {
530 if (opts
& aeLONGCOND
) {
531 // jump to the 2nd part of the condition
532 if (p
== c
.l
.conds1
+ MAXCONDLEN_1
) return c
.l
.conds2
;
533 // end of the MAXCONDLEN length condition
534 } else if (p
== c
.conds
+ MAXCONDLEN
) return NULL
;
535 return *p
? p
: NULL
;
540 inline int SfxEntry::test_condition(const char * st
, const char * beg
)
542 const char * pos
= NULL
; // group with pos input position
543 bool neg
= false; // complementer
544 bool ingroup
= false; // character in the group
545 if (numconds
== 0) return 1;
552 case '[': { p
= nextchar(p
); pos
= st
; break; }
553 case '^': { p
= nextchar(p
); neg
= true; break; }
554 case ']': { if (!neg
&& !ingroup
) return 0;
556 // skip the next character
558 for (; (opts
& aeUTF8
) && (st
>= beg
) && (*st
& 0xc0) == 0x80; st
--);
565 if (st
< beg
&& p
) return 0; // word <= condition
568 case '.': if (!pos
) { // dots are not metacharacters in groups: [.]
570 // skip the next character
571 for (st
--; (opts
& aeUTF8
) && (st
>= beg
) && (*st
& 0xc0) == 0x80; st
--);
572 if (st
< beg
) { // word <= condition
573 if (p
) return 0; else return 1;
575 if ((opts
& aeUTF8
) && (*st
& 0x80)) { // head of the UTF-8 character
577 if (st
< beg
) { // word <= condition
578 if (p
) return 0; else return 1;
586 if ((opts
& aeUTF8
) && (*st
& 0x80)) {
588 while (p
&& (st
>= beg
)) {
594 // first byte of the UTF-8 multibyte character
595 if ((*p
& 0xc0) != 0x80) break;
599 if (pos
&& st
!= pos
) {
601 else if (i
== numconds
) return 1;
603 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
606 if (p
&& *p
!= ']') p
= nextchar(p
);
609 else if (i
== numconds
) return 1;
611 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
612 // if (p && *p != ']') p = nextchar(p);
619 if (st
< beg
&& p
&& *p
!= ']') return 0; // word <= condition
620 } else if (pos
) { // group
629 // see if this suffix is present in the word
630 struct hentry
* SfxEntry::checkword(const char * word
, int len
, int optflags
,
631 AffEntry
* ppfx
, char ** wlst
, int maxSug
, int * ns
, const FLAG cclass
, const FLAG needflag
,
634 int tmpl
; // length of tmpword
635 struct hentry
* he
; // hash entry pointer
637 char tmpword
[MAXWORDUTF8LEN
+ 4];
638 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
640 // if this suffix is being cross checked with a prefix
641 // but it does not support cross products skip it
643 if (((optflags
& aeXPRODUCT
) != 0) && ((opts
& aeXPRODUCT
) == 0))
646 // upon entry suffix is 0 length or already matches the end of the word.
647 // So if the remaining root word has positive length
648 // and if there are enough chars in root word and added back strip chars
649 // to meet the number of characters conditions, then test it
652 // the second condition is not enough for UTF-8 strings
653 // it checked in test_condition()
655 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
656 (tmpl
+ stripl
>= numconds
)) {
658 // generate new root word by removing suffix and adding
659 // back any characters that would have been stripped or
660 // or null terminating the shorter string
662 strcpy (tmpword
, word
);
663 cp
= (unsigned char *)(tmpword
+ tmpl
);
665 strcpy ((char *)cp
, strip
);
667 cp
= (unsigned char *)(tmpword
+ tmpl
);
670 // now make sure all of the conditions on characters
671 // are met. Please see the appendix at the end of
672 // this file for more info on exactly what is being
675 // if all conditions are met then check if resulting
676 // root word in the dictionary
678 if (test_condition((char *) cp
, (char *) tmpword
)) {
680 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
681 fprintf(stdout
,"%s %s %c\n", word
, tmpword
, aflag
);
683 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
685 // check conditional suffix (enabled by prefix)
686 if ((TESTAFF(he
->astr
, aflag
, he
->alen
) || (ep
&& ep
->getCont() &&
687 TESTAFF(ep
->getCont(), aflag
, ep
->getContLen()))) &&
688 (((optflags
& aeXPRODUCT
) == 0) ||
689 TESTAFF(he
->astr
, ep
->getFlag(), he
->alen
) ||
691 ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
))
693 // handle cont. class
695 ((contclass
) && TESTAFF(contclass
, cclass
, contclasslen
))
697 // check only in compound homonyms (bad flags)
698 (!badflag
|| !TESTAFF(he
->astr
, badflag
, he
->alen
)
700 // handle required flag
702 (TESTAFF(he
->astr
, needflag
, he
->alen
) ||
703 ((contclass
) && TESTAFF(contclass
, needflag
, contclasslen
)))
706 he
= he
->next_homonym
; // check homonyms
709 // obsolote stemming code (used only by the
710 // experimental SuffixMgr:suggest_pos_stems)
711 // store resulting root in wlst
712 } else if (wlst
&& (*ns
< maxSug
)) {
714 for (int k
=0; k
< *ns
; k
++)
715 if (strcmp(tmpword
, wlst
[k
]) == 0) cwrd
= 0;
717 wlst
[*ns
] = mystrdup(tmpword
);
718 if (wlst
[*ns
] == NULL
) {
719 for (int j
=0; j
<*ns
; j
++) free(wlst
[j
]);
731 // see if two-level suffix is present in the word
732 struct hentry
* SfxEntry::check_twosfx(const char * word
, int len
, int optflags
,
733 AffEntry
* ppfx
, const FLAG needflag
)
735 int tmpl
; // length of tmpword
736 struct hentry
* he
; // hash entry pointer
738 char tmpword
[MAXWORDUTF8LEN
+ 4];
739 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
742 // if this suffix is being cross checked with a prefix
743 // but it does not support cross products skip it
745 if ((optflags
& aeXPRODUCT
) != 0 && (opts
& aeXPRODUCT
) == 0)
748 // upon entry suffix is 0 length or already matches the end of the word.
749 // So if the remaining root word has positive length
750 // and if there are enough chars in root word and added back strip chars
751 // to meet the number of characters conditions, then test it
755 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
756 (tmpl
+ stripl
>= numconds
)) {
758 // generate new root word by removing suffix and adding
759 // back any characters that would have been stripped or
760 // or null terminating the shorter string
762 strcpy (tmpword
, word
);
763 cp
= (unsigned char *)(tmpword
+ tmpl
);
765 strcpy ((char *)cp
, strip
);
767 cp
= (unsigned char *)(tmpword
+ tmpl
);
770 // now make sure all of the conditions on characters
771 // are met. Please see the appendix at the end of
772 // this file for more info on exactly what is being
775 // if all conditions are met then recall suffix_check
777 if (test_condition((char *) cp
, (char *) tmpword
)) {
779 // handle conditional suffix
780 if ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
))
781 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, 0, NULL
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
783 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, optflags
, ppfx
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
785 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, 0, NULL
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
793 // see if two-level suffix is present in the word
794 char * SfxEntry::check_twosfx_morph(const char * word
, int len
, int optflags
,
795 AffEntry
* ppfx
, const FLAG needflag
)
797 int tmpl
; // length of tmpword
799 char tmpword
[MAXWORDUTF8LEN
+ 4];
800 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
803 char result
[MAXLNLEN
];
807 // if this suffix is being cross checked with a prefix
808 // but it does not support cross products skip it
810 if ((optflags
& aeXPRODUCT
) != 0 && (opts
& aeXPRODUCT
) == 0)
813 // upon entry suffix is 0 length or already matches the end of the word.
814 // So if the remaining root word has positive length
815 // and if there are enough chars in root word and added back strip chars
816 // to meet the number of characters conditions, then test it
820 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
821 (tmpl
+ stripl
>= numconds
)) {
823 // generate new root word by removing suffix and adding
824 // back any characters that would have been stripped or
825 // or null terminating the shorter string
827 strcpy (tmpword
, word
);
828 cp
= (unsigned char *)(tmpword
+ tmpl
);
830 strcpy ((char *)cp
, strip
);
832 cp
= (unsigned char *)(tmpword
+ tmpl
);
835 // now make sure all of the conditions on characters
836 // are met. Please see the appendix at the end of
837 // this file for more info on exactly what is being
840 // if all conditions are met then recall suffix_check
842 if (test_condition((char *) cp
, (char *) tmpword
)) {
844 // handle conditional suffix
845 if ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
)) {
846 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, 0, NULL
, aflag
, needflag
);
848 if (((PfxEntry
*) ppfx
)->getMorph()) {
849 mystrcat(result
, ((PfxEntry
*) ppfx
)->getMorph(), MAXLNLEN
);
850 mystrcat(result
, " ", MAXLNLEN
);
852 mystrcat(result
,st
, MAXLNLEN
);
857 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, optflags
, ppfx
, aflag
, needflag
);
859 mystrcat(result
, st
, MAXLNLEN
);
865 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, 0, NULL
, aflag
, needflag
);
867 mystrcat(result
, st
, MAXLNLEN
);
872 if (*result
) return mystrdup(result
);
878 // get next homonym with same affix
879 struct hentry
* SfxEntry::get_next_homonym(struct hentry
* he
, int optflags
, AffEntry
* ppfx
,
880 const FLAG cclass
, const FLAG needflag
)
882 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
883 FLAG eFlag
= ep
? ep
->getFlag() : FLAG_NULL
;
885 while (he
->next_homonym
) {
886 he
= he
->next_homonym
;
887 if ((TESTAFF(he
->astr
, aflag
, he
->alen
) || (ep
&& ep
->getCont() && TESTAFF(ep
->getCont(), aflag
, ep
->getContLen()))) &&
888 ((optflags
& aeXPRODUCT
) == 0 ||
889 TESTAFF(he
->astr
, eFlag
, he
->alen
) ||
890 // handle conditional suffix
891 ((contclass
) && TESTAFF(contclass
, eFlag
, contclasslen
))
893 // handle cont. class
895 ((contclass
) && TESTAFF(contclass
, cclass
, contclasslen
))
897 // handle required flag
899 (TESTAFF(he
->astr
, needflag
, he
->alen
) ||
900 ((contclass
) && TESTAFF(contclass
, needflag
, contclasslen
)))
910 Appendix
: Understanding Affix Code
913 An affix is either a prefix
or a suffix attached to root words to make
916 Basically a Prefix
or a Suffix is set of AffEntry objects
917 which store information about the prefix
or suffix along
918 with supporting routines to check
if a word has a particular
919 prefix
or suffix
or a combination
.
921 The structure affentry is defined as follows
:
925 unsigned short aflag
; // ID used to represent the affix
926 char * strip
; // string to strip before adding affix
927 char * appnd
; // the affix string to add
928 unsigned char stripl
; // length of the strip string
929 unsigned char appndl
; // length of the affix string
930 char numconds
; // the number of conditions that must be met
931 char opts
; // flag: aeXPRODUCT- combine both prefix and suffix
932 char conds
[SETSIZE
]; // array which encodes the conditions to be met
936 Here is a suffix borrowed from the en_US
.aff file
. This file
937 is whitespace delimited
.
941 SFX D y ied
[^aeiou
]y
945 This information can be interpreted as follows
:
947 In the first line has
4 fields
951 1 SFX
- indicates
this is a suffix
952 2 D
- is the name of the character flag which represents
this suffix
953 3 Y
- indicates it can be combined with
prefixes (cross product
)
954 4 4 - indicates that sequence of
4 affentry structures are needed to
955 properly store the affix information
957 The remaining lines describe the unique information
for the
4 SfxEntry
958 objects that make up
this affix
. Each line can be interpreted
959 as follows
: (note fields
1 and 2 are as a check against line
1 info
)
963 1 SFX
- indicates
this is a suffix
964 2 D
- is the name of the character flag
for this affix
965 3 y
- the string of chars to strip off before adding affix
966 (a
0 here indicates the NULL string
)
967 4 ied
- the string of affix characters to add
968 5 [^aeiou
]y
- the conditions which must be met before the affix
971 Field
5 is interesting
. Since
this is a suffix
, field
5 tells us that
972 there are
2 conditions that must be met
. The first condition is that
973 the next to the last character in the word must
*NOT
* be any of the
974 following
"a", "e", "i", "o" or "u". The second condition is that
975 the last character of the word must end in
"y".
977 So how can we encode
this information concisely
and be able to
978 test
for both conditions in a fast manner
? The answer is found
979 but studying the wonderful ispell code of Geoff Kuenning
, et
.al
.
980 (now available under a normal BSD license
).
982 If we set up a conds array of
256 bytes
indexed (0 to
255) and access it
983 using a
character (cast to an
unsigned char) of a string
, we have
8 bits
984 of information we can store about that character
. Specifically we
985 could use each bit to say
if that character is allowed in any of the
986 last (or first
for prefixes
) 8 characters of the word
.
988 Basically
, each character at one end of the
word (up to the number
989 of conditions
) is used to index into the conds array
and the resulting
990 value found there says whether the that character is valid
for a
991 specific character position in the word
.
993 For prefixes
, it does
this by setting bit
0 if that
char is valid
994 in the first position
, bit
1 if valid in the second position
, and so on
.
996 If a bit is
not set
, then that
char is
not valid
for that postion in the
999 If working with suffixes bit
0 is used
for the character closest
1000 to the front
, bit
1 for the next character towards the end
, ...,
1001 with bit numconds
-1 representing the last
char at the end of the string
.
1003 Note
: since entries in the conds
[] are
8 bits
, only
8 conditions
1004 (read that only
8 character positions
) can be examined at one
1005 end of a
word (the beginning
for prefixes
and the end
for suffixes
.
1007 So to make
this clearer
, lets encode the conds array values
for the
1008 first two affentries
for the suffix D described earlier
.
1011 For the first affentry
:
1012 numconds
= 1 (only examine the last character
)
1014 conds
['e'] = (1 << 0) (the word must end in an E
)
1015 all others are all
0
1017 For the second affentry
:
1018 numconds
= 2 (only examine the last two characters
)
1020 conds
[X
] = conds
[X
] | (1 << 0) (aeiou are
not allowed
)
1021 where X is all characters
*but
* a
, e
, i
, o
, or u
1024 conds
['y'] = (1 << 1) (the last
char must be a y
)
1025 all other bits
for all other entries in the conds array are zero