Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / extensions / spellcheck / hunspell / src / affixmgr.cpp
blob9fa04e128244c4d2a2f8a1b103b611ff8917c12f
1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
21 * Davide Prina
22 * Giuseppe Modugno
23 * Gianluca Turconi
24 * Simon Brouwer
25 * Noll Janos
26 * Biro Arpad
27 * Goldman Eleonora
28 * Sarlos Tamas
29 * Bencsath Boldizsar
30 * Halacsy Peter
31 * Dvornik Laszlo
32 * Gefferth Andras
33 * Nagy Viktor
34 * Varga Daniel
35 * Chris Halls
36 * Rene Engelhard
37 * Bram Moolenaar
38 * Dafydd Jones
39 * Harri Pitkanen
40 * Andras Timar
41 * Tor Lillqvist
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
58 #include <cstdlib>
59 #include <cstring>
60 #include <cctype>
61 #include <cstdio>
62 #else
63 #include <stdlib.h>
64 #include <string.h>
65 #include <stdio.h>
66 #include <ctype.h>
67 #endif
69 #include "affentry.hxx"
70 #include "affixmgr.hxx"
71 #include "csutil.hxx"
72 #include "langnum.hxx"
74 #ifndef MOZILLA_CLIENT
75 #ifndef W32
76 using namespace std;
77 #endif
78 #endif
80 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
82 // register hash manager and load affix data from aff file
83 pHMgr = ptr[0];
84 alldic = ptr;
85 maxdic = md;
86 keystring = NULL;
87 trystring = NULL;
88 encoding=NULL;
89 utf8 = 0;
90 complexprefixes = 0;
91 maptable = NULL;
92 nummap = 0;
93 breaktable = NULL;
94 numbreak = 0;
95 reptable = NULL;
96 numrep = 0;
97 iconvtable = NULL;
98 oconvtable = NULL;
99 checkcpdtable = NULL;
100 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
101 simplifiedcpd = 0;
102 numcheckcpd = 0;
103 defcpdtable = NULL;
104 numdefcpd = 0;
105 phone = NULL;
106 compoundflag = FLAG_NULL; // permits word in compound forms
107 compoundbegin = FLAG_NULL; // may be first word in compound forms
108 compoundmiddle = FLAG_NULL; // may be middle word in compound forms
109 compoundend = FLAG_NULL; // may be last word in compound forms
110 compoundroot = FLAG_NULL; // compound word signing flag
111 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
112 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
113 checkcompounddup = 0; // forbid double words in compounds
114 checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
115 checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
116 checkcompoundtriple = 0; // forbid compounds with triple letters
117 simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
118 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
119 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
120 lang = NULL; // language
121 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
122 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
123 cpdwordmax = -1; // default: unlimited wordcount in compound words
124 cpdmin = -1; // undefined
125 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
126 cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
127 cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
128 cpdvowels_utf16_len=0; // vowels
129 pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
130 sfxappnd=NULL; // previous suffix for counting a special syllables BUG
131 cpdsyllablenum=NULL; // syllable count incrementing flag
132 checknum=0; // checking numbers, and word with numbers
133 wordchars=NULL; // letters + spec. word characters
134 wordchars_utf16=NULL; // letters + spec. word characters
135 wordchars_utf16_len=0; // letters + spec. word characters
136 ignorechars=NULL; // letters + spec. word characters
137 ignorechars_utf16=NULL; // letters + spec. word characters
138 ignorechars_utf16_len=0; // letters + spec. word characters
139 version=NULL; // affix and dictionary file version string
140 havecontclass=0; // flags of possible continuing classes (double affix)
141 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
142 // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
143 lemma_present = FLAG_NULL;
144 circumfix = FLAG_NULL;
145 onlyincompound = FLAG_NULL;
146 maxngramsugs = -1; // undefined
147 nosplitsugs = 0;
148 sugswithdots = 0;
149 keepcase = 0;
150 checksharps = 0;
151 substandard = FLAG_NULL;
152 fullstrip = 0;
154 sfx = NULL;
155 pfx = NULL;
157 for (int i=0; i < SETSIZE; i++) {
158 pStart[i] = NULL;
159 sStart[i] = NULL;
160 pFlag[i] = NULL;
161 sFlag[i] = NULL;
164 for (int j=0; j < CONTSIZE; j++) {
165 contclasses[j] = 0;
168 if (parse_file(affpath, key)) {
169 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
172 if (cpdmin == -1) cpdmin = MINCPDLEN;
177 AffixMgr::~AffixMgr()
180 // pass through linked prefix entries and clean up
181 for (int i=0; i < SETSIZE ;i++) {
182 pFlag[i] = NULL;
183 PfxEntry * ptr = (PfxEntry *)pStart[i];
184 PfxEntry * nptr = NULL;
185 while (ptr) {
186 nptr = ptr->getNext();
187 delete(ptr);
188 ptr = nptr;
189 nptr = NULL;
193 // pass through linked suffix entries and clean up
194 for (int j=0; j < SETSIZE ; j++) {
195 sFlag[j] = NULL;
196 SfxEntry * ptr = (SfxEntry *)sStart[j];
197 SfxEntry * nptr = NULL;
198 while (ptr) {
199 nptr = ptr->getNext();
200 delete(ptr);
201 ptr = nptr;
202 nptr = NULL;
204 sStart[j] = NULL;
207 if (keystring) free(keystring);
208 keystring=NULL;
209 if (trystring) free(trystring);
210 trystring=NULL;
211 if (encoding) free(encoding);
212 encoding=NULL;
213 if (maptable) {
214 for (int j=0; j < nummap; j++) {
215 if (maptable[j].set) free(maptable[j].set);
216 if (maptable[j].set_utf16) free(maptable[j].set_utf16);
217 maptable[j].set = NULL;
218 maptable[j].len = 0;
220 free(maptable);
221 maptable = NULL;
223 nummap = 0;
224 if (breaktable) {
225 for (int j=0; j < numbreak; j++) {
226 if (breaktable[j]) free(breaktable[j]);
227 breaktable[j] = NULL;
229 free(breaktable);
230 breaktable = NULL;
232 numbreak = 0;
233 if (reptable) {
234 for (int j=0; j < numrep; j++) {
235 free(reptable[j].pattern);
236 free(reptable[j].pattern2);
238 free(reptable);
239 reptable = NULL;
241 if (iconvtable) delete iconvtable;
242 if (oconvtable) delete oconvtable;
243 if (phone && phone->rules) {
244 for (int j=0; j < phone->num + 1; j++) {
245 free(phone->rules[j * 2]);
246 free(phone->rules[j * 2 + 1]);
248 free(phone->rules);
249 free(phone);
250 phone = NULL;
253 if (defcpdtable) {
254 for (int j=0; j < numdefcpd; j++) {
255 free(defcpdtable[j].def);
256 defcpdtable[j].def = NULL;
258 free(defcpdtable);
259 defcpdtable = NULL;
261 numrep = 0;
262 if (checkcpdtable) {
263 for (int j=0; j < numcheckcpd; j++) {
264 free(checkcpdtable[j].pattern);
265 free(checkcpdtable[j].pattern2);
266 free(checkcpdtable[j].pattern3);
267 checkcpdtable[j].pattern = NULL;
268 checkcpdtable[j].pattern2 = NULL;
269 checkcpdtable[j].pattern3 = NULL;
271 free(checkcpdtable);
272 checkcpdtable = NULL;
274 numcheckcpd = 0;
275 FREE_FLAG(compoundflag);
276 FREE_FLAG(compoundbegin);
277 FREE_FLAG(compoundmiddle);
278 FREE_FLAG(compoundend);
279 FREE_FLAG(compoundpermitflag);
280 FREE_FLAG(compoundforbidflag);
281 FREE_FLAG(compoundroot);
282 FREE_FLAG(forbiddenword);
283 FREE_FLAG(nosuggest);
284 FREE_FLAG(needaffix);
285 FREE_FLAG(lemma_present);
286 FREE_FLAG(circumfix);
287 FREE_FLAG(onlyincompound);
289 cpdwordmax = 0;
290 pHMgr = NULL;
291 cpdmin = 0;
292 cpdmaxsyllable = 0;
293 if (cpdvowels) free(cpdvowels);
294 if (cpdvowels_utf16) free(cpdvowels_utf16);
295 if (cpdsyllablenum) free(cpdsyllablenum);
296 free_utf_tbl();
297 if (lang) free(lang);
298 if (wordchars) free(wordchars);
299 if (wordchars_utf16) free(wordchars_utf16);
300 if (ignorechars) free(ignorechars);
301 if (ignorechars_utf16) free(ignorechars_utf16);
302 if (version) free(version);
303 checknum=0;
307 // read in aff file and build up prefix and suffix entry objects
308 int AffixMgr::parse_file(const char * affpath, const char * key)
310 char * line; // io buffers
311 char ft; // affix type
313 // checking flag duplication
314 char dupflags[CONTSIZE];
315 char dupflags_ini = 1;
317 // first line indicator for removing byte order mark
318 int firstline = 1;
320 // open the affix file
321 FileMgr * afflst = new FileMgr(affpath, key);
322 if (!afflst) {
323 HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
324 return 1;
327 // step one is to parse the affix file building up the internal
328 // affix data structures
330 // read in each line ignoring any that do not
331 // start with a known line type indicator
332 while ((line = afflst->getline())) {
333 mychomp(line);
335 /* remove byte order mark */
336 if (firstline) {
337 firstline = 0;
338 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
339 memmove(line, line+3, strlen(line+3)+1);
340 HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
344 /* parse in the keyboard string */
345 if (strncmp(line,"KEY",3) == 0) {
346 if (parse_string(line, &keystring, afflst->getlinenum())) {
347 delete afflst;
348 return 1;
352 /* parse in the try string */
353 if (strncmp(line,"TRY",3) == 0) {
354 if (parse_string(line, &trystring, afflst->getlinenum())) {
355 delete afflst;
356 return 1;
360 /* parse in the name of the character set used by the .dict and .aff */
361 if (strncmp(line,"SET",3) == 0) {
362 if (parse_string(line, &encoding, afflst->getlinenum())) {
363 delete afflst;
364 return 1;
366 if (strcmp(encoding, "UTF-8") == 0) {
367 utf8 = 1;
368 #ifndef OPENOFFICEORG
369 #ifndef MOZILLA_CLIENT
370 if (initialize_utf_tbl()) return 1;
371 #endif
372 #endif
376 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
377 if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
378 complexprefixes = 1;
380 /* parse in the flag used by the controlled compound words */
381 if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
382 if (parse_flag(line, &compoundflag, afflst)) {
383 delete afflst;
384 return 1;
388 /* parse in the flag used by compound words */
389 if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
390 if (complexprefixes) {
391 if (parse_flag(line, &compoundend, afflst)) {
392 delete afflst;
393 return 1;
395 } else {
396 if (parse_flag(line, &compoundbegin, afflst)) {
397 delete afflst;
398 return 1;
403 /* parse in the flag used by compound words */
404 if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
405 if (parse_flag(line, &compoundmiddle, afflst)) {
406 delete afflst;
407 return 1;
410 /* parse in the flag used by compound words */
411 if (strncmp(line,"COMPOUNDEND",11) == 0) {
412 if (complexprefixes) {
413 if (parse_flag(line, &compoundbegin, afflst)) {
414 delete afflst;
415 return 1;
417 } else {
418 if (parse_flag(line, &compoundend, afflst)) {
419 delete afflst;
420 return 1;
425 /* parse in the data used by compound_check() method */
426 if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
427 if (parse_num(line, &cpdwordmax, afflst)) {
428 delete afflst;
429 return 1;
433 /* parse in the flag sign compounds in dictionary */
434 if (strncmp(line,"COMPOUNDROOT",12) == 0) {
435 if (parse_flag(line, &compoundroot, afflst)) {
436 delete afflst;
437 return 1;
441 /* parse in the flag used by compound_check() method */
442 if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
443 if (parse_flag(line, &compoundpermitflag, afflst)) {
444 delete afflst;
445 return 1;
449 /* parse in the flag used by compound_check() method */
450 if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
451 if (parse_flag(line, &compoundforbidflag, afflst)) {
452 delete afflst;
453 return 1;
457 if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
458 checkcompounddup = 1;
461 if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
462 checkcompoundrep = 1;
465 if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
466 checkcompoundtriple = 1;
469 if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
470 simplifiedtriple = 1;
473 if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
474 checkcompoundcase = 1;
477 if (strncmp(line,"NOSUGGEST",9) == 0) {
478 if (parse_flag(line, &nosuggest, afflst)) {
479 delete afflst;
480 return 1;
484 /* parse in the flag used by forbidden words */
485 if (strncmp(line,"FORBIDDENWORD",13) == 0) {
486 if (parse_flag(line, &forbiddenword, afflst)) {
487 delete afflst;
488 return 1;
492 /* parse in the flag used by forbidden words */
493 if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
494 if (parse_flag(line, &lemma_present, afflst)) {
495 delete afflst;
496 return 1;
500 /* parse in the flag used by circumfixes */
501 if (strncmp(line,"CIRCUMFIX",9) == 0) {
502 if (parse_flag(line, &circumfix, afflst)) {
503 delete afflst;
504 return 1;
508 /* parse in the flag used by fogemorphemes */
509 if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
510 if (parse_flag(line, &onlyincompound, afflst)) {
511 delete afflst;
512 return 1;
516 /* parse in the flag used by `needaffixs' */
517 if (strncmp(line,"PSEUDOROOT",10) == 0) {
518 if (parse_flag(line, &needaffix, afflst)) {
519 delete afflst;
520 return 1;
524 /* parse in the flag used by `needaffixs' */
525 if (strncmp(line,"NEEDAFFIX",9) == 0) {
526 if (parse_flag(line, &needaffix, afflst)) {
527 delete afflst;
528 return 1;
532 /* parse in the minimal length for words in compounds */
533 if (strncmp(line,"COMPOUNDMIN",11) == 0) {
534 if (parse_num(line, &cpdmin, afflst)) {
535 delete afflst;
536 return 1;
538 if (cpdmin < 1) cpdmin = 1;
541 /* parse in the max. words and syllables in compounds */
542 if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
543 if (parse_cpdsyllable(line, afflst)) {
544 delete afflst;
545 return 1;
549 /* parse in the flag used by compound_check() method */
550 if (strncmp(line,"SYLLABLENUM",11) == 0) {
551 if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
552 delete afflst;
553 return 1;
557 /* parse in the flag used by the controlled compound words */
558 if (strncmp(line,"CHECKNUM",8) == 0) {
559 checknum=1;
562 /* parse in the extra word characters */
563 if (strncmp(line,"WORDCHARS",9) == 0) {
564 if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
565 delete afflst;
566 return 1;
570 /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
571 if (strncmp(line,"IGNORE",6) == 0) {
572 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
573 delete afflst;
574 return 1;
578 /* parse in the typical fault correcting table */
579 if (strncmp(line,"REP",3) == 0) {
580 if (parse_reptable(line, afflst)) {
581 delete afflst;
582 return 1;
586 /* parse in the input conversion table */
587 if (strncmp(line,"ICONV",5) == 0) {
588 if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
589 delete afflst;
590 return 1;
594 /* parse in the input conversion table */
595 if (strncmp(line,"OCONV",5) == 0) {
596 if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
597 delete afflst;
598 return 1;
602 /* parse in the phonetic translation table */
603 if (strncmp(line,"PHONE",5) == 0) {
604 if (parse_phonetable(line, afflst)) {
605 delete afflst;
606 return 1;
610 /* parse in the checkcompoundpattern table */
611 if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
612 if (parse_checkcpdtable(line, afflst)) {
613 delete afflst;
614 return 1;
618 /* parse in the defcompound table */
619 if (strncmp(line,"COMPOUNDRULE",12) == 0) {
620 if (parse_defcpdtable(line, afflst)) {
621 delete afflst;
622 return 1;
626 /* parse in the related character map table */
627 if (strncmp(line,"MAP",3) == 0) {
628 if (parse_maptable(line, afflst)) {
629 delete afflst;
630 return 1;
634 /* parse in the word breakpoints table */
635 if (strncmp(line,"BREAK",5) == 0) {
636 if (parse_breaktable(line, afflst)) {
637 delete afflst;
638 return 1;
642 /* parse in the language for language specific codes */
643 if (strncmp(line,"LANG",4) == 0) {
644 if (parse_string(line, &lang, afflst->getlinenum())) {
645 delete afflst;
646 return 1;
648 langnum = get_lang_num(lang);
651 if (strncmp(line,"VERSION",7) == 0) {
652 for(line = line + 7; *line == ' ' || *line == '\t'; line++);
653 version = mystrdup(line);
656 if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
657 if (parse_num(line, &maxngramsugs, afflst)) {
658 delete afflst;
659 return 1;
663 if (strncmp(line,"NOSPLITSUGS",11) == 0) {
664 nosplitsugs=1;
667 if (strncmp(line,"FULLSTRIP",9) == 0) {
668 fullstrip=1;
671 if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
672 sugswithdots=1;
675 /* parse in the flag used by forbidden words */
676 if (strncmp(line,"KEEPCASE",8) == 0) {
677 if (parse_flag(line, &keepcase, afflst)) {
678 delete afflst;
679 return 1;
683 /* parse in the flag used by the affix generator */
684 if (strncmp(line,"SUBSTANDARD",11) == 0) {
685 if (parse_flag(line, &substandard, afflst)) {
686 delete afflst;
687 return 1;
691 if (strncmp(line,"CHECKSHARPS",11) == 0) {
692 checksharps=1;
695 /* parse this affix: P - prefix, S - suffix */
696 ft = ' ';
697 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
698 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
699 if (ft != ' ') {
700 if (dupflags_ini) {
701 for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
702 dupflags_ini = 0;
704 if (parse_affix(line, ft, afflst, dupflags)) {
705 delete afflst;
706 process_pfx_tree_to_list();
707 process_sfx_tree_to_list();
708 return 1;
713 delete afflst;
715 // convert affix trees to sorted list
716 process_pfx_tree_to_list();
717 process_sfx_tree_to_list();
719 // now we can speed up performance greatly taking advantage of the
720 // relationship between the affixes and the idea of "subsets".
722 // View each prefix as a potential leading subset of another and view
723 // each suffix (reversed) as a potential trailing subset of another.
725 // To illustrate this relationship if we know the prefix "ab" is found in the
726 // word to examine, only prefixes that "ab" is a leading subset of need be examined.
727 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
728 // is a subset need be examined.
729 // The same argument goes for suffix string that are reversed.
731 // Then to top this off why not examine the first char of the word to quickly
732 // limit the set of prefixes to examine (i.e. the prefixes to examine must
733 // be leading supersets of the first character of the word (if they exist)
735 // To take advantage of this "subset" relationship, we need to add two links
736 // from entry. One to take next if the current prefix is found (call it nexteq)
737 // and one to take next if the current prefix is not found (call it nextne).
739 // Since we have built ordered lists, all that remains is to properly intialize
740 // the nextne and nexteq pointers that relate them
742 process_pfx_order();
743 process_sfx_order();
745 /* get encoding for CHECKCOMPOUNDCASE */
746 if (!utf8) {
747 char * enc = get_encoding();
748 csconv = get_current_cs(enc);
749 free(enc);
750 enc = NULL;
752 char expw[MAXLNLEN];
753 if (wordchars) {
754 strcpy(expw, wordchars);
755 free(wordchars);
756 } else *expw = '\0';
758 for (int i = 0; i <= 255; i++) {
759 if ( (csconv[i].cupper != csconv[i].clower) &&
760 (! strchr(expw, (char) i))) {
761 *(expw + strlen(expw) + 1) = '\0';
762 *(expw + strlen(expw)) = (char) i;
766 wordchars = mystrdup(expw);
769 // default BREAK definition
770 if (!breaktable) {
771 breaktable = (char **) malloc(sizeof(char *) * 3);
772 if (!breaktable) return 1;
773 breaktable[0] = mystrdup("-");
774 breaktable[1] = mystrdup("^-");
775 breaktable[2] = mystrdup("-$");
776 if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
778 return 0;
782 // we want to be able to quickly access prefix information
783 // both by prefix flag, and sorted by prefix string itself
784 // so we need to set up two indexes
786 int AffixMgr::build_pfxtree(AffEntry* pfxptr)
788 PfxEntry * ptr;
789 PfxEntry * pptr;
790 PfxEntry * ep = (PfxEntry*) pfxptr;
792 // get the right starting points
793 const char * key = ep->getKey();
794 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
796 // first index by flag which must exist
797 ptr = (PfxEntry*)pFlag[flg];
798 ep->setFlgNxt(ptr);
799 pFlag[flg] = (AffEntry *) ep;
802 // handle the special case of null affix string
803 if (strlen(key) == 0) {
804 // always inset them at head of list at element 0
805 ptr = (PfxEntry*)pStart[0];
806 ep->setNext(ptr);
807 pStart[0] = (AffEntry*)ep;
808 return 0;
811 // now handle the normal case
812 ep->setNextEQ(NULL);
813 ep->setNextNE(NULL);
815 unsigned char sp = *((const unsigned char *)key);
816 ptr = (PfxEntry*)pStart[sp];
818 // handle the first insert
819 if (!ptr) {
820 pStart[sp] = (AffEntry*)ep;
821 return 0;
825 // otherwise use binary tree insertion so that a sorted
826 // list can easily be generated later
827 pptr = NULL;
828 for (;;) {
829 pptr = ptr;
830 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
831 ptr = ptr->getNextEQ();
832 if (!ptr) {
833 pptr->setNextEQ(ep);
834 break;
836 } else {
837 ptr = ptr->getNextNE();
838 if (!ptr) {
839 pptr->setNextNE(ep);
840 break;
844 return 0;
847 // we want to be able to quickly access suffix information
848 // both by suffix flag, and sorted by the reverse of the
849 // suffix string itself; so we need to set up two indexes
850 int AffixMgr::build_sfxtree(AffEntry* sfxptr)
852 SfxEntry * ptr;
853 SfxEntry * pptr;
854 SfxEntry * ep = (SfxEntry *) sfxptr;
856 /* get the right starting point */
857 const char * key = ep->getKey();
858 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
860 // first index by flag which must exist
861 ptr = (SfxEntry*)sFlag[flg];
862 ep->setFlgNxt(ptr);
863 sFlag[flg] = (AffEntry *) ep;
865 // next index by affix string
867 // handle the special case of null affix string
868 if (strlen(key) == 0) {
869 // always inset them at head of list at element 0
870 ptr = (SfxEntry*)sStart[0];
871 ep->setNext(ptr);
872 sStart[0] = (AffEntry*)ep;
873 return 0;
876 // now handle the normal case
877 ep->setNextEQ(NULL);
878 ep->setNextNE(NULL);
880 unsigned char sp = *((const unsigned char *)key);
881 ptr = (SfxEntry*)sStart[sp];
883 // handle the first insert
884 if (!ptr) {
885 sStart[sp] = (AffEntry*)ep;
886 return 0;
889 // otherwise use binary tree insertion so that a sorted
890 // list can easily be generated later
891 pptr = NULL;
892 for (;;) {
893 pptr = ptr;
894 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
895 ptr = ptr->getNextEQ();
896 if (!ptr) {
897 pptr->setNextEQ(ep);
898 break;
900 } else {
901 ptr = ptr->getNextNE();
902 if (!ptr) {
903 pptr->setNextNE(ep);
904 break;
908 return 0;
911 // convert from binary tree to sorted list
912 int AffixMgr::process_pfx_tree_to_list()
914 for (int i=1; i< SETSIZE; i++) {
915 pStart[i] = process_pfx_in_order(pStart[i],NULL);
917 return 0;
921 AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
923 if (ptr) {
924 nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
925 ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
926 nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
928 return nptr;
932 // convert from binary tree to sorted list
933 int AffixMgr:: process_sfx_tree_to_list()
935 for (int i=1; i< SETSIZE; i++) {
936 sStart[i] = process_sfx_in_order(sStart[i],NULL);
938 return 0;
941 AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
943 if (ptr) {
944 nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
945 ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
946 nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
948 return nptr;
952 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
953 // using the idea of leading subsets this time
954 int AffixMgr::process_pfx_order()
956 PfxEntry* ptr;
958 // loop through each prefix list starting point
959 for (int i=1; i < SETSIZE; i++) {
961 ptr = (PfxEntry*)pStart[i];
963 // look through the remainder of the list
964 // and find next entry with affix that
965 // the current one is not a subset of
966 // mark that as destination for NextNE
967 // use next in list that you are a subset
968 // of as NextEQ
970 for (; ptr != NULL; ptr = ptr->getNext()) {
972 PfxEntry * nptr = ptr->getNext();
973 for (; nptr != NULL; nptr = nptr->getNext()) {
974 if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
976 ptr->setNextNE(nptr);
977 ptr->setNextEQ(NULL);
978 if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
979 ptr->setNextEQ(ptr->getNext());
982 // now clean up by adding smart search termination strings:
983 // if you are already a superset of the previous prefix
984 // but not a subset of the next, search can end here
985 // so set NextNE properly
987 ptr = (PfxEntry *) pStart[i];
988 for (; ptr != NULL; ptr = ptr->getNext()) {
989 PfxEntry * nptr = ptr->getNext();
990 PfxEntry * mptr = NULL;
991 for (; nptr != NULL; nptr = nptr->getNext()) {
992 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
993 mptr = nptr;
995 if (mptr) mptr->setNextNE(NULL);
998 return 0;
1001 // initialize the SfxEntry links NextEQ and NextNE to speed searching
1002 // using the idea of leading subsets this time
1003 int AffixMgr::process_sfx_order()
1005 SfxEntry* ptr;
1007 // loop through each prefix list starting point
1008 for (int i=1; i < SETSIZE; i++) {
1010 ptr = (SfxEntry *) sStart[i];
1012 // look through the remainder of the list
1013 // and find next entry with affix that
1014 // the current one is not a subset of
1015 // mark that as destination for NextNE
1016 // use next in list that you are a subset
1017 // of as NextEQ
1019 for (; ptr != NULL; ptr = ptr->getNext()) {
1020 SfxEntry * nptr = ptr->getNext();
1021 for (; nptr != NULL; nptr = nptr->getNext()) {
1022 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1024 ptr->setNextNE(nptr);
1025 ptr->setNextEQ(NULL);
1026 if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
1027 ptr->setNextEQ(ptr->getNext());
1031 // now clean up by adding smart search termination strings:
1032 // if you are already a superset of the previous suffix
1033 // but not a subset of the next, search can end here
1034 // so set NextNE properly
1036 ptr = (SfxEntry *) sStart[i];
1037 for (; ptr != NULL; ptr = ptr->getNext()) {
1038 SfxEntry * nptr = ptr->getNext();
1039 SfxEntry * mptr = NULL;
1040 for (; nptr != NULL; nptr = nptr->getNext()) {
1041 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1042 mptr = nptr;
1044 if (mptr) mptr->setNextNE(NULL);
1047 return 0;
1050 // add flags to the result for dictionary debugging
1051 void AffixMgr::debugflag(char * result, unsigned short flag) {
1052 char * st = encode_flag(flag);
1053 mystrcat(result, " ", MAXLNLEN);
1054 mystrcat(result, MORPH_FLAG, MAXLNLEN);
1055 if (st) {
1056 mystrcat(result, st, MAXLNLEN);
1057 free(st);
1061 // calculate the character length of the condition
1062 int AffixMgr::condlen(char * st)
1064 int l = 0;
1065 bool group = false;
1066 for(; *st; st++) {
1067 if (*st == '[') {
1068 group = true;
1069 l++;
1070 } else if (*st == ']') group = false;
1071 else if (!group && (!utf8 ||
1072 (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
1074 return l;
1077 int AffixMgr::encodeit(struct affentry * ptr, char * cs)
1079 if (strcmp(cs,".") != 0) {
1080 ptr->numconds = (char) condlen(cs);
1081 strncpy(ptr->c.conds, cs, MAXCONDLEN);
1082 // long condition (end of conds padded by strncpy)
1083 if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
1084 ptr->opts += aeLONGCOND;
1085 ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1086 if (!ptr->c.l.conds2) return 1;
1088 } else {
1089 ptr->numconds = 0;
1090 ptr->c.conds[0] = '\0';
1092 return 0;
1095 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
1096 inline int AffixMgr::isSubset(const char * s1, const char * s2)
1098 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1099 s1++;
1100 s2++;
1102 return (*s1 == '\0');
1106 // check word for prefixes
1107 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
1108 const FLAG needflag)
1110 struct hentry * rv= NULL;
1112 pfx = NULL;
1113 pfxappnd = NULL;
1114 sfxappnd = NULL;
1116 // first handle the special case of 0 length prefixes
1117 PfxEntry * pe = (PfxEntry *) pStart[0];
1118 while (pe) {
1119 if (
1120 // fogemorpheme
1121 ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
1122 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1123 // permit prefixes in compounds
1124 ((in_compound != IN_CPD_END) || (pe->getCont() &&
1125 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
1127 // check prefix
1128 rv = pe->checkword(word, len, in_compound, needflag);
1129 if (rv) {
1130 pfx=(AffEntry *)pe; // BUG: pfx not stateless
1131 return rv;
1134 pe = pe->getNext();
1137 // now handle the general case
1138 unsigned char sp = *((const unsigned char *)word);
1139 PfxEntry * pptr = (PfxEntry *)pStart[sp];
1141 while (pptr) {
1142 if (isSubset(pptr->getKey(),word)) {
1143 if (
1144 // fogemorpheme
1145 ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
1146 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1147 // permit prefixes in compounds
1148 ((in_compound != IN_CPD_END) || (pptr->getCont() &&
1149 (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
1151 // check prefix
1152 rv = pptr->checkword(word, len, in_compound, needflag);
1153 if (rv) {
1154 pfx=(AffEntry *)pptr; // BUG: pfx not stateless
1155 return rv;
1158 pptr = pptr->getNextEQ();
1159 } else {
1160 pptr = pptr->getNextNE();
1164 return NULL;
1167 // check word for prefixes
1168 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
1169 char in_compound, const FLAG needflag)
1171 struct hentry * rv= NULL;
1173 pfx = NULL;
1174 sfxappnd = NULL;
1176 // first handle the special case of 0 length prefixes
1177 PfxEntry * pe = (PfxEntry *) pStart[0];
1179 while (pe) {
1180 rv = pe->check_twosfx(word, len, in_compound, needflag);
1181 if (rv) return rv;
1182 pe = pe->getNext();
1185 // now handle the general case
1186 unsigned char sp = *((const unsigned char *)word);
1187 PfxEntry * pptr = (PfxEntry *)pStart[sp];
1189 while (pptr) {
1190 if (isSubset(pptr->getKey(),word)) {
1191 rv = pptr->check_twosfx(word, len, in_compound, needflag);
1192 if (rv) {
1193 pfx = (AffEntry *)pptr;
1194 return rv;
1196 pptr = pptr->getNextEQ();
1197 } else {
1198 pptr = pptr->getNextNE();
1202 return NULL;
1205 // check word for prefixes
1206 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
1207 const FLAG needflag)
1209 char * st;
1211 char result[MAXLNLEN];
1212 result[0] = '\0';
1214 pfx = NULL;
1215 sfxappnd = NULL;
1217 // first handle the special case of 0 length prefixes
1218 PfxEntry * pe = (PfxEntry *) pStart[0];
1219 while (pe) {
1220 st = pe->check_morph(word,len,in_compound, needflag);
1221 if (st) {
1222 mystrcat(result, st, MAXLNLEN);
1223 free(st);
1225 // if (rv) return rv;
1226 pe = pe->getNext();
1229 // now handle the general case
1230 unsigned char sp = *((const unsigned char *)word);
1231 PfxEntry * pptr = (PfxEntry *)pStart[sp];
1233 while (pptr) {
1234 if (isSubset(pptr->getKey(),word)) {
1235 st = pptr->check_morph(word,len,in_compound, needflag);
1236 if (st) {
1237 // fogemorpheme
1238 if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
1239 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
1240 mystrcat(result, st, MAXLNLEN);
1241 pfx = (AffEntry *)pptr;
1243 free(st);
1245 pptr = pptr->getNextEQ();
1246 } else {
1247 pptr = pptr->getNextNE();
1251 if (*result) return mystrdup(result);
1252 return NULL;
1256 // check word for prefixes
1257 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
1258 char in_compound, const FLAG needflag)
1260 char * st;
1262 char result[MAXLNLEN];
1263 result[0] = '\0';
1265 pfx = NULL;
1266 sfxappnd = NULL;
1268 // first handle the special case of 0 length prefixes
1269 PfxEntry * pe = (PfxEntry *) pStart[0];
1270 while (pe) {
1271 st = pe->check_twosfx_morph(word,len,in_compound, needflag);
1272 if (st) {
1273 mystrcat(result, st, MAXLNLEN);
1274 free(st);
1276 pe = pe->getNext();
1279 // now handle the general case
1280 unsigned char sp = *((const unsigned char *)word);
1281 PfxEntry * pptr = (PfxEntry *)pStart[sp];
1283 while (pptr) {
1284 if (isSubset(pptr->getKey(),word)) {
1285 st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1286 if (st) {
1287 mystrcat(result, st, MAXLNLEN);
1288 free(st);
1289 pfx = (AffEntry *)pptr;
1291 pptr = pptr->getNextEQ();
1292 } else {
1293 pptr = pptr->getNextNE();
1297 if (*result) return mystrdup(result);
1298 return NULL;
1301 // Is word a non compound with a REP substitution (see checkcompoundrep)?
1302 int AffixMgr::cpdrep_check(const char * word, int wl)
1304 char candidate[MAXLNLEN];
1305 const char * r;
1306 int lenr, lenp;
1308 if ((wl < 2) || !numrep) return 0;
1310 for (int i=0; i < numrep; i++ ) {
1311 r = word;
1312 lenr = strlen(reptable[i].pattern2);
1313 lenp = strlen(reptable[i].pattern);
1314 // search every occurence of the pattern in the word
1315 while ((r=strstr(r, reptable[i].pattern)) != NULL) {
1316 strcpy(candidate, word);
1317 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1318 strcpy(candidate+(r-word),reptable[i].pattern2);
1319 strcpy(candidate+(r-word)+lenr, r+lenp);
1320 if (candidate_check(candidate,strlen(candidate))) return 1;
1321 r++; // search for the next letter
1324 return 0;
1327 // forbid compoundings when there are special patterns at word bound
1328 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2)
1330 int len;
1331 for (int i = 0; i < numcheckcpd; i++) {
1332 if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
1333 (!r1 || !checkcpdtable[i].cond ||
1334 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1335 (!r2 || !checkcpdtable[i].cond2 ||
1336 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1337 (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
1338 (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
1340 return 0;
1343 // forbid compounding with neighbouring upper and lower case characters at word bounds
1344 int AffixMgr::cpdcase_check(const char * word, int pos)
1346 if (utf8) {
1347 w_char u, w;
1348 const char * p;
1349 u8_u16(&u, 1, word + pos);
1350 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
1351 u8_u16(&w, 1, p);
1352 unsigned short a = (u.h << 8) + u.l;
1353 unsigned short b = (w.h << 8) + w.l;
1354 if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
1355 (a != '-') && (b != '-')) return 1;
1356 } else {
1357 unsigned char a = *(word + pos - 1);
1358 unsigned char b = *(word + pos);
1359 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
1361 return 0;
1364 // check compound patterns
1365 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
1367 signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
1368 signed short btwp[MAXWORDLEN]; // word positions for metacharacters
1369 int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
1370 short bt = 0;
1371 int i, j;
1372 int ok;
1373 int w = 0;
1375 if (!*words) {
1376 w = 1;
1377 *words = def;
1379 (*words)[wnum] = rv;
1381 // has the last word COMPOUNDRULE flag?
1382 if (rv->alen == 0) {
1383 (*words)[wnum] = NULL;
1384 if (w) *words = NULL;
1385 return 0;
1387 ok = 0;
1388 for (i = 0; i < numdefcpd; i++) {
1389 for (j = 0; j < defcpdtable[i].len; j++) {
1390 if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
1391 TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;
1394 if (ok == 0) {
1395 (*words)[wnum] = NULL;
1396 if (w) *words = NULL;
1397 return 0;
1400 for (i = 0; i < numdefcpd; i++) {
1401 signed short pp = 0; // pattern position
1402 signed short wp = 0; // "words" position
1403 int ok2;
1404 ok = 1;
1405 ok2 = 1;
1406 do {
1407 while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
1408 if (((pp+1) < defcpdtable[i].len) &&
1409 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
1410 int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
1411 ok2 = 1;
1412 pp+=2;
1413 btpp[bt] = pp;
1414 btwp[bt] = wp;
1415 while (wp <= wend) {
1416 if (!(*words)[wp]->alen ||
1417 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
1418 ok2 = 0;
1419 break;
1421 wp++;
1423 if (wp <= wnum) ok2 = 0;
1424 btnum[bt] = wp - btwp[bt];
1425 if (btnum[bt] > 0) bt++;
1426 if (ok2) break;
1427 } else {
1428 ok2 = 1;
1429 if (!(*words)[wp] || !(*words)[wp]->alen ||
1430 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
1431 ok = 0;
1432 break;
1434 pp++;
1435 wp++;
1436 if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
1439 if (ok && ok2) {
1440 int r = pp;
1441 while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
1442 ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
1443 if (defcpdtable[i].len <= r) return 1;
1445 // backtrack
1446 if (bt) do {
1447 ok = 1;
1448 btnum[bt - 1]--;
1449 pp = btpp[bt - 1];
1450 wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
1451 } while ((btnum[bt - 1] < 0) && --bt);
1452 } while (bt);
1454 if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
1456 // check zero ending
1457 while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
1458 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
1459 if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
1461 (*words)[wnum] = NULL;
1462 if (w) *words = NULL;
1463 return 0;
1466 inline int AffixMgr::candidate_check(const char * word, int len)
1468 struct hentry * rv=NULL;
1470 rv = lookup(word);
1471 if (rv) return 1;
1473 // rv = prefix_check(word,len,1);
1474 // if (rv) return 1;
1476 rv = affix_check(word,len);
1477 if (rv) return 1;
1478 return 0;
1481 // calculate number of syllable for compound-checking
1482 short AffixMgr::get_syllable(const char * word, int wlen)
1484 if (cpdmaxsyllable==0) return 0;
1486 short num=0;
1488 if (!utf8) {
1489 for (int i=0; i<wlen; i++) {
1490 if (strchr(cpdvowels, word[i])) num++;
1492 } else if (cpdvowels_utf16) {
1493 w_char w[MAXWORDUTF8LEN];
1494 int i = u8_u16(w, MAXWORDUTF8LEN, word);
1495 for (; i > 0; i--) {
1496 if (flag_bsearch((unsigned short *) cpdvowels_utf16,
1497 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
1500 return num;
1503 void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
1504 if (utf8) {
1505 int i;
1506 for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
1507 for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
1509 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
1510 for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
1512 } else {
1513 *cmin = cpdmin;
1514 *cmax = len - cpdmin + 1;
1518 // check if compound word is correctly spelled
1519 // hu_mov_rule = spec. Hungarian rule (XXX)
1520 struct hentry * AffixMgr::compound_check(const char * word, int len,
1521 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
1522 char hu_mov_rule = 0, char is_sug = 0)
1524 int i;
1525 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1526 struct hentry * rv = NULL;
1527 struct hentry * rv_first;
1528 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1529 char st [MAXWORDUTF8LEN + 4];
1530 char ch;
1531 int cmin;
1532 int cmax;
1533 int striple = 0;
1534 int scpd = 0;
1535 int soldi = 0;
1536 int oldcmin = 0;
1537 int oldcmax = 0;
1538 int oldlen = 0;
1539 int checkedstriple = 0;
1541 int checked_prefix;
1543 setcminmax(&cmin, &cmax, word, len);
1545 strcpy(st, word);
1547 for (i = cmin; i < cmax; i++) {
1549 oldnumsyllable = numsyllable;
1550 oldwordnum = wordnum;
1551 checked_prefix = 0;
1553 // go to end of the UTF-8 character
1554 if (utf8) {
1555 for (; (st[i] & 0xc0) == 0x80; i++);
1556 if (i >= cmax) return NULL;
1559 do { // simplified checkcompoundpattern loop
1561 if (scpd > 0) {
1562 for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
1563 strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
1565 if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
1566 strcpy(st + i, checkcpdtable[scpd-1].pattern);
1567 soldi = i;
1568 i += strlen(checkcpdtable[scpd-1].pattern);
1569 strcpy(st + i, checkcpdtable[scpd-1].pattern2);
1570 strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
1572 oldlen = len;
1573 len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
1574 oldcmin = cmin;
1575 oldcmax = cmax;
1576 setcminmax(&cmin, &cmax, st, len);
1578 cmax = len - cpdmin + 1;
1582 ch = st[i];
1583 st[i] = '\0';
1585 sfx = NULL;
1586 pfx = NULL;
1588 // FIRST WORD
1590 rv = lookup(st); // perhaps without prefix
1592 // search homonym with compound flag
1593 while ((rv) && !hu_mov_rule &&
1594 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1595 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1596 (compoundbegin && !wordnum &&
1597 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1598 (compoundmiddle && wordnum && !words &&
1599 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1600 (numdefcpd &&
1601 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
1602 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||
1603 (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&
1604 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))
1606 rv = rv->next_homonym;
1609 if (!rv) {
1610 if (compoundflag &&
1611 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
1612 if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
1613 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
1614 ((SfxEntry*)sfx)->getCont() &&
1615 ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1616 ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
1617 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
1618 ((SfxEntry*)sfx)->getContLen())))) {
1619 rv = NULL;
1623 if (rv ||
1624 (((wordnum == 0) && compoundbegin &&
1625 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1626 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
1627 ((wordnum > 0) && compoundmiddle &&
1628 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1629 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
1630 ) checked_prefix = 1;
1631 // else check forbiddenwords and needaffix
1632 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1633 TESTAFF(rv->astr, needaffix, rv->alen) ||
1634 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
1635 )) {
1636 st[i] = ch;
1637 continue;
1640 // check non_compound flag in suffix and prefix
1641 if ((rv) && !hu_mov_rule &&
1642 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1643 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
1644 ((PfxEntry*)pfx)->getContLen())) ||
1645 (sfx && ((SfxEntry*)sfx)->getCont() &&
1646 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1647 ((SfxEntry*)sfx)->getContLen())))) {
1648 rv = NULL;
1651 // check compoundend flag in suffix and prefix
1652 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1653 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1654 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
1655 ((PfxEntry*)pfx)->getContLen())) ||
1656 (sfx && ((SfxEntry*)sfx)->getCont() &&
1657 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
1658 ((SfxEntry*)sfx)->getContLen())))) {
1659 rv = NULL;
1662 // check compoundmiddle flag in suffix and prefix
1663 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
1664 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1665 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
1666 ((PfxEntry*)pfx)->getContLen())) ||
1667 (sfx && ((SfxEntry*)sfx)->getCont() &&
1668 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
1669 ((SfxEntry*)sfx)->getContLen())))) {
1670 rv = NULL;
1673 // check forbiddenwords
1674 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1675 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1676 return NULL;
1679 // increment word number, if the second root has a compoundroot flag
1680 if ((rv) && compoundroot &&
1681 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1682 wordnum++;
1685 // first word is acceptable in compound words?
1686 if (((rv) &&
1687 ( checked_prefix || (words && words[wnum]) ||
1688 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1689 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1690 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
1691 // (numdefcpd && )
1693 // LANG_hu section: spec. Hungarian rule
1694 || ((langnum == LANG_hu) && hu_mov_rule && (
1695 TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
1696 TESTAFF(rv->astr, 'G', rv->alen) ||
1697 TESTAFF(rv->astr, 'H', rv->alen)
1700 // END of LANG_hu section
1701 ) &&
1703 // test CHECKCOMPOUNDPATTERN conditions
1704 scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL ||
1705 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)
1707 && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters
1708 (word[i-1]==word[i]) && (
1709 ((i>1) && (word[i-1]==word[i-2])) ||
1710 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
1712 ) ||
1714 checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i)
1717 // LANG_hu section: spec. Hungarian rule
1718 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
1719 (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes
1720 TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
1721 TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
1725 ) { // first word is ok condition
1727 // LANG_hu section: spec. Hungarian rule
1728 if (langnum == LANG_hu) {
1729 // calculate syllable number of the word
1730 numsyllable += get_syllable(st, i);
1732 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1733 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
1735 // END of LANG_hu section
1738 // NEXT WORD(S)
1739 rv_first = rv;
1740 st[i] = ch;
1742 do { // striple loop
1744 // check simplifiedtriple
1745 if (simplifiedtriple) {
1746 if (striple) {
1747 checkedstriple = 1;
1748 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1749 } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1;
1752 rv = lookup((st+i)); // perhaps without prefix
1754 // search homonym with compound flag
1755 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1756 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1757 (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
1758 (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) ||
1759 (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL &&
1760 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1761 )) {
1762 rv = rv->next_homonym;
1765 if (rv && words && words[wnum + 1]) return rv_first;
1767 oldnumsyllable2 = numsyllable;
1768 oldwordnum2 = wordnum;
1770 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1771 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
1772 numsyllable--;
1774 // END of LANG_hu section
1776 // increment word number, if the second root has a compoundroot flag
1777 if ((rv) && (compoundroot) &&
1778 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1779 wordnum++;
1782 // check forbiddenwords
1783 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1784 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1786 // second word is acceptable, as a root?
1787 // hungarian conventions: compounding is acceptable,
1788 // when compound forms consist of 2 words, or if more,
1789 // then the syllable number of root words must be 6, or lesser.
1791 if ((rv) && (
1792 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1793 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
1795 && (
1796 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
1797 ((cpdmaxsyllable!=0) &&
1798 (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))
1799 ) &&
1801 // test CHECKCOMPOUNDPATTERN
1802 !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv)
1803 ) &&
1805 (!checkcompounddup || (rv != rv_first))
1807 // test CHECKCOMPOUNDPATTERN conditions
1808 && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1809 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1812 // forbid compound word, if it is a non compound word with typical fault
1813 if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
1814 return rv_first;
1817 numsyllable = oldnumsyllable2;
1818 wordnum = oldwordnum2;
1820 // perhaps second word has prefix or/and suffix
1821 sfx = NULL;
1822 sfxflag = FLAG_NULL;
1823 rv = (compoundflag) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
1824 if (!rv && compoundend) {
1825 sfx = NULL;
1826 pfx = NULL;
1827 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
1830 if (!rv && numdefcpd && words) {
1831 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
1832 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first;
1833 rv = NULL;
1836 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1837 if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1838 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL;
1840 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1841 if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv)) rv = NULL;
1843 // check non_compound flag in suffix and prefix
1844 if ((rv) &&
1845 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1846 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
1847 ((PfxEntry*)pfx)->getContLen())) ||
1848 (sfx && ((SfxEntry*)sfx)->getCont() &&
1849 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1850 ((SfxEntry*)sfx)->getContLen())))) {
1851 rv = NULL;
1854 // check forbiddenwords
1855 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1856 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1858 // pfxappnd = prefix of word+i, or NULL
1859 // calculate syllable number of prefix.
1860 // hungarian convention: when syllable number of prefix is more,
1861 // than 1, the prefix+word counts as two words.
1863 if (langnum == LANG_hu) {
1864 // calculate syllable number of the word
1865 numsyllable += get_syllable(word + i, strlen(word + i));
1867 // - affix syllable num.
1868 // XXX only second suffix (inflections, not derivations)
1869 if (sfxappnd) {
1870 char * tmp = myrevstrdup(sfxappnd);
1871 numsyllable -= get_syllable(tmp, strlen(tmp));
1872 free(tmp);
1875 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1876 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
1878 // increment syllable num, if last word has a SYLLABLENUM flag
1879 // and the suffix is beginning `s'
1881 if (cpdsyllablenum) {
1882 switch (sfxflag) {
1883 case 'c': { numsyllable+=2; break; }
1884 case 'J': { numsyllable += 1; break; }
1885 case 'I': { if (TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
1890 // increment word number, if the second word has a compoundroot flag
1891 if ((rv) && (compoundroot) &&
1892 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1893 wordnum++;
1896 // second word is acceptable, as a word with prefix or/and suffix?
1897 // hungarian conventions: compounding is acceptable,
1898 // when compound forms consist 2 word, otherwise
1899 // the syllable number of root words is 6, or lesser.
1900 if ((rv) &&
1902 ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1903 ((cpdmaxsyllable != 0) &&
1904 (numsyllable <= cpdmaxsyllable))
1906 && (
1907 (!checkcompounddup || (rv != rv_first))
1908 )) {
1909 // forbid compound word, if it is a non compound word with typical fault
1910 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1911 return rv_first;
1914 numsyllable = oldnumsyllable2;
1915 wordnum = oldwordnum2;
1917 // perhaps second word is a compound word (recursive call)
1918 if (wordnum < maxwordnum) {
1919 rv = compound_check((st+i),strlen(st+i), wordnum+1,
1920 numsyllable, maxwordnum, wnum + 1, words, 0, is_sug);
1921 if (rv && numcheckcpd && (scpd == 0 && cpdpat_check(word, i, rv_first, rv) ||
1922 scpd != 0 && !cpdpat_check(word, i, rv_first, rv))) rv = NULL;
1923 } else {
1924 rv=NULL;
1926 if (rv) {
1927 // forbid compound word, if it is a non compound word with typical fault
1928 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1929 return rv_first;
1931 } while (striple && !checkedstriple); // end of striple loop
1933 if (checkedstriple) {
1934 i++;
1935 checkedstriple = 0;
1936 striple = 0;
1939 } // first word is ok condition
1941 if (soldi != 0) {
1942 i = soldi;
1943 soldi = 0;
1944 len = oldlen;
1945 cmin = oldcmin;
1946 cmax = oldcmax;
1948 scpd++;
1950 } while (simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop
1952 if (soldi != 0) {
1953 i = soldi;
1954 strcpy(st, word); // XXX add more optim.
1955 soldi = 0;
1956 } else st[i] = ch;
1958 scpd = 0;
1959 wordnum = oldwordnum;
1960 numsyllable = oldnumsyllable;
1963 return NULL;
1966 // check if compound word is correctly spelled
1967 // hu_mov_rule = spec. Hungarian rule (XXX)
1968 int AffixMgr::compound_check_morph(const char * word, int len,
1969 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
1970 char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
1972 int i;
1973 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1974 int ok = 0;
1976 struct hentry * rv = NULL;
1977 struct hentry * rv_first;
1978 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1979 char st [MAXWORDUTF8LEN + 4];
1980 char ch;
1982 int checked_prefix;
1983 char presult[MAXLNLEN];
1985 int cmin;
1986 int cmax;
1988 setcminmax(&cmin, &cmax, word, len);
1990 strcpy(st, word);
1992 for (i = cmin; i < cmax; i++) {
1993 oldnumsyllable = numsyllable;
1994 oldwordnum = wordnum;
1995 checked_prefix = 0;
1997 // go to end of the UTF-8 character
1998 if (utf8) {
1999 for (; (st[i] & 0xc0) == 0x80; i++);
2000 if (i >= cmax) return 0;
2003 ch = st[i];
2004 st[i] = '\0';
2005 sfx = NULL;
2007 // FIRST WORD
2008 *presult = '\0';
2009 if (partresult) strcat(presult, partresult);
2011 rv = lookup(st); // perhaps without prefix
2013 // search homonym with compound flag
2014 while ((rv) && !hu_mov_rule &&
2015 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2016 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2017 (compoundbegin && !wordnum &&
2018 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2019 (compoundmiddle && wordnum && !words &&
2020 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2021 (numdefcpd &&
2022 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
2023 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
2024 ))) {
2025 rv = rv->next_homonym;
2028 if (rv) {
2029 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
2030 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2031 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);
2033 // store the pointer of the hash entry
2034 // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
2035 if (HENTRY_DATA(rv)) {
2036 sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv));
2039 if (!rv) {
2040 if (compoundflag &&
2041 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
2042 if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
2043 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
2044 ((SfxEntry*)sfx)->getCont() &&
2045 ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
2046 ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
2047 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
2048 ((SfxEntry*)sfx)->getContLen())))) {
2049 rv = NULL;
2053 if (rv ||
2054 (((wordnum == 0) && compoundbegin &&
2055 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2056 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
2057 ((wordnum > 0) && compoundmiddle &&
2058 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2059 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
2061 // char * p = prefix_check_morph(st, i, 0, compound);
2062 char * p = NULL;
2063 if (compoundflag) p = affix_check_morph(st, i, compoundflag);
2064 if (!p || (*p == '\0')) {
2065 if (p) free(p);
2066 p = NULL;
2067 if ((wordnum == 0) && compoundbegin) {
2068 p = affix_check_morph(st, i, compoundbegin);
2069 } else if ((wordnum > 0) && compoundmiddle) {
2070 p = affix_check_morph(st, i, compoundmiddle);
2073 if (p && (*p != '\0')) {
2074 sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,
2075 MORPH_PART, st, line_uniq_app(&p, MSEP_REC));
2077 if (p) free(p);
2078 checked_prefix = 1;
2080 // else check forbiddenwords
2081 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2082 TESTAFF(rv->astr, needaffix, rv->alen))) {
2083 st[i] = ch;
2084 continue;
2087 // check non_compound flag in suffix and prefix
2088 if ((rv) && !hu_mov_rule &&
2089 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2090 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
2091 ((PfxEntry*)pfx)->getContLen())) ||
2092 (sfx && ((SfxEntry*)sfx)->getCont() &&
2093 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
2094 ((SfxEntry*)sfx)->getContLen())))) {
2095 continue;
2098 // check compoundend flag in suffix and prefix
2099 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2100 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2101 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
2102 ((PfxEntry*)pfx)->getContLen())) ||
2103 (sfx && ((SfxEntry*)sfx)->getCont() &&
2104 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
2105 ((SfxEntry*)sfx)->getContLen())))) {
2106 continue;
2109 // check compoundmiddle flag in suffix and prefix
2110 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
2111 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2112 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
2113 ((PfxEntry*)pfx)->getContLen())) ||
2114 (sfx && ((SfxEntry*)sfx)->getCont() &&
2115 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
2116 ((SfxEntry*)sfx)->getContLen())))) {
2117 rv = NULL;
2120 // check forbiddenwords
2121 if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) continue;
2123 // increment word number, if the second root has a compoundroot flag
2124 if ((rv) && (compoundroot) &&
2125 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2126 wordnum++;
2129 // first word is acceptable in compound words?
2130 if (((rv) &&
2131 ( checked_prefix || (words && words[wnum]) ||
2132 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2133 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2134 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
2135 // LANG_hu section: spec. Hungarian rule
2136 || ((langnum == LANG_hu) && // hu_mov_rule
2137 hu_mov_rule && (
2138 TESTAFF(rv->astr, 'F', rv->alen) ||
2139 TESTAFF(rv->astr, 'G', rv->alen) ||
2140 TESTAFF(rv->astr, 'H', rv->alen)
2143 // END of LANG_hu section
2145 && ! (( checkcompoundtriple && !words && // test triple letters
2146 (word[i-1]==word[i]) && (
2147 ((i>1) && (word[i-1]==word[i-2])) ||
2148 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
2150 ) ||
2152 // test CHECKCOMPOUNDPATTERN
2153 numcheckcpd && !words && cpdpat_check(word, i, rv, NULL)
2154 ) ||
2156 checkcompoundcase && !words && cpdcase_check(word, i)
2159 // LANG_hu section: spec. Hungarian rule
2160 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
2161 (sfx && ((SfxEntry*)sfx)->getCont() && (
2162 TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
2163 TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
2167 // END of LANG_hu section
2170 // LANG_hu section: spec. Hungarian rule
2171 if (langnum == LANG_hu) {
2172 // calculate syllable number of the word
2173 numsyllable += get_syllable(st, i);
2175 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2176 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
2178 // END of LANG_hu section
2180 // NEXT WORD(S)
2181 rv_first = rv;
2182 rv = lookup((word+i)); // perhaps without prefix
2184 // search homonym with compound flag
2185 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2186 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2187 (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
2188 (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
2189 rv = rv->next_homonym;
2192 if (rv && words && words[wnum + 1]) {
2193 strcat(*result, presult);
2194 strcat(*result, " ");
2195 strcat(*result, MORPH_PART);
2196 strcat(*result, word+i);
2197 if (complexprefixes && HENTRY_DATA(rv)) strcat(*result, HENTRY_DATA2(rv));
2198 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2199 strcat(*result, " ");
2200 strcat(*result, MORPH_STEM);
2201 strcat(*result, HENTRY_WORD(rv));
2203 // store the pointer of the hash entry
2204 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2205 if (!complexprefixes && HENTRY_DATA(rv)) {
2206 strcat(*result, " ");
2207 strcat(*result, HENTRY_DATA2(rv));
2209 strcat(*result, "\n");
2210 ok = 1;
2211 return 0;
2214 oldnumsyllable2 = numsyllable;
2215 oldwordnum2 = wordnum;
2217 // LANG_hu section: spec. Hungarian rule
2218 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
2219 numsyllable--;
2221 // END of LANG_hu section
2222 // increment word number, if the second root has a compoundroot flag
2223 if ((rv) && (compoundroot) &&
2224 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2225 wordnum++;
2228 // check forbiddenwords
2229 if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) {
2230 st[i] = ch;
2231 continue;
2234 // second word is acceptable, as a root?
2235 // hungarian conventions: compounding is acceptable,
2236 // when compound forms consist of 2 words, or if more,
2237 // then the syllable number of root words must be 6, or lesser.
2238 if ((rv) && (
2239 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2240 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
2242 && (
2243 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2244 ((cpdmaxsyllable!=0) &&
2245 (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))
2247 && (
2248 (!checkcompounddup || (rv != rv_first))
2252 // bad compound word
2253 strcat(*result, presult);
2254 strcat(*result, " ");
2255 strcat(*result, MORPH_PART);
2256 strcat(*result, word+i);
2258 if (HENTRY_DATA(rv)) {
2259 if (complexprefixes) strcat(*result, HENTRY_DATA2(rv));
2260 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2261 strcat(*result, " ");
2262 strcat(*result, MORPH_STEM);
2263 strcat(*result, HENTRY_WORD(rv));
2265 // store the pointer of the hash entry
2266 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2267 if (!complexprefixes) {
2268 strcat(*result, " ");
2269 strcat(*result, HENTRY_DATA2(rv));
2272 strcat(*result, "\n");
2273 ok = 1;
2276 numsyllable = oldnumsyllable2 ;
2277 wordnum = oldwordnum2;
2279 // perhaps second word has prefix or/and suffix
2280 sfx = NULL;
2281 sfxflag = FLAG_NULL;
2283 if (compoundflag) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
2285 if (!rv && compoundend) {
2286 sfx = NULL;
2287 pfx = NULL;
2288 rv = affix_check((word+i),strlen(word+i), compoundend);
2291 if (!rv && numdefcpd && words) {
2292 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
2293 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2294 char * m = NULL;
2295 if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2296 if ((!m || *m == '\0') && compoundend) {
2297 if (m) free(m);
2298 m = affix_check_morph((word+i),strlen(word+i), compoundend);
2300 strcat(*result, presult);
2301 if (m || (*m != '\0')) {
2302 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2303 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2305 if (m) free(m);
2306 strcat(*result, "\n");
2307 ok = 1;
2311 // check non_compound flag in suffix and prefix
2312 if ((rv) &&
2313 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2314 TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
2315 ((PfxEntry*)pfx)->getContLen())) ||
2316 (sfx && ((SfxEntry*)sfx)->getCont() &&
2317 TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
2318 ((SfxEntry*)sfx)->getContLen())))) {
2319 rv = NULL;
2322 // check forbiddenwords
2323 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen))
2324 && (! TESTAFF(rv->astr, needaffix, rv->alen))) {
2325 st[i] = ch;
2326 continue;
2329 if (langnum == LANG_hu) {
2330 // calculate syllable number of the word
2331 numsyllable += get_syllable(word + i, strlen(word + i));
2333 // - affix syllable num.
2334 // XXX only second suffix (inflections, not derivations)
2335 if (sfxappnd) {
2336 char * tmp = myrevstrdup(sfxappnd);
2337 numsyllable -= get_syllable(tmp, strlen(tmp));
2338 free(tmp);
2341 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2342 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
2344 // increment syllable num, if last word has a SYLLABLENUM flag
2345 // and the suffix is beginning `s'
2347 if (cpdsyllablenum) {
2348 switch (sfxflag) {
2349 case 'c': { numsyllable+=2; break; }
2350 case 'J': { numsyllable += 1; break; }
2351 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
2356 // increment word number, if the second word has a compoundroot flag
2357 if ((rv) && (compoundroot) &&
2358 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2359 wordnum++;
2361 // second word is acceptable, as a word with prefix or/and suffix?
2362 // hungarian conventions: compounding is acceptable,
2363 // when compound forms consist 2 word, otherwise
2364 // the syllable number of root words is 6, or lesser.
2365 if ((rv) &&
2367 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2368 ((cpdmaxsyllable!=0) &&
2369 (numsyllable <= cpdmaxsyllable))
2371 && (
2372 (!checkcompounddup || (rv != rv_first))
2373 )) {
2374 char * m = NULL;
2375 if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2376 if ((!m || *m == '\0') && compoundend) {
2377 if (m) free(m);
2378 m = affix_check_morph((word+i),strlen(word+i), compoundend);
2380 strcat(*result, presult);
2381 if (m && (*m != '\0')) {
2382 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2383 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2385 if (m) free(m);
2386 sprintf(*result + strlen(*result), "%c", MSEP_REC);
2387 ok = 1;
2390 numsyllable = oldnumsyllable2;
2391 wordnum = oldwordnum2;
2393 // perhaps second word is a compound word (recursive call)
2394 if ((wordnum < maxwordnum) && (ok == 0)) {
2395 compound_check_morph((word+i),strlen(word+i), wordnum+1,
2396 numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
2397 } else {
2398 rv=NULL;
2401 st[i] = ch;
2402 wordnum = oldwordnum;
2403 numsyllable = oldnumsyllable;
2405 return 0;
2408 // return 1 if s1 (reversed) is a leading subset of end of s2
2409 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2411 while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2412 s1++;
2413 end_of_s2--;
2414 len--;
2416 return (*s1 == '\0');
2420 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2422 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2423 s1++;
2424 end_of_s2--;
2425 len--;
2427 return (*s1 == '\0');
2430 // check word for suffixes
2432 struct hentry * AffixMgr::suffix_check (const char * word, int len,
2433 int sfxopts, AffEntry * ppfx, char ** wlst, int maxSug, int * ns,
2434 const FLAG cclass, const FLAG needflag, char in_compound)
2436 struct hentry * rv = NULL;
2437 PfxEntry* ep = (PfxEntry *) ppfx;
2439 // first handle the special case of 0 length suffixes
2440 SfxEntry * se = (SfxEntry *) sStart[0];
2442 while (se) {
2443 if (!cclass || se->getCont()) {
2444 // suffixes are not allowed in beginning of compounds
2445 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2446 // except when signed with compoundpermitflag flag
2447 (se->getCont() && compoundpermitflag &&
2448 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2449 // no circumfix flag in prefix and suffix
2450 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2451 circumfix, ep->getContLen())) &&
2452 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2453 // circumfix flag in prefix AND suffix
2454 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2455 circumfix, ep->getContLen())) &&
2456 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
2457 // fogemorpheme
2458 (in_compound ||
2459 !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2460 // needaffix on prefix or first suffix
2461 (cclass ||
2462 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2463 (ppfx && !((ep->getCont()) &&
2464 TESTAFF(ep->getCont(), needaffix,
2465 ep->getContLen())))
2468 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
2469 needflag, (in_compound ? 0 : onlyincompound));
2470 if (rv) {
2471 sfx=(AffEntry *)se; // BUG: sfx not stateless
2472 return rv;
2476 se = se->getNext();
2479 // now handle the general case
2480 unsigned char sp = *((const unsigned char *)(word + len - 1));
2481 SfxEntry * sptr = (SfxEntry *) sStart[sp];
2483 while (sptr) {
2484 if (isRevSubset(sptr->getKey(), word + len - 1, len)
2486 // suffixes are not allowed in beginning of compounds
2487 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2488 // except when signed with compoundpermitflag flag
2489 (sptr->getCont() && compoundpermitflag &&
2490 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2491 // no circumfix flag in prefix and suffix
2492 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2493 circumfix, ep->getContLen())) &&
2494 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2495 // circumfix flag in prefix AND suffix
2496 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2497 circumfix, ep->getContLen())) &&
2498 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
2499 // fogemorpheme
2500 (in_compound ||
2501 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2502 // needaffix on prefix or first suffix
2503 (cclass ||
2504 !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2505 (ppfx && !((ep->getCont()) &&
2506 TESTAFF(ep->getCont(), needaffix,
2507 ep->getContLen())))
2510 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
2511 maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
2512 if (rv) {
2513 sfx=(AffEntry *)sptr; // BUG: sfx not stateless
2514 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2515 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2516 return rv;
2519 sptr = sptr->getNextEQ();
2520 } else {
2521 sptr = sptr->getNextNE();
2525 return NULL;
2528 // check word for two-level suffixes
2530 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
2531 int sfxopts, AffEntry * ppfx, const FLAG needflag)
2533 struct hentry * rv = NULL;
2535 // first handle the special case of 0 length suffixes
2536 SfxEntry * se = (SfxEntry *) sStart[0];
2537 while (se) {
2538 if (contclasses[se->getFlag()])
2540 rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
2541 if (rv) return rv;
2543 se = se->getNext();
2546 // now handle the general case
2547 unsigned char sp = *((const unsigned char *)(word + len - 1));
2548 SfxEntry * sptr = (SfxEntry *) sStart[sp];
2550 while (sptr) {
2551 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2552 if (contclasses[sptr->getFlag()])
2554 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
2555 if (rv) {
2556 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2557 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2558 return rv;
2561 sptr = sptr->getNextEQ();
2562 } else {
2563 sptr = sptr->getNextNE();
2567 return NULL;
2570 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
2571 int sfxopts, AffEntry * ppfx, const FLAG needflag)
2573 char result[MAXLNLEN];
2574 char result2[MAXLNLEN];
2575 char result3[MAXLNLEN];
2577 char * st;
2579 result[0] = '\0';
2580 result2[0] = '\0';
2581 result3[0] = '\0';
2583 // first handle the special case of 0 length suffixes
2584 SfxEntry * se = (SfxEntry *) sStart[0];
2585 while (se) {
2586 if (contclasses[se->getFlag()])
2588 st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2589 if (st) {
2590 if (ppfx) {
2591 if (((PfxEntry *) ppfx)->getMorph()) {
2592 mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN);
2593 mystrcat(result, " ", MAXLNLEN);
2594 } else debugflag(result, ((PfxEntry *) ppfx)->getFlag());
2596 mystrcat(result, st, MAXLNLEN);
2597 free(st);
2598 if (se->getMorph()) {
2599 mystrcat(result, " ", MAXLNLEN);
2600 mystrcat(result, se->getMorph(), MAXLNLEN);
2601 } else debugflag(result, se->getFlag());
2602 mystrcat(result, "\n", MAXLNLEN);
2605 se = se->getNext();
2608 // now handle the general case
2609 unsigned char sp = *((const unsigned char *)(word + len - 1));
2610 SfxEntry * sptr = (SfxEntry *) sStart[sp];
2612 while (sptr) {
2613 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2614 if (contclasses[sptr->getFlag()])
2616 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2617 if (st) {
2618 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2619 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2620 strcpy(result2, st);
2621 free(st);
2623 result3[0] = '\0';
2625 if (sptr->getMorph()) {
2626 mystrcat(result3, " ", MAXLNLEN);
2627 mystrcat(result3, sptr->getMorph(), MAXLNLEN);
2628 } else debugflag(result3, sptr->getFlag());
2629 strlinecat(result2, result3);
2630 mystrcat(result2, "\n", MAXLNLEN);
2631 mystrcat(result, result2, MAXLNLEN);
2634 sptr = sptr->getNextEQ();
2635 } else {
2636 sptr = sptr->getNextNE();
2639 if (*result) return mystrdup(result);
2640 return NULL;
2643 char * AffixMgr::suffix_check_morph(const char * word, int len,
2644 int sfxopts, AffEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
2646 char result[MAXLNLEN];
2648 struct hentry * rv = NULL;
2650 result[0] = '\0';
2652 PfxEntry* ep = (PfxEntry *) ppfx;
2654 // first handle the special case of 0 length suffixes
2655 SfxEntry * se = (SfxEntry *) sStart[0];
2656 while (se) {
2657 if (!cclass || se->getCont()) {
2658 // suffixes are not allowed in beginning of compounds
2659 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2660 // except when signed with compoundpermitflag flag
2661 (se->getCont() && compoundpermitflag &&
2662 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2663 // no circumfix flag in prefix and suffix
2664 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2665 circumfix, ep->getContLen())) &&
2666 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2667 // circumfix flag in prefix AND suffix
2668 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2669 circumfix, ep->getContLen())) &&
2670 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
2671 // fogemorpheme
2672 (in_compound ||
2673 !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2674 // needaffix on prefix or first suffix
2675 (cclass ||
2676 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2677 (ppfx && !((ep->getCont()) &&
2678 TESTAFF(ep->getCont(), needaffix,
2679 ep->getContLen())))
2682 rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2683 while (rv) {
2684 if (ppfx) {
2685 if (((PfxEntry *) ppfx)->getMorph()) {
2686 mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN);
2687 mystrcat(result, " ", MAXLNLEN);
2688 } else debugflag(result, ((PfxEntry *) ppfx)->getFlag());
2690 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2691 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2692 mystrcat(result, " ", MAXLNLEN);
2693 mystrcat(result, MORPH_STEM, MAXLNLEN);
2694 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2696 // store the pointer of the hash entry
2697 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2699 if (!complexprefixes && HENTRY_DATA(rv)) {
2700 mystrcat(result, " ", MAXLNLEN);
2701 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2703 if (se->getMorph()) {
2704 mystrcat(result, " ", MAXLNLEN);
2705 mystrcat(result, se->getMorph(), MAXLNLEN);
2706 } else debugflag(result, se->getFlag());
2707 mystrcat(result, "\n", MAXLNLEN);
2708 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2711 se = se->getNext();
2714 // now handle the general case
2715 unsigned char sp = *((const unsigned char *)(word + len - 1));
2716 SfxEntry * sptr = (SfxEntry *) sStart[sp];
2718 while (sptr) {
2719 if (isRevSubset(sptr->getKey(), word + len - 1, len)
2721 // suffixes are not allowed in beginning of compounds
2722 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2723 // except when signed with compoundpermitflag flag
2724 (sptr->getCont() && compoundpermitflag &&
2725 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2726 // no circumfix flag in prefix and suffix
2727 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2728 circumfix, ep->getContLen())) &&
2729 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2730 // circumfix flag in prefix AND suffix
2731 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2732 circumfix, ep->getContLen())) &&
2733 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
2734 // fogemorpheme
2735 (in_compound ||
2736 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2737 // needaffix on first suffix
2738 (cclass || !(sptr->getCont() &&
2739 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))
2740 )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2741 while (rv) {
2742 if (ppfx) {
2743 if (((PfxEntry *) ppfx)->getMorph()) {
2744 mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN);
2745 mystrcat(result, " ", MAXLNLEN);
2746 } else debugflag(result, ((PfxEntry *) ppfx)->getFlag());
2748 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2749 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2750 mystrcat(result, " ", MAXLNLEN);
2751 mystrcat(result, MORPH_STEM, MAXLNLEN);
2752 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2754 // store the pointer of the hash entry
2755 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2757 if (!complexprefixes && HENTRY_DATA(rv)) {
2758 mystrcat(result, " ", MAXLNLEN);
2759 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2762 if (sptr->getMorph()) {
2763 mystrcat(result, " ", MAXLNLEN);
2764 mystrcat(result, sptr->getMorph(), MAXLNLEN);
2765 } else debugflag(result, sptr->getFlag());
2766 mystrcat(result, "\n", MAXLNLEN);
2767 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2769 sptr = sptr->getNextEQ();
2770 } else {
2771 sptr = sptr->getNextNE();
2775 if (*result) return mystrdup(result);
2776 return NULL;
2779 // check if word with affixes is correctly spelled
2780 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
2782 struct hentry * rv= NULL;
2784 // check all prefixes (also crossed with suffixes if allowed)
2785 rv = prefix_check(word, len, in_compound, needflag);
2786 if (rv) return rv;
2788 // if still not found check all suffixes
2789 rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
2791 if (havecontclass) {
2792 sfx = NULL;
2793 pfx = NULL;
2794 if (rv) return rv;
2795 // if still not found check all two-level suffixes
2796 rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
2797 if (rv) return rv;
2798 // if still not found check all two-level suffixes
2799 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
2801 return rv;
2804 // check if word with affixes is correctly spelled
2805 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
2807 char result[MAXLNLEN];
2808 char * st = NULL;
2810 *result = '\0';
2812 // check all prefixes (also crossed with suffixes if allowed)
2813 st = prefix_check_morph(word, len, in_compound);
2814 if (st) {
2815 mystrcat(result, st, MAXLNLEN);
2816 free(st);
2819 // if still not found check all suffixes
2820 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
2821 if (st) {
2822 mystrcat(result, st, MAXLNLEN);
2823 free(st);
2826 if (havecontclass) {
2827 sfx = NULL;
2828 pfx = NULL;
2829 // if still not found check all two-level suffixes
2830 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
2831 if (st) {
2832 mystrcat(result, st, MAXLNLEN);
2833 free(st);
2836 // if still not found check all two-level suffixes
2837 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
2838 if (st) {
2839 mystrcat(result, st, MAXLNLEN);
2840 free(st);
2844 return mystrdup(result);
2847 char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,
2848 unsigned short al, char * morph, char * targetmorph, int level)
2850 // handle suffixes
2851 char * stemmorph;
2852 char * stemmorphcatpos;
2853 char mymorph[MAXLNLEN];
2855 if (!morph && !targetmorph) return NULL;
2857 // check substandard flag
2858 if (TESTAFF(ap, substandard, al)) return NULL;
2860 if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);
2862 // int targetcount = get_sfxcount(targetmorph);
2864 // use input suffix fields, if exist
2865 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
2866 stemmorph = mymorph;
2867 strcpy(stemmorph, morph);
2868 strcat(stemmorph, " ");
2869 stemmorphcatpos = stemmorph + strlen(stemmorph);
2870 } else {
2871 stemmorph = morph;
2872 stemmorphcatpos = NULL;
2875 for (int i = 0; i < al; i++) {
2876 const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
2877 SfxEntry * sptr = (SfxEntry *)sFlag[c];
2878 while (sptr) {
2879 if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) ||
2880 // don't generate forms with substandard affixes
2881 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
2883 if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());
2884 else stemmorph = (char *) sptr->getMorph();
2886 int cmp = morphcmp(stemmorph, targetmorph);
2888 if (cmp == 0) {
2889 char * newword = sptr->add(ts, wl);
2890 if (newword) {
2891 hentry * check = pHMgr->lookup(newword); // XXX extra dic
2892 if (!check || !check->astr ||
2893 !TESTAFF(check->astr, forbiddenword, check->alen)) {
2894 return newword;
2896 free(newword);
2900 // recursive call for secondary suffixes
2901 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
2902 // (get_sfxcount(stemmorph) < targetcount) &&
2903 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
2904 char * newword = sptr->add(ts, wl);
2905 if (newword) {
2906 char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),
2907 sptr->getContLen(), stemmorph, targetmorph, 1);
2909 if (newword2) {
2910 free(newword);
2911 return newword2;
2913 free(newword);
2914 newword = NULL;
2918 sptr = (SfxEntry *)sptr ->getFlgNxt();
2921 return NULL;
2925 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
2926 int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,
2927 char * phon)
2929 int nh=0;
2930 // first add root word to list
2931 if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
2932 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
2933 wlst[nh].word = mystrdup(ts);
2934 if (!wlst[nh].word) return 0;
2935 wlst[nh].allow = (1 == 0);
2936 wlst[nh].orig = NULL;
2937 nh++;
2938 // add special phonetic version
2939 if (phon && (nh < maxn)) {
2940 wlst[nh].word = mystrdup(phon);
2941 if (!wlst[nh].word) return nh - 1;
2942 wlst[nh].allow = (1 == 0);
2943 wlst[nh].orig = mystrdup(ts);
2944 if (!wlst[nh].orig) return nh - 1;
2945 nh++;
2949 // handle suffixes
2950 for (int i = 0; i < al; i++) {
2951 const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
2952 SfxEntry * sptr = (SfxEntry *)sFlag[c];
2953 while (sptr) {
2954 if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
2955 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
2956 // check needaffix flag
2957 !(sptr->getCont() && ((needaffix &&
2958 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2959 (circumfix &&
2960 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
2961 (onlyincompound &&
2962 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
2964 char * newword = sptr->add(ts, wl);
2965 if (newword) {
2966 if (nh < maxn) {
2967 wlst[nh].word = newword;
2968 wlst[nh].allow = sptr->allowCross();
2969 wlst[nh].orig = NULL;
2970 nh++;
2971 // add special phonetic version
2972 if (phon && (nh < maxn)) {
2973 char st[MAXWORDUTF8LEN];
2974 strcpy(st, phon);
2975 strcat(st, sptr->getKey());
2976 reverseword(st + strlen(phon));
2977 wlst[nh].word = mystrdup(st);
2978 if (!wlst[nh].word) return nh - 1;
2979 wlst[nh].allow = (1 == 0);
2980 wlst[nh].orig = mystrdup(newword);
2981 if (!wlst[nh].orig) return nh - 1;
2982 nh++;
2984 } else {
2985 free(newword);
2989 sptr = (SfxEntry *)sptr ->getFlgNxt();
2993 int n = nh;
2995 // handle cross products of prefixes and suffixes
2996 for (int j=1;j<n ;j++)
2997 if (wlst[j].allow) {
2998 for (int k = 0; k < al; k++) {
2999 const unsigned char c = (unsigned char) (ap[k] & 0x00FF);
3000 PfxEntry * cptr = (PfxEntry *) pFlag[c];
3001 while (cptr) {
3002 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
3003 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3004 int l1 = strlen(wlst[j].word);
3005 char * newword = cptr->add(wlst[j].word, l1);
3006 if (newword) {
3007 if (nh < maxn) {
3008 wlst[nh].word = newword;
3009 wlst[nh].allow = cptr->allowCross();
3010 wlst[nh].orig = NULL;
3011 nh++;
3012 } else {
3013 free(newword);
3017 cptr = (PfxEntry *)cptr ->getFlgNxt();
3023 // now handle pure prefixes
3024 for (int m = 0; m < al; m ++) {
3025 const unsigned char c = (unsigned char) (ap[m] & 0x00FF);
3026 PfxEntry * ptr = (PfxEntry *) pFlag[c];
3027 while (ptr) {
3028 if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
3029 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3030 // check needaffix flag
3031 !(ptr->getCont() && ((needaffix &&
3032 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3033 (circumfix &&
3034 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3035 (onlyincompound &&
3036 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
3038 char * newword = ptr->add(ts, wl);
3039 if (newword) {
3040 if (nh < maxn) {
3041 wlst[nh].word = newword;
3042 wlst[nh].allow = ptr->allowCross();
3043 wlst[nh].orig = NULL;
3044 nh++;
3045 } else {
3046 free(newword);
3050 ptr = (PfxEntry *)ptr ->getFlgNxt();
3054 return nh;
3057 // return length of replacing table
3058 int AffixMgr::get_numrep()
3060 return numrep;
3063 // return replacing table
3064 struct replentry * AffixMgr::get_reptable()
3066 if (! reptable ) return NULL;
3067 return reptable;
3070 // return iconv table
3071 RepList * AffixMgr::get_iconvtable()
3073 if (! iconvtable ) return NULL;
3074 return iconvtable;
3077 // return oconv table
3078 RepList * AffixMgr::get_oconvtable()
3080 if (! oconvtable ) return NULL;
3081 return oconvtable;
3084 // return replacing table
3085 struct phonetable * AffixMgr::get_phonetable()
3087 if (! phone ) return NULL;
3088 return phone;
3091 // return length of character map table
3092 int AffixMgr::get_nummap()
3094 return nummap;
3097 // return character map table
3098 struct mapentry * AffixMgr::get_maptable()
3100 if (! maptable ) return NULL;
3101 return maptable;
3104 // return length of word break table
3105 int AffixMgr::get_numbreak()
3107 return numbreak;
3110 // return character map table
3111 char ** AffixMgr::get_breaktable()
3113 if (! breaktable ) return NULL;
3114 return breaktable;
3117 // return text encoding of dictionary
3118 char * AffixMgr::get_encoding()
3120 if (! encoding ) encoding = mystrdup(SPELL_ENCODING);
3121 return mystrdup(encoding);
3124 // return text encoding of dictionary
3125 int AffixMgr::get_langnum()
3127 return langnum;
3130 // return double prefix option
3131 int AffixMgr::get_complexprefixes()
3133 return complexprefixes;
3136 // return FULLSTRIP option
3137 int AffixMgr::get_fullstrip()
3139 return fullstrip;
3142 FLAG AffixMgr::get_keepcase()
3144 return keepcase;
3147 int AffixMgr::get_checksharps()
3149 return checksharps;
3152 char * AffixMgr::encode_flag(unsigned short aflag)
3154 return pHMgr->encode_flag(aflag);
3158 // return the preferred ignore string for suggestions
3159 char * AffixMgr::get_ignore()
3161 if (!ignorechars) return NULL;
3162 return ignorechars;
3165 // return the preferred ignore string for suggestions
3166 unsigned short * AffixMgr::get_ignore_utf16(int * len)
3168 *len = ignorechars_utf16_len;
3169 return ignorechars_utf16;
3172 // return the keyboard string for suggestions
3173 char * AffixMgr::get_key_string()
3175 if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING);
3176 return mystrdup(keystring);
3179 // return the preferred try string for suggestions
3180 char * AffixMgr::get_try_string()
3182 if (! trystring ) return NULL;
3183 return mystrdup(trystring);
3186 // return the preferred try string for suggestions
3187 const char * AffixMgr::get_wordchars()
3189 return wordchars;
3192 unsigned short * AffixMgr::get_wordchars_utf16(int * len)
3194 *len = wordchars_utf16_len;
3195 return wordchars_utf16;
3198 // is there compounding?
3199 int AffixMgr::get_compound()
3201 return compoundflag || compoundbegin || numdefcpd;
3204 // return the compound words control flag
3205 FLAG AffixMgr::get_compoundflag()
3207 return compoundflag;
3210 // return the forbidden words control flag
3211 FLAG AffixMgr::get_forbiddenword()
3213 return forbiddenword;
3216 // return the forbidden words control flag
3217 FLAG AffixMgr::get_nosuggest()
3219 return nosuggest;
3222 // return the forbidden words flag modify flag
3223 FLAG AffixMgr::get_needaffix()
3225 return needaffix;
3228 // return the onlyincompound flag
3229 FLAG AffixMgr::get_onlyincompound()
3231 return onlyincompound;
3234 // return the compound word signal flag
3235 FLAG AffixMgr::get_compoundroot()
3237 return compoundroot;
3240 // return the compound begin signal flag
3241 FLAG AffixMgr::get_compoundbegin()
3243 return compoundbegin;
3246 // return the value of checknum
3247 int AffixMgr::get_checknum()
3249 return checknum;
3252 // return the value of prefix
3253 const char * AffixMgr::get_prefix()
3255 if (pfx) return ((PfxEntry *)pfx)->getKey();
3256 return NULL;
3259 // return the value of suffix
3260 const char * AffixMgr::get_suffix()
3262 return sfxappnd;
3265 // return the value of suffix
3266 const char * AffixMgr::get_version()
3268 return version;
3271 // return lemma_present flag
3272 FLAG AffixMgr::get_lemma_present()
3274 return lemma_present;
3277 // utility method to look up root words in hash table
3278 struct hentry * AffixMgr::lookup(const char * word)
3280 int i;
3281 struct hentry * he = NULL;
3282 for (i = 0; i < *maxdic && !he; i++) {
3283 he = (alldic[i])->lookup(word);
3285 return he;
3288 // return the value of suffix
3289 const int AffixMgr::have_contclass()
3291 return havecontclass;
3294 // return utf8
3295 int AffixMgr::get_utf8()
3297 return utf8;
3300 // return nosplitsugs
3301 int AffixMgr::get_maxngramsugs(void)
3303 return maxngramsugs;
3306 // return nosplitsugs
3307 int AffixMgr::get_nosplitsugs(void)
3309 return nosplitsugs;
3312 // return sugswithdots
3313 int AffixMgr::get_sugswithdots(void)
3315 return sugswithdots;
3318 /* parse flag */
3319 int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) {
3320 char * s = NULL;
3321 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3322 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3323 return 1;
3325 if (parse_string(line, &s, af->getlinenum())) return 1;
3326 *out = pHMgr->decode_flag(s);
3327 free(s);
3328 return 0;
3331 /* parse num */
3332 int AffixMgr::parse_num(char * line, int * out, FileMgr * af) {
3333 char * s = NULL;
3334 if (*out != -1) {
3335 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3336 return 1;
3338 if (parse_string(line, &s, af->getlinenum())) return 1;
3339 *out = atoi(s);
3340 free(s);
3341 return 0;
3344 /* parse in the max syllablecount of compound words and */
3345 int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af)
3347 char * tp = line;
3348 char * piece;
3349 int i = 0;
3350 int np = 0;
3351 w_char w[MAXWORDLEN];
3352 piece = mystrsep(&tp, 0);
3353 while (piece) {
3354 if (*piece != '\0') {
3355 switch(i) {
3356 case 0: { np++; break; }
3357 case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
3358 case 2: {
3359 if (!utf8) {
3360 cpdvowels = mystrdup(piece);
3361 } else {
3362 int n = u8_u16(w, MAXWORDLEN, piece);
3363 if (n > 0) {
3364 flag_qsort((unsigned short *) w, 0, n);
3365 cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
3366 if (!cpdvowels_utf16) return 1;
3367 memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
3369 cpdvowels_utf16_len = n;
3371 np++;
3372 break;
3374 default: break;
3376 i++;
3378 piece = mystrsep(&tp, 0);
3380 if (np < 2) {
3381 HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum());
3382 return 1;
3384 if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
3385 return 0;
3388 /* parse in the typical fault correcting table */
3389 int AffixMgr::parse_reptable(char * line, FileMgr * af)
3391 if (numrep != 0) {
3392 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3393 return 1;
3395 char * tp = line;
3396 char * piece;
3397 int i = 0;
3398 int np = 0;
3399 piece = mystrsep(&tp, 0);
3400 while (piece) {
3401 if (*piece != '\0') {
3402 switch(i) {
3403 case 0: { np++; break; }
3404 case 1: {
3405 numrep = atoi(piece);
3406 if (numrep < 1) {
3407 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3408 return 1;
3410 reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
3411 if (!reptable) return 1;
3412 np++;
3413 break;
3415 default: break;
3417 i++;
3419 piece = mystrsep(&tp, 0);
3421 if (np != 2) {
3422 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3423 return 1;
3426 /* now parse the numrep lines to read in the remainder of the table */
3427 char * nl;
3428 for (int j=0; j < numrep; j++) {
3429 if (!(nl = af->getline())) return 1;
3430 mychomp(nl);
3431 tp = nl;
3432 i = 0;
3433 reptable[j].pattern = NULL;
3434 reptable[j].pattern2 = NULL;
3435 piece = mystrsep(&tp, 0);
3436 while (piece) {
3437 if (*piece != '\0') {
3438 switch(i) {
3439 case 0: {
3440 if (strncmp(piece,"REP",3) != 0) {
3441 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3442 numrep = 0;
3443 return 1;
3445 break;
3447 case 1: { reptable[j].pattern = mystrrep(mystrdup(piece),"_"," "); break; }
3448 case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
3449 default: break;
3451 i++;
3453 piece = mystrsep(&tp, 0);
3455 if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
3456 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3457 numrep = 0;
3458 return 1;
3461 return 0;
3464 /* parse in the typical fault correcting table */
3465 int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword)
3467 if (*rl) {
3468 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3469 return 1;
3471 char * tp = line;
3472 char * piece;
3473 int i = 0;
3474 int np = 0;
3475 int numrl = 0;
3476 piece = mystrsep(&tp, 0);
3477 while (piece) {
3478 if (*piece != '\0') {
3479 switch(i) {
3480 case 0: { np++; break; }
3481 case 1: {
3482 numrl = atoi(piece);
3483 if (numrl < 1) {
3484 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3485 return 1;
3487 *rl = new RepList(numrl);
3488 if (!rl) return 1;
3489 np++;
3490 break;
3492 default: break;
3494 i++;
3496 piece = mystrsep(&tp, 0);
3498 if (np != 2) {
3499 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3500 return 1;
3503 /* now parse the num lines to read in the remainder of the table */
3504 char * nl;
3505 for (int j=0; j < numrl; j++) {
3506 if (!(nl = af->getline())) return 1;
3507 mychomp(nl);
3508 tp = nl;
3509 i = 0;
3510 char * pattern = NULL;
3511 char * pattern2 = NULL;
3512 piece = mystrsep(&tp, 0);
3513 while (piece) {
3514 if (*piece != '\0') {
3515 switch(i) {
3516 case 0: {
3517 if (strncmp(piece, keyword, sizeof(keyword)) != 0) {
3518 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3519 delete *rl;
3520 *rl = NULL;
3521 return 1;
3523 break;
3525 case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; }
3526 case 2: {
3527 pattern2 = mystrrep(mystrdup(piece),"_"," ");
3528 break;
3530 default: break;
3532 i++;
3534 piece = mystrsep(&tp, 0);
3536 if (!pattern || !pattern2) {
3537 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3538 return 1;
3540 (*rl)->add(pattern, pattern2);
3542 return 0;
3546 /* parse in the typical fault correcting table */
3547 int AffixMgr::parse_phonetable(char * line, FileMgr * af)
3549 if (phone) {
3550 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3551 return 1;
3553 char * tp = line;
3554 char * piece;
3555 int i = 0;
3556 int np = 0;
3557 piece = mystrsep(&tp, 0);
3558 while (piece) {
3559 if (*piece != '\0') {
3560 switch(i) {
3561 case 0: { np++; break; }
3562 case 1: {
3563 phone = (phonetable *) malloc(sizeof(struct phonetable));
3564 phone->num = atoi(piece);
3565 phone->rules = NULL;
3566 phone->utf8 = (char) utf8;
3567 if (!phone) return 1;
3568 if (phone->num < 1) {
3569 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3570 return 1;
3572 phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));
3573 if (!phone->rules) return 1;
3574 np++;
3575 break;
3577 default: break;
3579 i++;
3581 piece = mystrsep(&tp, 0);
3583 if (np != 2) {
3584 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3585 return 1;
3588 /* now parse the phone->num lines to read in the remainder of the table */
3589 char * nl;
3590 for (int j=0; j < phone->num; j++) {
3591 if (!(nl = af->getline())) return 1;
3592 mychomp(nl);
3593 tp = nl;
3594 i = 0;
3595 phone->rules[j * 2] = NULL;
3596 phone->rules[j * 2 + 1] = NULL;
3597 piece = mystrsep(&tp, 0);
3598 while (piece) {
3599 if (*piece != '\0') {
3600 switch(i) {
3601 case 0: {
3602 if (strncmp(piece,"PHONE",5) != 0) {
3603 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3604 phone->num = 0;
3605 return 1;
3607 break;
3609 case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }
3610 case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }
3611 default: break;
3613 i++;
3615 piece = mystrsep(&tp, 0);
3617 if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {
3618 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3619 phone->num = 0;
3620 return 1;
3623 phone->rules[phone->num * 2] = mystrdup("");
3624 phone->rules[phone->num * 2 + 1] = mystrdup("");
3625 init_phonet_hash(*phone);
3626 return 0;
3629 /* parse in the checkcompoundpattern table */
3630 int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)
3632 if (numcheckcpd != 0) {
3633 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3634 return 1;
3636 char * tp = line;
3637 char * piece;
3638 int i = 0;
3639 int np = 0;
3640 piece = mystrsep(&tp, 0);
3641 while (piece) {
3642 if (*piece != '\0') {
3643 switch(i) {
3644 case 0: { np++; break; }
3645 case 1: {
3646 numcheckcpd = atoi(piece);
3647 if (numcheckcpd < 1) {
3648 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3649 return 1;
3651 checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry));
3652 if (!checkcpdtable) return 1;
3653 np++;
3654 break;
3656 default: break;
3658 i++;
3660 piece = mystrsep(&tp, 0);
3662 if (np != 2) {
3663 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3664 return 1;
3667 /* now parse the numcheckcpd lines to read in the remainder of the table */
3668 char * nl;
3669 for (int j=0; j < numcheckcpd; j++) {
3670 if (!(nl = af->getline())) return 1;
3671 mychomp(nl);
3672 tp = nl;
3673 i = 0;
3674 checkcpdtable[j].pattern = NULL;
3675 checkcpdtable[j].pattern2 = NULL;
3676 checkcpdtable[j].pattern3 = NULL;
3677 checkcpdtable[j].cond = FLAG_NULL;
3678 checkcpdtable[j].cond2 = FLAG_NULL;
3679 piece = mystrsep(&tp, 0);
3680 while (piece) {
3681 if (*piece != '\0') {
3682 switch(i) {
3683 case 0: {
3684 if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
3685 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3686 numcheckcpd = 0;
3687 return 1;
3689 break;
3691 case 1: {
3692 checkcpdtable[j].pattern = mystrdup(piece);
3693 char * p = strchr(checkcpdtable[j].pattern, '/');
3694 if (p) {
3695 *p = '\0';
3696 checkcpdtable[j].cond = pHMgr->decode_flag(p + 1);
3698 break; }
3699 case 2: {
3700 checkcpdtable[j].pattern2 = mystrdup(piece);
3701 char * p = strchr(checkcpdtable[j].pattern2, '/');
3702 if (p) {
3703 *p = '\0';
3704 checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1);
3706 break;
3708 case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; }
3709 default: break;
3711 i++;
3713 piece = mystrsep(&tp, 0);
3715 if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
3716 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3717 numcheckcpd = 0;
3718 return 1;
3721 return 0;
3724 /* parse in the compound rule table */
3725 int AffixMgr::parse_defcpdtable(char * line, FileMgr * af)
3727 if (numdefcpd != 0) {
3728 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3729 return 1;
3731 char * tp = line;
3732 char * piece;
3733 int i = 0;
3734 int np = 0;
3735 piece = mystrsep(&tp, 0);
3736 while (piece) {
3737 if (*piece != '\0') {
3738 switch(i) {
3739 case 0: { np++; break; }
3740 case 1: {
3741 numdefcpd = atoi(piece);
3742 if (numdefcpd < 1) {
3743 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3744 return 1;
3746 defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
3747 if (!defcpdtable) return 1;
3748 np++;
3749 break;
3751 default: break;
3753 i++;
3755 piece = mystrsep(&tp, 0);
3757 if (np != 2) {
3758 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3759 return 1;
3762 /* now parse the numdefcpd lines to read in the remainder of the table */
3763 char * nl;
3764 for (int j=0; j < numdefcpd; j++) {
3765 if (!(nl = af->getline())) return 1;
3766 mychomp(nl);
3767 tp = nl;
3768 i = 0;
3769 defcpdtable[j].def = NULL;
3770 piece = mystrsep(&tp, 0);
3771 while (piece) {
3772 if (*piece != '\0') {
3773 switch(i) {
3774 case 0: {
3775 if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
3776 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3777 numdefcpd = 0;
3778 return 1;
3780 break;
3782 case 1: { // handle parenthesized flags
3783 if (strchr(piece, '(')) {
3784 defcpdtable[j].def = (FLAG *) malloc(sizeof(piece) * sizeof(FLAG));
3785 defcpdtable[j].len = 0;
3786 int end = 0;
3787 FLAG * conv;
3788 while (!end) {
3789 char * par = piece + 1;
3790 while (*par != '(' && *par != ')' && *par != '\0') par++;
3791 if (*par == '\0') end = 1; else *par = '\0';
3792 if (*piece == '(') piece++;
3793 if (*piece == '*' || *piece == '?') {
3794 defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece;
3795 } else if (*piece != '\0') {
3796 int l = pHMgr->decode_flags(&conv, piece, af);
3797 for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k];
3798 free(conv);
3800 piece = par + 1;
3802 } else {
3803 defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af);
3805 break;
3807 default: break;
3809 i++;
3811 piece = mystrsep(&tp, 0);
3813 if (!defcpdtable[j].len) {
3814 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3815 numdefcpd = 0;
3816 return 1;
3819 return 0;
3823 /* parse in the character map table */
3824 int AffixMgr::parse_maptable(char * line, FileMgr * af)
3826 if (nummap != 0) {
3827 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3828 return 1;
3830 char * tp = line;
3831 char * piece;
3832 int i = 0;
3833 int np = 0;
3834 piece = mystrsep(&tp, 0);
3835 while (piece) {
3836 if (*piece != '\0') {
3837 switch(i) {
3838 case 0: { np++; break; }
3839 case 1: {
3840 nummap = atoi(piece);
3841 if (nummap < 1) {
3842 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3843 return 1;
3845 maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
3846 if (!maptable) return 1;
3847 np++;
3848 break;
3850 default: break;
3852 i++;
3854 piece = mystrsep(&tp, 0);
3856 if (np != 2) {
3857 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3858 return 1;
3861 /* now parse the nummap lines to read in the remainder of the table */
3862 char * nl;
3863 for (int j=0; j < nummap; j++) {
3864 if (!(nl = af->getline())) return 1;
3865 mychomp(nl);
3866 tp = nl;
3867 i = 0;
3868 maptable[j].set = NULL;
3869 maptable[j].len = 0;
3870 piece = mystrsep(&tp, 0);
3871 while (piece) {
3872 if (*piece != '\0') {
3873 switch(i) {
3874 case 0: {
3875 if (strncmp(piece,"MAP",3) != 0) {
3876 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3877 nummap = 0;
3878 return 1;
3880 break;
3882 case 1: {
3883 maptable[j].len = 0;
3884 maptable[j].set = NULL;
3885 maptable[j].set_utf16 = NULL;
3886 if (!utf8) {
3887 maptable[j].set = mystrdup(piece);
3888 maptable[j].len = strlen(maptable[j].set);
3889 } else {
3890 w_char w[MAXWORDLEN];
3891 int n = u8_u16(w, MAXWORDLEN, piece);
3892 if (n > 0) {
3893 flag_qsort((unsigned short *) w, 0, n);
3894 maptable[j].set_utf16 = (w_char *) malloc(n * sizeof(w_char));
3895 if (!maptable[j].set_utf16) return 1;
3896 memcpy(maptable[j].set_utf16, w, n * sizeof(w_char));
3898 maptable[j].len = n;
3900 break; }
3901 default: break;
3903 i++;
3905 piece = mystrsep(&tp, 0);
3907 if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) {
3908 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3909 nummap = 0;
3910 return 1;
3913 return 0;
3916 /* parse in the word breakpoint table */
3917 int AffixMgr::parse_breaktable(char * line, FileMgr * af)
3919 if (numbreak != 0) {
3920 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3921 return 1;
3923 char * tp = line;
3924 char * piece;
3925 int i = 0;
3926 int np = 0;
3927 piece = mystrsep(&tp, 0);
3928 while (piece) {
3929 if (*piece != '\0') {
3930 switch(i) {
3931 case 0: { np++; break; }
3932 case 1: {
3933 numbreak = atoi(piece);
3934 if (numbreak < 1) {
3935 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3936 return 1;
3938 breaktable = (char **) malloc(numbreak * sizeof(char *));
3939 if (!breaktable) return 1;
3940 np++;
3941 break;
3943 default: break;
3945 i++;
3947 piece = mystrsep(&tp, 0);
3949 if (np != 2) {
3950 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3951 return 1;
3954 /* now parse the numbreak lines to read in the remainder of the table */
3955 char * nl;
3956 for (int j=0; j < numbreak; j++) {
3957 if (!(nl = af->getline())) return 1;
3958 mychomp(nl);
3959 tp = nl;
3960 i = 0;
3961 piece = mystrsep(&tp, 0);
3962 while (piece) {
3963 if (*piece != '\0') {
3964 switch(i) {
3965 case 0: {
3966 if (strncmp(piece,"BREAK",5) != 0) {
3967 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3968 numbreak = 0;
3969 return 1;
3971 break;
3973 case 1: {
3974 breaktable[j] = mystrdup(piece);
3975 break;
3977 default: break;
3979 i++;
3981 piece = mystrsep(&tp, 0);
3983 if (!breaktable) {
3984 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3985 numbreak = 0;
3986 return 1;
3989 return 0;
3992 void AffixMgr::reverse_condition(char * piece) {
3993 int neg = 0;
3994 for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
3995 switch(*k) {
3996 case '[': {
3997 if (neg) *(k+1) = '['; else *k = ']';
3998 break;
4000 case ']': {
4001 *k = '[';
4002 if (neg) *(k+1) = '^';
4003 neg = 0;
4004 break;
4006 case '^': {
4007 if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
4008 break;
4010 default: {
4011 if (neg) *(k+1) = *k;
4017 int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)
4019 int numents = 0; // number of affentry structures to parse
4021 unsigned short aflag = 0; // affix char identifier
4023 char ff=0;
4024 struct affentry * ptr= NULL;
4025 struct affentry * nptr= NULL;
4027 char * tp = line;
4028 char * nl = line;
4029 char * piece;
4030 int i = 0;
4032 // checking lines with bad syntax
4033 #ifdef DEBUG
4034 int basefieldnum = 0;
4035 #endif
4037 // split affix header line into pieces
4039 int np = 0;
4041 piece = mystrsep(&tp, 0);
4042 while (piece) {
4043 if (*piece != '\0') {
4044 switch(i) {
4045 // piece 1 - is type of affix
4046 case 0: { np++; break; }
4048 // piece 2 - is affix char
4049 case 1: {
4050 np++;
4051 aflag = pHMgr->decode_flag(piece);
4052 if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4053 ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4054 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",
4055 af->getlinenum());
4056 // return 1; XXX permissive mode for bad dictionaries
4058 dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
4059 break;
4061 // piece 3 - is cross product indicator
4062 case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
4064 // piece 4 - is number of affentries
4065 case 3: {
4066 np++;
4067 numents = atoi(piece);
4068 if (numents == 0) {
4069 char * err = pHMgr->encode_flag(aflag);
4070 if (err) {
4071 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4072 af->getlinenum());
4073 free(err);
4075 return 1;
4077 ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
4078 if (!ptr) return 1;
4079 ptr->opts = ff;
4080 if (utf8) ptr->opts += aeUTF8;
4081 if (pHMgr->is_aliasf()) ptr->opts += aeALIASF;
4082 if (pHMgr->is_aliasm()) ptr->opts += aeALIASM;
4083 ptr->aflag = aflag;
4086 default: break;
4088 i++;
4090 piece = mystrsep(&tp, 0);
4092 // check to make sure we parsed enough pieces
4093 if (np != 4) {
4094 char * err = pHMgr->encode_flag(aflag);
4095 if (err) {
4096 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4097 free(err);
4099 free(ptr);
4100 return 1;
4103 // store away ptr to first affentry
4104 nptr = ptr;
4106 // now parse numents affentries for this affix
4107 for (int j=0; j < numents; j++) {
4108 if (!(nl = af->getline())) return 1;
4109 mychomp(nl);
4110 tp = nl;
4111 i = 0;
4112 np = 0;
4114 // split line into pieces
4115 piece = mystrsep(&tp, 0);
4116 while (piece) {
4117 if (*piece != '\0') {
4118 switch(i) {
4119 // piece 1 - is type
4120 case 0: {
4121 np++;
4122 if (nptr != ptr) nptr->opts = ptr->opts &
4123 (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);
4124 break;
4127 // piece 2 - is affix char
4128 case 1: {
4129 np++;
4130 if (pHMgr->decode_flag(piece) != aflag) {
4131 char * err = pHMgr->encode_flag(aflag);
4132 if (err) {
4133 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4134 af->getlinenum(), err);
4135 free(err);
4137 return 1;
4140 if (nptr != ptr) nptr->aflag = ptr->aflag;
4141 break;
4144 // piece 3 - is string to strip or 0 for null
4145 case 2: {
4146 np++;
4147 if (complexprefixes) {
4148 if (utf8) reverseword_utf(piece); else reverseword(piece);
4150 nptr->strip = mystrdup(piece);
4151 nptr->stripl = (unsigned char) strlen(nptr->strip);
4152 if (strcmp(nptr->strip,"0") == 0) {
4153 free(nptr->strip);
4154 nptr->strip=mystrdup("");
4155 nptr->stripl = 0;
4157 break;
4160 // piece 4 - is affix string or 0 for null
4161 case 3: {
4162 char * dash;
4163 nptr->morphcode = NULL;
4164 nptr->contclass = NULL;
4165 nptr->contclasslen = 0;
4166 np++;
4167 dash = strchr(piece, '/');
4168 if (dash) {
4169 *dash = '\0';
4171 if (ignorechars) {
4172 if (utf8) {
4173 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4174 } else {
4175 remove_ignored_chars(piece,ignorechars);
4179 if (complexprefixes) {
4180 if (utf8) reverseword_utf(piece); else reverseword(piece);
4182 nptr->appnd = mystrdup(piece);
4184 if (pHMgr->is_aliasf()) {
4185 int index = atoi(dash + 1);
4186 nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass), af);
4187 if (!nptr->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);
4188 } else {
4189 nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1, af);
4190 flag_qsort(nptr->contclass, 0, nptr->contclasslen);
4192 *dash = '/';
4194 havecontclass = 1;
4195 for (unsigned short _i = 0; _i < nptr->contclasslen; _i++) {
4196 contclasses[(nptr->contclass)[_i]] = 1;
4198 } else {
4199 if (ignorechars) {
4200 if (utf8) {
4201 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4202 } else {
4203 remove_ignored_chars(piece,ignorechars);
4207 if (complexprefixes) {
4208 if (utf8) reverseword_utf(piece); else reverseword(piece);
4210 nptr->appnd = mystrdup(piece);
4213 nptr->appndl = (unsigned char) strlen(nptr->appnd);
4214 if (strcmp(nptr->appnd,"0") == 0) {
4215 free(nptr->appnd);
4216 nptr->appnd=mystrdup("");
4217 nptr->appndl = 0;
4219 break;
4222 // piece 5 - is the conditions descriptions
4223 case 4: {
4224 np++;
4225 if (complexprefixes) {
4226 if (utf8) reverseword_utf(piece); else reverseword(piece);
4227 reverse_condition(piece);
4229 if (nptr->stripl && (strcmp(piece, ".") != 0) &&
4230 redundant_condition(at, nptr->strip, nptr->stripl, piece, af->getlinenum()))
4231 strcpy(piece, ".");
4232 if (at == 'S') {
4233 reverseword(piece);
4234 reverse_condition(piece);
4236 if (encodeit(nptr, piece)) return 1;
4237 break;
4240 case 5: {
4241 np++;
4242 if (pHMgr->is_aliasm()) {
4243 int index = atoi(piece);
4244 nptr->morphcode = pHMgr->get_aliasm(index);
4245 } else {
4246 if (complexprefixes) { // XXX - fix me for morph. gen.
4247 if (utf8) reverseword_utf(piece); else reverseword(piece);
4249 // add the remaining of the line
4250 if (*tp) {
4251 *(tp - 1) = ' ';
4252 tp = tp + strlen(tp);
4254 nptr->morphcode = mystrdup(piece);
4255 if (!nptr->morphcode) return 1;
4257 break;
4259 default: break;
4261 i++;
4263 piece = mystrsep(&tp, 0);
4265 // check to make sure we parsed enough pieces
4266 if (np < 4) {
4267 char * err = pHMgr->encode_flag(aflag);
4268 if (err) {
4269 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4270 af->getlinenum(), err);
4271 free(err);
4273 free(ptr);
4274 return 1;
4277 #ifdef DEBUG
4278 // detect unnecessary fields, excepting comments
4279 if (basefieldnum) {
4280 int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
4281 if (fieldnum != basefieldnum)
4282 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum());
4283 } else {
4284 basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
4286 #endif
4287 nptr++;
4290 // now create SfxEntry or PfxEntry objects and use links to
4291 // build an ordered (sorted by affix string) list
4292 nptr = ptr;
4293 for (int k = 0; k < numents; k++) {
4294 if (at == 'P') {
4295 PfxEntry * pfxptr = new PfxEntry(this,nptr);
4296 build_pfxtree((AffEntry *)pfxptr);
4297 } else {
4298 SfxEntry * sfxptr = new SfxEntry(this,nptr);
4299 build_sfxtree((AffEntry *)sfxptr);
4301 nptr++;
4303 free(ptr);
4304 return 0;
4307 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) {
4308 int condl = strlen(cond);
4309 int i;
4310 int j;
4311 int neg;
4312 int in;
4313 if (ft == 'P') { // prefix
4314 if (strncmp(strip, cond, condl) == 0) return 1;
4315 if (utf8) {
4316 } else {
4317 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4318 if (cond[j] != '[') {
4319 if (cond[j] != strip[i]) {
4320 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4321 return 0;
4323 } else {
4324 neg = (cond[j+1] == '^') ? 1 : 0;
4325 in = 0;
4326 do {
4327 j++;
4328 if (strip[i] == cond[j]) in = 1;
4329 } while ((j < (condl - 1)) && (cond[j] != ']'));
4330 if (j == (condl - 1) && (cond[j] != ']')) {
4331 HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum);
4332 return 0;
4334 if ((!neg && !in) || (neg && in)) {
4335 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4336 return 0;
4340 if (j >= condl) return 1;
4342 } else { // suffix
4343 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
4344 if (utf8) {
4345 } else {
4346 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4347 if (cond[j] != ']') {
4348 if (cond[j] != strip[i]) {
4349 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4350 return 0;
4352 } else {
4353 in = 0;
4354 do {
4355 j--;
4356 if (strip[i] == cond[j]) in = 1;
4357 } while ((j > 0) && (cond[j] != '['));
4358 if ((j == 0) && (cond[j] != '[')) {
4359 HUNSPELL_WARNING(stderr, "error: error: %d: missing ] in condition:\n%s\n", linenum);
4360 return 0;
4362 neg = (cond[j+1] == '^') ? 1 : 0;
4363 if ((!neg && !in) || (neg && in)) {
4364 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4365 return 0;
4369 if (j < 0) return 1;
4372 return 0;