Check-in hunspell source code.
[chromium-blink-merge.git] / third_party / hunspell_new / src / hunspell / affixmgr.cxx
blob776cab152409ba6a4f028f6c3e53315748eb63a9
1 #include "license.hunspell"
2 #include "license.myspell"
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
9 #include <vector>
11 #include "affixmgr.hxx"
12 #include "affentry.hxx"
13 #include "langnum.hxx"
15 #include "csutil.hxx"
17 #ifdef HUNSPELL_CHROME_CLIENT
18 AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md)
20 bdict_reader = reader;
21 #else
22 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
24 #endif
25 // register hash manager and load affix data from aff file
26 pHMgr = ptr[0];
27 alldic = ptr;
28 maxdic = md;
29 keystring = NULL;
30 trystring = NULL;
31 encoding=NULL;
32 csconv=NULL;
33 utf8 = 0;
34 complexprefixes = 0;
35 maptable = NULL;
36 nummap = 0;
37 breaktable = NULL;
38 numbreak = -1;
39 reptable = NULL;
40 numrep = 0;
41 iconvtable = NULL;
42 oconvtable = NULL;
43 checkcpdtable = NULL;
44 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
45 simplifiedcpd = 0;
46 numcheckcpd = 0;
47 defcpdtable = NULL;
48 numdefcpd = 0;
49 phone = NULL;
50 compoundflag = FLAG_NULL; // permits word in compound forms
51 compoundbegin = FLAG_NULL; // may be first word in compound forms
52 compoundmiddle = FLAG_NULL; // may be middle word in compound forms
53 compoundend = FLAG_NULL; // may be last word in compound forms
54 compoundroot = FLAG_NULL; // compound word signing flag
55 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
56 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
57 compoundmoresuffixes = 0; // allow more suffixes within compound words
58 checkcompounddup = 0; // forbid double words in compounds
59 checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
60 checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
61 checkcompoundtriple = 0; // forbid compounds with triple letters
62 simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
63 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
64 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
65 nongramsuggest = FLAG_NULL;
66 lang = NULL; // language
67 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
68 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
69 cpdwordmax = -1; // default: unlimited wordcount in compound words
70 cpdmin = -1; // undefined
71 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
72 cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
73 cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
74 cpdvowels_utf16_len=0; // vowels
75 pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
76 sfxappnd=NULL; // previous suffix for counting a special syllables BUG
77 cpdsyllablenum=NULL; // syllable count incrementing flag
78 checknum=0; // checking numbers, and word with numbers
79 wordchars=NULL; // letters + spec. word characters
80 wordchars_utf16=NULL; // letters + spec. word characters
81 wordchars_utf16_len=0; // letters + spec. word characters
82 ignorechars=NULL; // letters + spec. word characters
83 ignorechars_utf16=NULL; // letters + spec. word characters
84 ignorechars_utf16_len=0; // letters + spec. word characters
85 version=NULL; // affix and dictionary file version string
86 havecontclass=0; // flags of possible continuing classes (double affix)
87 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
88 // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
89 lemma_present = FLAG_NULL;
90 circumfix = FLAG_NULL;
91 onlyincompound = FLAG_NULL;
92 maxngramsugs = -1; // undefined
93 maxdiff = -1; // undefined
94 onlymaxdiff = 0;
95 maxcpdsugs = -1; // undefined
96 nosplitsugs = 0;
97 sugswithdots = 0;
98 keepcase = 0;
99 forceucase = 0;
100 warn = 0;
101 forbidwarn = 0;
102 checksharps = 0;
103 substandard = FLAG_NULL;
104 fullstrip = 0;
106 sfx = NULL;
107 pfx = NULL;
109 for (int i=0; i < SETSIZE; i++) {
110 pStart[i] = NULL;
111 sStart[i] = NULL;
112 pFlag[i] = NULL;
113 sFlag[i] = NULL;
116 #ifdef HUNSPELL_CHROME_CLIENT
117 // Define dummy parameters for parse_file() to avoid changing the parameters
118 // of parse_file(). This may make it easier to merge the changes of the
119 // original hunspell.
120 const char* affpath = NULL;
121 const char* key = NULL;
122 #else
123 for (int j=0; j < CONTSIZE; j++) {
124 contclasses[j] = 0;
126 #endif
128 if (parse_file(affpath, key)) {
129 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
132 if (cpdmin == -1) cpdmin = MINCPDLEN;
137 AffixMgr::~AffixMgr()
139 // pass through linked prefix entries and clean up
140 for (int i=0; i < SETSIZE ;i++) {
141 pFlag[i] = NULL;
142 PfxEntry * ptr = pStart[i];
143 PfxEntry * nptr = NULL;
144 while (ptr) {
145 nptr = ptr->getNext();
146 delete(ptr);
147 ptr = nptr;
148 nptr = NULL;
152 // pass through linked suffix entries and clean up
153 for (int j=0; j < SETSIZE ; j++) {
154 sFlag[j] = NULL;
155 SfxEntry * ptr = sStart[j];
156 SfxEntry * nptr = NULL;
157 while (ptr) {
158 nptr = ptr->getNext();
159 delete(ptr);
160 ptr = nptr;
161 nptr = NULL;
163 sStart[j] = NULL;
166 if (keystring) free(keystring);
167 keystring=NULL;
168 if (trystring) free(trystring);
169 trystring=NULL;
170 if (encoding) free(encoding);
171 encoding=NULL;
172 if (maptable) {
173 for (int j=0; j < nummap; j++) {
174 for (int k=0; k < maptable[j].len; k++) {
175 if (maptable[j].set[k]) free(maptable[j].set[k]);
177 free(maptable[j].set);
178 maptable[j].set = NULL;
179 maptable[j].len = 0;
181 free(maptable);
182 maptable = NULL;
184 nummap = 0;
185 if (breaktable) {
186 for (int j=0; j < numbreak; j++) {
187 if (breaktable[j]) free(breaktable[j]);
188 breaktable[j] = NULL;
190 free(breaktable);
191 breaktable = NULL;
193 numbreak = 0;
194 if (reptable) {
195 for (int j=0; j < numrep; j++) {
196 free(reptable[j].pattern);
197 free(reptable[j].pattern2);
199 free(reptable);
200 reptable = NULL;
202 if (iconvtable) delete iconvtable;
203 if (oconvtable) delete oconvtable;
204 if (phone && phone->rules) {
205 for (int j=0; j < phone->num + 1; j++) {
206 free(phone->rules[j * 2]);
207 free(phone->rules[j * 2 + 1]);
209 free(phone->rules);
210 free(phone);
211 phone = NULL;
214 if (defcpdtable) {
215 for (int j=0; j < numdefcpd; j++) {
216 free(defcpdtable[j].def);
217 defcpdtable[j].def = NULL;
219 free(defcpdtable);
220 defcpdtable = NULL;
222 numrep = 0;
223 if (checkcpdtable) {
224 for (int j=0; j < numcheckcpd; j++) {
225 free(checkcpdtable[j].pattern);
226 free(checkcpdtable[j].pattern2);
227 free(checkcpdtable[j].pattern3);
228 checkcpdtable[j].pattern = NULL;
229 checkcpdtable[j].pattern2 = NULL;
230 checkcpdtable[j].pattern3 = NULL;
232 free(checkcpdtable);
233 checkcpdtable = NULL;
235 numcheckcpd = 0;
236 FREE_FLAG(compoundflag);
237 FREE_FLAG(compoundbegin);
238 FREE_FLAG(compoundmiddle);
239 FREE_FLAG(compoundend);
240 FREE_FLAG(compoundpermitflag);
241 FREE_FLAG(compoundforbidflag);
242 FREE_FLAG(compoundroot);
243 FREE_FLAG(forbiddenword);
244 FREE_FLAG(nosuggest);
245 FREE_FLAG(nongramsuggest);
246 FREE_FLAG(needaffix);
247 FREE_FLAG(lemma_present);
248 FREE_FLAG(circumfix);
249 FREE_FLAG(onlyincompound);
251 cpdwordmax = 0;
252 pHMgr = NULL;
253 cpdmin = 0;
254 cpdmaxsyllable = 0;
255 if (cpdvowels) free(cpdvowels);
256 if (cpdvowels_utf16) free(cpdvowels_utf16);
257 if (cpdsyllablenum) free(cpdsyllablenum);
258 free_utf_tbl();
259 if (lang) free(lang);
260 if (wordchars) free(wordchars);
261 if (wordchars_utf16) free(wordchars_utf16);
262 if (ignorechars) free(ignorechars);
263 if (ignorechars_utf16) free(ignorechars_utf16);
264 if (version) free(version);
265 checknum=0;
266 #ifdef MOZILLA_CLIENT
267 delete [] csconv;
268 #endif
271 void AffixMgr::finishFileMgr(FileMgr *afflst)
273 delete afflst;
275 // convert affix trees to sorted list
276 process_pfx_tree_to_list();
277 process_sfx_tree_to_list();
280 // read in aff file and build up prefix and suffix entry objects
281 int AffixMgr::parse_file(const char * affpath, const char * key)
283 char * line; // io buffers
284 char ft; // affix type
286 #ifdef HUNSPELL_CHROME_CLIENT
287 // open the affix file
288 // We're always UTF-8
289 utf8 = 1;
291 // A BDICT file stores PFX and SFX lines in a special section and it provides
292 // a special line iterator for reading PFX and SFX lines.
293 // We create a FileMgr object from this iterator and parse PFX and SFX lines
294 // before parsing other lines.
295 hunspell::LineIterator affix_iterator = bdict_reader->GetAffixLineIterator();
296 FileMgr* iterator = new FileMgr(&affix_iterator);
297 if (!iterator) {
298 HUNSPELL_WARNING(stderr,
299 "error: could not create a FileMgr from an affix line iterator.\n");
300 return 1;
303 while ((line = iterator->getline())) {
304 ft = ' ';
305 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
306 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
307 if (ft != ' ')
308 parse_affix(line, ft, iterator, NULL);
310 delete iterator;
312 // Create a FileMgr object for reading lines except PFX and SFX lines.
313 // We don't need to change the loop below since our FileMgr emulates the
314 // original one.
315 hunspell::LineIterator other_iterator = bdict_reader->GetOtherLineIterator();
316 FileMgr * afflst = new FileMgr(&other_iterator);
317 if (!afflst) {
318 HUNSPELL_WARNING(stderr,
319 "error: could not create a FileMgr from an other line iterator.\n");
320 return 1;
322 #else
323 // checking flag duplication
324 char dupflags[CONTSIZE];
325 char dupflags_ini = 1;
327 // first line indicator for removing byte order mark
328 int firstline = 1;
330 // open the affix file
331 FileMgr * afflst = new FileMgr(affpath, key);
332 if (!afflst) {
333 HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
334 return 1;
336 #endif
338 // step one is to parse the affix file building up the internal
339 // affix data structures
341 // read in each line ignoring any that do not
342 // start with a known line type indicator
343 while ((line = afflst->getline()) != NULL) {
344 mychomp(line);
346 #ifndef HUNSPELL_CHROME_CLIENT
347 /* remove byte order mark */
348 if (firstline) {
349 firstline = 0;
350 // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
351 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
352 memmove(line, line+3, strlen(line+3)+1);
355 #endif
357 /* parse in the keyboard string */
358 if (strncmp(line,"KEY",3) == 0) {
359 if (parse_string(line, &keystring, afflst->getlinenum())) {
360 finishFileMgr(afflst);
361 return 1;
365 /* parse in the try string */
366 if (strncmp(line,"TRY",3) == 0) {
367 if (parse_string(line, &trystring, afflst->getlinenum())) {
368 finishFileMgr(afflst);
369 return 1;
373 /* parse in the name of the character set used by the .dict and .aff */
374 if (strncmp(line,"SET",3) == 0) {
375 if (parse_string(line, &encoding, afflst->getlinenum())) {
376 finishFileMgr(afflst);
377 return 1;
379 if (strcmp(encoding, "UTF-8") == 0) {
380 utf8 = 1;
381 #ifndef OPENOFFICEORG
382 #ifndef MOZILLA_CLIENT
383 if (initialize_utf_tbl()) return 1;
384 #endif
385 #endif
389 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
390 if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
391 complexprefixes = 1;
393 /* parse in the flag used by the controlled compound words */
394 if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
395 if (parse_flag(line, &compoundflag, afflst)) {
396 finishFileMgr(afflst);
397 return 1;
401 /* parse in the flag used by compound words */
402 if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
403 if (complexprefixes) {
404 if (parse_flag(line, &compoundend, afflst)) {
405 finishFileMgr(afflst);
406 return 1;
408 } else {
409 if (parse_flag(line, &compoundbegin, afflst)) {
410 finishFileMgr(afflst);
411 return 1;
416 /* parse in the flag used by compound words */
417 if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
418 if (parse_flag(line, &compoundmiddle, afflst)) {
419 finishFileMgr(afflst);
420 return 1;
423 /* parse in the flag used by compound words */
424 if (strncmp(line,"COMPOUNDEND",11) == 0) {
425 if (complexprefixes) {
426 if (parse_flag(line, &compoundbegin, afflst)) {
427 finishFileMgr(afflst);
428 return 1;
430 } else {
431 if (parse_flag(line, &compoundend, afflst)) {
432 finishFileMgr(afflst);
433 return 1;
438 /* parse in the data used by compound_check() method */
439 if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
440 if (parse_num(line, &cpdwordmax, afflst)) {
441 finishFileMgr(afflst);
442 return 1;
446 /* parse in the flag sign compounds in dictionary */
447 if (strncmp(line,"COMPOUNDROOT",12) == 0) {
448 if (parse_flag(line, &compoundroot, afflst)) {
449 finishFileMgr(afflst);
450 return 1;
454 /* parse in the flag used by compound_check() method */
455 if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
456 if (parse_flag(line, &compoundpermitflag, afflst)) {
457 finishFileMgr(afflst);
458 return 1;
462 /* parse in the flag used by compound_check() method */
463 if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
464 if (parse_flag(line, &compoundforbidflag, afflst)) {
465 finishFileMgr(afflst);
466 return 1;
470 if (strncmp(line,"COMPOUNDMORESUFFIXES",20) == 0) {
471 compoundmoresuffixes = 1;
474 if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
475 checkcompounddup = 1;
478 if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
479 checkcompoundrep = 1;
482 if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
483 checkcompoundtriple = 1;
486 if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
487 simplifiedtriple = 1;
490 if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
491 checkcompoundcase = 1;
494 if (strncmp(line,"NOSUGGEST",9) == 0) {
495 if (parse_flag(line, &nosuggest, afflst)) {
496 finishFileMgr(afflst);
497 return 1;
501 if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
502 if (parse_flag(line, &nongramsuggest, afflst)) {
503 finishFileMgr(afflst);
504 return 1;
508 /* parse in the flag used by forbidden words */
509 if (strncmp(line,"FORBIDDENWORD",13) == 0) {
510 if (parse_flag(line, &forbiddenword, afflst)) {
511 finishFileMgr(afflst);
512 return 1;
516 /* parse in the flag used by forbidden words */
517 if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
518 if (parse_flag(line, &lemma_present, afflst)) {
519 finishFileMgr(afflst);
520 return 1;
524 /* parse in the flag used by circumfixes */
525 if (strncmp(line,"CIRCUMFIX",9) == 0) {
526 if (parse_flag(line, &circumfix, afflst)) {
527 finishFileMgr(afflst);
528 return 1;
532 /* parse in the flag used by fogemorphemes */
533 if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
534 if (parse_flag(line, &onlyincompound, afflst)) {
535 finishFileMgr(afflst);
536 return 1;
540 /* parse in the flag used by `needaffixs' */
541 if (strncmp(line,"PSEUDOROOT",10) == 0) {
542 if (parse_flag(line, &needaffix, afflst)) {
543 finishFileMgr(afflst);
544 return 1;
548 /* parse in the flag used by `needaffixs' */
549 if (strncmp(line,"NEEDAFFIX",9) == 0) {
550 if (parse_flag(line, &needaffix, afflst)) {
551 finishFileMgr(afflst);
552 return 1;
556 /* parse in the minimal length for words in compounds */
557 if (strncmp(line,"COMPOUNDMIN",11) == 0) {
558 if (parse_num(line, &cpdmin, afflst)) {
559 finishFileMgr(afflst);
560 return 1;
562 if (cpdmin < 1) cpdmin = 1;
565 /* parse in the max. words and syllables in compounds */
566 if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
567 if (parse_cpdsyllable(line, afflst)) {
568 finishFileMgr(afflst);
569 return 1;
573 /* parse in the flag used by compound_check() method */
574 if (strncmp(line,"SYLLABLENUM",11) == 0) {
575 if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
576 finishFileMgr(afflst);
577 return 1;
581 /* parse in the flag used by the controlled compound words */
582 if (strncmp(line,"CHECKNUM",8) == 0) {
583 checknum=1;
586 /* parse in the extra word characters */
587 if (strncmp(line,"WORDCHARS",9) == 0) {
588 if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
589 finishFileMgr(afflst);
590 return 1;
594 /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
595 if (strncmp(line,"IGNORE",6) == 0) {
596 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
597 finishFileMgr(afflst);
598 return 1;
602 #ifndef HUNSPELL_CHROME_CLIENT
603 /* parse in the typical fault correcting table */
604 if (strncmp(line,"REP",3) == 0) {
605 if (parse_reptable(line, afflst)) {
606 finishFileMgr(afflst);
607 return 1;
610 #endif
612 /* parse in the input conversion table */
613 if (strncmp(line,"ICONV",5) == 0) {
614 if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
615 finishFileMgr(afflst);
616 return 1;
620 /* parse in the input conversion table */
621 if (strncmp(line,"OCONV",5) == 0) {
622 if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
623 finishFileMgr(afflst);
624 return 1;
628 /* parse in the phonetic translation table */
629 if (strncmp(line,"PHONE",5) == 0) {
630 if (parse_phonetable(line, afflst)) {
631 finishFileMgr(afflst);
632 return 1;
636 /* parse in the checkcompoundpattern table */
637 if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
638 if (parse_checkcpdtable(line, afflst)) {
639 finishFileMgr(afflst);
640 return 1;
644 /* parse in the defcompound table */
645 if (strncmp(line,"COMPOUNDRULE",12) == 0) {
646 if (parse_defcpdtable(line, afflst)) {
647 finishFileMgr(afflst);
648 return 1;
652 /* parse in the related character map table */
653 if (strncmp(line,"MAP",3) == 0) {
654 if (parse_maptable(line, afflst)) {
655 finishFileMgr(afflst);
656 return 1;
660 /* parse in the word breakpoints table */
661 if (strncmp(line,"BREAK",5) == 0) {
662 if (parse_breaktable(line, afflst)) {
663 finishFileMgr(afflst);
664 return 1;
668 /* parse in the language for language specific codes */
669 if (strncmp(line,"LANG",4) == 0) {
670 if (parse_string(line, &lang, afflst->getlinenum())) {
671 finishFileMgr(afflst);
672 return 1;
674 langnum = get_lang_num(lang);
677 if (strncmp(line,"VERSION",7) == 0) {
678 for(line = line + 7; *line == ' ' || *line == '\t'; line++);
679 version = mystrdup(line);
682 if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
683 if (parse_num(line, &maxngramsugs, afflst)) {
684 finishFileMgr(afflst);
685 return 1;
689 if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
690 onlymaxdiff = 1;
692 if (strncmp(line,"MAXDIFF",7) == 0) {
693 if (parse_num(line, &maxdiff, afflst)) {
694 finishFileMgr(afflst);
695 return 1;
699 if (strncmp(line,"MAXCPDSUGS",10) == 0) {
700 if (parse_num(line, &maxcpdsugs, afflst)) {
701 finishFileMgr(afflst);
702 return 1;
706 if (strncmp(line,"NOSPLITSUGS",11) == 0) {
707 nosplitsugs=1;
710 if (strncmp(line,"FULLSTRIP",9) == 0) {
711 fullstrip=1;
714 if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
715 sugswithdots=1;
718 /* parse in the flag used by forbidden words */
719 if (strncmp(line,"KEEPCASE",8) == 0) {
720 if (parse_flag(line, &keepcase, afflst)) {
721 finishFileMgr(afflst);
722 return 1;
726 /* parse in the flag used by `forceucase' */
727 if (strncmp(line,"FORCEUCASE",10) == 0) {
728 if (parse_flag(line, &forceucase, afflst)) {
729 finishFileMgr(afflst);
730 return 1;
734 /* parse in the flag used by `warn' */
735 if (strncmp(line,"WARN",4) == 0) {
736 if (parse_flag(line, &warn, afflst)) {
737 finishFileMgr(afflst);
738 return 1;
742 if (strncmp(line,"FORBIDWARN",10) == 0) {
743 forbidwarn=1;
746 /* parse in the flag used by the affix generator */
747 if (strncmp(line,"SUBSTANDARD",11) == 0) {
748 if (parse_flag(line, &substandard, afflst)) {
749 finishFileMgr(afflst);
750 return 1;
754 if (strncmp(line,"CHECKSHARPS",11) == 0) {
755 checksharps=1;
758 #ifndef HUNSPELL_CHROME_CLIENT
759 /* parse this affix: P - prefix, S - suffix */
760 ft = ' ';
761 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
762 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
763 if (ft != ' ') {
764 if (dupflags_ini) {
765 memset(dupflags, 0, sizeof(dupflags));
766 dupflags_ini = 0;
768 if (parse_affix(line, ft, afflst, dupflags)) {
769 finishFileMgr(afflst);
770 return 1;
773 #endif
776 finishFileMgr(afflst);
777 // affix trees are sorted now
779 // now we can speed up performance greatly taking advantage of the
780 // relationship between the affixes and the idea of "subsets".
782 // View each prefix as a potential leading subset of another and view
783 // each suffix (reversed) as a potential trailing subset of another.
785 // To illustrate this relationship if we know the prefix "ab" is found in the
786 // word to examine, only prefixes that "ab" is a leading subset of need be examined.
787 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
788 // is a subset need be examined.
789 // The same argument goes for suffix string that are reversed.
791 // Then to top this off why not examine the first char of the word to quickly
792 // limit the set of prefixes to examine (i.e. the prefixes to examine must
793 // be leading supersets of the first character of the word (if they exist)
795 // To take advantage of this "subset" relationship, we need to add two links
796 // from entry. One to take next if the current prefix is found (call it nexteq)
797 // and one to take next if the current prefix is not found (call it nextne).
799 // Since we have built ordered lists, all that remains is to properly initialize
800 // the nextne and nexteq pointers that relate them
802 process_pfx_order();
803 process_sfx_order();
805 /* get encoding for CHECKCOMPOUNDCASE */
806 if (!utf8) {
807 char * enc = get_encoding();
808 csconv = get_current_cs(enc);
809 free(enc);
810 enc = NULL;
812 char expw[MAXLNLEN];
813 if (wordchars) {
814 strcpy(expw, wordchars);
815 free(wordchars);
816 } else *expw = '\0';
818 for (int i = 0; i <= 255; i++) {
819 if ( (csconv[i].cupper != csconv[i].clower) &&
820 (! strchr(expw, (char) i))) {
821 *(expw + strlen(expw) + 1) = '\0';
822 *(expw + strlen(expw)) = (char) i;
826 wordchars = mystrdup(expw);
829 // default BREAK definition
830 if (numbreak == -1) {
831 breaktable = (char **) malloc(sizeof(char *) * 3);
832 if (!breaktable) return 1;
833 breaktable[0] = mystrdup("-");
834 breaktable[1] = mystrdup("^-");
835 breaktable[2] = mystrdup("-$");
836 if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
838 return 0;
842 // we want to be able to quickly access prefix information
843 // both by prefix flag, and sorted by prefix string itself
844 // so we need to set up two indexes
846 int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
848 PfxEntry * ptr;
849 PfxEntry * pptr;
850 PfxEntry * ep = pfxptr;
852 // get the right starting points
853 const char * key = ep->getKey();
854 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
856 // first index by flag which must exist
857 ptr = pFlag[flg];
858 ep->setFlgNxt(ptr);
859 pFlag[flg] = ep;
862 // handle the special case of null affix string
863 if (strlen(key) == 0) {
864 // always inset them at head of list at element 0
865 ptr = pStart[0];
866 ep->setNext(ptr);
867 pStart[0] = ep;
868 return 0;
871 // now handle the normal case
872 ep->setNextEQ(NULL);
873 ep->setNextNE(NULL);
875 unsigned char sp = *((const unsigned char *)key);
876 ptr = pStart[sp];
878 // handle the first insert
879 if (!ptr) {
880 pStart[sp] = ep;
881 return 0;
885 // otherwise use binary tree insertion so that a sorted
886 // list can easily be generated later
887 pptr = NULL;
888 for (;;) {
889 pptr = ptr;
890 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
891 ptr = ptr->getNextEQ();
892 if (!ptr) {
893 pptr->setNextEQ(ep);
894 break;
896 } else {
897 ptr = ptr->getNextNE();
898 if (!ptr) {
899 pptr->setNextNE(ep);
900 break;
904 return 0;
907 // we want to be able to quickly access suffix information
908 // both by suffix flag, and sorted by the reverse of the
909 // suffix string itself; so we need to set up two indexes
910 int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
912 SfxEntry * ptr;
913 SfxEntry * pptr;
914 SfxEntry * ep = sfxptr;
916 /* get the right starting point */
917 const char * key = ep->getKey();
918 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
920 // first index by flag which must exist
921 ptr = sFlag[flg];
922 ep->setFlgNxt(ptr);
923 sFlag[flg] = ep;
925 // next index by affix string
927 // handle the special case of null affix string
928 if (strlen(key) == 0) {
929 // always inset them at head of list at element 0
930 ptr = sStart[0];
931 ep->setNext(ptr);
932 sStart[0] = ep;
933 return 0;
936 // now handle the normal case
937 ep->setNextEQ(NULL);
938 ep->setNextNE(NULL);
940 unsigned char sp = *((const unsigned char *)key);
941 ptr = sStart[sp];
943 // handle the first insert
944 if (!ptr) {
945 sStart[sp] = ep;
946 return 0;
949 // otherwise use binary tree insertion so that a sorted
950 // list can easily be generated later
951 pptr = NULL;
952 for (;;) {
953 pptr = ptr;
954 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
955 ptr = ptr->getNextEQ();
956 if (!ptr) {
957 pptr->setNextEQ(ep);
958 break;
960 } else {
961 ptr = ptr->getNextNE();
962 if (!ptr) {
963 pptr->setNextNE(ep);
964 break;
968 return 0;
971 // convert from binary tree to sorted list
972 int AffixMgr::process_pfx_tree_to_list()
974 for (int i=1; i< SETSIZE; i++) {
975 pStart[i] = process_pfx_in_order(pStart[i],NULL);
977 return 0;
981 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
983 if (ptr) {
984 nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
985 ptr->setNext(nptr);
986 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
988 return nptr;
992 // convert from binary tree to sorted list
993 int AffixMgr:: process_sfx_tree_to_list()
995 for (int i=1; i< SETSIZE; i++) {
996 sStart[i] = process_sfx_in_order(sStart[i],NULL);
998 return 0;
1001 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
1003 if (ptr) {
1004 nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
1005 ptr->setNext(nptr);
1006 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
1008 return nptr;
1012 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
1013 // using the idea of leading subsets this time
1014 int AffixMgr::process_pfx_order()
1016 PfxEntry* ptr;
1018 // loop through each prefix list starting point
1019 for (int i=1; i < SETSIZE; i++) {
1021 ptr = pStart[i];
1023 // look through the remainder of the list
1024 // and find next entry with affix that
1025 // the current one is not a subset of
1026 // mark that as destination for NextNE
1027 // use next in list that you are a subset
1028 // of as NextEQ
1030 for (; ptr != NULL; ptr = ptr->getNext()) {
1032 PfxEntry * nptr = ptr->getNext();
1033 for (; nptr != NULL; nptr = nptr->getNext()) {
1034 if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
1036 ptr->setNextNE(nptr);
1037 ptr->setNextEQ(NULL);
1038 if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
1039 ptr->setNextEQ(ptr->getNext());
1042 // now clean up by adding smart search termination strings:
1043 // if you are already a superset of the previous prefix
1044 // but not a subset of the next, search can end here
1045 // so set NextNE properly
1047 ptr = pStart[i];
1048 for (; ptr != NULL; ptr = ptr->getNext()) {
1049 PfxEntry * nptr = ptr->getNext();
1050 PfxEntry * mptr = NULL;
1051 for (; nptr != NULL; nptr = nptr->getNext()) {
1052 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1053 mptr = nptr;
1055 if (mptr) mptr->setNextNE(NULL);
1058 return 0;
1061 // initialize the SfxEntry links NextEQ and NextNE to speed searching
1062 // using the idea of leading subsets this time
1063 int AffixMgr::process_sfx_order()
1065 SfxEntry* ptr;
1067 // loop through each prefix list starting point
1068 for (int i=1; i < SETSIZE; i++) {
1070 ptr = sStart[i];
1072 // look through the remainder of the list
1073 // and find next entry with affix that
1074 // the current one is not a subset of
1075 // mark that as destination for NextNE
1076 // use next in list that you are a subset
1077 // of as NextEQ
1079 for (; ptr != NULL; ptr = ptr->getNext()) {
1080 SfxEntry * nptr = ptr->getNext();
1081 for (; nptr != NULL; nptr = nptr->getNext()) {
1082 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1084 ptr->setNextNE(nptr);
1085 ptr->setNextEQ(NULL);
1086 if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
1087 ptr->setNextEQ(ptr->getNext());
1091 // now clean up by adding smart search termination strings:
1092 // if you are already a superset of the previous suffix
1093 // but not a subset of the next, search can end here
1094 // so set NextNE properly
1096 ptr = sStart[i];
1097 for (; ptr != NULL; ptr = ptr->getNext()) {
1098 SfxEntry * nptr = ptr->getNext();
1099 SfxEntry * mptr = NULL;
1100 for (; nptr != NULL; nptr = nptr->getNext()) {
1101 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1102 mptr = nptr;
1104 if (mptr) mptr->setNextNE(NULL);
1107 return 0;
1110 // add flags to the result for dictionary debugging
1111 void AffixMgr::debugflag(char * result, unsigned short flag) {
1112 char * st = encode_flag(flag);
1113 mystrcat(result, " ", MAXLNLEN);
1114 mystrcat(result, MORPH_FLAG, MAXLNLEN);
1115 if (st) {
1116 mystrcat(result, st, MAXLNLEN);
1117 free(st);
1121 // calculate the character length of the condition
1122 int AffixMgr::condlen(char * st)
1124 int l = 0;
1125 bool group = false;
1126 for(; *st; st++) {
1127 if (*st == '[') {
1128 group = true;
1129 l++;
1130 } else if (*st == ']') group = false;
1131 else if (!group && (!utf8 ||
1132 (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
1134 return l;
1137 int AffixMgr::encodeit(affentry &entry, char * cs)
1139 if (strcmp(cs,".") != 0) {
1140 entry.numconds = (char) condlen(cs);
1141 strncpy(entry.c.conds, cs, MAXCONDLEN);
1142 // long condition (end of conds padded by strncpy)
1143 if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
1144 entry.opts += aeLONGCOND;
1145 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1146 if (!entry.c.l.conds2) return 1;
1148 } else {
1149 entry.numconds = 0;
1150 entry.c.conds[0] = '\0';
1152 return 0;
1155 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
1156 inline int AffixMgr::isSubset(const char * s1, const char * s2)
1158 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1159 s1++;
1160 s2++;
1162 return (*s1 == '\0');
1166 // check word for prefixes
1167 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
1168 const FLAG needflag)
1170 struct hentry * rv= NULL;
1172 pfx = NULL;
1173 pfxappnd = NULL;
1174 sfxappnd = NULL;
1176 // first handle the special case of 0 length prefixes
1177 PfxEntry * pe = pStart[0];
1178 while (pe) {
1179 if (
1180 // fogemorpheme
1181 ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
1182 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1183 // permit prefixes in compounds
1184 ((in_compound != IN_CPD_END) || (pe->getCont() &&
1185 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
1187 // check prefix
1188 rv = pe->checkword(word, len, in_compound, needflag);
1189 if (rv) {
1190 pfx=pe; // BUG: pfx not stateless
1191 return rv;
1194 pe = pe->getNext();
1197 // now handle the general case
1198 unsigned char sp = *((const unsigned char *)word);
1199 PfxEntry * pptr = pStart[sp];
1201 while (pptr) {
1202 if (isSubset(pptr->getKey(),word)) {
1203 if (
1204 // fogemorpheme
1205 ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
1206 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1207 // permit prefixes in compounds
1208 ((in_compound != IN_CPD_END) || (pptr->getCont() &&
1209 (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
1211 // check prefix
1212 rv = pptr->checkword(word, len, in_compound, needflag);
1213 if (rv) {
1214 pfx=pptr; // BUG: pfx not stateless
1215 return rv;
1218 pptr = pptr->getNextEQ();
1219 } else {
1220 pptr = pptr->getNextNE();
1224 return NULL;
1227 // check word for prefixes
1228 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
1229 char in_compound, const FLAG needflag)
1231 struct hentry * rv= NULL;
1233 pfx = NULL;
1234 sfxappnd = NULL;
1236 // first handle the special case of 0 length prefixes
1237 PfxEntry * pe = pStart[0];
1239 while (pe) {
1240 rv = pe->check_twosfx(word, len, in_compound, needflag);
1241 if (rv) return rv;
1242 pe = pe->getNext();
1245 // now handle the general case
1246 unsigned char sp = *((const unsigned char *)word);
1247 PfxEntry * pptr = pStart[sp];
1249 while (pptr) {
1250 if (isSubset(pptr->getKey(),word)) {
1251 rv = pptr->check_twosfx(word, len, in_compound, needflag);
1252 if (rv) {
1253 pfx = pptr;
1254 return rv;
1256 pptr = pptr->getNextEQ();
1257 } else {
1258 pptr = pptr->getNextNE();
1262 return NULL;
1265 // check word for prefixes
1266 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
1267 const FLAG needflag)
1269 char * st;
1271 char result[MAXLNLEN];
1272 result[0] = '\0';
1274 pfx = NULL;
1275 sfxappnd = NULL;
1277 // first handle the special case of 0 length prefixes
1278 PfxEntry * pe = pStart[0];
1279 while (pe) {
1280 st = pe->check_morph(word,len,in_compound, needflag);
1281 if (st) {
1282 mystrcat(result, st, MAXLNLEN);
1283 free(st);
1285 // if (rv) return rv;
1286 pe = pe->getNext();
1289 // now handle the general case
1290 unsigned char sp = *((const unsigned char *)word);
1291 PfxEntry * pptr = pStart[sp];
1293 while (pptr) {
1294 if (isSubset(pptr->getKey(),word)) {
1295 st = pptr->check_morph(word,len,in_compound, needflag);
1296 if (st) {
1297 // fogemorpheme
1298 if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
1299 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
1300 mystrcat(result, st, MAXLNLEN);
1301 pfx = pptr;
1303 free(st);
1305 pptr = pptr->getNextEQ();
1306 } else {
1307 pptr = pptr->getNextNE();
1311 if (*result) return mystrdup(result);
1312 return NULL;
1316 // check word for prefixes
1317 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
1318 char in_compound, const FLAG needflag)
1320 char * st;
1322 char result[MAXLNLEN];
1323 result[0] = '\0';
1325 pfx = NULL;
1326 sfxappnd = NULL;
1328 // first handle the special case of 0 length prefixes
1329 PfxEntry * pe = pStart[0];
1330 while (pe) {
1331 st = pe->check_twosfx_morph(word,len,in_compound, needflag);
1332 if (st) {
1333 mystrcat(result, st, MAXLNLEN);
1334 free(st);
1336 pe = pe->getNext();
1339 // now handle the general case
1340 unsigned char sp = *((const unsigned char *)word);
1341 PfxEntry * pptr = pStart[sp];
1343 while (pptr) {
1344 if (isSubset(pptr->getKey(),word)) {
1345 st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1346 if (st) {
1347 mystrcat(result, st, MAXLNLEN);
1348 free(st);
1349 pfx = pptr;
1351 pptr = pptr->getNextEQ();
1352 } else {
1353 pptr = pptr->getNextNE();
1357 if (*result) return mystrdup(result);
1358 return NULL;
1361 // Is word a non compound with a REP substitution (see checkcompoundrep)?
1362 int AffixMgr::cpdrep_check(const char * word, int wl)
1364 char candidate[MAXLNLEN];
1365 const char * r;
1366 int lenr, lenp;
1368 #ifdef HUNSPELL_CHROME_CLIENT
1369 const char *pattern, *pattern2;
1370 hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator();
1371 while (iterator.GetNext(&pattern, &pattern2)) {
1372 r = word;
1373 lenr = strlen(pattern2);
1374 lenp = strlen(pattern);
1376 // search every occurence of the pattern in the word
1377 while ((r=strstr(r, pattern)) != NULL) {
1378 strcpy(candidate, word);
1379 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1380 strcpy(candidate+(r-word), pattern2);
1381 strcpy(candidate+(r-word)+lenr, r+lenp);
1382 if (candidate_check(candidate,strlen(candidate))) return 1;
1383 r++; // search for the next letter
1387 #else
1388 if ((wl < 2) || !numrep) return 0;
1390 for (int i=0; i < numrep; i++ ) {
1391 r = word;
1392 lenr = strlen(reptable[i].pattern2);
1393 lenp = strlen(reptable[i].pattern);
1394 // search every occurence of the pattern in the word
1395 while ((r=strstr(r, reptable[i].pattern)) != NULL) {
1396 strcpy(candidate, word);
1397 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1398 strcpy(candidate+(r-word),reptable[i].pattern2);
1399 strcpy(candidate+(r-word)+lenr, r+lenp);
1400 if (candidate_check(candidate,strlen(candidate))) return 1;
1401 r++; // search for the next letter
1404 #endif
1405 return 0;
1408 // forbid compoundings when there are special patterns at word bound
1409 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char /*affixed*/)
1411 int len;
1412 for (int i = 0; i < numcheckcpd; i++) {
1413 if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
1414 (!r1 || !checkcpdtable[i].cond ||
1415 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1416 (!r2 || !checkcpdtable[i].cond2 ||
1417 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1418 // zero length pattern => only TESTAFF
1419 // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1420 (!*(checkcpdtable[i].pattern) || (
1421 (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1422 (*(checkcpdtable[i].pattern)!='0' && ((len = strlen(checkcpdtable[i].pattern)) != 0) &&
1423 strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
1424 return 1;
1427 return 0;
1430 // forbid compounding with neighbouring upper and lower case characters at word bounds
1431 int AffixMgr::cpdcase_check(const char * word, int pos)
1433 if (utf8) {
1434 w_char u, w;
1435 const char * p;
1436 u8_u16(&u, 1, word + pos);
1437 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
1438 u8_u16(&w, 1, p);
1439 unsigned short a = (u.h << 8) + u.l;
1440 unsigned short b = (w.h << 8) + w.l;
1441 if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
1442 (a != '-') && (b != '-')) return 1;
1443 } else {
1444 unsigned char a = *(word + pos - 1);
1445 unsigned char b = *(word + pos);
1446 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
1448 return 0;
1451 // check compound patterns
1452 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
1454 signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
1455 signed short btwp[MAXWORDLEN]; // word positions for metacharacters
1456 int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
1457 short bt = 0;
1458 int i, j;
1459 int ok;
1460 int w = 0;
1462 if (!*words) {
1463 w = 1;
1464 *words = def;
1467 if (!*words) {
1468 return 0;
1471 (*words)[wnum] = rv;
1473 // has the last word COMPOUNDRULE flag?
1474 if (rv->alen == 0) {
1475 (*words)[wnum] = NULL;
1476 if (w) *words = NULL;
1477 return 0;
1479 ok = 0;
1480 for (i = 0; i < numdefcpd; i++) {
1481 for (j = 0; j < defcpdtable[i].len; j++) {
1482 if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
1483 TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;
1486 if (ok == 0) {
1487 (*words)[wnum] = NULL;
1488 if (w) *words = NULL;
1489 return 0;
1492 for (i = 0; i < numdefcpd; i++) {
1493 signed short pp = 0; // pattern position
1494 signed short wp = 0; // "words" position
1495 int ok2;
1496 ok = 1;
1497 ok2 = 1;
1498 do {
1499 while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
1500 if (((pp+1) < defcpdtable[i].len) &&
1501 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
1502 int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
1503 ok2 = 1;
1504 pp+=2;
1505 btpp[bt] = pp;
1506 btwp[bt] = wp;
1507 while (wp <= wend) {
1508 if (!(*words)[wp]->alen ||
1509 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
1510 ok2 = 0;
1511 break;
1513 wp++;
1515 if (wp <= wnum) ok2 = 0;
1516 btnum[bt] = wp - btwp[bt];
1517 if (btnum[bt] > 0) bt++;
1518 if (ok2) break;
1519 } else {
1520 ok2 = 1;
1521 if (!(*words)[wp] || !(*words)[wp]->alen ||
1522 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
1523 ok = 0;
1524 break;
1526 pp++;
1527 wp++;
1528 if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
1531 if (ok && ok2) {
1532 int r = pp;
1533 while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
1534 ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
1535 if (defcpdtable[i].len <= r) return 1;
1537 // backtrack
1538 if (bt) do {
1539 ok = 1;
1540 btnum[bt - 1]--;
1541 pp = btpp[bt - 1];
1542 wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
1543 } while ((btnum[bt - 1] < 0) && --bt);
1544 } while (bt);
1546 if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
1548 // check zero ending
1549 while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
1550 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
1551 if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
1553 (*words)[wnum] = NULL;
1554 if (w) *words = NULL;
1555 return 0;
1558 inline int AffixMgr::candidate_check(const char * word, int len)
1560 struct hentry * rv=NULL;
1562 rv = lookup(word);
1563 if (rv) return 1;
1565 // rv = prefix_check(word,len,1);
1566 // if (rv) return 1;
1568 rv = affix_check(word,len);
1569 if (rv) return 1;
1570 return 0;
1573 // calculate number of syllable for compound-checking
1574 short AffixMgr::get_syllable(const char * word, int wlen)
1576 if (cpdmaxsyllable==0) return 0;
1578 short num=0;
1580 if (!utf8) {
1581 for (int i=0; i<wlen; i++) {
1582 if (strchr(cpdvowels, word[i])) num++;
1584 } else if (cpdvowels_utf16) {
1585 w_char w[MAXWORDUTF8LEN];
1586 int i = u8_u16(w, MAXWORDUTF8LEN, word);
1587 for (; i > 0; i--) {
1588 if (flag_bsearch((unsigned short *) cpdvowels_utf16,
1589 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
1592 return num;
1595 void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
1596 if (utf8) {
1597 int i;
1598 for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
1599 for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
1601 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
1602 for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
1604 } else {
1605 *cmin = cpdmin;
1606 *cmax = len - cpdmin + 1;
1611 // check if compound word is correctly spelled
1612 // hu_mov_rule = spec. Hungarian rule (XXX)
1613 struct hentry * AffixMgr::compound_check(const char * word, int len,
1614 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
1615 char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
1617 int i;
1618 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1619 struct hentry * rv = NULL;
1620 struct hentry * rv_first;
1621 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1622 char st [MAXWORDUTF8LEN + 4];
1623 char ch = '\0';
1624 int cmin;
1625 int cmax;
1626 int striple = 0;
1627 int scpd = 0;
1628 int soldi = 0;
1629 int oldcmin = 0;
1630 int oldcmax = 0;
1631 int oldlen = 0;
1632 int checkedstriple = 0;
1633 int onlycpdrule;
1634 char affixed = 0;
1635 hentry ** oldwords = words;
1637 int checked_prefix;
1639 setcminmax(&cmin, &cmax, word, len);
1641 strcpy(st, word);
1643 for (i = cmin; i < cmax; i++) {
1644 // go to end of the UTF-8 character
1645 if (utf8) {
1646 for (; (st[i] & 0xc0) == 0x80; i++);
1647 if (i >= cmax) return NULL;
1650 words = oldwords;
1651 onlycpdrule = (words) ? 1 : 0;
1653 do { // onlycpdrule loop
1655 oldnumsyllable = numsyllable;
1656 oldwordnum = wordnum;
1657 checked_prefix = 0;
1660 do { // simplified checkcompoundpattern loop
1662 if (scpd > 0) {
1663 for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
1664 strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
1666 if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
1667 strcpy(st + i, checkcpdtable[scpd-1].pattern);
1668 soldi = i;
1669 i += strlen(checkcpdtable[scpd-1].pattern);
1670 strcpy(st + i, checkcpdtable[scpd-1].pattern2);
1671 strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
1673 oldlen = len;
1674 len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
1675 oldcmin = cmin;
1676 oldcmax = cmax;
1677 setcminmax(&cmin, &cmax, st, len);
1679 cmax = len - cpdmin + 1;
1682 ch = st[i];
1683 st[i] = '\0';
1685 sfx = NULL;
1686 pfx = NULL;
1688 // FIRST WORD
1690 affixed = 1;
1691 rv = lookup(st); // perhaps without prefix
1693 // search homonym with compound flag
1694 while ((rv) && !hu_mov_rule &&
1695 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1696 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1697 (compoundbegin && !wordnum && !onlycpdrule &&
1698 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1699 (compoundmiddle && wordnum && !words && !onlycpdrule &&
1700 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1701 (numdefcpd && onlycpdrule &&
1702 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
1703 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||
1704 (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&
1705 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))
1707 rv = rv->next_homonym;
1710 if (rv) affixed = 0;
1712 if (!rv) {
1713 if (onlycpdrule) break;
1714 if (compoundflag &&
1715 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
1716 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
1717 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1718 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) && !hu_mov_rule &&
1719 sfx->getCont() &&
1720 ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
1721 sfx->getContLen())) || (compoundend &&
1722 TESTAFF(sfx->getCont(), compoundend,
1723 sfx->getContLen())))) {
1724 rv = NULL;
1728 if (rv ||
1729 (((wordnum == 0) && compoundbegin &&
1730 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1731 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundbegin))) || // twofold suffixes + compound
1732 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
1733 ((wordnum > 0) && compoundmiddle &&
1734 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1735 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundmiddle))) || // twofold suffixes + compound
1736 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
1737 ) checked_prefix = 1;
1738 // else check forbiddenwords and needaffix
1739 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1740 TESTAFF(rv->astr, needaffix, rv->alen) ||
1741 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1742 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
1743 )) {
1744 st[i] = ch;
1745 //continue;
1746 break;
1749 // check non_compound flag in suffix and prefix
1750 if ((rv) && !hu_mov_rule &&
1751 ((pfx && pfx->getCont() &&
1752 TESTAFF(pfx->getCont(), compoundforbidflag,
1753 pfx->getContLen())) ||
1754 (sfx && sfx->getCont() &&
1755 TESTAFF(sfx->getCont(), compoundforbidflag,
1756 sfx->getContLen())))) {
1757 rv = NULL;
1760 // check compoundend flag in suffix and prefix
1761 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1762 ((pfx && pfx->getCont() &&
1763 TESTAFF(pfx->getCont(), compoundend,
1764 pfx->getContLen())) ||
1765 (sfx && sfx->getCont() &&
1766 TESTAFF(sfx->getCont(), compoundend,
1767 sfx->getContLen())))) {
1768 rv = NULL;
1771 // check compoundmiddle flag in suffix and prefix
1772 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
1773 ((pfx && pfx->getCont() &&
1774 TESTAFF(pfx->getCont(), compoundmiddle,
1775 pfx->getContLen())) ||
1776 (sfx && sfx->getCont() &&
1777 TESTAFF(sfx->getCont(), compoundmiddle,
1778 sfx->getContLen())))) {
1779 rv = NULL;
1782 // check forbiddenwords
1783 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1784 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1785 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1786 return NULL;
1789 // increment word number, if the second root has a compoundroot flag
1790 if ((rv) && compoundroot &&
1791 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1792 wordnum++;
1795 // first word is acceptable in compound words?
1796 if (((rv) &&
1797 ( checked_prefix || (words && words[wnum]) ||
1798 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1799 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1800 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
1801 // (numdefcpd && )
1803 // LANG_hu section: spec. Hungarian rule
1804 || ((langnum == LANG_hu) && hu_mov_rule && (
1805 TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
1806 TESTAFF(rv->astr, 'G', rv->alen) ||
1807 TESTAFF(rv->astr, 'H', rv->alen)
1810 // END of LANG_hu section
1811 ) &&
1813 // test CHECKCOMPOUNDPATTERN conditions
1814 scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL ||
1815 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)
1817 && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters
1818 (word[i-1]==word[i]) && (
1819 ((i>1) && (word[i-1]==word[i-2])) ||
1820 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
1822 ) ||
1824 checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i)
1827 // LANG_hu section: spec. Hungarian rule
1828 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
1829 (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes
1830 TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
1831 TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
1835 ) { // first word is ok condition
1837 // LANG_hu section: spec. Hungarian rule
1838 if (langnum == LANG_hu) {
1839 // calculate syllable number of the word
1840 numsyllable += get_syllable(st, i);
1841 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1842 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
1844 // END of LANG_hu section
1846 // NEXT WORD(S)
1847 rv_first = rv;
1848 st[i] = ch;
1850 do { // striple loop
1852 // check simplifiedtriple
1853 if (simplifiedtriple) {
1854 if (striple) {
1855 checkedstriple = 1;
1856 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1857 } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1;
1860 rv = lookup((st+i)); // perhaps without prefix
1862 // search homonym with compound flag
1863 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1864 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1865 (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
1866 (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) ||
1867 (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL &&
1868 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1869 )) {
1870 rv = rv->next_homonym;
1873 // check FORCEUCASE
1874 if (rv && forceucase && (rv) &&
1875 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
1877 if (rv && words && words[wnum + 1]) return rv_first;
1879 oldnumsyllable2 = numsyllable;
1880 oldwordnum2 = wordnum;
1883 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1884 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
1885 numsyllable--;
1887 // END of LANG_hu section
1889 // increment word number, if the second root has a compoundroot flag
1890 if ((rv) && (compoundroot) &&
1891 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1892 wordnum++;
1895 // check forbiddenwords
1896 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1897 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1898 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1900 // second word is acceptable, as a root?
1901 // hungarian conventions: compounding is acceptable,
1902 // when compound forms consist of 2 words, or if more,
1903 // then the syllable number of root words must be 6, or lesser.
1905 if ((rv) && (
1906 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1907 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
1909 && (
1910 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
1911 ((cpdmaxsyllable!=0) &&
1912 (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))
1913 ) &&
1915 // test CHECKCOMPOUNDPATTERN
1916 !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0)
1917 ) &&
1919 (!checkcompounddup || (rv != rv_first))
1921 // test CHECKCOMPOUNDPATTERN conditions
1922 && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1923 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1926 // forbid compound word, if it is a non compound word with typical fault
1927 if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
1928 return rv_first;
1931 numsyllable = oldnumsyllable2;
1932 wordnum = oldwordnum2;
1934 // perhaps second word has prefix or/and suffix
1935 sfx = NULL;
1936 sfxflag = FLAG_NULL;
1937 rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
1938 if (!rv && compoundend && !onlycpdrule) {
1939 sfx = NULL;
1940 pfx = NULL;
1941 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
1944 if (!rv && numdefcpd && words) {
1945 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
1946 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first;
1947 rv = NULL;
1950 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1951 if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1952 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL;
1954 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1955 if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL;
1957 // check non_compound flag in suffix and prefix
1958 if ((rv) &&
1959 ((pfx && pfx->getCont() &&
1960 TESTAFF(pfx->getCont(), compoundforbidflag,
1961 pfx->getContLen())) ||
1962 (sfx && sfx->getCont() &&
1963 TESTAFF(sfx->getCont(), compoundforbidflag,
1964 sfx->getContLen())))) {
1965 rv = NULL;
1968 // check FORCEUCASE
1969 if (rv && forceucase && (rv) &&
1970 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
1972 // check forbiddenwords
1973 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1974 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1975 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1977 // pfxappnd = prefix of word+i, or NULL
1978 // calculate syllable number of prefix.
1979 // hungarian convention: when syllable number of prefix is more,
1980 // than 1, the prefix+word counts as two words.
1982 if (langnum == LANG_hu) {
1983 // calculate syllable number of the word
1984 numsyllable += get_syllable(word + i, strlen(word + i));
1986 // - affix syllable num.
1987 // XXX only second suffix (inflections, not derivations)
1988 if (sfxappnd) {
1989 char * tmp = myrevstrdup(sfxappnd);
1990 numsyllable -= get_syllable(tmp, strlen(tmp));
1991 free(tmp);
1994 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1995 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
1997 // increment syllable num, if last word has a SYLLABLENUM flag
1998 // and the suffix is beginning `s'
2000 if (cpdsyllablenum) {
2001 switch (sfxflag) {
2002 case 'c': { numsyllable+=2; break; }
2003 case 'J': { numsyllable += 1; break; }
2004 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
2009 // increment word number, if the second word has a compoundroot flag
2010 if ((rv) && (compoundroot) &&
2011 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2012 wordnum++;
2015 // second word is acceptable, as a word with prefix or/and suffix?
2016 // hungarian conventions: compounding is acceptable,
2017 // when compound forms consist 2 word, otherwise
2018 // the syllable number of root words is 6, or lesser.
2019 if ((rv) &&
2021 ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2022 ((cpdmaxsyllable != 0) &&
2023 (numsyllable <= cpdmaxsyllable))
2025 && (
2026 (!checkcompounddup || (rv != rv_first))
2027 )) {
2028 // forbid compound word, if it is a non compound word with typical fault
2029 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
2030 return rv_first;
2033 numsyllable = oldnumsyllable2;
2034 wordnum = oldwordnum2;
2036 // perhaps second word is a compound word (recursive call)
2037 if (wordnum < maxwordnum) {
2038 rv = compound_check((st+i),strlen(st+i), wordnum+1,
2039 numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info);
2041 if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) ||
2042 (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL;
2043 } else {
2044 rv=NULL;
2046 if (rv) {
2047 // forbid compound word, if it is a non compound word with typical fault
2048 if (checkcompoundrep || forbiddenword) {
2049 struct hentry * rv2 = NULL;
2051 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
2053 // check first part
2054 if (strncmp(rv->word, word + i, rv->blen) == 0) {
2055 char r = *(st + i + rv->blen);
2056 *(st + i + rv->blen) = '\0';
2058 if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {
2059 *(st + i + rv->blen) = r;
2060 continue;
2063 if (forbiddenword) {
2064 rv2 = lookup(word);
2065 if (!rv2) rv2 = affix_check(word, len);
2066 if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
2067 (strncmp(rv2->word, st, i + rv->blen) == 0)) {
2068 return NULL;
2071 *(st + i + rv->blen) = r;
2074 return rv_first;
2076 } while (striple && !checkedstriple); // end of striple loop
2078 if (checkedstriple) {
2079 i++;
2080 checkedstriple = 0;
2081 striple = 0;
2084 } // first word is ok condition
2086 if (soldi != 0) {
2087 i = soldi;
2088 soldi = 0;
2089 len = oldlen;
2090 cmin = oldcmin;
2091 cmax = oldcmax;
2093 scpd++;
2096 } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop
2098 scpd = 0;
2099 wordnum = oldwordnum;
2100 numsyllable = oldnumsyllable;
2102 if (soldi != 0) {
2103 i = soldi;
2104 strcpy(st, word); // XXX add more optim.
2105 soldi = 0;
2106 } else st[i] = ch;
2108 } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
2112 return NULL;
2115 // check if compound word is correctly spelled
2116 // hu_mov_rule = spec. Hungarian rule (XXX)
2117 int AffixMgr::compound_check_morph(const char * word, int len,
2118 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
2119 char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
2121 int i;
2122 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2123 int ok = 0;
2125 struct hentry * rv = NULL;
2126 struct hentry * rv_first;
2127 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
2128 char st [MAXWORDUTF8LEN + 4];
2129 char ch;
2131 int checked_prefix;
2132 char presult[MAXLNLEN];
2134 int cmin;
2135 int cmax;
2137 int onlycpdrule;
2138 char affixed = 0;
2139 hentry ** oldwords = words;
2141 setcminmax(&cmin, &cmax, word, len);
2143 strcpy(st, word);
2145 for (i = cmin; i < cmax; i++) {
2146 oldnumsyllable = numsyllable;
2147 oldwordnum = wordnum;
2148 checked_prefix = 0;
2150 // go to end of the UTF-8 character
2151 if (utf8) {
2152 for (; (st[i] & 0xc0) == 0x80; i++);
2153 if (i >= cmax) return 0;
2156 words = oldwords;
2157 onlycpdrule = (words) ? 1 : 0;
2159 do { // onlycpdrule loop
2161 oldnumsyllable = numsyllable;
2162 oldwordnum = wordnum;
2163 checked_prefix = 0;
2165 ch = st[i];
2166 st[i] = '\0';
2167 sfx = NULL;
2169 // FIRST WORD
2171 affixed = 1;
2173 *presult = '\0';
2174 if (partresult) mystrcat(presult, partresult, MAXLNLEN);
2176 rv = lookup(st); // perhaps without prefix
2178 // search homonym with compound flag
2179 while ((rv) && !hu_mov_rule &&
2180 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2181 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2182 (compoundbegin && !wordnum && !onlycpdrule &&
2183 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2184 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2185 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2186 (numdefcpd && onlycpdrule &&
2187 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
2188 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
2189 ))) {
2190 rv = rv->next_homonym;
2193 if (rv) affixed = 0;
2195 if (rv) {
2196 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
2197 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2198 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);
2200 // store the pointer of the hash entry
2201 // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
2202 if (HENTRY_DATA(rv)) {
2203 sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv));
2207 if (!rv) {
2208 if (onlycpdrule) break;
2209 if (compoundflag &&
2210 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
2211 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
2212 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2213 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) && !hu_mov_rule &&
2214 sfx->getCont() &&
2215 ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
2216 sfx->getContLen())) || (compoundend &&
2217 TESTAFF(sfx->getCont(), compoundend,
2218 sfx->getContLen())))) {
2219 rv = NULL;
2223 if (rv ||
2224 (((wordnum == 0) && compoundbegin &&
2225 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2226 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundbegin))) || // twofold suffix+compound
2227 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
2228 ((wordnum > 0) && compoundmiddle &&
2229 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2230 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundmiddle))) || // twofold suffix+compound
2231 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
2233 // char * p = prefix_check_morph(st, i, 0, compound);
2234 char * p = NULL;
2235 if (compoundflag) p = affix_check_morph(st, i, compoundflag);
2236 if (!p || (*p == '\0')) {
2237 if (p) free(p);
2238 p = NULL;
2239 if ((wordnum == 0) && compoundbegin) {
2240 p = affix_check_morph(st, i, compoundbegin);
2241 } else if ((wordnum > 0) && compoundmiddle) {
2242 p = affix_check_morph(st, i, compoundmiddle);
2245 if (p && (*p != '\0')) {
2246 sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,
2247 MORPH_PART, st, line_uniq_app(&p, MSEP_REC));
2249 if (p) free(p);
2250 checked_prefix = 1;
2252 // else check forbiddenwords
2253 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2254 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2255 TESTAFF(rv->astr, needaffix, rv->alen))) {
2256 st[i] = ch;
2257 continue;
2260 // check non_compound flag in suffix and prefix
2261 if ((rv) && !hu_mov_rule &&
2262 ((pfx && pfx->getCont() &&
2263 TESTAFF(pfx->getCont(), compoundforbidflag,
2264 pfx->getContLen())) ||
2265 (sfx && sfx->getCont() &&
2266 TESTAFF(sfx->getCont(), compoundforbidflag,
2267 sfx->getContLen())))) {
2268 continue;
2271 // check compoundend flag in suffix and prefix
2272 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2273 ((pfx && pfx->getCont() &&
2274 TESTAFF(pfx->getCont(), compoundend,
2275 pfx->getContLen())) ||
2276 (sfx && sfx->getCont() &&
2277 TESTAFF(sfx->getCont(), compoundend,
2278 sfx->getContLen())))) {
2279 continue;
2282 // check compoundmiddle flag in suffix and prefix
2283 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
2284 ((pfx && pfx->getCont() &&
2285 TESTAFF(pfx->getCont(), compoundmiddle,
2286 pfx->getContLen())) ||
2287 (sfx && sfx->getCont() &&
2288 TESTAFF(sfx->getCont(), compoundmiddle,
2289 sfx->getContLen())))) {
2290 rv = NULL;
2293 // check forbiddenwords
2294 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen)
2295 || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue;
2297 // increment word number, if the second root has a compoundroot flag
2298 if ((rv) && (compoundroot) &&
2299 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2300 wordnum++;
2303 // first word is acceptable in compound words?
2304 if (((rv) &&
2305 ( checked_prefix || (words && words[wnum]) ||
2306 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2307 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2308 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
2309 // LANG_hu section: spec. Hungarian rule
2310 || ((langnum == LANG_hu) && // hu_mov_rule
2311 hu_mov_rule && (
2312 TESTAFF(rv->astr, 'F', rv->alen) ||
2313 TESTAFF(rv->astr, 'G', rv->alen) ||
2314 TESTAFF(rv->astr, 'H', rv->alen)
2317 // END of LANG_hu section
2319 && ! (( checkcompoundtriple && !words && // test triple letters
2320 (word[i-1]==word[i]) && (
2321 ((i>1) && (word[i-1]==word[i-2])) ||
2322 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
2324 ) ||
2326 // test CHECKCOMPOUNDPATTERN
2327 numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed)
2328 ) ||
2330 checkcompoundcase && !words && cpdcase_check(word, i)
2333 // LANG_hu section: spec. Hungarian rule
2334 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
2335 (sfx && sfx->getCont() && (
2336 TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
2337 TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
2341 // END of LANG_hu section
2344 // LANG_hu section: spec. Hungarian rule
2345 if (langnum == LANG_hu) {
2346 // calculate syllable number of the word
2347 numsyllable += get_syllable(st, i);
2349 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2350 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
2352 // END of LANG_hu section
2354 // NEXT WORD(S)
2355 rv_first = rv;
2356 rv = lookup((word+i)); // perhaps without prefix
2358 // search homonym with compound flag
2359 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2360 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2361 (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
2362 (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
2363 rv = rv->next_homonym;
2366 if (rv && words && words[wnum + 1]) {
2367 mystrcat(*result, presult, MAXLNLEN);
2368 mystrcat(*result, " ", MAXLNLEN);
2369 mystrcat(*result, MORPH_PART, MAXLNLEN);
2370 mystrcat(*result, word+i, MAXLNLEN);
2371 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2372 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2373 mystrcat(*result, " ", MAXLNLEN);
2374 mystrcat(*result, MORPH_STEM, MAXLNLEN);
2375 mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
2377 // store the pointer of the hash entry
2378 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2379 if (!complexprefixes && HENTRY_DATA(rv)) {
2380 mystrcat(*result, " ", MAXLNLEN);
2381 mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2383 mystrcat(*result, "\n", MAXLNLEN);
2384 ok = 1;
2385 return 0;
2388 oldnumsyllable2 = numsyllable;
2389 oldwordnum2 = wordnum;
2391 // LANG_hu section: spec. Hungarian rule
2392 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
2393 numsyllable--;
2395 // END of LANG_hu section
2396 // increment word number, if the second root has a compoundroot flag
2397 if ((rv) && (compoundroot) &&
2398 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2399 wordnum++;
2402 // check forbiddenwords
2403 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2404 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2405 st[i] = ch;
2406 continue;
2409 // second word is acceptable, as a root?
2410 // hungarian conventions: compounding is acceptable,
2411 // when compound forms consist of 2 words, or if more,
2412 // then the syllable number of root words must be 6, or lesser.
2413 if ((rv) && (
2414 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2415 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
2417 && (
2418 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2419 ((cpdmaxsyllable!=0) &&
2420 (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))
2422 && (
2423 (!checkcompounddup || (rv != rv_first))
2427 // bad compound word
2428 mystrcat(*result, presult, MAXLNLEN);
2429 mystrcat(*result, " ", MAXLNLEN);
2430 mystrcat(*result, MORPH_PART, MAXLNLEN);
2431 mystrcat(*result, word+i, MAXLNLEN);
2433 if (HENTRY_DATA(rv)) {
2434 if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2435 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2436 mystrcat(*result, " ", MAXLNLEN);
2437 mystrcat(*result, MORPH_STEM, MAXLNLEN);
2438 mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
2440 // store the pointer of the hash entry
2441 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2442 if (!complexprefixes) {
2443 mystrcat(*result, " ", MAXLNLEN);
2444 mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2447 mystrcat(*result, "\n", MAXLNLEN);
2448 ok = 1;
2451 numsyllable = oldnumsyllable2 ;
2452 wordnum = oldwordnum2;
2454 // perhaps second word has prefix or/and suffix
2455 sfx = NULL;
2456 sfxflag = FLAG_NULL;
2458 if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
2460 if (!rv && compoundend && !onlycpdrule) {
2461 sfx = NULL;
2462 pfx = NULL;
2463 rv = affix_check((word+i),strlen(word+i), compoundend);
2466 if (!rv && numdefcpd && words) {
2467 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
2468 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2469 char * m = NULL;
2470 if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2471 if ((!m || *m == '\0') && compoundend) {
2472 if (m) free(m);
2473 m = affix_check_morph((word+i),strlen(word+i), compoundend);
2475 mystrcat(*result, presult, MAXLNLEN);
2476 if (m || (*m != '\0')) {
2477 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2478 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2480 if (m) free(m);
2481 mystrcat(*result, "\n", MAXLNLEN);
2482 ok = 1;
2486 // check non_compound flag in suffix and prefix
2487 if ((rv) &&
2488 ((pfx && pfx->getCont() &&
2489 TESTAFF(pfx->getCont(), compoundforbidflag,
2490 pfx->getContLen())) ||
2491 (sfx && sfx->getCont() &&
2492 TESTAFF(sfx->getCont(), compoundforbidflag,
2493 sfx->getContLen())))) {
2494 rv = NULL;
2497 // check forbiddenwords
2498 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) ||
2499 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))
2500 && (! TESTAFF(rv->astr, needaffix, rv->alen))) {
2501 st[i] = ch;
2502 continue;
2505 if (langnum == LANG_hu) {
2506 // calculate syllable number of the word
2507 numsyllable += get_syllable(word + i, strlen(word + i));
2509 // - affix syllable num.
2510 // XXX only second suffix (inflections, not derivations)
2511 if (sfxappnd) {
2512 char * tmp = myrevstrdup(sfxappnd);
2513 numsyllable -= get_syllable(tmp, strlen(tmp));
2514 free(tmp);
2517 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2518 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
2520 // increment syllable num, if last word has a SYLLABLENUM flag
2521 // and the suffix is beginning `s'
2523 if (cpdsyllablenum) {
2524 switch (sfxflag) {
2525 case 'c': { numsyllable+=2; break; }
2526 case 'J': { numsyllable += 1; break; }
2527 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
2532 // increment word number, if the second word has a compoundroot flag
2533 if ((rv) && (compoundroot) &&
2534 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2535 wordnum++;
2537 // second word is acceptable, as a word with prefix or/and suffix?
2538 // hungarian conventions: compounding is acceptable,
2539 // when compound forms consist 2 word, otherwise
2540 // the syllable number of root words is 6, or lesser.
2541 if ((rv) &&
2543 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2544 ((cpdmaxsyllable!=0) &&
2545 (numsyllable <= cpdmaxsyllable))
2547 && (
2548 (!checkcompounddup || (rv != rv_first))
2549 )) {
2550 char * m = NULL;
2551 if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2552 if ((!m || *m == '\0') && compoundend) {
2553 if (m) free(m);
2554 m = affix_check_morph((word+i),strlen(word+i), compoundend);
2556 mystrcat(*result, presult, MAXLNLEN);
2557 if (m && (*m != '\0')) {
2558 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2559 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2561 if (m) free(m);
2562 sprintf(*result + strlen(*result), "%c", MSEP_REC);
2563 ok = 1;
2566 numsyllable = oldnumsyllable2;
2567 wordnum = oldwordnum2;
2569 // perhaps second word is a compound word (recursive call)
2570 if ((wordnum < maxwordnum) && (ok == 0)) {
2571 compound_check_morph((word+i),strlen(word+i), wordnum+1,
2572 numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
2573 } else {
2574 rv=NULL;
2577 st[i] = ch;
2578 wordnum = oldwordnum;
2579 numsyllable = oldnumsyllable;
2581 } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
2584 return 0;
2587 // return 1 if s1 (reversed) is a leading subset of end of s2
2588 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2590 while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2591 s1++;
2592 end_of_s2--;
2593 len--;
2595 return (*s1 == '\0');
2599 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2601 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2602 s1++;
2603 end_of_s2--;
2604 len--;
2606 return (*s1 == '\0');
2609 // check word for suffixes
2611 struct hentry * AffixMgr::suffix_check (const char * word, int len,
2612 int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns,
2613 const FLAG cclass, const FLAG needflag, char in_compound)
2615 struct hentry * rv = NULL;
2616 PfxEntry* ep = ppfx;
2618 // first handle the special case of 0 length suffixes
2619 SfxEntry * se = sStart[0];
2621 while (se) {
2622 if (!cclass || se->getCont()) {
2623 // suffixes are not allowed in beginning of compounds
2624 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2625 // except when signed with compoundpermitflag flag
2626 (se->getCont() && compoundpermitflag &&
2627 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2628 // no circumfix flag in prefix and suffix
2629 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2630 circumfix, ep->getContLen())) &&
2631 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2632 // circumfix flag in prefix AND suffix
2633 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2634 circumfix, ep->getContLen())) &&
2635 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
2636 // fogemorpheme
2637 (in_compound ||
2638 !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2639 // needaffix on prefix or first suffix
2640 (cclass ||
2641 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2642 (ppfx && !((ep->getCont()) &&
2643 TESTAFF(ep->getCont(), needaffix,
2644 ep->getContLen())))
2645 )) {
2646 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
2647 needflag, (in_compound ? 0 : onlyincompound));
2648 if (rv) {
2649 sfx=se; // BUG: sfx not stateless
2650 return rv;
2654 se = se->getNext();
2657 // now handle the general case
2658 if (len == 0) return NULL; // FULLSTRIP
2659 unsigned char sp= *((const unsigned char *)(word + len - 1));
2660 SfxEntry * sptr = sStart[sp];
2662 while (sptr) {
2663 if (isRevSubset(sptr->getKey(), word + len - 1, len)
2665 // suffixes are not allowed in beginning of compounds
2666 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2667 // except when signed with compoundpermitflag flag
2668 (sptr->getCont() && compoundpermitflag &&
2669 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2670 // no circumfix flag in prefix and suffix
2671 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2672 circumfix, ep->getContLen())) &&
2673 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2674 // circumfix flag in prefix AND suffix
2675 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2676 circumfix, ep->getContLen())) &&
2677 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
2678 // fogemorpheme
2679 (in_compound ||
2680 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2681 // needaffix on prefix or first suffix
2682 (cclass ||
2683 !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2684 (ppfx && !((ep->getCont()) &&
2685 TESTAFF(ep->getCont(), needaffix,
2686 ep->getContLen())))
2688 ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2689 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
2690 maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
2691 if (rv) {
2692 sfx=sptr; // BUG: sfx not stateless
2693 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2694 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2695 return rv;
2698 sptr = sptr->getNextEQ();
2699 } else {
2700 sptr = sptr->getNextNE();
2704 return NULL;
2707 // check word for two-level suffixes
2709 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
2710 int sfxopts, PfxEntry * ppfx, const FLAG needflag)
2712 struct hentry * rv = NULL;
2714 // first handle the special case of 0 length suffixes
2715 SfxEntry * se = sStart[0];
2716 while (se) {
2717 if (contclasses[se->getFlag()])
2719 rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
2720 if (rv) return rv;
2722 se = se->getNext();
2725 // now handle the general case
2726 if (len == 0) return NULL; // FULLSTRIP
2727 unsigned char sp = *((const unsigned char *)(word + len - 1));
2728 SfxEntry * sptr = sStart[sp];
2730 while (sptr) {
2731 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2732 if (contclasses[sptr->getFlag()])
2734 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
2735 if (rv) {
2736 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2737 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2738 return rv;
2741 sptr = sptr->getNextEQ();
2742 } else {
2743 sptr = sptr->getNextNE();
2747 return NULL;
2750 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
2751 int sfxopts, PfxEntry * ppfx, const FLAG needflag)
2753 char result[MAXLNLEN];
2754 char result2[MAXLNLEN];
2755 char result3[MAXLNLEN];
2757 char * st;
2759 result[0] = '\0';
2760 result2[0] = '\0';
2761 result3[0] = '\0';
2763 // first handle the special case of 0 length suffixes
2764 SfxEntry * se = sStart[0];
2765 while (se) {
2766 if (contclasses[se->getFlag()])
2768 st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2769 if (st) {
2770 if (ppfx) {
2771 if (ppfx->getMorph()) {
2772 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2773 mystrcat(result, " ", MAXLNLEN);
2774 } else debugflag(result, ppfx->getFlag());
2776 mystrcat(result, st, MAXLNLEN);
2777 free(st);
2778 if (se->getMorph()) {
2779 mystrcat(result, " ", MAXLNLEN);
2780 mystrcat(result, se->getMorph(), MAXLNLEN);
2781 } else debugflag(result, se->getFlag());
2782 mystrcat(result, "\n", MAXLNLEN);
2785 se = se->getNext();
2788 // now handle the general case
2789 if (len == 0) return NULL; // FULLSTRIP
2790 unsigned char sp = *((const unsigned char *)(word + len - 1));
2791 SfxEntry * sptr = sStart[sp];
2793 while (sptr) {
2794 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2795 if (contclasses[sptr->getFlag()])
2797 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2798 if (st) {
2799 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2800 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2801 strcpy(result2, st);
2802 free(st);
2804 result3[0] = '\0';
2806 if (sptr->getMorph()) {
2807 mystrcat(result3, " ", MAXLNLEN);
2808 mystrcat(result3, sptr->getMorph(), MAXLNLEN);
2809 } else debugflag(result3, sptr->getFlag());
2810 strlinecat(result2, result3);
2811 mystrcat(result2, "\n", MAXLNLEN);
2812 mystrcat(result, result2, MAXLNLEN);
2815 sptr = sptr->getNextEQ();
2816 } else {
2817 sptr = sptr->getNextNE();
2820 if (*result) return mystrdup(result);
2821 return NULL;
2824 char * AffixMgr::suffix_check_morph(const char * word, int len,
2825 int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
2827 char result[MAXLNLEN];
2829 struct hentry * rv = NULL;
2831 result[0] = '\0';
2833 PfxEntry* ep = ppfx;
2835 // first handle the special case of 0 length suffixes
2836 SfxEntry * se = sStart[0];
2837 while (se) {
2838 if (!cclass || se->getCont()) {
2839 // suffixes are not allowed in beginning of compounds
2840 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2841 // except when signed with compoundpermitflag flag
2842 (se->getCont() && compoundpermitflag &&
2843 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2844 // no circumfix flag in prefix and suffix
2845 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2846 circumfix, ep->getContLen())) &&
2847 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2848 // circumfix flag in prefix AND suffix
2849 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2850 circumfix, ep->getContLen())) &&
2851 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
2852 // fogemorpheme
2853 (in_compound ||
2854 !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2855 // needaffix on prefix or first suffix
2856 (cclass ||
2857 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2858 (ppfx && !((ep->getCont()) &&
2859 TESTAFF(ep->getCont(), needaffix,
2860 ep->getContLen())))
2863 rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2864 while (rv) {
2865 if (ppfx) {
2866 if (ppfx->getMorph()) {
2867 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2868 mystrcat(result, " ", MAXLNLEN);
2869 } else debugflag(result, ppfx->getFlag());
2871 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2872 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2873 mystrcat(result, " ", MAXLNLEN);
2874 mystrcat(result, MORPH_STEM, MAXLNLEN);
2875 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2877 // store the pointer of the hash entry
2878 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2880 if (!complexprefixes && HENTRY_DATA(rv)) {
2881 mystrcat(result, " ", MAXLNLEN);
2882 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2884 if (se->getMorph()) {
2885 mystrcat(result, " ", MAXLNLEN);
2886 mystrcat(result, se->getMorph(), MAXLNLEN);
2887 } else debugflag(result, se->getFlag());
2888 mystrcat(result, "\n", MAXLNLEN);
2889 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2892 se = se->getNext();
2895 // now handle the general case
2896 if (len == 0) return NULL; // FULLSTRIP
2897 unsigned char sp = *((const unsigned char *)(word + len - 1));
2898 SfxEntry * sptr = sStart[sp];
2900 while (sptr) {
2901 if (isRevSubset(sptr->getKey(), word + len - 1, len)
2903 // suffixes are not allowed in beginning of compounds
2904 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2905 // except when signed with compoundpermitflag flag
2906 (sptr->getCont() && compoundpermitflag &&
2907 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2908 // no circumfix flag in prefix and suffix
2909 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2910 circumfix, ep->getContLen())) &&
2911 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2912 // circumfix flag in prefix AND suffix
2913 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2914 circumfix, ep->getContLen())) &&
2915 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
2916 // fogemorpheme
2917 (in_compound ||
2918 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2919 // needaffix on first suffix
2920 (cclass || !(sptr->getCont() &&
2921 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))
2922 )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2923 while (rv) {
2924 if (ppfx) {
2925 if (ppfx->getMorph()) {
2926 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2927 mystrcat(result, " ", MAXLNLEN);
2928 } else debugflag(result, ppfx->getFlag());
2930 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2931 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2932 mystrcat(result, " ", MAXLNLEN);
2933 mystrcat(result, MORPH_STEM, MAXLNLEN);
2934 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2936 // store the pointer of the hash entry
2937 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2939 if (!complexprefixes && HENTRY_DATA(rv)) {
2940 mystrcat(result, " ", MAXLNLEN);
2941 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2944 if (sptr->getMorph()) {
2945 mystrcat(result, " ", MAXLNLEN);
2946 mystrcat(result, sptr->getMorph(), MAXLNLEN);
2947 } else debugflag(result, sptr->getFlag());
2948 mystrcat(result, "\n", MAXLNLEN);
2949 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2951 sptr = sptr->getNextEQ();
2952 } else {
2953 sptr = sptr->getNextNE();
2957 if (*result) return mystrdup(result);
2958 return NULL;
2961 // check if word with affixes is correctly spelled
2962 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
2964 struct hentry * rv= NULL;
2966 // check all prefixes (also crossed with suffixes if allowed)
2967 rv = prefix_check(word, len, in_compound, needflag);
2968 if (rv) return rv;
2970 // if still not found check all suffixes
2971 rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
2973 if (havecontclass) {
2974 sfx = NULL;
2975 pfx = NULL;
2977 if (rv) return rv;
2978 // if still not found check all two-level suffixes
2979 rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
2981 if (rv) return rv;
2982 // if still not found check all two-level suffixes
2983 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
2986 return rv;
2989 // check if word with affixes is correctly spelled
2990 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
2992 char result[MAXLNLEN];
2993 char * st = NULL;
2995 *result = '\0';
2997 // check all prefixes (also crossed with suffixes if allowed)
2998 st = prefix_check_morph(word, len, in_compound);
2999 if (st) {
3000 mystrcat(result, st, MAXLNLEN);
3001 free(st);
3004 // if still not found check all suffixes
3005 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
3006 if (st) {
3007 mystrcat(result, st, MAXLNLEN);
3008 free(st);
3011 if (havecontclass) {
3012 sfx = NULL;
3013 pfx = NULL;
3014 // if still not found check all two-level suffixes
3015 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
3016 if (st) {
3017 mystrcat(result, st, MAXLNLEN);
3018 free(st);
3021 // if still not found check all two-level suffixes
3022 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
3023 if (st) {
3024 mystrcat(result, st, MAXLNLEN);
3025 free(st);
3029 return mystrdup(result);
3032 char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,
3033 unsigned short al, char * morph, char * targetmorph, int level)
3035 // handle suffixes
3036 char * stemmorph;
3037 char * stemmorphcatpos;
3038 char mymorph[MAXLNLEN];
3040 if (!morph) return NULL;
3042 // check substandard flag
3043 if (TESTAFF(ap, substandard, al)) return NULL;
3045 if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);
3047 // int targetcount = get_sfxcount(targetmorph);
3049 // use input suffix fields, if exist
3050 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
3051 stemmorph = mymorph;
3052 strcpy(stemmorph, morph);
3053 mystrcat(stemmorph, " ", MAXLNLEN);
3054 stemmorphcatpos = stemmorph + strlen(stemmorph);
3055 } else {
3056 stemmorph = morph;
3057 stemmorphcatpos = NULL;
3060 for (int i = 0; i < al; i++) {
3061 const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
3062 SfxEntry * sptr = sFlag[c];
3063 while (sptr) {
3064 if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) ||
3065 // don't generate forms with substandard affixes
3066 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
3068 if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());
3069 else stemmorph = (char *) sptr->getMorph();
3071 int cmp = morphcmp(stemmorph, targetmorph);
3073 if (cmp == 0) {
3074 char * newword = sptr->add(ts, wl);
3075 if (newword) {
3076 hentry * check = pHMgr->lookup(newword); // XXX extra dic
3077 if (!check || !check->astr ||
3078 !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3079 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3080 return newword;
3082 free(newword);
3086 // recursive call for secondary suffixes
3087 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3088 // (get_sfxcount(stemmorph) < targetcount) &&
3089 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3090 char * newword = sptr->add(ts, wl);
3091 if (newword) {
3092 char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),
3093 sptr->getContLen(), stemmorph, targetmorph, 1);
3095 if (newword2) {
3096 free(newword);
3097 return newword2;
3099 free(newword);
3100 newword = NULL;
3104 sptr = sptr->getFlgNxt();
3107 return NULL;
3111 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
3112 int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,
3113 char * phon)
3115 int nh=0;
3116 // first add root word to list
3117 if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3118 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3119 wlst[nh].word = mystrdup(ts);
3120 if (!wlst[nh].word) return 0;
3121 wlst[nh].allow = (1 == 0);
3122 wlst[nh].orig = NULL;
3123 nh++;
3124 // add special phonetic version
3125 if (phon && (nh < maxn)) {
3126 wlst[nh].word = mystrdup(phon);
3127 if (!wlst[nh].word) return nh - 1;
3128 wlst[nh].allow = (1 == 0);
3129 wlst[nh].orig = mystrdup(ts);
3130 if (!wlst[nh].orig) return nh - 1;
3131 nh++;
3135 // handle suffixes
3136 for (int i = 0; i < al; i++) {
3137 const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
3138 SfxEntry * sptr = sFlag[c];
3139 while (sptr) {
3140 if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
3141 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3142 // check needaffix flag
3143 !(sptr->getCont() && ((needaffix &&
3144 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3145 (circumfix &&
3146 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3147 (onlyincompound &&
3148 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
3150 char * newword = sptr->add(ts, wl);
3151 if (newword) {
3152 if (nh < maxn) {
3153 wlst[nh].word = newword;
3154 wlst[nh].allow = sptr->allowCross();
3155 wlst[nh].orig = NULL;
3156 nh++;
3157 // add special phonetic version
3158 if (phon && (nh < maxn)) {
3159 char st[MAXWORDUTF8LEN];
3160 strcpy(st, phon);
3161 strcat(st, sptr->getKey());
3162 reverseword(st + strlen(phon));
3163 wlst[nh].word = mystrdup(st);
3164 if (!wlst[nh].word) return nh - 1;
3165 wlst[nh].allow = (1 == 0);
3166 wlst[nh].orig = mystrdup(newword);
3167 if (!wlst[nh].orig) return nh - 1;
3168 nh++;
3170 } else {
3171 free(newword);
3175 sptr = sptr->getFlgNxt();
3179 int n = nh;
3181 // handle cross products of prefixes and suffixes
3182 for (int j=1;j<n ;j++)
3183 if (wlst[j].allow) {
3184 for (int k = 0; k < al; k++) {
3185 const unsigned char c = (unsigned char) (ap[k] & 0x00FF);
3186 PfxEntry * cptr = pFlag[c];
3187 while (cptr) {
3188 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
3189 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3190 int l1 = strlen(wlst[j].word);
3191 char * newword = cptr->add(wlst[j].word, l1);
3192 if (newword) {
3193 if (nh < maxn) {
3194 wlst[nh].word = newword;
3195 wlst[nh].allow = cptr->allowCross();
3196 wlst[nh].orig = NULL;
3197 nh++;
3198 } else {
3199 free(newword);
3203 cptr = cptr->getFlgNxt();
3209 // now handle pure prefixes
3210 for (int m = 0; m < al; m ++) {
3211 const unsigned char c = (unsigned char) (ap[m] & 0x00FF);
3212 PfxEntry * ptr = pFlag[c];
3213 while (ptr) {
3214 if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
3215 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3216 // check needaffix flag
3217 !(ptr->getCont() && ((needaffix &&
3218 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3219 (circumfix &&
3220 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3221 (onlyincompound &&
3222 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
3224 char * newword = ptr->add(ts, wl);
3225 if (newword) {
3226 if (nh < maxn) {
3227 wlst[nh].word = newword;
3228 wlst[nh].allow = ptr->allowCross();
3229 wlst[nh].orig = NULL;
3230 nh++;
3231 } else {
3232 free(newword);
3236 ptr = ptr->getFlgNxt();
3240 return nh;
3243 // return length of replacing table
3244 int AffixMgr::get_numrep() const
3246 return numrep;
3249 // return replacing table
3250 struct replentry * AffixMgr::get_reptable() const
3252 if (! reptable ) return NULL;
3253 return reptable;
3256 // return iconv table
3257 RepList * AffixMgr::get_iconvtable() const
3259 if (! iconvtable ) return NULL;
3260 return iconvtable;
3263 // return oconv table
3264 RepList * AffixMgr::get_oconvtable() const
3266 if (! oconvtable ) return NULL;
3267 return oconvtable;
3270 // return replacing table
3271 struct phonetable * AffixMgr::get_phonetable() const
3273 if (! phone ) return NULL;
3274 return phone;
3277 // return length of character map table
3278 int AffixMgr::get_nummap() const
3280 return nummap;
3283 // return character map table
3284 struct mapentry * AffixMgr::get_maptable() const
3286 if (! maptable ) return NULL;
3287 return maptable;
3290 // return length of word break table
3291 int AffixMgr::get_numbreak() const
3293 return numbreak;
3296 // return character map table
3297 char ** AffixMgr::get_breaktable() const
3299 if (! breaktable ) return NULL;
3300 return breaktable;
3303 // return text encoding of dictionary
3304 char * AffixMgr::get_encoding()
3306 if (! encoding ) encoding = mystrdup(SPELL_ENCODING);
3307 return mystrdup(encoding);
3310 // return text encoding of dictionary
3311 int AffixMgr::get_langnum() const
3313 return langnum;
3316 // return double prefix option
3317 int AffixMgr::get_complexprefixes() const
3319 return complexprefixes;
3322 // return FULLSTRIP option
3323 int AffixMgr::get_fullstrip() const
3325 return fullstrip;
3328 FLAG AffixMgr::get_keepcase() const
3330 return keepcase;
3333 FLAG AffixMgr::get_forceucase() const
3335 return forceucase;
3338 FLAG AffixMgr::get_warn() const
3340 return warn;
3343 int AffixMgr::get_forbidwarn() const
3345 return forbidwarn;
3348 int AffixMgr::get_checksharps() const
3350 return checksharps;
3353 char * AffixMgr::encode_flag(unsigned short aflag) const
3355 return pHMgr->encode_flag(aflag);
3359 // return the preferred ignore string for suggestions
3360 char * AffixMgr::get_ignore() const
3362 if (!ignorechars) return NULL;
3363 return ignorechars;
3366 // return the preferred ignore string for suggestions
3367 unsigned short * AffixMgr::get_ignore_utf16(int * len) const
3369 *len = ignorechars_utf16_len;
3370 return ignorechars_utf16;
3373 // return the keyboard string for suggestions
3374 char * AffixMgr::get_key_string()
3376 if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING);
3377 return mystrdup(keystring);
3380 // return the preferred try string for suggestions
3381 char * AffixMgr::get_try_string() const
3383 if (! trystring ) return NULL;
3384 return mystrdup(trystring);
3387 // return the preferred try string for suggestions
3388 const char * AffixMgr::get_wordchars() const
3390 return wordchars;
3393 unsigned short * AffixMgr::get_wordchars_utf16(int * len) const
3395 *len = wordchars_utf16_len;
3396 return wordchars_utf16;
3399 // is there compounding?
3400 int AffixMgr::get_compound() const
3402 return compoundflag || compoundbegin || numdefcpd;
3405 // return the compound words control flag
3406 FLAG AffixMgr::get_compoundflag() const
3408 return compoundflag;
3411 // return the forbidden words control flag
3412 FLAG AffixMgr::get_forbiddenword() const
3414 return forbiddenword;
3417 // return the forbidden words control flag
3418 FLAG AffixMgr::get_nosuggest() const
3420 return nosuggest;
3423 // return the forbidden words control flag
3424 FLAG AffixMgr::get_nongramsuggest() const
3426 return nongramsuggest;
3429 // return the forbidden words flag modify flag
3430 FLAG AffixMgr::get_needaffix() const
3432 return needaffix;
3435 // return the onlyincompound flag
3436 FLAG AffixMgr::get_onlyincompound() const
3438 return onlyincompound;
3441 // return the compound word signal flag
3442 FLAG AffixMgr::get_compoundroot() const
3444 return compoundroot;
3447 // return the compound begin signal flag
3448 FLAG AffixMgr::get_compoundbegin() const
3450 return compoundbegin;
3453 // return the value of checknum
3454 int AffixMgr::get_checknum() const
3456 return checknum;
3459 // return the value of prefix
3460 const char * AffixMgr::get_prefix() const
3462 if (pfx) return pfx->getKey();
3463 return NULL;
3466 // return the value of suffix
3467 const char * AffixMgr::get_suffix() const
3469 return sfxappnd;
3472 // return the value of suffix
3473 const char * AffixMgr::get_version() const
3475 return version;
3478 // return lemma_present flag
3479 FLAG AffixMgr::get_lemma_present() const
3481 return lemma_present;
3484 // utility method to look up root words in hash table
3485 struct hentry * AffixMgr::lookup(const char * word)
3487 int i;
3488 struct hentry * he = NULL;
3489 for (i = 0; i < *maxdic && !he; i++) {
3490 he = (alldic[i])->lookup(word);
3492 return he;
3495 // return the value of suffix
3496 int AffixMgr::have_contclass() const
3498 return havecontclass;
3501 // return utf8
3502 int AffixMgr::get_utf8() const
3504 return utf8;
3507 int AffixMgr::get_maxngramsugs(void) const
3509 return maxngramsugs;
3512 int AffixMgr::get_maxcpdsugs(void) const
3514 return maxcpdsugs;
3517 int AffixMgr::get_maxdiff(void) const
3519 return maxdiff;
3522 int AffixMgr::get_onlymaxdiff(void) const
3524 return onlymaxdiff;
3527 // return nosplitsugs
3528 int AffixMgr::get_nosplitsugs(void) const
3530 return nosplitsugs;
3533 // return sugswithdots
3534 int AffixMgr::get_sugswithdots(void) const
3536 return sugswithdots;
3539 /* parse flag */
3540 int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) {
3541 char * s = NULL;
3542 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3543 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3544 return 1;
3546 if (parse_string(line, &s, af->getlinenum())) return 1;
3547 *out = pHMgr->decode_flag(s);
3548 free(s);
3549 return 0;
3552 /* parse num */
3553 int AffixMgr::parse_num(char * line, int * out, FileMgr * af) {
3554 char * s = NULL;
3555 if (*out != -1) {
3556 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3557 return 1;
3559 if (parse_string(line, &s, af->getlinenum())) return 1;
3560 *out = atoi(s);
3561 free(s);
3562 return 0;
3565 /* parse in the max syllablecount of compound words and */
3566 int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af)
3568 char * tp = line;
3569 char * piece;
3570 int i = 0;
3571 int np = 0;
3572 w_char w[MAXWORDLEN];
3573 piece = mystrsep(&tp, 0);
3574 while (piece) {
3575 if (*piece != '\0') {
3576 switch(i) {
3577 case 0: { np++; break; }
3578 case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
3579 case 2: {
3580 if (!utf8) {
3581 cpdvowels = mystrdup(piece);
3582 } else {
3583 int n = u8_u16(w, MAXWORDLEN, piece);
3584 if (n > 0) {
3585 flag_qsort((unsigned short *) w, 0, n);
3586 cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
3587 if (!cpdvowels_utf16) return 1;
3588 memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
3590 cpdvowels_utf16_len = n;
3592 np++;
3593 break;
3595 default: break;
3597 i++;
3599 piece = mystrsep(&tp, 0);
3601 if (np < 2) {
3602 HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum());
3603 return 1;
3605 if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
3606 return 0;
3609 /* parse in the typical fault correcting table */
3610 int AffixMgr::parse_reptable(char * line, FileMgr * af)
3612 if (numrep != 0) {
3613 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3614 return 1;
3616 char * tp = line;
3617 char * piece;
3618 int i = 0;
3619 int np = 0;
3620 piece = mystrsep(&tp, 0);
3621 while (piece) {
3622 if (*piece != '\0') {
3623 switch(i) {
3624 case 0: { np++; break; }
3625 case 1: {
3626 numrep = atoi(piece);
3627 if (numrep < 1) {
3628 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3629 return 1;
3631 reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
3632 if (!reptable) return 1;
3633 np++;
3634 break;
3636 default: break;
3638 i++;
3640 piece = mystrsep(&tp, 0);
3642 if (np != 2) {
3643 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3644 return 1;
3647 /* now parse the numrep lines to read in the remainder of the table */
3648 char * nl;
3649 for (int j=0; j < numrep; j++) {
3650 if ((nl = af->getline()) == NULL) return 1;
3651 mychomp(nl);
3652 tp = nl;
3653 i = 0;
3654 reptable[j].pattern = NULL;
3655 reptable[j].pattern2 = NULL;
3656 piece = mystrsep(&tp, 0);
3657 while (piece) {
3658 if (*piece != '\0') {
3659 switch(i) {
3660 case 0: {
3661 if (strncmp(piece,"REP",3) != 0) {
3662 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3663 numrep = 0;
3664 return 1;
3666 break;
3668 case 1: {
3669 if (*piece == '^') reptable[j].start = true; else reptable[j].start = false;
3670 reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," ");
3671 int lr = strlen(reptable[j].pattern) - 1;
3672 if (reptable[j].pattern[lr] == '$') {
3673 reptable[j].end = true;
3674 reptable[j].pattern[lr] = '\0';
3675 } else reptable[j].end = false;
3676 break;
3678 case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
3679 default: break;
3681 i++;
3683 piece = mystrsep(&tp, 0);
3685 if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
3686 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3687 numrep = 0;
3688 return 1;
3691 return 0;
3694 /* parse in the typical fault correcting table */
3695 int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword)
3697 if (*rl) {
3698 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3699 return 1;
3701 char * tp = line;
3702 char * piece;
3703 int i = 0;
3704 int np = 0;
3705 int numrl = 0;
3706 piece = mystrsep(&tp, 0);
3707 while (piece) {
3708 if (*piece != '\0') {
3709 switch(i) {
3710 case 0: { np++; break; }
3711 case 1: {
3712 numrl = atoi(piece);
3713 if (numrl < 1) {
3714 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3715 return 1;
3717 *rl = new RepList(numrl);
3718 if (!*rl) return 1;
3719 np++;
3720 break;
3722 default: break;
3724 i++;
3726 piece = mystrsep(&tp, 0);
3728 if (np != 2) {
3729 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3730 return 1;
3733 /* now parse the num lines to read in the remainder of the table */
3734 char * nl;
3735 for (int j=0; j < numrl; j++) {
3736 if (!(nl = af->getline())) return 1;
3737 mychomp(nl);
3738 tp = nl;
3739 i = 0;
3740 char * pattern = NULL;
3741 char * pattern2 = NULL;
3742 piece = mystrsep(&tp, 0);
3743 while (piece) {
3744 if (*piece != '\0') {
3745 switch(i) {
3746 case 0: {
3747 if (strncmp(piece, keyword, strlen(keyword)) != 0) {
3748 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3749 delete *rl;
3750 *rl = NULL;
3751 return 1;
3753 break;
3755 case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; }
3756 case 2: {
3757 pattern2 = mystrrep(mystrdup(piece),"_"," ");
3758 break;
3760 default: break;
3762 i++;
3764 piece = mystrsep(&tp, 0);
3766 if (!pattern || !pattern2) {
3767 if (pattern)
3768 free(pattern);
3769 if (pattern2)
3770 free(pattern2);
3771 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3772 return 1;
3774 (*rl)->add(pattern, pattern2);
3776 return 0;
3780 /* parse in the typical fault correcting table */
3781 int AffixMgr::parse_phonetable(char * line, FileMgr * af)
3783 if (phone) {
3784 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3785 return 1;
3787 char * tp = line;
3788 char * piece;
3789 int i = 0;
3790 int np = 0;
3791 piece = mystrsep(&tp, 0);
3792 while (piece) {
3793 if (*piece != '\0') {
3794 switch(i) {
3795 case 0: { np++; break; }
3796 case 1: {
3797 phone = (phonetable *) malloc(sizeof(struct phonetable));
3798 if (!phone) return 1;
3799 phone->num = atoi(piece);
3800 phone->rules = NULL;
3801 phone->utf8 = (char) utf8;
3802 if (phone->num < 1) {
3803 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3804 return 1;
3806 phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));
3807 if (!phone->rules) {
3808 free(phone);
3809 phone = NULL;
3810 return 1;
3812 np++;
3813 break;
3815 default: break;
3817 i++;
3819 piece = mystrsep(&tp, 0);
3821 if (np != 2) {
3822 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3823 return 1;
3826 /* now parse the phone->num lines to read in the remainder of the table */
3827 char * nl;
3828 for (int j=0; j < phone->num; j++) {
3829 if (!(nl = af->getline())) return 1;
3830 mychomp(nl);
3831 tp = nl;
3832 i = 0;
3833 phone->rules[j * 2] = NULL;
3834 phone->rules[j * 2 + 1] = NULL;
3835 piece = mystrsep(&tp, 0);
3836 while (piece) {
3837 if (*piece != '\0') {
3838 switch(i) {
3839 case 0: {
3840 if (strncmp(piece,"PHONE",5) != 0) {
3841 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3842 phone->num = 0;
3843 return 1;
3845 break;
3847 case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }
3848 case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }
3849 default: break;
3851 i++;
3853 piece = mystrsep(&tp, 0);
3855 if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {
3856 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3857 phone->num = 0;
3858 return 1;
3861 phone->rules[phone->num * 2] = mystrdup("");
3862 phone->rules[phone->num * 2 + 1] = mystrdup("");
3863 init_phonet_hash(*phone);
3864 return 0;
3867 /* parse in the checkcompoundpattern table */
3868 int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)
3870 if (numcheckcpd != 0) {
3871 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3872 return 1;
3874 char * tp = line;
3875 char * piece;
3876 int i = 0;
3877 int np = 0;
3878 piece = mystrsep(&tp, 0);
3879 while (piece) {
3880 if (*piece != '\0') {
3881 switch(i) {
3882 case 0: { np++; break; }
3883 case 1: {
3884 numcheckcpd = atoi(piece);
3885 if (numcheckcpd < 1) {
3886 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3887 return 1;
3889 checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry));
3890 if (!checkcpdtable) return 1;
3891 np++;
3892 break;
3894 default: break;
3896 i++;
3898 piece = mystrsep(&tp, 0);
3900 if (np != 2) {
3901 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3902 return 1;
3905 /* now parse the numcheckcpd lines to read in the remainder of the table */
3906 char * nl;
3907 for (int j=0; j < numcheckcpd; j++) {
3908 if (!(nl = af->getline())) return 1;
3909 mychomp(nl);
3910 tp = nl;
3911 i = 0;
3912 checkcpdtable[j].pattern = NULL;
3913 checkcpdtable[j].pattern2 = NULL;
3914 checkcpdtable[j].pattern3 = NULL;
3915 checkcpdtable[j].cond = FLAG_NULL;
3916 checkcpdtable[j].cond2 = FLAG_NULL;
3917 piece = mystrsep(&tp, 0);
3918 while (piece) {
3919 if (*piece != '\0') {
3920 switch(i) {
3921 case 0: {
3922 if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
3923 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3924 numcheckcpd = 0;
3925 return 1;
3927 break;
3929 case 1: {
3930 checkcpdtable[j].pattern = mystrdup(piece);
3931 char * p = strchr(checkcpdtable[j].pattern, '/');
3932 if (p) {
3933 *p = '\0';
3934 checkcpdtable[j].cond = pHMgr->decode_flag(p + 1);
3936 break; }
3937 case 2: {
3938 checkcpdtable[j].pattern2 = mystrdup(piece);
3939 char * p = strchr(checkcpdtable[j].pattern2, '/');
3940 if (p) {
3941 *p = '\0';
3942 checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1);
3944 break;
3946 case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; }
3947 default: break;
3949 i++;
3951 piece = mystrsep(&tp, 0);
3953 if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
3954 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3955 numcheckcpd = 0;
3956 return 1;
3959 return 0;
3962 /* parse in the compound rule table */
3963 int AffixMgr::parse_defcpdtable(char * line, FileMgr * af)
3965 if (numdefcpd != 0) {
3966 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3967 return 1;
3969 char * tp = line;
3970 char * piece;
3971 int i = 0;
3972 int np = 0;
3973 piece = mystrsep(&tp, 0);
3974 while (piece) {
3975 if (*piece != '\0') {
3976 switch(i) {
3977 case 0: { np++; break; }
3978 case 1: {
3979 numdefcpd = atoi(piece);
3980 if (numdefcpd < 1) {
3981 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3982 return 1;
3984 defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
3985 if (!defcpdtable) return 1;
3986 np++;
3987 break;
3989 default: break;
3991 i++;
3993 piece = mystrsep(&tp, 0);
3995 if (np != 2) {
3996 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3997 return 1;
4000 /* now parse the numdefcpd lines to read in the remainder of the table */
4001 char * nl;
4002 for (int j=0; j < numdefcpd; j++) {
4003 if (!(nl = af->getline())) return 1;
4004 mychomp(nl);
4005 tp = nl;
4006 i = 0;
4007 defcpdtable[j].def = NULL;
4008 piece = mystrsep(&tp, 0);
4009 while (piece) {
4010 if (*piece != '\0') {
4011 switch(i) {
4012 case 0: {
4013 if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
4014 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4015 numdefcpd = 0;
4016 return 1;
4018 break;
4020 case 1: { // handle parenthesized flags
4021 if (strchr(piece, '(')) {
4022 defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG));
4023 defcpdtable[j].len = 0;
4024 int end = 0;
4025 FLAG * conv;
4026 while (!end) {
4027 char * par = piece + 1;
4028 while (*par != '(' && *par != ')' && *par != '\0') par++;
4029 if (*par == '\0') end = 1; else *par = '\0';
4030 if (*piece == '(') piece++;
4031 if (*piece == '*' || *piece == '?') {
4032 defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece;
4033 } else if (*piece != '\0') {
4034 int l = pHMgr->decode_flags(&conv, piece, af);
4035 for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k];
4036 free(conv);
4038 piece = par + 1;
4040 } else {
4041 defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af);
4043 break;
4045 default: break;
4047 i++;
4049 piece = mystrsep(&tp, 0);
4051 if (!defcpdtable[j].len) {
4052 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4053 numdefcpd = 0;
4054 return 1;
4057 return 0;
4061 /* parse in the character map table */
4062 int AffixMgr::parse_maptable(char * line, FileMgr * af)
4064 if (nummap != 0) {
4065 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
4066 return 1;
4068 char * tp = line;
4069 char * piece;
4070 int i = 0;
4071 int np = 0;
4072 piece = mystrsep(&tp, 0);
4073 while (piece) {
4074 if (*piece != '\0') {
4075 switch(i) {
4076 case 0: { np++; break; }
4077 case 1: {
4078 nummap = atoi(piece);
4079 if (nummap < 1) {
4080 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
4081 return 1;
4083 maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
4084 if (!maptable) return 1;
4085 np++;
4086 break;
4088 default: break;
4090 i++;
4092 piece = mystrsep(&tp, 0);
4094 if (np != 2) {
4095 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4096 return 1;
4099 /* now parse the nummap lines to read in the remainder of the table */
4100 char * nl;
4101 for (int j=0; j < nummap; j++) {
4102 if (!(nl = af->getline())) return 1;
4103 mychomp(nl);
4104 tp = nl;
4105 i = 0;
4106 maptable[j].set = NULL;
4107 maptable[j].len = 0;
4108 piece = mystrsep(&tp, 0);
4109 while (piece) {
4110 if (*piece != '\0') {
4111 switch(i) {
4112 case 0: {
4113 if (strncmp(piece,"MAP",3) != 0) {
4114 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4115 nummap = 0;
4116 return 1;
4118 break;
4120 case 1: {
4121 int setn = 0;
4122 maptable[j].len = strlen(piece);
4123 maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*));
4124 if (!maptable[j].set) return 1;
4125 for (int k = 0; k < maptable[j].len; k++) {
4126 int chl = 1;
4127 int chb = k;
4128 if (piece[k] == '(') {
4129 char * parpos = strchr(piece + k, ')');
4130 if (parpos != NULL) {
4131 chb = k + 1;
4132 chl = (int)(parpos - piece) - k - 1;
4133 k = k + chl + 1;
4135 } else {
4136 if (utf8 && (piece[k] & 0xc0) == 0xc0) {
4137 for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++);
4138 chl = k - chb;
4139 k--;
4142 maptable[j].set[setn] = (char *) malloc(chl + 1);
4143 if (!maptable[j].set[setn]) return 1;
4144 strncpy(maptable[j].set[setn], piece + chb, chl);
4145 maptable[j].set[setn][chl] = '\0';
4146 setn++;
4148 maptable[j].len = setn;
4149 break; }
4150 default: break;
4152 i++;
4154 piece = mystrsep(&tp, 0);
4156 if (!maptable[j].set || !maptable[j].len) {
4157 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4158 nummap = 0;
4159 return 1;
4162 return 0;
4165 /* parse in the word breakpoint table */
4166 int AffixMgr::parse_breaktable(char * line, FileMgr * af)
4168 if (numbreak > -1) {
4169 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
4170 return 1;
4172 char * tp = line;
4173 char * piece;
4174 int i = 0;
4175 int np = 0;
4176 piece = mystrsep(&tp, 0);
4177 while (piece) {
4178 if (*piece != '\0') {
4179 switch(i) {
4180 case 0: { np++; break; }
4181 case 1: {
4182 numbreak = atoi(piece);
4183 if (numbreak < 0) {
4184 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
4185 return 1;
4187 if (numbreak == 0) return 0;
4188 breaktable = (char **) malloc(numbreak * sizeof(char *));
4189 if (!breaktable) return 1;
4190 np++;
4191 break;
4193 default: break;
4195 i++;
4197 piece = mystrsep(&tp, 0);
4199 if (np != 2) {
4200 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4201 return 1;
4204 /* now parse the numbreak lines to read in the remainder of the table */
4205 char * nl;
4206 for (int j=0; j < numbreak; j++) {
4207 if (!(nl = af->getline())) return 1;
4208 mychomp(nl);
4209 tp = nl;
4210 i = 0;
4211 piece = mystrsep(&tp, 0);
4212 while (piece) {
4213 if (*piece != '\0') {
4214 switch(i) {
4215 case 0: {
4216 if (strncmp(piece,"BREAK",5) != 0) {
4217 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4218 numbreak = 0;
4219 return 1;
4221 break;
4223 case 1: {
4224 breaktable[j] = mystrdup(piece);
4225 break;
4227 default: break;
4229 i++;
4231 piece = mystrsep(&tp, 0);
4233 if (!breaktable) {
4234 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4235 numbreak = 0;
4236 return 1;
4239 return 0;
4242 void AffixMgr::reverse_condition(char * piece) {
4243 int neg = 0;
4244 for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
4245 switch(*k) {
4246 case '[': {
4247 if (neg) *(k+1) = '['; else *k = ']';
4248 break;
4250 case ']': {
4251 *k = '[';
4252 if (neg) *(k+1) = '^';
4253 neg = 0;
4254 break;
4256 case '^': {
4257 if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
4258 break;
4260 default: {
4261 if (neg) *(k+1) = *k;
4267 int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)
4269 int numents = 0; // number of affentry structures to parse
4271 unsigned short aflag = 0; // affix char identifier
4273 char ff=0;
4274 std::vector<affentry> affentries;
4276 char * tp = line;
4277 char * nl = line;
4278 char * piece;
4279 int i = 0;
4281 // checking lines with bad syntax
4282 #ifdef DEBUG
4283 int basefieldnum = 0;
4284 #endif
4286 // split affix header line into pieces
4288 int np = 0;
4290 piece = mystrsep(&tp, 0);
4291 while (piece) {
4292 if (*piece != '\0') {
4293 switch(i) {
4294 // piece 1 - is type of affix
4295 case 0: { np++; break; }
4297 // piece 2 - is affix char
4298 case 1: {
4299 np++;
4300 aflag = pHMgr->decode_flag(piece);
4301 #ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates.
4302 if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4303 ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4304 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",
4305 af->getlinenum());
4306 // return 1; XXX permissive mode for bad dictionaries
4308 dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
4309 #endif
4310 break;
4312 // piece 3 - is cross product indicator
4313 case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
4315 // piece 4 - is number of affentries
4316 case 3: {
4317 np++;
4318 numents = atoi(piece);
4319 if (numents == 0) {
4320 char * err = pHMgr->encode_flag(aflag);
4321 if (err) {
4322 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4323 af->getlinenum());
4324 free(err);
4326 return 1;
4328 affentries.resize(numents);
4329 affentries[0].opts = ff;
4330 if (utf8) affentries[0].opts += aeUTF8;
4331 if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF;
4332 if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM;
4333 affentries[0].aflag = aflag;
4336 default: break;
4338 i++;
4340 piece = mystrsep(&tp, 0);
4342 // check to make sure we parsed enough pieces
4343 if (np != 4) {
4344 char * err = pHMgr->encode_flag(aflag);
4345 if (err) {
4346 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4347 free(err);
4349 return 1;
4352 // now parse numents affentries for this affix
4353 std::vector<affentry>::iterator start = affentries.begin();
4354 std::vector<affentry>::iterator end = affentries.end();
4355 for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
4356 if ((nl = af->getline()) == NULL) return 1;
4357 mychomp(nl);
4358 tp = nl;
4359 i = 0;
4360 np = 0;
4362 // split line into pieces
4363 piece = mystrsep(&tp, 0);
4364 while (piece) {
4365 if (*piece != '\0') {
4366 switch(i) {
4367 // piece 1 - is type
4368 case 0: {
4369 np++;
4370 if (entry != start) entry->opts = start->opts &
4371 (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);
4372 break;
4375 // piece 2 - is affix char
4376 case 1: {
4377 np++;
4378 if (pHMgr->decode_flag(piece) != aflag) {
4379 char * err = pHMgr->encode_flag(aflag);
4380 if (err) {
4381 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4382 af->getlinenum(), err);
4383 free(err);
4385 return 1;
4388 if (entry != start) entry->aflag = start->aflag;
4389 break;
4392 // piece 3 - is string to strip or 0 for null
4393 case 2: {
4394 np++;
4395 if (complexprefixes) {
4396 if (utf8) reverseword_utf(piece); else reverseword(piece);
4398 entry->strip = mystrdup(piece);
4399 entry->stripl = (unsigned char) strlen(entry->strip);
4400 if (strcmp(entry->strip,"0") == 0) {
4401 free(entry->strip);
4402 entry->strip=mystrdup("");
4403 entry->stripl = 0;
4405 break;
4408 // piece 4 - is affix string or 0 for null
4409 case 3: {
4410 char * dash;
4411 entry->morphcode = NULL;
4412 entry->contclass = NULL;
4413 entry->contclasslen = 0;
4414 np++;
4415 dash = strchr(piece, '/');
4416 if (dash) {
4417 *dash = '\0';
4419 if (ignorechars) {
4420 if (utf8) {
4421 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4422 } else {
4423 remove_ignored_chars(piece,ignorechars);
4427 if (complexprefixes) {
4428 if (utf8) reverseword_utf(piece); else reverseword(piece);
4430 entry->appnd = mystrdup(piece);
4432 if (pHMgr->is_aliasf()) {
4433 int index = atoi(dash + 1);
4434 entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af);
4435 if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);
4436 } else {
4437 entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af);
4438 flag_qsort(entry->contclass, 0, entry->contclasslen);
4440 *dash = '/';
4442 havecontclass = 1;
4443 for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4444 contclasses[(entry->contclass)[_i]] = 1;
4446 } else {
4447 if (ignorechars) {
4448 if (utf8) {
4449 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4450 } else {
4451 remove_ignored_chars(piece,ignorechars);
4455 if (complexprefixes) {
4456 if (utf8) reverseword_utf(piece); else reverseword(piece);
4458 entry->appnd = mystrdup(piece);
4461 entry->appndl = (unsigned char) strlen(entry->appnd);
4462 if (strcmp(entry->appnd,"0") == 0) {
4463 free(entry->appnd);
4464 entry->appnd=mystrdup("");
4465 entry->appndl = 0;
4467 break;
4470 // piece 5 - is the conditions descriptions
4471 case 4: {
4472 np++;
4473 if (complexprefixes) {
4474 if (utf8) reverseword_utf(piece); else reverseword(piece);
4475 reverse_condition(piece);
4477 if (entry->stripl && (strcmp(piece, ".") != 0) &&
4478 redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum()))
4479 strcpy(piece, ".");
4480 if (at == 'S') {
4481 reverseword(piece);
4482 reverse_condition(piece);
4484 if (encodeit(*entry, piece)) return 1;
4485 break;
4488 case 5: {
4489 np++;
4490 if (pHMgr->is_aliasm()) {
4491 int index = atoi(piece);
4492 entry->morphcode = pHMgr->get_aliasm(index);
4493 } else {
4494 if (complexprefixes) { // XXX - fix me for morph. gen.
4495 if (utf8) reverseword_utf(piece); else reverseword(piece);
4497 // add the remaining of the line
4498 if (*tp) {
4499 *(tp - 1) = ' ';
4500 tp = tp + strlen(tp);
4502 entry->morphcode = mystrdup(piece);
4503 if (!entry->morphcode) return 1;
4505 break;
4507 default: break;
4509 i++;
4511 piece = mystrsep(&tp, 0);
4513 // check to make sure we parsed enough pieces
4514 if (np < 4) {
4515 char * err = pHMgr->encode_flag(aflag);
4516 if (err) {
4517 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4518 af->getlinenum(), err);
4519 free(err);
4521 return 1;
4524 #ifdef DEBUG
4525 // detect unnecessary fields, excepting comments
4526 if (basefieldnum) {
4527 int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
4528 if (fieldnum != basefieldnum)
4529 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum());
4530 } else {
4531 basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
4533 #endif
4536 // now create SfxEntry or PfxEntry objects and use links to
4537 // build an ordered (sorted by affix string) list
4538 for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
4539 if (at == 'P') {
4540 PfxEntry * pfxptr = new PfxEntry(this,&(*entry));
4541 build_pfxtree(pfxptr);
4542 } else {
4543 SfxEntry * sfxptr = new SfxEntry(this,&(*entry));
4544 build_sfxtree(sfxptr);
4547 return 0;
4550 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) {
4551 int condl = strlen(cond);
4552 int i;
4553 int j;
4554 int neg;
4555 int in;
4556 if (ft == 'P') { // prefix
4557 if (strncmp(strip, cond, condl) == 0) return 1;
4558 if (utf8) {
4559 } else {
4560 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4561 if (cond[j] != '[') {
4562 if (cond[j] != strip[i]) {
4563 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4564 return 0;
4566 } else {
4567 neg = (cond[j+1] == '^') ? 1 : 0;
4568 in = 0;
4569 do {
4570 j++;
4571 if (strip[i] == cond[j]) in = 1;
4572 } while ((j < (condl - 1)) && (cond[j] != ']'));
4573 if (j == (condl - 1) && (cond[j] != ']')) {
4574 HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond);
4575 return 0;
4577 if ((!neg && !in) || (neg && in)) {
4578 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4579 return 0;
4583 if (j >= condl) return 1;
4585 } else { // suffix
4586 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
4587 if (utf8) {
4588 } else {
4589 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4590 if (cond[j] != ']') {
4591 if (cond[j] != strip[i]) {
4592 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4593 return 0;
4595 } else {
4596 in = 0;
4597 do {
4598 j--;
4599 if (strip[i] == cond[j]) in = 1;
4600 } while ((j > 0) && (cond[j] != '['));
4601 if ((j == 0) && (cond[j] != '[')) {
4602 HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond);
4603 return 0;
4605 neg = (cond[j+1] == '^') ? 1 : 0;
4606 if ((!neg && !in) || (neg && in)) {
4607 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4608 return 0;
4612 if (j < 0) return 1;
4615 return 0;