1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
69 #include "affentry.hxx"
70 #include "affixmgr.hxx"
72 #include "langnum.hxx"
74 #ifndef MOZILLA_CLIENT
80 AffixMgr::AffixMgr(const char * affpath
, HashMgr
** ptr
, int * md
, const char * key
)
82 // register hash manager and load affix data from aff file
100 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
106 compoundflag
= FLAG_NULL
; // permits word in compound forms
107 compoundbegin
= FLAG_NULL
; // may be first word in compound forms
108 compoundmiddle
= FLAG_NULL
; // may be middle word in compound forms
109 compoundend
= FLAG_NULL
; // may be last word in compound forms
110 compoundroot
= FLAG_NULL
; // compound word signing flag
111 compoundpermitflag
= FLAG_NULL
; // compound permitting flag for suffixed word
112 compoundforbidflag
= FLAG_NULL
; // compound fordidden flag for suffixed word
113 checkcompounddup
= 0; // forbid double words in compounds
114 checkcompoundrep
= 0; // forbid bad compounds (may be non compound word with a REP substitution)
115 checkcompoundcase
= 0; // forbid upper and lowercase combinations at word bounds
116 checkcompoundtriple
= 0; // forbid compounds with triple letters
117 simplifiedtriple
= 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
118 forbiddenword
= FORBIDDENWORD
; // forbidden word signing flag
119 nosuggest
= FLAG_NULL
; // don't suggest words signed with NOSUGGEST flag
120 lang
= NULL
; // language
121 langnum
= 0; // language code (see http://l10n.openoffice.org/languages.html)
122 needaffix
= FLAG_NULL
; // forbidden root, allowed only with suffixes
123 cpdwordmax
= -1; // default: unlimited wordcount in compound words
124 cpdmin
= -1; // undefined
125 cpdmaxsyllable
= 0; // default: unlimited syllablecount in compound words
126 cpdvowels
=NULL
; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
127 cpdvowels_utf16
=NULL
; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
128 cpdvowels_utf16_len
=0; // vowels
129 pfxappnd
=NULL
; // previous prefix for counting the syllables of prefix BUG
130 sfxappnd
=NULL
; // previous suffix for counting a special syllables BUG
131 cpdsyllablenum
=NULL
; // syllable count incrementing flag
132 checknum
=0; // checking numbers, and word with numbers
133 wordchars
=NULL
; // letters + spec. word characters
134 wordchars_utf16
=NULL
; // letters + spec. word characters
135 wordchars_utf16_len
=0; // letters + spec. word characters
136 ignorechars
=NULL
; // letters + spec. word characters
137 ignorechars_utf16
=NULL
; // letters + spec. word characters
138 ignorechars_utf16_len
=0; // letters + spec. word characters
139 version
=NULL
; // affix and dictionary file version string
140 havecontclass
=0; // flags of possible continuing classes (double affix)
141 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
142 // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
143 lemma_present
= FLAG_NULL
;
144 circumfix
= FLAG_NULL
;
145 onlyincompound
= FLAG_NULL
;
146 maxngramsugs
= -1; // undefined
151 substandard
= FLAG_NULL
;
157 for (int i
=0; i
< SETSIZE
; i
++) {
164 for (int j
=0; j
< CONTSIZE
; j
++) {
168 if (parse_file(affpath
, key
)) {
169 HUNSPELL_WARNING(stderr
, "Failure loading aff file %s\n",affpath
);
172 if (cpdmin
== -1) cpdmin
= MINCPDLEN
;
177 AffixMgr::~AffixMgr()
180 // pass through linked prefix entries and clean up
181 for (int i
=0; i
< SETSIZE
;i
++) {
183 PfxEntry
* ptr
= (PfxEntry
*)pStart
[i
];
184 PfxEntry
* nptr
= NULL
;
186 nptr
= ptr
->getNext();
193 // pass through linked suffix entries and clean up
194 for (int j
=0; j
< SETSIZE
; j
++) {
196 SfxEntry
* ptr
= (SfxEntry
*)sStart
[j
];
197 SfxEntry
* nptr
= NULL
;
199 nptr
= ptr
->getNext();
207 if (keystring
) free(keystring
);
209 if (trystring
) free(trystring
);
211 if (encoding
) free(encoding
);
214 for (int j
=0; j
< nummap
; j
++) {
215 if (maptable
[j
].set
) free(maptable
[j
].set
);
216 if (maptable
[j
].set_utf16
) free(maptable
[j
].set_utf16
);
217 maptable
[j
].set
= NULL
;
225 for (int j
=0; j
< numbreak
; j
++) {
226 if (breaktable
[j
]) free(breaktable
[j
]);
227 breaktable
[j
] = NULL
;
234 for (int j
=0; j
< numrep
; j
++) {
235 free(reptable
[j
].pattern
);
236 free(reptable
[j
].pattern2
);
241 if (iconvtable
) delete iconvtable
;
242 if (oconvtable
) delete oconvtable
;
243 if (phone
&& phone
->rules
) {
244 for (int j
=0; j
< phone
->num
+ 1; j
++) {
245 free(phone
->rules
[j
* 2]);
246 free(phone
->rules
[j
* 2 + 1]);
254 for (int j
=0; j
< numdefcpd
; j
++) {
255 free(defcpdtable
[j
].def
);
256 defcpdtable
[j
].def
= NULL
;
263 for (int j
=0; j
< numcheckcpd
; j
++) {
264 free(checkcpdtable
[j
].pattern
);
265 free(checkcpdtable
[j
].pattern2
);
266 free(checkcpdtable
[j
].pattern3
);
267 checkcpdtable
[j
].pattern
= NULL
;
268 checkcpdtable
[j
].pattern2
= NULL
;
269 checkcpdtable
[j
].pattern3
= NULL
;
272 checkcpdtable
= NULL
;
275 FREE_FLAG(compoundflag
);
276 FREE_FLAG(compoundbegin
);
277 FREE_FLAG(compoundmiddle
);
278 FREE_FLAG(compoundend
);
279 FREE_FLAG(compoundpermitflag
);
280 FREE_FLAG(compoundforbidflag
);
281 FREE_FLAG(compoundroot
);
282 FREE_FLAG(forbiddenword
);
283 FREE_FLAG(nosuggest
);
284 FREE_FLAG(needaffix
);
285 FREE_FLAG(lemma_present
);
286 FREE_FLAG(circumfix
);
287 FREE_FLAG(onlyincompound
);
293 if (cpdvowels
) free(cpdvowels
);
294 if (cpdvowels_utf16
) free(cpdvowels_utf16
);
295 if (cpdsyllablenum
) free(cpdsyllablenum
);
297 if (lang
) free(lang
);
298 if (wordchars
) free(wordchars
);
299 if (wordchars_utf16
) free(wordchars_utf16
);
300 if (ignorechars
) free(ignorechars
);
301 if (ignorechars_utf16
) free(ignorechars_utf16
);
302 if (version
) free(version
);
307 // read in aff file and build up prefix and suffix entry objects
308 int AffixMgr::parse_file(const char * affpath
, const char * key
)
310 char * line
; // io buffers
311 char ft
; // affix type
313 // checking flag duplication
314 char dupflags
[CONTSIZE
];
315 char dupflags_ini
= 1;
317 // first line indicator for removing byte order mark
320 // open the affix file
321 FileMgr
* afflst
= new FileMgr(affpath
, key
);
323 HUNSPELL_WARNING(stderr
, "error: could not open affix description file %s\n",affpath
);
327 // step one is to parse the affix file building up the internal
328 // affix data structures
330 // read in each line ignoring any that do not
331 // start with a known line type indicator
332 while ((line
= afflst
->getline())) {
335 /* remove byte order mark */
338 if (strncmp(line
,"\xEF\xBB\xBF",3) == 0) {
339 memmove(line
, line
+3, strlen(line
+3)+1);
340 HUNSPELL_WARNING(stderr
, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
344 /* parse in the keyboard string */
345 if (strncmp(line
,"KEY",3) == 0) {
346 if (parse_string(line
, &keystring
, afflst
->getlinenum())) {
352 /* parse in the try string */
353 if (strncmp(line
,"TRY",3) == 0) {
354 if (parse_string(line
, &trystring
, afflst
->getlinenum())) {
360 /* parse in the name of the character set used by the .dict and .aff */
361 if (strncmp(line
,"SET",3) == 0) {
362 if (parse_string(line
, &encoding
, afflst
->getlinenum())) {
366 if (strcmp(encoding
, "UTF-8") == 0) {
368 #ifndef OPENOFFICEORG
369 #ifndef MOZILLA_CLIENT
370 if (initialize_utf_tbl()) return 1;
376 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
377 if (strncmp(line
,"COMPLEXPREFIXES",15) == 0)
380 /* parse in the flag used by the controlled compound words */
381 if (strncmp(line
,"COMPOUNDFLAG",12) == 0) {
382 if (parse_flag(line
, &compoundflag
, afflst
)) {
388 /* parse in the flag used by compound words */
389 if (strncmp(line
,"COMPOUNDBEGIN",13) == 0) {
390 if (complexprefixes
) {
391 if (parse_flag(line
, &compoundend
, afflst
)) {
396 if (parse_flag(line
, &compoundbegin
, afflst
)) {
403 /* parse in the flag used by compound words */
404 if (strncmp(line
,"COMPOUNDMIDDLE",14) == 0) {
405 if (parse_flag(line
, &compoundmiddle
, afflst
)) {
410 /* parse in the flag used by compound words */
411 if (strncmp(line
,"COMPOUNDEND",11) == 0) {
412 if (complexprefixes
) {
413 if (parse_flag(line
, &compoundbegin
, afflst
)) {
418 if (parse_flag(line
, &compoundend
, afflst
)) {
425 /* parse in the data used by compound_check() method */
426 if (strncmp(line
,"COMPOUNDWORDMAX",15) == 0) {
427 if (parse_num(line
, &cpdwordmax
, afflst
)) {
433 /* parse in the flag sign compounds in dictionary */
434 if (strncmp(line
,"COMPOUNDROOT",12) == 0) {
435 if (parse_flag(line
, &compoundroot
, afflst
)) {
441 /* parse in the flag used by compound_check() method */
442 if (strncmp(line
,"COMPOUNDPERMITFLAG",18) == 0) {
443 if (parse_flag(line
, &compoundpermitflag
, afflst
)) {
449 /* parse in the flag used by compound_check() method */
450 if (strncmp(line
,"COMPOUNDFORBIDFLAG",18) == 0) {
451 if (parse_flag(line
, &compoundforbidflag
, afflst
)) {
457 if (strncmp(line
,"CHECKCOMPOUNDDUP",16) == 0) {
458 checkcompounddup
= 1;
461 if (strncmp(line
,"CHECKCOMPOUNDREP",16) == 0) {
462 checkcompoundrep
= 1;
465 if (strncmp(line
,"CHECKCOMPOUNDTRIPLE",19) == 0) {
466 checkcompoundtriple
= 1;
469 if (strncmp(line
,"SIMPLIFIEDTRIPLE",16) == 0) {
470 simplifiedtriple
= 1;
473 if (strncmp(line
,"CHECKCOMPOUNDCASE",17) == 0) {
474 checkcompoundcase
= 1;
477 if (strncmp(line
,"NOSUGGEST",9) == 0) {
478 if (parse_flag(line
, &nosuggest
, afflst
)) {
484 /* parse in the flag used by forbidden words */
485 if (strncmp(line
,"FORBIDDENWORD",13) == 0) {
486 if (parse_flag(line
, &forbiddenword
, afflst
)) {
492 /* parse in the flag used by forbidden words */
493 if (strncmp(line
,"LEMMA_PRESENT",13) == 0) {
494 if (parse_flag(line
, &lemma_present
, afflst
)) {
500 /* parse in the flag used by circumfixes */
501 if (strncmp(line
,"CIRCUMFIX",9) == 0) {
502 if (parse_flag(line
, &circumfix
, afflst
)) {
508 /* parse in the flag used by fogemorphemes */
509 if (strncmp(line
,"ONLYINCOMPOUND",14) == 0) {
510 if (parse_flag(line
, &onlyincompound
, afflst
)) {
516 /* parse in the flag used by `needaffixs' */
517 if (strncmp(line
,"PSEUDOROOT",10) == 0) {
518 if (parse_flag(line
, &needaffix
, afflst
)) {
524 /* parse in the flag used by `needaffixs' */
525 if (strncmp(line
,"NEEDAFFIX",9) == 0) {
526 if (parse_flag(line
, &needaffix
, afflst
)) {
532 /* parse in the minimal length for words in compounds */
533 if (strncmp(line
,"COMPOUNDMIN",11) == 0) {
534 if (parse_num(line
, &cpdmin
, afflst
)) {
538 if (cpdmin
< 1) cpdmin
= 1;
541 /* parse in the max. words and syllables in compounds */
542 if (strncmp(line
,"COMPOUNDSYLLABLE",16) == 0) {
543 if (parse_cpdsyllable(line
, afflst
)) {
549 /* parse in the flag used by compound_check() method */
550 if (strncmp(line
,"SYLLABLENUM",11) == 0) {
551 if (parse_string(line
, &cpdsyllablenum
, afflst
->getlinenum())) {
557 /* parse in the flag used by the controlled compound words */
558 if (strncmp(line
,"CHECKNUM",8) == 0) {
562 /* parse in the extra word characters */
563 if (strncmp(line
,"WORDCHARS",9) == 0) {
564 if (parse_array(line
, &wordchars
, &wordchars_utf16
, &wordchars_utf16_len
, utf8
, afflst
->getlinenum())) {
570 /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
571 if (strncmp(line
,"IGNORE",6) == 0) {
572 if (parse_array(line
, &ignorechars
, &ignorechars_utf16
, &ignorechars_utf16_len
, utf8
, afflst
->getlinenum())) {
578 /* parse in the typical fault correcting table */
579 if (strncmp(line
,"REP",3) == 0) {
580 if (parse_reptable(line
, afflst
)) {
586 /* parse in the input conversion table */
587 if (strncmp(line
,"ICONV",5) == 0) {
588 if (parse_convtable(line
, afflst
, &iconvtable
, "ICONV")) {
594 /* parse in the input conversion table */
595 if (strncmp(line
,"OCONV",5) == 0) {
596 if (parse_convtable(line
, afflst
, &oconvtable
, "OCONV")) {
602 /* parse in the phonetic translation table */
603 if (strncmp(line
,"PHONE",5) == 0) {
604 if (parse_phonetable(line
, afflst
)) {
610 /* parse in the checkcompoundpattern table */
611 if (strncmp(line
,"CHECKCOMPOUNDPATTERN",20) == 0) {
612 if (parse_checkcpdtable(line
, afflst
)) {
618 /* parse in the defcompound table */
619 if (strncmp(line
,"COMPOUNDRULE",12) == 0) {
620 if (parse_defcpdtable(line
, afflst
)) {
626 /* parse in the related character map table */
627 if (strncmp(line
,"MAP",3) == 0) {
628 if (parse_maptable(line
, afflst
)) {
634 /* parse in the word breakpoints table */
635 if (strncmp(line
,"BREAK",5) == 0) {
636 if (parse_breaktable(line
, afflst
)) {
642 /* parse in the language for language specific codes */
643 if (strncmp(line
,"LANG",4) == 0) {
644 if (parse_string(line
, &lang
, afflst
->getlinenum())) {
648 langnum
= get_lang_num(lang
);
651 if (strncmp(line
,"VERSION",7) == 0) {
652 for(line
= line
+ 7; *line
== ' ' || *line
== '\t'; line
++);
653 version
= mystrdup(line
);
656 if (strncmp(line
,"MAXNGRAMSUGS",12) == 0) {
657 if (parse_num(line
, &maxngramsugs
, afflst
)) {
663 if (strncmp(line
,"NOSPLITSUGS",11) == 0) {
667 if (strncmp(line
,"FULLSTRIP",9) == 0) {
671 if (strncmp(line
,"SUGSWITHDOTS",12) == 0) {
675 /* parse in the flag used by forbidden words */
676 if (strncmp(line
,"KEEPCASE",8) == 0) {
677 if (parse_flag(line
, &keepcase
, afflst
)) {
683 /* parse in the flag used by the affix generator */
684 if (strncmp(line
,"SUBSTANDARD",11) == 0) {
685 if (parse_flag(line
, &substandard
, afflst
)) {
691 if (strncmp(line
,"CHECKSHARPS",11) == 0) {
695 /* parse this affix: P - prefix, S - suffix */
697 if (strncmp(line
,"PFX",3) == 0) ft
= complexprefixes
? 'S' : 'P';
698 if (strncmp(line
,"SFX",3) == 0) ft
= complexprefixes
? 'P' : 'S';
701 for (int i
= 0; i
< CONTSIZE
; i
++) dupflags
[i
] = 0;
704 if (parse_affix(line
, ft
, afflst
, dupflags
)) {
706 process_pfx_tree_to_list();
707 process_sfx_tree_to_list();
715 // convert affix trees to sorted list
716 process_pfx_tree_to_list();
717 process_sfx_tree_to_list();
719 // now we can speed up performance greatly taking advantage of the
720 // relationship between the affixes and the idea of "subsets".
722 // View each prefix as a potential leading subset of another and view
723 // each suffix (reversed) as a potential trailing subset of another.
725 // To illustrate this relationship if we know the prefix "ab" is found in the
726 // word to examine, only prefixes that "ab" is a leading subset of need be examined.
727 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
728 // is a subset need be examined.
729 // The same argument goes for suffix string that are reversed.
731 // Then to top this off why not examine the first char of the word to quickly
732 // limit the set of prefixes to examine (i.e. the prefixes to examine must
733 // be leading supersets of the first character of the word (if they exist)
735 // To take advantage of this "subset" relationship, we need to add two links
736 // from entry. One to take next if the current prefix is found (call it nexteq)
737 // and one to take next if the current prefix is not found (call it nextne).
739 // Since we have built ordered lists, all that remains is to properly intialize
740 // the nextne and nexteq pointers that relate them
745 /* get encoding for CHECKCOMPOUNDCASE */
747 char * enc
= get_encoding();
748 csconv
= get_current_cs(enc
);
754 strcpy(expw
, wordchars
);
758 for (int i
= 0; i
<= 255; i
++) {
759 if ( (csconv
[i
].cupper
!= csconv
[i
].clower
) &&
760 (! strchr(expw
, (char) i
))) {
761 *(expw
+ strlen(expw
) + 1) = '\0';
762 *(expw
+ strlen(expw
)) = (char) i
;
766 wordchars
= mystrdup(expw
);
769 // default BREAK definition
771 breaktable
= (char **) malloc(sizeof(char *) * 3);
772 if (!breaktable
) return 1;
773 breaktable
[0] = mystrdup("-");
774 breaktable
[1] = mystrdup("^-");
775 breaktable
[2] = mystrdup("-$");
776 if (breaktable
[0] && breaktable
[1] && breaktable
[2]) numbreak
= 3;
782 // we want to be able to quickly access prefix information
783 // both by prefix flag, and sorted by prefix string itself
784 // so we need to set up two indexes
786 int AffixMgr::build_pfxtree(AffEntry
* pfxptr
)
790 PfxEntry
* ep
= (PfxEntry
*) pfxptr
;
792 // get the right starting points
793 const char * key
= ep
->getKey();
794 const unsigned char flg
= (unsigned char) (ep
->getFlag() & 0x00FF);
796 // first index by flag which must exist
797 ptr
= (PfxEntry
*)pFlag
[flg
];
799 pFlag
[flg
] = (AffEntry
*) ep
;
802 // handle the special case of null affix string
803 if (strlen(key
) == 0) {
804 // always inset them at head of list at element 0
805 ptr
= (PfxEntry
*)pStart
[0];
807 pStart
[0] = (AffEntry
*)ep
;
811 // now handle the normal case
815 unsigned char sp
= *((const unsigned char *)key
);
816 ptr
= (PfxEntry
*)pStart
[sp
];
818 // handle the first insert
820 pStart
[sp
] = (AffEntry
*)ep
;
825 // otherwise use binary tree insertion so that a sorted
826 // list can easily be generated later
830 if (strcmp(ep
->getKey(), ptr
->getKey() ) <= 0) {
831 ptr
= ptr
->getNextEQ();
837 ptr
= ptr
->getNextNE();
847 // we want to be able to quickly access suffix information
848 // both by suffix flag, and sorted by the reverse of the
849 // suffix string itself; so we need to set up two indexes
850 int AffixMgr::build_sfxtree(AffEntry
* sfxptr
)
854 SfxEntry
* ep
= (SfxEntry
*) sfxptr
;
856 /* get the right starting point */
857 const char * key
= ep
->getKey();
858 const unsigned char flg
= (unsigned char) (ep
->getFlag() & 0x00FF);
860 // first index by flag which must exist
861 ptr
= (SfxEntry
*)sFlag
[flg
];
863 sFlag
[flg
] = (AffEntry
*) ep
;
865 // next index by affix string
867 // handle the special case of null affix string
868 if (strlen(key
) == 0) {
869 // always inset them at head of list at element 0
870 ptr
= (SfxEntry
*)sStart
[0];
872 sStart
[0] = (AffEntry
*)ep
;
876 // now handle the normal case
880 unsigned char sp
= *((const unsigned char *)key
);
881 ptr
= (SfxEntry
*)sStart
[sp
];
883 // handle the first insert
885 sStart
[sp
] = (AffEntry
*)ep
;
889 // otherwise use binary tree insertion so that a sorted
890 // list can easily be generated later
894 if (strcmp(ep
->getKey(), ptr
->getKey() ) <= 0) {
895 ptr
= ptr
->getNextEQ();
901 ptr
= ptr
->getNextNE();
911 // convert from binary tree to sorted list
912 int AffixMgr::process_pfx_tree_to_list()
914 for (int i
=1; i
< SETSIZE
; i
++) {
915 pStart
[i
] = process_pfx_in_order(pStart
[i
],NULL
);
921 AffEntry
* AffixMgr::process_pfx_in_order(AffEntry
* ptr
, AffEntry
* nptr
)
924 nptr
= process_pfx_in_order(((PfxEntry
*) ptr
)->getNextNE(), nptr
);
925 ((PfxEntry
*) ptr
)->setNext((PfxEntry
*) nptr
);
926 nptr
= process_pfx_in_order(((PfxEntry
*) ptr
)->getNextEQ(), ptr
);
932 // convert from binary tree to sorted list
933 int AffixMgr:: process_sfx_tree_to_list()
935 for (int i
=1; i
< SETSIZE
; i
++) {
936 sStart
[i
] = process_sfx_in_order(sStart
[i
],NULL
);
941 AffEntry
* AffixMgr::process_sfx_in_order(AffEntry
* ptr
, AffEntry
* nptr
)
944 nptr
= process_sfx_in_order(((SfxEntry
*) ptr
)->getNextNE(), nptr
);
945 ((SfxEntry
*) ptr
)->setNext((SfxEntry
*) nptr
);
946 nptr
= process_sfx_in_order(((SfxEntry
*) ptr
)->getNextEQ(), ptr
);
952 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
953 // using the idea of leading subsets this time
954 int AffixMgr::process_pfx_order()
958 // loop through each prefix list starting point
959 for (int i
=1; i
< SETSIZE
; i
++) {
961 ptr
= (PfxEntry
*)pStart
[i
];
963 // look through the remainder of the list
964 // and find next entry with affix that
965 // the current one is not a subset of
966 // mark that as destination for NextNE
967 // use next in list that you are a subset
970 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
972 PfxEntry
* nptr
= ptr
->getNext();
973 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
974 if (! isSubset( ptr
->getKey() , nptr
->getKey() )) break;
976 ptr
->setNextNE(nptr
);
977 ptr
->setNextEQ(NULL
);
978 if ((ptr
->getNext()) && isSubset(ptr
->getKey() , (ptr
->getNext())->getKey()))
979 ptr
->setNextEQ(ptr
->getNext());
982 // now clean up by adding smart search termination strings:
983 // if you are already a superset of the previous prefix
984 // but not a subset of the next, search can end here
985 // so set NextNE properly
987 ptr
= (PfxEntry
*) pStart
[i
];
988 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
989 PfxEntry
* nptr
= ptr
->getNext();
990 PfxEntry
* mptr
= NULL
;
991 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
992 if (! isSubset(ptr
->getKey(),nptr
->getKey())) break;
995 if (mptr
) mptr
->setNextNE(NULL
);
1001 // initialize the SfxEntry links NextEQ and NextNE to speed searching
1002 // using the idea of leading subsets this time
1003 int AffixMgr::process_sfx_order()
1007 // loop through each prefix list starting point
1008 for (int i
=1; i
< SETSIZE
; i
++) {
1010 ptr
= (SfxEntry
*) sStart
[i
];
1012 // look through the remainder of the list
1013 // and find next entry with affix that
1014 // the current one is not a subset of
1015 // mark that as destination for NextNE
1016 // use next in list that you are a subset
1019 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
1020 SfxEntry
* nptr
= ptr
->getNext();
1021 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
1022 if (! isSubset(ptr
->getKey(),nptr
->getKey())) break;
1024 ptr
->setNextNE(nptr
);
1025 ptr
->setNextEQ(NULL
);
1026 if ((ptr
->getNext()) && isSubset(ptr
->getKey(),(ptr
->getNext())->getKey()))
1027 ptr
->setNextEQ(ptr
->getNext());
1031 // now clean up by adding smart search termination strings:
1032 // if you are already a superset of the previous suffix
1033 // but not a subset of the next, search can end here
1034 // so set NextNE properly
1036 ptr
= (SfxEntry
*) sStart
[i
];
1037 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
1038 SfxEntry
* nptr
= ptr
->getNext();
1039 SfxEntry
* mptr
= NULL
;
1040 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
1041 if (! isSubset(ptr
->getKey(),nptr
->getKey())) break;
1044 if (mptr
) mptr
->setNextNE(NULL
);
1050 // add flags to the result for dictionary debugging
1051 void AffixMgr::debugflag(char * result
, unsigned short flag
) {
1052 char * st
= encode_flag(flag
);
1053 mystrcat(result
, " ", MAXLNLEN
);
1054 mystrcat(result
, MORPH_FLAG
, MAXLNLEN
);
1056 mystrcat(result
, st
, MAXLNLEN
);
1061 // calculate the character length of the condition
1062 int AffixMgr::condlen(char * st
)
1070 } else if (*st
== ']') group
= false;
1071 else if (!group
&& (!utf8
||
1072 (!(*st
& 0x80) || ((*st
& 0xc0) == 0x80)))) l
++;
1077 int AffixMgr::encodeit(struct affentry
* ptr
, char * cs
)
1079 if (strcmp(cs
,".") != 0) {
1080 ptr
->numconds
= (char) condlen(cs
);
1081 strncpy(ptr
->c
.conds
, cs
, MAXCONDLEN
);
1082 // long condition (end of conds padded by strncpy)
1083 if (ptr
->c
.conds
[MAXCONDLEN
- 1] && cs
[MAXCONDLEN
]) {
1084 ptr
->opts
+= aeLONGCOND
;
1085 ptr
->c
.l
.conds2
= mystrdup(cs
+ MAXCONDLEN_1
);
1086 if (!ptr
->c
.l
.conds2
) return 1;
1090 ptr
->c
.conds
[0] = '\0';
1095 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
1096 inline int AffixMgr::isSubset(const char * s1
, const char * s2
)
1098 while (((*s1
== *s2
) || (*s1
== '.')) && (*s1
!= '\0')) {
1102 return (*s1
== '\0');
1106 // check word for prefixes
1107 struct hentry
* AffixMgr::prefix_check(const char * word
, int len
, char in_compound
,
1108 const FLAG needflag
)
1110 struct hentry
* rv
= NULL
;
1116 // first handle the special case of 0 length prefixes
1117 PfxEntry
* pe
= (PfxEntry
*) pStart
[0];
1121 ((in_compound
!= IN_CPD_NOT
) || !(pe
->getCont() &&
1122 (TESTAFF(pe
->getCont(), onlyincompound
, pe
->getContLen())))) &&
1123 // permit prefixes in compounds
1124 ((in_compound
!= IN_CPD_END
) || (pe
->getCont() &&
1125 (TESTAFF(pe
->getCont(), compoundpermitflag
, pe
->getContLen()))))
1128 rv
= pe
->checkword(word
, len
, in_compound
, needflag
);
1130 pfx
=(AffEntry
*)pe
; // BUG: pfx not stateless
1137 // now handle the general case
1138 unsigned char sp
= *((const unsigned char *)word
);
1139 PfxEntry
* pptr
= (PfxEntry
*)pStart
[sp
];
1142 if (isSubset(pptr
->getKey(),word
)) {
1145 ((in_compound
!= IN_CPD_NOT
) || !(pptr
->getCont() &&
1146 (TESTAFF(pptr
->getCont(), onlyincompound
, pptr
->getContLen())))) &&
1147 // permit prefixes in compounds
1148 ((in_compound
!= IN_CPD_END
) || (pptr
->getCont() &&
1149 (TESTAFF(pptr
->getCont(), compoundpermitflag
, pptr
->getContLen()))))
1152 rv
= pptr
->checkword(word
, len
, in_compound
, needflag
);
1154 pfx
=(AffEntry
*)pptr
; // BUG: pfx not stateless
1158 pptr
= pptr
->getNextEQ();
1160 pptr
= pptr
->getNextNE();
1167 // check word for prefixes
1168 struct hentry
* AffixMgr::prefix_check_twosfx(const char * word
, int len
,
1169 char in_compound
, const FLAG needflag
)
1171 struct hentry
* rv
= NULL
;
1176 // first handle the special case of 0 length prefixes
1177 PfxEntry
* pe
= (PfxEntry
*) pStart
[0];
1180 rv
= pe
->check_twosfx(word
, len
, in_compound
, needflag
);
1185 // now handle the general case
1186 unsigned char sp
= *((const unsigned char *)word
);
1187 PfxEntry
* pptr
= (PfxEntry
*)pStart
[sp
];
1190 if (isSubset(pptr
->getKey(),word
)) {
1191 rv
= pptr
->check_twosfx(word
, len
, in_compound
, needflag
);
1193 pfx
= (AffEntry
*)pptr
;
1196 pptr
= pptr
->getNextEQ();
1198 pptr
= pptr
->getNextNE();
1205 // check word for prefixes
1206 char * AffixMgr::prefix_check_morph(const char * word
, int len
, char in_compound
,
1207 const FLAG needflag
)
1211 char result
[MAXLNLEN
];
1217 // first handle the special case of 0 length prefixes
1218 PfxEntry
* pe
= (PfxEntry
*) pStart
[0];
1220 st
= pe
->check_morph(word
,len
,in_compound
, needflag
);
1222 mystrcat(result
, st
, MAXLNLEN
);
1225 // if (rv) return rv;
1229 // now handle the general case
1230 unsigned char sp
= *((const unsigned char *)word
);
1231 PfxEntry
* pptr
= (PfxEntry
*)pStart
[sp
];
1234 if (isSubset(pptr
->getKey(),word
)) {
1235 st
= pptr
->check_morph(word
,len
,in_compound
, needflag
);
1238 if ((in_compound
!= IN_CPD_NOT
) || !((pptr
->getCont() &&
1239 (TESTAFF(pptr
->getCont(), onlyincompound
, pptr
->getContLen()))))) {
1240 mystrcat(result
, st
, MAXLNLEN
);
1241 pfx
= (AffEntry
*)pptr
;
1245 pptr
= pptr
->getNextEQ();
1247 pptr
= pptr
->getNextNE();
1251 if (*result
) return mystrdup(result
);
1256 // check word for prefixes
1257 char * AffixMgr::prefix_check_twosfx_morph(const char * word
, int len
,
1258 char in_compound
, const FLAG needflag
)
1262 char result
[MAXLNLEN
];
1268 // first handle the special case of 0 length prefixes
1269 PfxEntry
* pe
= (PfxEntry
*) pStart
[0];
1271 st
= pe
->check_twosfx_morph(word
,len
,in_compound
, needflag
);
1273 mystrcat(result
, st
, MAXLNLEN
);
1279 // now handle the general case
1280 unsigned char sp
= *((const unsigned char *)word
);
1281 PfxEntry
* pptr
= (PfxEntry
*)pStart
[sp
];
1284 if (isSubset(pptr
->getKey(),word
)) {
1285 st
= pptr
->check_twosfx_morph(word
, len
, in_compound
, needflag
);
1287 mystrcat(result
, st
, MAXLNLEN
);
1289 pfx
= (AffEntry
*)pptr
;
1291 pptr
= pptr
->getNextEQ();
1293 pptr
= pptr
->getNextNE();
1297 if (*result
) return mystrdup(result
);
1301 // Is word a non compound with a REP substitution (see checkcompoundrep)?
1302 int AffixMgr::cpdrep_check(const char * word
, int wl
)
1304 char candidate
[MAXLNLEN
];
1308 if ((wl
< 2) || !numrep
) return 0;
1310 for (int i
=0; i
< numrep
; i
++ ) {
1312 lenr
= strlen(reptable
[i
].pattern2
);
1313 lenp
= strlen(reptable
[i
].pattern
);
1314 // search every occurence of the pattern in the word
1315 while ((r
=strstr(r
, reptable
[i
].pattern
)) != NULL
) {
1316 strcpy(candidate
, word
);
1317 if (r
-word
+ lenr
+ strlen(r
+lenp
) >= MAXLNLEN
) break;
1318 strcpy(candidate
+(r
-word
),reptable
[i
].pattern2
);
1319 strcpy(candidate
+(r
-word
)+lenr
, r
+lenp
);
1320 if (candidate_check(candidate
,strlen(candidate
))) return 1;
1321 r
++; // search for the next letter
1327 // forbid compoundings when there are special patterns at word bound
1328 int AffixMgr::cpdpat_check(const char * word
, int pos
, hentry
* r1
, hentry
* r2
)
1331 for (int i
= 0; i
< numcheckcpd
; i
++) {
1332 if (isSubset(checkcpdtable
[i
].pattern2
, word
+ pos
) &&
1333 (!r1
|| !checkcpdtable
[i
].cond
||
1334 (r1
->astr
&& TESTAFF(r1
->astr
, checkcpdtable
[i
].cond
, r1
->alen
))) &&
1335 (!r2
|| !checkcpdtable
[i
].cond2
||
1336 (r2
->astr
&& TESTAFF(r2
->astr
, checkcpdtable
[i
].cond2
, r2
->alen
))) &&
1337 (len
= strlen(checkcpdtable
[i
].pattern
)) && (pos
> len
) &&
1338 (strncmp(word
+ pos
- len
, checkcpdtable
[i
].pattern
, len
) == 0)) return 1;
1343 // forbid compounding with neighbouring upper and lower case characters at word bounds
1344 int AffixMgr::cpdcase_check(const char * word
, int pos
)
1349 u8_u16(&u
, 1, word
+ pos
);
1350 for (p
= word
+ pos
- 1; (*p
& 0xc0) == 0x80; p
--);
1352 unsigned short a
= (u
.h
<< 8) + u
.l
;
1353 unsigned short b
= (w
.h
<< 8) + w
.l
;
1354 if (((unicodetoupper(a
, langnum
) == a
) || (unicodetoupper(b
, langnum
) == b
)) &&
1355 (a
!= '-') && (b
!= '-')) return 1;
1357 unsigned char a
= *(word
+ pos
- 1);
1358 unsigned char b
= *(word
+ pos
);
1359 if ((csconv
[a
].ccase
|| csconv
[b
].ccase
) && (a
!= '-') && (b
!= '-')) return 1;
1364 // check compound patterns
1365 int AffixMgr::defcpd_check(hentry
*** words
, short wnum
, hentry
* rv
, hentry
** def
, char all
)
1367 signed short btpp
[MAXWORDLEN
]; // metacharacter (*, ?) positions for backtracking
1368 signed short btwp
[MAXWORDLEN
]; // word positions for metacharacters
1369 int btnum
[MAXWORDLEN
]; // number of matched characters in metacharacter positions
1379 (*words
)[wnum
] = rv
;
1381 // has the last word COMPOUNDRULE flag?
1382 if (rv
->alen
== 0) {
1383 (*words
)[wnum
] = NULL
;
1384 if (w
) *words
= NULL
;
1388 for (i
= 0; i
< numdefcpd
; i
++) {
1389 for (j
= 0; j
< defcpdtable
[i
].len
; j
++) {
1390 if (defcpdtable
[i
].def
[j
] != '*' && defcpdtable
[i
].def
[j
] != '?' &&
1391 TESTAFF(rv
->astr
, defcpdtable
[i
].def
[j
], rv
->alen
)) ok
= 1;
1395 (*words
)[wnum
] = NULL
;
1396 if (w
) *words
= NULL
;
1400 for (i
= 0; i
< numdefcpd
; i
++) {
1401 signed short pp
= 0; // pattern position
1402 signed short wp
= 0; // "words" position
1407 while ((pp
< defcpdtable
[i
].len
) && (wp
<= wnum
)) {
1408 if (((pp
+1) < defcpdtable
[i
].len
) &&
1409 ((defcpdtable
[i
].def
[pp
+1] == '*') || (defcpdtable
[i
].def
[pp
+1] == '?'))) {
1410 int wend
= (defcpdtable
[i
].def
[pp
+1] == '?') ? wp
: wnum
;
1415 while (wp
<= wend
) {
1416 if (!(*words
)[wp
]->alen
||
1417 !TESTAFF((*words
)[wp
]->astr
, defcpdtable
[i
].def
[pp
-2], (*words
)[wp
]->alen
)) {
1423 if (wp
<= wnum
) ok2
= 0;
1424 btnum
[bt
] = wp
- btwp
[bt
];
1425 if (btnum
[bt
] > 0) bt
++;
1429 if (!(*words
)[wp
] || !(*words
)[wp
]->alen
||
1430 !TESTAFF((*words
)[wp
]->astr
, defcpdtable
[i
].def
[pp
], (*words
)[wp
]->alen
)) {
1436 if ((defcpdtable
[i
].len
== pp
) && !(wp
> wnum
)) ok
= 0;
1441 while ((defcpdtable
[i
].len
> r
) && ((r
+1) < defcpdtable
[i
].len
) &&
1442 ((defcpdtable
[i
].def
[r
+1] == '*') || (defcpdtable
[i
].def
[r
+1] == '?'))) r
+=2;
1443 if (defcpdtable
[i
].len
<= r
) return 1;
1450 wp
= btwp
[bt
- 1] + (signed short) btnum
[bt
- 1];
1451 } while ((btnum
[bt
- 1] < 0) && --bt
);
1454 if (ok
&& ok2
&& (!all
|| (defcpdtable
[i
].len
<= pp
))) return 1;
1456 // check zero ending
1457 while (ok
&& ok2
&& (defcpdtable
[i
].len
> pp
) && ((pp
+1) < defcpdtable
[i
].len
) &&
1458 ((defcpdtable
[i
].def
[pp
+1] == '*') || (defcpdtable
[i
].def
[pp
+1] == '?'))) pp
+=2;
1459 if (ok
&& ok2
&& (defcpdtable
[i
].len
<= pp
)) return 1;
1461 (*words
)[wnum
] = NULL
;
1462 if (w
) *words
= NULL
;
1466 inline int AffixMgr::candidate_check(const char * word
, int len
)
1468 struct hentry
* rv
=NULL
;
1473 // rv = prefix_check(word,len,1);
1474 // if (rv) return 1;
1476 rv
= affix_check(word
,len
);
1481 // calculate number of syllable for compound-checking
1482 short AffixMgr::get_syllable(const char * word
, int wlen
)
1484 if (cpdmaxsyllable
==0) return 0;
1489 for (int i
=0; i
<wlen
; i
++) {
1490 if (strchr(cpdvowels
, word
[i
])) num
++;
1492 } else if (cpdvowels_utf16
) {
1493 w_char w
[MAXWORDUTF8LEN
];
1494 int i
= u8_u16(w
, MAXWORDUTF8LEN
, word
);
1495 for (; i
> 0; i
--) {
1496 if (flag_bsearch((unsigned short *) cpdvowels_utf16
,
1497 ((unsigned short *) w
)[i
- 1], cpdvowels_utf16_len
)) num
++;
1503 void AffixMgr::setcminmax(int * cmin
, int * cmax
, const char * word
, int len
) {
1506 for (*cmin
= 0, i
= 0; (i
< cpdmin
) && word
[*cmin
]; i
++) {
1507 for ((*cmin
)++; (word
[*cmin
] & 0xc0) == 0x80; (*cmin
)++);
1509 for (*cmax
= len
, i
= 0; (i
< (cpdmin
- 1)) && *cmax
; i
++) {
1510 for ((*cmax
)--; (word
[*cmax
] & 0xc0) == 0x80; (*cmax
)--);
1514 *cmax
= len
- cpdmin
+ 1;
1518 // check if compound word is correctly spelled
1519 // hu_mov_rule = spec. Hungarian rule (XXX)
1520 struct hentry
* AffixMgr::compound_check(const char * word
, int len
,
1521 short wordnum
, short numsyllable
, short maxwordnum
, short wnum
, hentry
** words
= NULL
,
1522 char hu_mov_rule
= 0, char is_sug
= 0)
1525 short oldnumsyllable
, oldnumsyllable2
, oldwordnum
, oldwordnum2
;
1526 struct hentry
* rv
= NULL
;
1527 struct hentry
* rv_first
;
1528 struct hentry
* rwords
[MAXWORDLEN
]; // buffer for COMPOUND pattern checking
1529 char st
[MAXWORDUTF8LEN
+ 4];
1539 int checkedstriple
= 0;
1543 setcminmax(&cmin
, &cmax
, word
, len
);
1547 for (i
= cmin
; i
< cmax
; i
++) {
1549 oldnumsyllable
= numsyllable
;
1550 oldwordnum
= wordnum
;
1553 // go to end of the UTF-8 character
1555 for (; (st
[i
] & 0xc0) == 0x80; i
++);
1556 if (i
>= cmax
) return NULL
;
1559 do { // simplified checkcompoundpattern loop
1562 for (; scpd
<= numcheckcpd
&& (!checkcpdtable
[scpd
-1].pattern3
||
1563 strncmp(word
+ i
, checkcpdtable
[scpd
-1].pattern3
, strlen(checkcpdtable
[scpd
-1].pattern3
)) != 0); scpd
++);
1565 if (scpd
> numcheckcpd
) break; // break simplified checkcompoundpattern loop
1566 strcpy(st
+ i
, checkcpdtable
[scpd
-1].pattern
);
1568 i
+= strlen(checkcpdtable
[scpd
-1].pattern
);
1569 strcpy(st
+ i
, checkcpdtable
[scpd
-1].pattern2
);
1570 strcpy(st
+ i
+ strlen(checkcpdtable
[scpd
-1].pattern2
), word
+ soldi
+ strlen(checkcpdtable
[scpd
-1].pattern3
));
1573 len
+= strlen(checkcpdtable
[scpd
-1].pattern
) + strlen(checkcpdtable
[scpd
-1].pattern2
) - strlen(checkcpdtable
[scpd
-1].pattern3
);
1576 setcminmax(&cmin
, &cmax
, st
, len
);
1578 cmax
= len
- cpdmin
+ 1;
1590 rv
= lookup(st
); // perhaps without prefix
1592 // search homonym with compound flag
1593 while ((rv
) && !hu_mov_rule
&&
1594 ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
1595 !((compoundflag
&& !words
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1596 (compoundbegin
&& !wordnum
&&
1597 TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
1598 (compoundmiddle
&& wordnum
&& !words
&&
1599 TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
)) ||
1601 ((!words
&& !wordnum
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0)) ||
1602 (words
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0))))) ||
1603 (scpd
!= 0 && checkcpdtable
[scpd
-1].cond
!= FLAG_NULL
&&
1604 !TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond
, rv
->alen
)))
1606 rv
= rv
->next_homonym
;
1611 !(rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundflag
))) {
1612 if ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
,
1613 FLAG_NULL
, compoundflag
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) && !hu_mov_rule
&&
1614 ((SfxEntry
*)sfx
)->getCont() &&
1615 ((compoundforbidflag
&& TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundforbidflag
,
1616 ((SfxEntry
*)sfx
)->getContLen())) || (compoundend
&&
1617 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundend
,
1618 ((SfxEntry
*)sfx
)->getContLen())))) {
1624 (((wordnum
== 0) && compoundbegin
&&
1625 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundbegin
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
1626 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundbegin
)))) ||
1627 ((wordnum
> 0) && compoundmiddle
&&
1628 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundmiddle
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
1629 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundmiddle
)))))
1630 ) checked_prefix
= 1;
1631 // else check forbiddenwords and needaffix
1632 } else if (rv
->astr
&& (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1633 TESTAFF(rv
->astr
, needaffix
, rv
->alen
) ||
1634 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
))
1640 // check non_compound flag in suffix and prefix
1641 if ((rv
) && !hu_mov_rule
&&
1642 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
1643 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundforbidflag
,
1644 ((PfxEntry
*)pfx
)->getContLen())) ||
1645 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
1646 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundforbidflag
,
1647 ((SfxEntry
*)sfx
)->getContLen())))) {
1651 // check compoundend flag in suffix and prefix
1652 if ((rv
) && !checked_prefix
&& compoundend
&& !hu_mov_rule
&&
1653 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
1654 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundend
,
1655 ((PfxEntry
*)pfx
)->getContLen())) ||
1656 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
1657 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundend
,
1658 ((SfxEntry
*)sfx
)->getContLen())))) {
1662 // check compoundmiddle flag in suffix and prefix
1663 if ((rv
) && !checked_prefix
&& (wordnum
==0) && compoundmiddle
&& !hu_mov_rule
&&
1664 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
1665 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundmiddle
,
1666 ((PfxEntry
*)pfx
)->getContLen())) ||
1667 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
1668 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundmiddle
,
1669 ((SfxEntry
*)sfx
)->getContLen())))) {
1673 // check forbiddenwords
1674 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1675 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
)))) {
1679 // increment word number, if the second root has a compoundroot flag
1680 if ((rv
) && compoundroot
&&
1681 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
1685 // first word is acceptable in compound words?
1687 ( checked_prefix
|| (words
&& words
[wnum
]) ||
1688 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1689 ((oldwordnum
== 0) && compoundbegin
&& TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
1690 ((oldwordnum
> 0) && compoundmiddle
&& TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
))// ||
1693 // LANG_hu section: spec. Hungarian rule
1694 || ((langnum
== LANG_hu
) && hu_mov_rule
&& (
1695 TESTAFF(rv
->astr
, 'F', rv
->alen
) || // XXX hardwired Hungarian dictionary codes
1696 TESTAFF(rv
->astr
, 'G', rv
->alen
) ||
1697 TESTAFF(rv
->astr
, 'H', rv
->alen
)
1700 // END of LANG_hu section
1703 // test CHECKCOMPOUNDPATTERN conditions
1704 scpd
== 0 || checkcpdtable
[scpd
-1].cond
== FLAG_NULL
||
1705 TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond
, rv
->alen
)
1707 && ! (( checkcompoundtriple
&& scpd
== 0 && !words
&& // test triple letters
1708 (word
[i
-1]==word
[i
]) && (
1709 ((i
>1) && (word
[i
-1]==word
[i
-2])) ||
1710 ((word
[i
-1]==word
[i
+1])) // may be word[i+1] == '\0'
1714 checkcompoundcase
&& scpd
== 0 && !words
&& cpdcase_check(word
, i
)
1717 // LANG_hu section: spec. Hungarian rule
1718 || ((!rv
) && (langnum
== LANG_hu
) && hu_mov_rule
&& (rv
= affix_check(st
,i
)) &&
1719 (sfx
&& ((SfxEntry
*)sfx
)->getCont() && ( // XXX hardwired Hungarian dic. codes
1720 TESTAFF(((SfxEntry
*)sfx
)->getCont(), (unsigned short) 'x', ((SfxEntry
*)sfx
)->getContLen()) ||
1721 TESTAFF(((SfxEntry
*)sfx
)->getCont(), (unsigned short) '%', ((SfxEntry
*)sfx
)->getContLen())
1725 ) { // first word is ok condition
1727 // LANG_hu section: spec. Hungarian rule
1728 if (langnum
== LANG_hu
) {
1729 // calculate syllable number of the word
1730 numsyllable
+= get_syllable(st
, i
);
1732 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1733 if (pfx
&& (get_syllable(((PfxEntry
*)pfx
)->getKey(),strlen(((PfxEntry
*)pfx
)->getKey())) > 1)) wordnum
++;
1735 // END of LANG_hu section
1742 do { // striple loop
1744 // check simplifiedtriple
1745 if (simplifiedtriple
) {
1748 i
--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1749 } else if (i
> 2 && *(word
+i
- 1) == *(word
+ i
- 2)) striple
= 1;
1752 rv
= lookup((st
+i
)); // perhaps without prefix
1754 // search homonym with compound flag
1755 while ((rv
) && ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
1756 !((compoundflag
&& !words
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1757 (compoundend
&& !words
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
)) ||
1758 (numdefcpd
&& words
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
,1))) ||
1759 (scpd
!= 0 && checkcpdtable
[scpd
-1].cond2
!= FLAG_NULL
&&
1760 !TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond2
, rv
->alen
))
1762 rv
= rv
->next_homonym
;
1765 if (rv
&& words
&& words
[wnum
+ 1]) return rv_first
;
1767 oldnumsyllable2
= numsyllable
;
1768 oldwordnum2
= wordnum
;
1770 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1771 if ((rv
) && (langnum
== LANG_hu
) && (TESTAFF(rv
->astr
, 'I', rv
->alen
)) && !(TESTAFF(rv
->astr
, 'J', rv
->alen
))) {
1774 // END of LANG_hu section
1776 // increment word number, if the second root has a compoundroot flag
1777 if ((rv
) && (compoundroot
) &&
1778 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
1782 // check forbiddenwords
1783 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1784 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
)))) return NULL
;
1786 // second word is acceptable, as a root?
1787 // hungarian conventions: compounding is acceptable,
1788 // when compound forms consist of 2 words, or if more,
1789 // then the syllable number of root words must be 6, or lesser.
1792 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1793 (compoundend
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
))
1796 ((cpdwordmax
==-1) || (wordnum
+1<cpdwordmax
)) ||
1797 ((cpdmaxsyllable
!=0) &&
1798 (numsyllable
+ get_syllable(HENTRY_WORD(rv
), rv
->clen
)<=cpdmaxsyllable
))
1801 // test CHECKCOMPOUNDPATTERN
1802 !numcheckcpd
|| scpd
!= 0 || !cpdpat_check(word
, i
, rv_first
, rv
)
1805 (!checkcompounddup
|| (rv
!= rv_first
))
1807 // test CHECKCOMPOUNDPATTERN conditions
1808 && (scpd
== 0 || checkcpdtable
[scpd
-1].cond2
== FLAG_NULL
||
1809 TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond2
, rv
->alen
))
1812 // forbid compound word, if it is a non compound word with typical fault
1813 if (checkcompoundrep
&& cpdrep_check(word
,len
)) return NULL
;
1817 numsyllable
= oldnumsyllable2
;
1818 wordnum
= oldwordnum2
;
1820 // perhaps second word has prefix or/and suffix
1822 sfxflag
= FLAG_NULL
;
1823 rv
= (compoundflag
) ? affix_check((word
+i
),strlen(word
+i
), compoundflag
, IN_CPD_END
) : NULL
;
1824 if (!rv
&& compoundend
) {
1827 rv
= affix_check((word
+i
),strlen(word
+i
), compoundend
, IN_CPD_END
);
1830 if (!rv
&& numdefcpd
&& words
) {
1831 rv
= affix_check((word
+i
),strlen(word
+i
), 0, IN_CPD_END
);
1832 if (rv
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
, 1)) return rv_first
;
1836 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1837 if (rv
&& !(scpd
== 0 || checkcpdtable
[scpd
-1].cond2
== FLAG_NULL
||
1838 TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond2
, rv
->alen
))) rv
= NULL
;
1840 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1841 if (rv
&& numcheckcpd
&& scpd
== 0 && cpdpat_check(word
, i
, rv_first
, rv
)) rv
= NULL
;
1843 // check non_compound flag in suffix and prefix
1845 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
1846 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundforbidflag
,
1847 ((PfxEntry
*)pfx
)->getContLen())) ||
1848 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
1849 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundforbidflag
,
1850 ((SfxEntry
*)sfx
)->getContLen())))) {
1854 // check forbiddenwords
1855 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1856 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
)))) return NULL
;
1858 // pfxappnd = prefix of word+i, or NULL
1859 // calculate syllable number of prefix.
1860 // hungarian convention: when syllable number of prefix is more,
1861 // than 1, the prefix+word counts as two words.
1863 if (langnum
== LANG_hu
) {
1864 // calculate syllable number of the word
1865 numsyllable
+= get_syllable(word
+ i
, strlen(word
+ i
));
1867 // - affix syllable num.
1868 // XXX only second suffix (inflections, not derivations)
1870 char * tmp
= myrevstrdup(sfxappnd
);
1871 numsyllable
-= get_syllable(tmp
, strlen(tmp
));
1875 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1876 if (pfx
&& (get_syllable(((PfxEntry
*)pfx
)->getKey(),strlen(((PfxEntry
*)pfx
)->getKey())) > 1)) wordnum
++;
1878 // increment syllable num, if last word has a SYLLABLENUM flag
1879 // and the suffix is beginning `s'
1881 if (cpdsyllablenum
) {
1883 case 'c': { numsyllable
+=2; break; }
1884 case 'J': { numsyllable
+= 1; break; }
1885 case 'I': { if (TESTAFF(rv
->astr
, 'J', rv
->alen
)) numsyllable
+= 1; break; }
1890 // increment word number, if the second word has a compoundroot flag
1891 if ((rv
) && (compoundroot
) &&
1892 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
1896 // second word is acceptable, as a word with prefix or/and suffix?
1897 // hungarian conventions: compounding is acceptable,
1898 // when compound forms consist 2 word, otherwise
1899 // the syllable number of root words is 6, or lesser.
1902 ((cpdwordmax
== -1) || (wordnum
+ 1 < cpdwordmax
)) ||
1903 ((cpdmaxsyllable
!= 0) &&
1904 (numsyllable
<= cpdmaxsyllable
))
1907 (!checkcompounddup
|| (rv
!= rv_first
))
1909 // forbid compound word, if it is a non compound word with typical fault
1910 if (checkcompoundrep
&& cpdrep_check(word
, len
)) return NULL
;
1914 numsyllable
= oldnumsyllable2
;
1915 wordnum
= oldwordnum2
;
1917 // perhaps second word is a compound word (recursive call)
1918 if (wordnum
< maxwordnum
) {
1919 rv
= compound_check((st
+i
),strlen(st
+i
), wordnum
+1,
1920 numsyllable
, maxwordnum
, wnum
+ 1, words
, 0, is_sug
);
1921 if (rv
&& numcheckcpd
&& (scpd
== 0 && cpdpat_check(word
, i
, rv_first
, rv
) ||
1922 scpd
!= 0 && !cpdpat_check(word
, i
, rv_first
, rv
))) rv
= NULL
;
1927 // forbid compound word, if it is a non compound word with typical fault
1928 if (checkcompoundrep
&& cpdrep_check(word
, len
)) return NULL
;
1931 } while (striple
&& !checkedstriple
); // end of striple loop
1933 if (checkedstriple
) {
1939 } // first word is ok condition
1950 } while (simplifiedcpd
&& scpd
<= numcheckcpd
); // end of simplifiedcpd loop
1954 strcpy(st
, word
); // XXX add more optim.
1959 wordnum
= oldwordnum
;
1960 numsyllable
= oldnumsyllable
;
1966 // check if compound word is correctly spelled
1967 // hu_mov_rule = spec. Hungarian rule (XXX)
1968 int AffixMgr::compound_check_morph(const char * word
, int len
,
1969 short wordnum
, short numsyllable
, short maxwordnum
, short wnum
, hentry
** words
,
1970 char hu_mov_rule
= 0, char ** result
= NULL
, char * partresult
= NULL
)
1973 short oldnumsyllable
, oldnumsyllable2
, oldwordnum
, oldwordnum2
;
1976 struct hentry
* rv
= NULL
;
1977 struct hentry
* rv_first
;
1978 struct hentry
* rwords
[MAXWORDLEN
]; // buffer for COMPOUND pattern checking
1979 char st
[MAXWORDUTF8LEN
+ 4];
1983 char presult
[MAXLNLEN
];
1988 setcminmax(&cmin
, &cmax
, word
, len
);
1992 for (i
= cmin
; i
< cmax
; i
++) {
1993 oldnumsyllable
= numsyllable
;
1994 oldwordnum
= wordnum
;
1997 // go to end of the UTF-8 character
1999 for (; (st
[i
] & 0xc0) == 0x80; i
++);
2000 if (i
>= cmax
) return 0;
2009 if (partresult
) strcat(presult
, partresult
);
2011 rv
= lookup(st
); // perhaps without prefix
2013 // search homonym with compound flag
2014 while ((rv
) && !hu_mov_rule
&&
2015 ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
2016 !((compoundflag
&& !words
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2017 (compoundbegin
&& !wordnum
&&
2018 TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
2019 (compoundmiddle
&& wordnum
&& !words
&&
2020 TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
)) ||
2022 ((!words
&& !wordnum
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0)) ||
2023 (words
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0))))
2025 rv
= rv
->next_homonym
;
2029 sprintf(presult
+ strlen(presult
), "%c%s%s", MSEP_FLD
, MORPH_PART
, st
);
2030 if (!HENTRY_FIND(rv
, MORPH_STEM
)) {
2031 sprintf(presult
+ strlen(presult
), "%c%s%s", MSEP_FLD
, MORPH_STEM
, st
);
2033 // store the pointer of the hash entry
2034 // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
2035 if (HENTRY_DATA(rv
)) {
2036 sprintf(presult
+ strlen(presult
), "%c%s", MSEP_FLD
, HENTRY_DATA2(rv
));
2041 !(rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundflag
))) {
2042 if ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
,
2043 FLAG_NULL
, compoundflag
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) && !hu_mov_rule
&&
2044 ((SfxEntry
*)sfx
)->getCont() &&
2045 ((compoundforbidflag
&& TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundforbidflag
,
2046 ((SfxEntry
*)sfx
)->getContLen())) || (compoundend
&&
2047 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundend
,
2048 ((SfxEntry
*)sfx
)->getContLen())))) {
2054 (((wordnum
== 0) && compoundbegin
&&
2055 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundbegin
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
2056 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundbegin
)))) ||
2057 ((wordnum
> 0) && compoundmiddle
&&
2058 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundmiddle
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
2059 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundmiddle
)))))
2061 // char * p = prefix_check_morph(st, i, 0, compound);
2063 if (compoundflag
) p
= affix_check_morph(st
, i
, compoundflag
);
2064 if (!p
|| (*p
== '\0')) {
2067 if ((wordnum
== 0) && compoundbegin
) {
2068 p
= affix_check_morph(st
, i
, compoundbegin
);
2069 } else if ((wordnum
> 0) && compoundmiddle
) {
2070 p
= affix_check_morph(st
, i
, compoundmiddle
);
2073 if (p
&& (*p
!= '\0')) {
2074 sprintf(presult
+ strlen(presult
), "%c%s%s%s", MSEP_FLD
,
2075 MORPH_PART
, st
, line_uniq_app(&p
, MSEP_REC
));
2080 // else check forbiddenwords
2081 } else if (rv
->astr
&& (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
2082 TESTAFF(rv
->astr
, needaffix
, rv
->alen
))) {
2087 // check non_compound flag in suffix and prefix
2088 if ((rv
) && !hu_mov_rule
&&
2089 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
2090 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundforbidflag
,
2091 ((PfxEntry
*)pfx
)->getContLen())) ||
2092 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
2093 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundforbidflag
,
2094 ((SfxEntry
*)sfx
)->getContLen())))) {
2098 // check compoundend flag in suffix and prefix
2099 if ((rv
) && !checked_prefix
&& compoundend
&& !hu_mov_rule
&&
2100 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
2101 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundend
,
2102 ((PfxEntry
*)pfx
)->getContLen())) ||
2103 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
2104 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundend
,
2105 ((SfxEntry
*)sfx
)->getContLen())))) {
2109 // check compoundmiddle flag in suffix and prefix
2110 if ((rv
) && !checked_prefix
&& (wordnum
==0) && compoundmiddle
&& !hu_mov_rule
&&
2111 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
2112 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundmiddle
,
2113 ((PfxEntry
*)pfx
)->getContLen())) ||
2114 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
2115 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundmiddle
,
2116 ((SfxEntry
*)sfx
)->getContLen())))) {
2120 // check forbiddenwords
2121 if ((rv
) && (rv
->astr
) && TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
)) continue;
2123 // increment word number, if the second root has a compoundroot flag
2124 if ((rv
) && (compoundroot
) &&
2125 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
2129 // first word is acceptable in compound words?
2131 ( checked_prefix
|| (words
&& words
[wnum
]) ||
2132 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2133 ((oldwordnum
== 0) && compoundbegin
&& TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
2134 ((oldwordnum
> 0) && compoundmiddle
&& TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
))
2135 // LANG_hu section: spec. Hungarian rule
2136 || ((langnum
== LANG_hu
) && // hu_mov_rule
2138 TESTAFF(rv
->astr
, 'F', rv
->alen
) ||
2139 TESTAFF(rv
->astr
, 'G', rv
->alen
) ||
2140 TESTAFF(rv
->astr
, 'H', rv
->alen
)
2143 // END of LANG_hu section
2145 && ! (( checkcompoundtriple
&& !words
&& // test triple letters
2146 (word
[i
-1]==word
[i
]) && (
2147 ((i
>1) && (word
[i
-1]==word
[i
-2])) ||
2148 ((word
[i
-1]==word
[i
+1])) // may be word[i+1] == '\0'
2152 // test CHECKCOMPOUNDPATTERN
2153 numcheckcpd
&& !words
&& cpdpat_check(word
, i
, rv
, NULL
)
2156 checkcompoundcase
&& !words
&& cpdcase_check(word
, i
)
2159 // LANG_hu section: spec. Hungarian rule
2160 || ((!rv
) && (langnum
== LANG_hu
) && hu_mov_rule
&& (rv
= affix_check(st
,i
)) &&
2161 (sfx
&& ((SfxEntry
*)sfx
)->getCont() && (
2162 TESTAFF(((SfxEntry
*)sfx
)->getCont(), (unsigned short) 'x', ((SfxEntry
*)sfx
)->getContLen()) ||
2163 TESTAFF(((SfxEntry
*)sfx
)->getCont(), (unsigned short) '%', ((SfxEntry
*)sfx
)->getContLen())
2167 // END of LANG_hu section
2170 // LANG_hu section: spec. Hungarian rule
2171 if (langnum
== LANG_hu
) {
2172 // calculate syllable number of the word
2173 numsyllable
+= get_syllable(st
, i
);
2175 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2176 if (pfx
&& (get_syllable(((PfxEntry
*)pfx
)->getKey(),strlen(((PfxEntry
*)pfx
)->getKey())) > 1)) wordnum
++;
2178 // END of LANG_hu section
2182 rv
= lookup((word
+i
)); // perhaps without prefix
2184 // search homonym with compound flag
2185 while ((rv
) && ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
2186 !((compoundflag
&& !words
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2187 (compoundend
&& !words
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
)) ||
2188 (numdefcpd
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
,1))))) {
2189 rv
= rv
->next_homonym
;
2192 if (rv
&& words
&& words
[wnum
+ 1]) {
2193 strcat(*result
, presult
);
2194 strcat(*result
, " ");
2195 strcat(*result
, MORPH_PART
);
2196 strcat(*result
, word
+i
);
2197 if (complexprefixes
&& HENTRY_DATA(rv
)) strcat(*result
, HENTRY_DATA2(rv
));
2198 if (!HENTRY_FIND(rv
, MORPH_STEM
)) {
2199 strcat(*result
, " ");
2200 strcat(*result
, MORPH_STEM
);
2201 strcat(*result
, HENTRY_WORD(rv
));
2203 // store the pointer of the hash entry
2204 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2205 if (!complexprefixes
&& HENTRY_DATA(rv
)) {
2206 strcat(*result
, " ");
2207 strcat(*result
, HENTRY_DATA2(rv
));
2209 strcat(*result
, "\n");
2214 oldnumsyllable2
= numsyllable
;
2215 oldwordnum2
= wordnum
;
2217 // LANG_hu section: spec. Hungarian rule
2218 if ((rv
) && (langnum
== LANG_hu
) && (TESTAFF(rv
->astr
, 'I', rv
->alen
)) && !(TESTAFF(rv
->astr
, 'J', rv
->alen
))) {
2221 // END of LANG_hu section
2222 // increment word number, if the second root has a compoundroot flag
2223 if ((rv
) && (compoundroot
) &&
2224 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
2228 // check forbiddenwords
2229 if ((rv
) && (rv
->astr
) && TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
)) {
2234 // second word is acceptable, as a root?
2235 // hungarian conventions: compounding is acceptable,
2236 // when compound forms consist of 2 words, or if more,
2237 // then the syllable number of root words must be 6, or lesser.
2239 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2240 (compoundend
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
))
2243 ((cpdwordmax
==-1) || (wordnum
+1<cpdwordmax
)) ||
2244 ((cpdmaxsyllable
!=0) &&
2245 (numsyllable
+get_syllable(HENTRY_WORD(rv
),rv
->blen
)<=cpdmaxsyllable
))
2248 (!checkcompounddup
|| (rv
!= rv_first
))
2252 // bad compound word
2253 strcat(*result
, presult
);
2254 strcat(*result
, " ");
2255 strcat(*result
, MORPH_PART
);
2256 strcat(*result
, word
+i
);
2258 if (HENTRY_DATA(rv
)) {
2259 if (complexprefixes
) strcat(*result
, HENTRY_DATA2(rv
));
2260 if (! HENTRY_FIND(rv
, MORPH_STEM
)) {
2261 strcat(*result
, " ");
2262 strcat(*result
, MORPH_STEM
);
2263 strcat(*result
, HENTRY_WORD(rv
));
2265 // store the pointer of the hash entry
2266 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2267 if (!complexprefixes
) {
2268 strcat(*result
, " ");
2269 strcat(*result
, HENTRY_DATA2(rv
));
2272 strcat(*result
, "\n");
2276 numsyllable
= oldnumsyllable2
;
2277 wordnum
= oldwordnum2
;
2279 // perhaps second word has prefix or/and suffix
2281 sfxflag
= FLAG_NULL
;
2283 if (compoundflag
) rv
= affix_check((word
+i
),strlen(word
+i
), compoundflag
); else rv
= NULL
;
2285 if (!rv
&& compoundend
) {
2288 rv
= affix_check((word
+i
),strlen(word
+i
), compoundend
);
2291 if (!rv
&& numdefcpd
&& words
) {
2292 rv
= affix_check((word
+i
),strlen(word
+i
), 0, IN_CPD_END
);
2293 if (rv
&& words
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
, 1)) {
2295 if (compoundflag
) m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundflag
);
2296 if ((!m
|| *m
== '\0') && compoundend
) {
2298 m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundend
);
2300 strcat(*result
, presult
);
2301 if (m
|| (*m
!= '\0')) {
2302 sprintf(*result
+ strlen(*result
), "%c%s%s%s", MSEP_FLD
,
2303 MORPH_PART
, word
+ i
, line_uniq_app(&m
, MSEP_REC
));
2306 strcat(*result
, "\n");
2311 // check non_compound flag in suffix and prefix
2313 ((pfx
&& ((PfxEntry
*)pfx
)->getCont() &&
2314 TESTAFF(((PfxEntry
*)pfx
)->getCont(), compoundforbidflag
,
2315 ((PfxEntry
*)pfx
)->getContLen())) ||
2316 (sfx
&& ((SfxEntry
*)sfx
)->getCont() &&
2317 TESTAFF(((SfxEntry
*)sfx
)->getCont(), compoundforbidflag
,
2318 ((SfxEntry
*)sfx
)->getContLen())))) {
2322 // check forbiddenwords
2323 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
,forbiddenword
,rv
->alen
))
2324 && (! TESTAFF(rv
->astr
, needaffix
, rv
->alen
))) {
2329 if (langnum
== LANG_hu
) {
2330 // calculate syllable number of the word
2331 numsyllable
+= get_syllable(word
+ i
, strlen(word
+ i
));
2333 // - affix syllable num.
2334 // XXX only second suffix (inflections, not derivations)
2336 char * tmp
= myrevstrdup(sfxappnd
);
2337 numsyllable
-= get_syllable(tmp
, strlen(tmp
));
2341 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2342 if (pfx
&& (get_syllable(((PfxEntry
*)pfx
)->getKey(),strlen(((PfxEntry
*)pfx
)->getKey())) > 1)) wordnum
++;
2344 // increment syllable num, if last word has a SYLLABLENUM flag
2345 // and the suffix is beginning `s'
2347 if (cpdsyllablenum
) {
2349 case 'c': { numsyllable
+=2; break; }
2350 case 'J': { numsyllable
+= 1; break; }
2351 case 'I': { if (rv
&& TESTAFF(rv
->astr
, 'J', rv
->alen
)) numsyllable
+= 1; break; }
2356 // increment word number, if the second word has a compoundroot flag
2357 if ((rv
) && (compoundroot
) &&
2358 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
2361 // second word is acceptable, as a word with prefix or/and suffix?
2362 // hungarian conventions: compounding is acceptable,
2363 // when compound forms consist 2 word, otherwise
2364 // the syllable number of root words is 6, or lesser.
2367 ((cpdwordmax
==-1) || (wordnum
+1<cpdwordmax
)) ||
2368 ((cpdmaxsyllable
!=0) &&
2369 (numsyllable
<= cpdmaxsyllable
))
2372 (!checkcompounddup
|| (rv
!= rv_first
))
2375 if (compoundflag
) m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundflag
);
2376 if ((!m
|| *m
== '\0') && compoundend
) {
2378 m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundend
);
2380 strcat(*result
, presult
);
2381 if (m
&& (*m
!= '\0')) {
2382 sprintf(*result
+ strlen(*result
), "%c%s%s%s", MSEP_FLD
,
2383 MORPH_PART
, word
+ i
, line_uniq_app(&m
, MSEP_REC
));
2386 sprintf(*result
+ strlen(*result
), "%c", MSEP_REC
);
2390 numsyllable
= oldnumsyllable2
;
2391 wordnum
= oldwordnum2
;
2393 // perhaps second word is a compound word (recursive call)
2394 if ((wordnum
< maxwordnum
) && (ok
== 0)) {
2395 compound_check_morph((word
+i
),strlen(word
+i
), wordnum
+1,
2396 numsyllable
, maxwordnum
, wnum
+ 1, words
, 0, result
, presult
);
2402 wordnum
= oldwordnum
;
2403 numsyllable
= oldnumsyllable
;
2408 // return 1 if s1 (reversed) is a leading subset of end of s2
2409 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2411 while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2416 return (*s1 == '\0');
2420 inline int AffixMgr::isRevSubset(const char * s1
, const char * end_of_s2
, int len
)
2422 while ((len
> 0) && (*s1
!= '\0') && ((*s1
== *end_of_s2
) || (*s1
== '.'))) {
2427 return (*s1
== '\0');
2430 // check word for suffixes
2432 struct hentry
* AffixMgr::suffix_check (const char * word
, int len
,
2433 int sfxopts
, AffEntry
* ppfx
, char ** wlst
, int maxSug
, int * ns
,
2434 const FLAG cclass
, const FLAG needflag
, char in_compound
)
2436 struct hentry
* rv
= NULL
;
2437 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
2439 // first handle the special case of 0 length suffixes
2440 SfxEntry
* se
= (SfxEntry
*) sStart
[0];
2443 if (!cclass
|| se
->getCont()) {
2444 // suffixes are not allowed in beginning of compounds
2445 if ((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2446 // except when signed with compoundpermitflag flag
2447 (se
->getCont() && compoundpermitflag
&&
2448 TESTAFF(se
->getCont(),compoundpermitflag
,se
->getContLen()))) && (!circumfix
||
2449 // no circumfix flag in prefix and suffix
2450 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2451 circumfix
, ep
->getContLen())) &&
2452 (!se
->getCont() || !(TESTAFF(se
->getCont(),circumfix
,se
->getContLen())))) ||
2453 // circumfix flag in prefix AND suffix
2454 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2455 circumfix
, ep
->getContLen())) &&
2456 (se
->getCont() && (TESTAFF(se
->getCont(),circumfix
,se
->getContLen()))))) &&
2459 !((se
->getCont() && (TESTAFF(se
->getCont(), onlyincompound
, se
->getContLen()))))) &&
2460 // needaffix on prefix or first suffix
2462 !(se
->getCont() && TESTAFF(se
->getCont(), needaffix
, se
->getContLen())) ||
2463 (ppfx
&& !((ep
->getCont()) &&
2464 TESTAFF(ep
->getCont(), needaffix
,
2468 rv
= se
->checkword(word
,len
, sfxopts
, ppfx
, wlst
, maxSug
, ns
, (FLAG
) cclass
,
2469 needflag
, (in_compound
? 0 : onlyincompound
));
2471 sfx
=(AffEntry
*)se
; // BUG: sfx not stateless
2479 // now handle the general case
2480 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2481 SfxEntry
* sptr
= (SfxEntry
*) sStart
[sp
];
2484 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)
2486 // suffixes are not allowed in beginning of compounds
2487 if ((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2488 // except when signed with compoundpermitflag flag
2489 (sptr
->getCont() && compoundpermitflag
&&
2490 TESTAFF(sptr
->getCont(),compoundpermitflag
,sptr
->getContLen()))) && (!circumfix
||
2491 // no circumfix flag in prefix and suffix
2492 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2493 circumfix
, ep
->getContLen())) &&
2494 (!sptr
->getCont() || !(TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen())))) ||
2495 // circumfix flag in prefix AND suffix
2496 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2497 circumfix
, ep
->getContLen())) &&
2498 (sptr
->getCont() && (TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen()))))) &&
2501 !((sptr
->getCont() && (TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))))) &&
2502 // needaffix on prefix or first suffix
2504 !(sptr
->getCont() && TESTAFF(sptr
->getCont(), needaffix
, sptr
->getContLen())) ||
2505 (ppfx
&& !((ep
->getCont()) &&
2506 TESTAFF(ep
->getCont(), needaffix
,
2510 rv
= sptr
->checkword(word
,len
, sfxopts
, ppfx
, wlst
,
2511 maxSug
, ns
, cclass
, needflag
, (in_compound
? 0 : onlyincompound
));
2513 sfx
=(AffEntry
*)sptr
; // BUG: sfx not stateless
2514 sfxflag
= sptr
->getFlag(); // BUG: sfxflag not stateless
2515 if (!sptr
->getCont()) sfxappnd
=sptr
->getKey(); // BUG: sfxappnd not stateless
2519 sptr
= sptr
->getNextEQ();
2521 sptr
= sptr
->getNextNE();
2528 // check word for two-level suffixes
2530 struct hentry
* AffixMgr::suffix_check_twosfx(const char * word
, int len
,
2531 int sfxopts
, AffEntry
* ppfx
, const FLAG needflag
)
2533 struct hentry
* rv
= NULL
;
2535 // first handle the special case of 0 length suffixes
2536 SfxEntry
* se
= (SfxEntry
*) sStart
[0];
2538 if (contclasses
[se
->getFlag()])
2540 rv
= se
->check_twosfx(word
,len
, sfxopts
, ppfx
, needflag
);
2546 // now handle the general case
2547 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2548 SfxEntry
* sptr
= (SfxEntry
*) sStart
[sp
];
2551 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)) {
2552 if (contclasses
[sptr
->getFlag()])
2554 rv
= sptr
->check_twosfx(word
,len
, sfxopts
, ppfx
, needflag
);
2556 sfxflag
= sptr
->getFlag(); // BUG: sfxflag not stateless
2557 if (!sptr
->getCont()) sfxappnd
=sptr
->getKey(); // BUG: sfxappnd not stateless
2561 sptr
= sptr
->getNextEQ();
2563 sptr
= sptr
->getNextNE();
2570 char * AffixMgr::suffix_check_twosfx_morph(const char * word
, int len
,
2571 int sfxopts
, AffEntry
* ppfx
, const FLAG needflag
)
2573 char result
[MAXLNLEN
];
2574 char result2
[MAXLNLEN
];
2575 char result3
[MAXLNLEN
];
2583 // first handle the special case of 0 length suffixes
2584 SfxEntry
* se
= (SfxEntry
*) sStart
[0];
2586 if (contclasses
[se
->getFlag()])
2588 st
= se
->check_twosfx_morph(word
,len
, sfxopts
, ppfx
, needflag
);
2591 if (((PfxEntry
*) ppfx
)->getMorph()) {
2592 mystrcat(result
, ((PfxEntry
*) ppfx
)->getMorph(), MAXLNLEN
);
2593 mystrcat(result
, " ", MAXLNLEN
);
2594 } else debugflag(result
, ((PfxEntry
*) ppfx
)->getFlag());
2596 mystrcat(result
, st
, MAXLNLEN
);
2598 if (se
->getMorph()) {
2599 mystrcat(result
, " ", MAXLNLEN
);
2600 mystrcat(result
, se
->getMorph(), MAXLNLEN
);
2601 } else debugflag(result
, se
->getFlag());
2602 mystrcat(result
, "\n", MAXLNLEN
);
2608 // now handle the general case
2609 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2610 SfxEntry
* sptr
= (SfxEntry
*) sStart
[sp
];
2613 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)) {
2614 if (contclasses
[sptr
->getFlag()])
2616 st
= sptr
->check_twosfx_morph(word
,len
, sfxopts
, ppfx
, needflag
);
2618 sfxflag
= sptr
->getFlag(); // BUG: sfxflag not stateless
2619 if (!sptr
->getCont()) sfxappnd
=sptr
->getKey(); // BUG: sfxappnd not stateless
2620 strcpy(result2
, st
);
2625 if (sptr
->getMorph()) {
2626 mystrcat(result3
, " ", MAXLNLEN
);
2627 mystrcat(result3
, sptr
->getMorph(), MAXLNLEN
);
2628 } else debugflag(result3
, sptr
->getFlag());
2629 strlinecat(result2
, result3
);
2630 mystrcat(result2
, "\n", MAXLNLEN
);
2631 mystrcat(result
, result2
, MAXLNLEN
);
2634 sptr
= sptr
->getNextEQ();
2636 sptr
= sptr
->getNextNE();
2639 if (*result
) return mystrdup(result
);
2643 char * AffixMgr::suffix_check_morph(const char * word
, int len
,
2644 int sfxopts
, AffEntry
* ppfx
, const FLAG cclass
, const FLAG needflag
, char in_compound
)
2646 char result
[MAXLNLEN
];
2648 struct hentry
* rv
= NULL
;
2652 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
2654 // first handle the special case of 0 length suffixes
2655 SfxEntry
* se
= (SfxEntry
*) sStart
[0];
2657 if (!cclass
|| se
->getCont()) {
2658 // suffixes are not allowed in beginning of compounds
2659 if (((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2660 // except when signed with compoundpermitflag flag
2661 (se
->getCont() && compoundpermitflag
&&
2662 TESTAFF(se
->getCont(),compoundpermitflag
,se
->getContLen()))) && (!circumfix
||
2663 // no circumfix flag in prefix and suffix
2664 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2665 circumfix
, ep
->getContLen())) &&
2666 (!se
->getCont() || !(TESTAFF(se
->getCont(),circumfix
,se
->getContLen())))) ||
2667 // circumfix flag in prefix AND suffix
2668 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2669 circumfix
, ep
->getContLen())) &&
2670 (se
->getCont() && (TESTAFF(se
->getCont(),circumfix
,se
->getContLen()))))) &&
2673 !((se
->getCont() && (TESTAFF(se
->getCont(), onlyincompound
, se
->getContLen()))))) &&
2674 // needaffix on prefix or first suffix
2676 !(se
->getCont() && TESTAFF(se
->getCont(), needaffix
, se
->getContLen())) ||
2677 (ppfx
&& !((ep
->getCont()) &&
2678 TESTAFF(ep
->getCont(), needaffix
,
2682 rv
= se
->checkword(word
,len
, sfxopts
, ppfx
, NULL
, 0, 0, cclass
, needflag
);
2685 if (((PfxEntry
*) ppfx
)->getMorph()) {
2686 mystrcat(result
, ((PfxEntry
*) ppfx
)->getMorph(), MAXLNLEN
);
2687 mystrcat(result
, " ", MAXLNLEN
);
2688 } else debugflag(result
, ((PfxEntry
*) ppfx
)->getFlag());
2690 if (complexprefixes
&& HENTRY_DATA(rv
)) mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2691 if (! HENTRY_FIND(rv
, MORPH_STEM
)) {
2692 mystrcat(result
, " ", MAXLNLEN
);
2693 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
2694 mystrcat(result
, HENTRY_WORD(rv
), MAXLNLEN
);
2696 // store the pointer of the hash entry
2697 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2699 if (!complexprefixes
&& HENTRY_DATA(rv
)) {
2700 mystrcat(result
, " ", MAXLNLEN
);
2701 mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2703 if (se
->getMorph()) {
2704 mystrcat(result
, " ", MAXLNLEN
);
2705 mystrcat(result
, se
->getMorph(), MAXLNLEN
);
2706 } else debugflag(result
, se
->getFlag());
2707 mystrcat(result
, "\n", MAXLNLEN
);
2708 rv
= se
->get_next_homonym(rv
, sfxopts
, ppfx
, cclass
, needflag
);
2714 // now handle the general case
2715 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2716 SfxEntry
* sptr
= (SfxEntry
*) sStart
[sp
];
2719 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)
2721 // suffixes are not allowed in beginning of compounds
2722 if (((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2723 // except when signed with compoundpermitflag flag
2724 (sptr
->getCont() && compoundpermitflag
&&
2725 TESTAFF(sptr
->getCont(),compoundpermitflag
,sptr
->getContLen()))) && (!circumfix
||
2726 // no circumfix flag in prefix and suffix
2727 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2728 circumfix
, ep
->getContLen())) &&
2729 (!sptr
->getCont() || !(TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen())))) ||
2730 // circumfix flag in prefix AND suffix
2731 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2732 circumfix
, ep
->getContLen())) &&
2733 (sptr
->getCont() && (TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen()))))) &&
2736 !((sptr
->getCont() && (TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))))) &&
2737 // needaffix on first suffix
2738 (cclass
|| !(sptr
->getCont() &&
2739 TESTAFF(sptr
->getCont(), needaffix
, sptr
->getContLen())))
2740 )) rv
= sptr
->checkword(word
,len
, sfxopts
, ppfx
, NULL
, 0, 0, cclass
, needflag
);
2743 if (((PfxEntry
*) ppfx
)->getMorph()) {
2744 mystrcat(result
, ((PfxEntry
*) ppfx
)->getMorph(), MAXLNLEN
);
2745 mystrcat(result
, " ", MAXLNLEN
);
2746 } else debugflag(result
, ((PfxEntry
*) ppfx
)->getFlag());
2748 if (complexprefixes
&& HENTRY_DATA(rv
)) mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2749 if (! HENTRY_FIND(rv
, MORPH_STEM
)) {
2750 mystrcat(result
, " ", MAXLNLEN
);
2751 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
2752 mystrcat(result
, HENTRY_WORD(rv
), MAXLNLEN
);
2754 // store the pointer of the hash entry
2755 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2757 if (!complexprefixes
&& HENTRY_DATA(rv
)) {
2758 mystrcat(result
, " ", MAXLNLEN
);
2759 mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2762 if (sptr
->getMorph()) {
2763 mystrcat(result
, " ", MAXLNLEN
);
2764 mystrcat(result
, sptr
->getMorph(), MAXLNLEN
);
2765 } else debugflag(result
, sptr
->getFlag());
2766 mystrcat(result
, "\n", MAXLNLEN
);
2767 rv
= sptr
->get_next_homonym(rv
, sfxopts
, ppfx
, cclass
, needflag
);
2769 sptr
= sptr
->getNextEQ();
2771 sptr
= sptr
->getNextNE();
2775 if (*result
) return mystrdup(result
);
2779 // check if word with affixes is correctly spelled
2780 struct hentry
* AffixMgr::affix_check (const char * word
, int len
, const FLAG needflag
, char in_compound
)
2782 struct hentry
* rv
= NULL
;
2784 // check all prefixes (also crossed with suffixes if allowed)
2785 rv
= prefix_check(word
, len
, in_compound
, needflag
);
2788 // if still not found check all suffixes
2789 rv
= suffix_check(word
, len
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, needflag
, in_compound
);
2791 if (havecontclass
) {
2795 // if still not found check all two-level suffixes
2796 rv
= suffix_check_twosfx(word
, len
, 0, NULL
, needflag
);
2798 // if still not found check all two-level suffixes
2799 rv
= prefix_check_twosfx(word
, len
, IN_CPD_NOT
, needflag
);
2804 // check if word with affixes is correctly spelled
2805 char * AffixMgr::affix_check_morph(const char * word
, int len
, const FLAG needflag
, char in_compound
)
2807 char result
[MAXLNLEN
];
2812 // check all prefixes (also crossed with suffixes if allowed)
2813 st
= prefix_check_morph(word
, len
, in_compound
);
2815 mystrcat(result
, st
, MAXLNLEN
);
2819 // if still not found check all suffixes
2820 st
= suffix_check_morph(word
, len
, 0, NULL
, '\0', needflag
, in_compound
);
2822 mystrcat(result
, st
, MAXLNLEN
);
2826 if (havecontclass
) {
2829 // if still not found check all two-level suffixes
2830 st
= suffix_check_twosfx_morph(word
, len
, 0, NULL
, needflag
);
2832 mystrcat(result
, st
, MAXLNLEN
);
2836 // if still not found check all two-level suffixes
2837 st
= prefix_check_twosfx_morph(word
, len
, IN_CPD_NOT
, needflag
);
2839 mystrcat(result
, st
, MAXLNLEN
);
2844 return mystrdup(result
);
2847 char * AffixMgr::morphgen(char * ts
, int wl
, const unsigned short * ap
,
2848 unsigned short al
, char * morph
, char * targetmorph
, int level
)
2852 char * stemmorphcatpos
;
2853 char mymorph
[MAXLNLEN
];
2855 if (!morph
&& !targetmorph
) return NULL
;
2857 // check substandard flag
2858 if (TESTAFF(ap
, substandard
, al
)) return NULL
;
2860 if (morphcmp(morph
, targetmorph
) == 0) return mystrdup(ts
);
2862 // int targetcount = get_sfxcount(targetmorph);
2864 // use input suffix fields, if exist
2865 if (strstr(morph
, MORPH_INFL_SFX
) || strstr(morph
, MORPH_DERI_SFX
)) {
2866 stemmorph
= mymorph
;
2867 strcpy(stemmorph
, morph
);
2868 strcat(stemmorph
, " ");
2869 stemmorphcatpos
= stemmorph
+ strlen(stemmorph
);
2872 stemmorphcatpos
= NULL
;
2875 for (int i
= 0; i
< al
; i
++) {
2876 const unsigned char c
= (unsigned char) (ap
[i
] & 0x00FF);
2877 SfxEntry
* sptr
= (SfxEntry
*)sFlag
[c
];
2879 if (sptr
->getFlag() == ap
[i
] && sptr
->getMorph() && ((sptr
->getContLen() == 0) ||
2880 // don't generate forms with substandard affixes
2881 !TESTAFF(sptr
->getCont(), substandard
, sptr
->getContLen()))) {
2883 if (stemmorphcatpos
) strcpy(stemmorphcatpos
, sptr
->getMorph());
2884 else stemmorph
= (char *) sptr
->getMorph();
2886 int cmp
= morphcmp(stemmorph
, targetmorph
);
2889 char * newword
= sptr
->add(ts
, wl
);
2891 hentry
* check
= pHMgr
->lookup(newword
); // XXX extra dic
2892 if (!check
|| !check
->astr
||
2893 !TESTAFF(check
->astr
, forbiddenword
, check
->alen
)) {
2900 // recursive call for secondary suffixes
2901 if ((level
== 0) && (cmp
== 1) && (sptr
->getContLen() > 0) &&
2902 // (get_sfxcount(stemmorph) < targetcount) &&
2903 !TESTAFF(sptr
->getCont(), substandard
, sptr
->getContLen())) {
2904 char * newword
= sptr
->add(ts
, wl
);
2906 char * newword2
= morphgen(newword
, strlen(newword
), sptr
->getCont(),
2907 sptr
->getContLen(), stemmorph
, targetmorph
, 1);
2918 sptr
= (SfxEntry
*)sptr
->getFlgNxt();
2925 int AffixMgr::expand_rootword(struct guessword
* wlst
, int maxn
, const char * ts
,
2926 int wl
, const unsigned short * ap
, unsigned short al
, char * bad
, int badl
,
2930 // first add root word to list
2931 if ((nh
< maxn
) && !(al
&& ((needaffix
&& TESTAFF(ap
, needaffix
, al
)) ||
2932 (onlyincompound
&& TESTAFF(ap
, onlyincompound
, al
))))) {
2933 wlst
[nh
].word
= mystrdup(ts
);
2934 if (!wlst
[nh
].word
) return 0;
2935 wlst
[nh
].allow
= (1 == 0);
2936 wlst
[nh
].orig
= NULL
;
2938 // add special phonetic version
2939 if (phon
&& (nh
< maxn
)) {
2940 wlst
[nh
].word
= mystrdup(phon
);
2941 if (!wlst
[nh
].word
) return nh
- 1;
2942 wlst
[nh
].allow
= (1 == 0);
2943 wlst
[nh
].orig
= mystrdup(ts
);
2944 if (!wlst
[nh
].orig
) return nh
- 1;
2950 for (int i
= 0; i
< al
; i
++) {
2951 const unsigned char c
= (unsigned char) (ap
[i
] & 0x00FF);
2952 SfxEntry
* sptr
= (SfxEntry
*)sFlag
[c
];
2954 if ((sptr
->getFlag() == ap
[i
]) && (!sptr
->getKeyLen() || ((badl
> sptr
->getKeyLen()) &&
2955 (strcmp(sptr
->getAffix(), bad
+ badl
- sptr
->getKeyLen()) == 0))) &&
2956 // check needaffix flag
2957 !(sptr
->getCont() && ((needaffix
&&
2958 TESTAFF(sptr
->getCont(), needaffix
, sptr
->getContLen())) ||
2960 TESTAFF(sptr
->getCont(), circumfix
, sptr
->getContLen())) ||
2962 TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))))
2964 char * newword
= sptr
->add(ts
, wl
);
2967 wlst
[nh
].word
= newword
;
2968 wlst
[nh
].allow
= sptr
->allowCross();
2969 wlst
[nh
].orig
= NULL
;
2971 // add special phonetic version
2972 if (phon
&& (nh
< maxn
)) {
2973 char st
[MAXWORDUTF8LEN
];
2975 strcat(st
, sptr
->getKey());
2976 reverseword(st
+ strlen(phon
));
2977 wlst
[nh
].word
= mystrdup(st
);
2978 if (!wlst
[nh
].word
) return nh
- 1;
2979 wlst
[nh
].allow
= (1 == 0);
2980 wlst
[nh
].orig
= mystrdup(newword
);
2981 if (!wlst
[nh
].orig
) return nh
- 1;
2989 sptr
= (SfxEntry
*)sptr
->getFlgNxt();
2995 // handle cross products of prefixes and suffixes
2996 for (int j
=1;j
<n
;j
++)
2997 if (wlst
[j
].allow
) {
2998 for (int k
= 0; k
< al
; k
++) {
2999 const unsigned char c
= (unsigned char) (ap
[k
] & 0x00FF);
3000 PfxEntry
* cptr
= (PfxEntry
*) pFlag
[c
];
3002 if ((cptr
->getFlag() == ap
[k
]) && cptr
->allowCross() && (!cptr
->getKeyLen() || ((badl
> cptr
->getKeyLen()) &&
3003 (strncmp(cptr
->getKey(), bad
, cptr
->getKeyLen()) == 0)))) {
3004 int l1
= strlen(wlst
[j
].word
);
3005 char * newword
= cptr
->add(wlst
[j
].word
, l1
);
3008 wlst
[nh
].word
= newword
;
3009 wlst
[nh
].allow
= cptr
->allowCross();
3010 wlst
[nh
].orig
= NULL
;
3017 cptr
= (PfxEntry
*)cptr
->getFlgNxt();
3023 // now handle pure prefixes
3024 for (int m
= 0; m
< al
; m
++) {
3025 const unsigned char c
= (unsigned char) (ap
[m
] & 0x00FF);
3026 PfxEntry
* ptr
= (PfxEntry
*) pFlag
[c
];
3028 if ((ptr
->getFlag() == ap
[m
]) && (!ptr
->getKeyLen() || ((badl
> ptr
->getKeyLen()) &&
3029 (strncmp(ptr
->getKey(), bad
, ptr
->getKeyLen()) == 0))) &&
3030 // check needaffix flag
3031 !(ptr
->getCont() && ((needaffix
&&
3032 TESTAFF(ptr
->getCont(), needaffix
, ptr
->getContLen())) ||
3034 TESTAFF(ptr
->getCont(), circumfix
, ptr
->getContLen())) ||
3036 TESTAFF(ptr
->getCont(), onlyincompound
, ptr
->getContLen()))))
3038 char * newword
= ptr
->add(ts
, wl
);
3041 wlst
[nh
].word
= newword
;
3042 wlst
[nh
].allow
= ptr
->allowCross();
3043 wlst
[nh
].orig
= NULL
;
3050 ptr
= (PfxEntry
*)ptr
->getFlgNxt();
3057 // return length of replacing table
3058 int AffixMgr::get_numrep()
3063 // return replacing table
3064 struct replentry
* AffixMgr::get_reptable()
3066 if (! reptable
) return NULL
;
3070 // return iconv table
3071 RepList
* AffixMgr::get_iconvtable()
3073 if (! iconvtable
) return NULL
;
3077 // return oconv table
3078 RepList
* AffixMgr::get_oconvtable()
3080 if (! oconvtable
) return NULL
;
3084 // return replacing table
3085 struct phonetable
* AffixMgr::get_phonetable()
3087 if (! phone
) return NULL
;
3091 // return length of character map table
3092 int AffixMgr::get_nummap()
3097 // return character map table
3098 struct mapentry
* AffixMgr::get_maptable()
3100 if (! maptable
) return NULL
;
3104 // return length of word break table
3105 int AffixMgr::get_numbreak()
3110 // return character map table
3111 char ** AffixMgr::get_breaktable()
3113 if (! breaktable
) return NULL
;
3117 // return text encoding of dictionary
3118 char * AffixMgr::get_encoding()
3120 if (! encoding
) encoding
= mystrdup(SPELL_ENCODING
);
3121 return mystrdup(encoding
);
3124 // return text encoding of dictionary
3125 int AffixMgr::get_langnum()
3130 // return double prefix option
3131 int AffixMgr::get_complexprefixes()
3133 return complexprefixes
;
3136 // return FULLSTRIP option
3137 int AffixMgr::get_fullstrip()
3142 FLAG
AffixMgr::get_keepcase()
3147 int AffixMgr::get_checksharps()
3152 char * AffixMgr::encode_flag(unsigned short aflag
)
3154 return pHMgr
->encode_flag(aflag
);
3158 // return the preferred ignore string for suggestions
3159 char * AffixMgr::get_ignore()
3161 if (!ignorechars
) return NULL
;
3165 // return the preferred ignore string for suggestions
3166 unsigned short * AffixMgr::get_ignore_utf16(int * len
)
3168 *len
= ignorechars_utf16_len
;
3169 return ignorechars_utf16
;
3172 // return the keyboard string for suggestions
3173 char * AffixMgr::get_key_string()
3175 if (! keystring
) keystring
= mystrdup(SPELL_KEYSTRING
);
3176 return mystrdup(keystring
);
3179 // return the preferred try string for suggestions
3180 char * AffixMgr::get_try_string()
3182 if (! trystring
) return NULL
;
3183 return mystrdup(trystring
);
3186 // return the preferred try string for suggestions
3187 const char * AffixMgr::get_wordchars()
3192 unsigned short * AffixMgr::get_wordchars_utf16(int * len
)
3194 *len
= wordchars_utf16_len
;
3195 return wordchars_utf16
;
3198 // is there compounding?
3199 int AffixMgr::get_compound()
3201 return compoundflag
|| compoundbegin
|| numdefcpd
;
3204 // return the compound words control flag
3205 FLAG
AffixMgr::get_compoundflag()
3207 return compoundflag
;
3210 // return the forbidden words control flag
3211 FLAG
AffixMgr::get_forbiddenword()
3213 return forbiddenword
;
3216 // return the forbidden words control flag
3217 FLAG
AffixMgr::get_nosuggest()
3222 // return the forbidden words flag modify flag
3223 FLAG
AffixMgr::get_needaffix()
3228 // return the onlyincompound flag
3229 FLAG
AffixMgr::get_onlyincompound()
3231 return onlyincompound
;
3234 // return the compound word signal flag
3235 FLAG
AffixMgr::get_compoundroot()
3237 return compoundroot
;
3240 // return the compound begin signal flag
3241 FLAG
AffixMgr::get_compoundbegin()
3243 return compoundbegin
;
3246 // return the value of checknum
3247 int AffixMgr::get_checknum()
3252 // return the value of prefix
3253 const char * AffixMgr::get_prefix()
3255 if (pfx
) return ((PfxEntry
*)pfx
)->getKey();
3259 // return the value of suffix
3260 const char * AffixMgr::get_suffix()
3265 // return the value of suffix
3266 const char * AffixMgr::get_version()
3271 // return lemma_present flag
3272 FLAG
AffixMgr::get_lemma_present()
3274 return lemma_present
;
3277 // utility method to look up root words in hash table
3278 struct hentry
* AffixMgr::lookup(const char * word
)
3281 struct hentry
* he
= NULL
;
3282 for (i
= 0; i
< *maxdic
&& !he
; i
++) {
3283 he
= (alldic
[i
])->lookup(word
);
3288 // return the value of suffix
3289 const int AffixMgr::have_contclass()
3291 return havecontclass
;
3295 int AffixMgr::get_utf8()
3300 // return nosplitsugs
3301 int AffixMgr::get_maxngramsugs(void)
3303 return maxngramsugs
;
3306 // return nosplitsugs
3307 int AffixMgr::get_nosplitsugs(void)
3312 // return sugswithdots
3313 int AffixMgr::get_sugswithdots(void)
3315 return sugswithdots
;
3319 int AffixMgr::parse_flag(char * line
, unsigned short * out
, FileMgr
* af
) {
3321 if (*out
!= FLAG_NULL
&& !(*out
>= DEFAULTFLAGS
)) {
3322 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of an affix file parameter\n", af
->getlinenum());
3325 if (parse_string(line
, &s
, af
->getlinenum())) return 1;
3326 *out
= pHMgr
->decode_flag(s
);
3332 int AffixMgr::parse_num(char * line
, int * out
, FileMgr
* af
) {
3335 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of an affix file parameter\n", af
->getlinenum());
3338 if (parse_string(line
, &s
, af
->getlinenum())) return 1;
3344 /* parse in the max syllablecount of compound words and */
3345 int AffixMgr::parse_cpdsyllable(char * line
, FileMgr
* af
)
3351 w_char w
[MAXWORDLEN
];
3352 piece
= mystrsep(&tp
, 0);
3354 if (*piece
!= '\0') {
3356 case 0: { np
++; break; }
3357 case 1: { cpdmaxsyllable
= atoi(piece
); np
++; break; }
3360 cpdvowels
= mystrdup(piece
);
3362 int n
= u8_u16(w
, MAXWORDLEN
, piece
);
3364 flag_qsort((unsigned short *) w
, 0, n
);
3365 cpdvowels_utf16
= (w_char
*) malloc(n
* sizeof(w_char
));
3366 if (!cpdvowels_utf16
) return 1;
3367 memcpy(cpdvowels_utf16
, w
, n
* sizeof(w_char
));
3369 cpdvowels_utf16_len
= n
;
3378 piece
= mystrsep(&tp
, 0);
3381 HUNSPELL_WARNING(stderr
, "error: line %d: missing compoundsyllable information\n", af
->getlinenum());
3384 if (np
== 2) cpdvowels
= mystrdup("aeiouAEIOU");
3388 /* parse in the typical fault correcting table */
3389 int AffixMgr::parse_reptable(char * line
, FileMgr
* af
)
3392 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3399 piece
= mystrsep(&tp
, 0);
3401 if (*piece
!= '\0') {
3403 case 0: { np
++; break; }
3405 numrep
= atoi(piece
);
3407 HUNSPELL_WARNING(stderr
, "error: line %d: incorrect entry number\n", af
->getlinenum());
3410 reptable
= (replentry
*) malloc(numrep
* sizeof(struct replentry
));
3411 if (!reptable
) return 1;
3419 piece
= mystrsep(&tp
, 0);
3422 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3426 /* now parse the numrep lines to read in the remainder of the table */
3428 for (int j
=0; j
< numrep
; j
++) {
3429 if (!(nl
= af
->getline())) return 1;
3433 reptable
[j
].pattern
= NULL
;
3434 reptable
[j
].pattern2
= NULL
;
3435 piece
= mystrsep(&tp
, 0);
3437 if (*piece
!= '\0') {
3440 if (strncmp(piece
,"REP",3) != 0) {
3441 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3447 case 1: { reptable
[j
].pattern
= mystrrep(mystrdup(piece
),"_"," "); break; }
3448 case 2: { reptable
[j
].pattern2
= mystrrep(mystrdup(piece
),"_"," "); break; }
3453 piece
= mystrsep(&tp
, 0);
3455 if ((!(reptable
[j
].pattern
)) || (!(reptable
[j
].pattern2
))) {
3456 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3464 /* parse in the typical fault correcting table */
3465 int AffixMgr::parse_convtable(char * line
, FileMgr
* af
, RepList
** rl
, const char * keyword
)
3468 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3476 piece
= mystrsep(&tp
, 0);
3478 if (*piece
!= '\0') {
3480 case 0: { np
++; break; }
3482 numrl
= atoi(piece
);
3484 HUNSPELL_WARNING(stderr
, "error: line %d: incorrect entry number\n", af
->getlinenum());
3487 *rl
= new RepList(numrl
);
3496 piece
= mystrsep(&tp
, 0);
3499 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3503 /* now parse the num lines to read in the remainder of the table */
3505 for (int j
=0; j
< numrl
; j
++) {
3506 if (!(nl
= af
->getline())) return 1;
3510 char * pattern
= NULL
;
3511 char * pattern2
= NULL
;
3512 piece
= mystrsep(&tp
, 0);
3514 if (*piece
!= '\0') {
3517 if (strncmp(piece
, keyword
, sizeof(keyword
)) != 0) {
3518 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3525 case 1: { pattern
= mystrrep(mystrdup(piece
),"_"," "); break; }
3527 pattern2
= mystrrep(mystrdup(piece
),"_"," ");
3534 piece
= mystrsep(&tp
, 0);
3536 if (!pattern
|| !pattern2
) {
3537 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3540 (*rl
)->add(pattern
, pattern2
);
3546 /* parse in the typical fault correcting table */
3547 int AffixMgr::parse_phonetable(char * line
, FileMgr
* af
)
3550 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3557 piece
= mystrsep(&tp
, 0);
3559 if (*piece
!= '\0') {
3561 case 0: { np
++; break; }
3563 phone
= (phonetable
*) malloc(sizeof(struct phonetable
));
3564 phone
->num
= atoi(piece
);
3565 phone
->rules
= NULL
;
3566 phone
->utf8
= (char) utf8
;
3567 if (!phone
) return 1;
3568 if (phone
->num
< 1) {
3569 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3572 phone
->rules
= (char * *) malloc(2 * (phone
->num
+ 1) * sizeof(char *));
3573 if (!phone
->rules
) return 1;
3581 piece
= mystrsep(&tp
, 0);
3584 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3588 /* now parse the phone->num lines to read in the remainder of the table */
3590 for (int j
=0; j
< phone
->num
; j
++) {
3591 if (!(nl
= af
->getline())) return 1;
3595 phone
->rules
[j
* 2] = NULL
;
3596 phone
->rules
[j
* 2 + 1] = NULL
;
3597 piece
= mystrsep(&tp
, 0);
3599 if (*piece
!= '\0') {
3602 if (strncmp(piece
,"PHONE",5) != 0) {
3603 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3609 case 1: { phone
->rules
[j
* 2] = mystrrep(mystrdup(piece
),"_",""); break; }
3610 case 2: { phone
->rules
[j
* 2 + 1] = mystrrep(mystrdup(piece
),"_",""); break; }
3615 piece
= mystrsep(&tp
, 0);
3617 if ((!(phone
->rules
[j
* 2])) || (!(phone
->rules
[j
* 2 + 1]))) {
3618 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3623 phone
->rules
[phone
->num
* 2] = mystrdup("");
3624 phone
->rules
[phone
->num
* 2 + 1] = mystrdup("");
3625 init_phonet_hash(*phone
);
3629 /* parse in the checkcompoundpattern table */
3630 int AffixMgr::parse_checkcpdtable(char * line
, FileMgr
* af
)
3632 if (numcheckcpd
!= 0) {
3633 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3640 piece
= mystrsep(&tp
, 0);
3642 if (*piece
!= '\0') {
3644 case 0: { np
++; break; }
3646 numcheckcpd
= atoi(piece
);
3647 if (numcheckcpd
< 1) {
3648 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3651 checkcpdtable
= (patentry
*) malloc(numcheckcpd
* sizeof(struct patentry
));
3652 if (!checkcpdtable
) return 1;
3660 piece
= mystrsep(&tp
, 0);
3663 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3667 /* now parse the numcheckcpd lines to read in the remainder of the table */
3669 for (int j
=0; j
< numcheckcpd
; j
++) {
3670 if (!(nl
= af
->getline())) return 1;
3674 checkcpdtable
[j
].pattern
= NULL
;
3675 checkcpdtable
[j
].pattern2
= NULL
;
3676 checkcpdtable
[j
].pattern3
= NULL
;
3677 checkcpdtable
[j
].cond
= FLAG_NULL
;
3678 checkcpdtable
[j
].cond2
= FLAG_NULL
;
3679 piece
= mystrsep(&tp
, 0);
3681 if (*piece
!= '\0') {
3684 if (strncmp(piece
,"CHECKCOMPOUNDPATTERN",20) != 0) {
3685 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3692 checkcpdtable
[j
].pattern
= mystrdup(piece
);
3693 char * p
= strchr(checkcpdtable
[j
].pattern
, '/');
3696 checkcpdtable
[j
].cond
= pHMgr
->decode_flag(p
+ 1);
3700 checkcpdtable
[j
].pattern2
= mystrdup(piece
);
3701 char * p
= strchr(checkcpdtable
[j
].pattern2
, '/');
3704 checkcpdtable
[j
].cond2
= pHMgr
->decode_flag(p
+ 1);
3708 case 3: { checkcpdtable
[j
].pattern3
= mystrdup(piece
); simplifiedcpd
= 1; break; }
3713 piece
= mystrsep(&tp
, 0);
3715 if ((!(checkcpdtable
[j
].pattern
)) || (!(checkcpdtable
[j
].pattern2
))) {
3716 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3724 /* parse in the compound rule table */
3725 int AffixMgr::parse_defcpdtable(char * line
, FileMgr
* af
)
3727 if (numdefcpd
!= 0) {
3728 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3735 piece
= mystrsep(&tp
, 0);
3737 if (*piece
!= '\0') {
3739 case 0: { np
++; break; }
3741 numdefcpd
= atoi(piece
);
3742 if (numdefcpd
< 1) {
3743 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3746 defcpdtable
= (flagentry
*) malloc(numdefcpd
* sizeof(flagentry
));
3747 if (!defcpdtable
) return 1;
3755 piece
= mystrsep(&tp
, 0);
3758 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3762 /* now parse the numdefcpd lines to read in the remainder of the table */
3764 for (int j
=0; j
< numdefcpd
; j
++) {
3765 if (!(nl
= af
->getline())) return 1;
3769 defcpdtable
[j
].def
= NULL
;
3770 piece
= mystrsep(&tp
, 0);
3772 if (*piece
!= '\0') {
3775 if (strncmp(piece
, "COMPOUNDRULE", 12) != 0) {
3776 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3782 case 1: { // handle parenthesized flags
3783 if (strchr(piece
, '(')) {
3784 defcpdtable
[j
].def
= (FLAG
*) malloc(sizeof(piece
) * sizeof(FLAG
));
3785 defcpdtable
[j
].len
= 0;
3789 char * par
= piece
+ 1;
3790 while (*par
!= '(' && *par
!= ')' && *par
!= '\0') par
++;
3791 if (*par
== '\0') end
= 1; else *par
= '\0';
3792 if (*piece
== '(') piece
++;
3793 if (*piece
== '*' || *piece
== '?') {
3794 defcpdtable
[j
].def
[defcpdtable
[j
].len
++] = (FLAG
) *piece
;
3795 } else if (*piece
!= '\0') {
3796 int l
= pHMgr
->decode_flags(&conv
, piece
, af
);
3797 for (int k
= 0; k
< l
; k
++) defcpdtable
[j
].def
[defcpdtable
[j
].len
++] = conv
[k
];
3803 defcpdtable
[j
].len
= pHMgr
->decode_flags(&(defcpdtable
[j
].def
), piece
, af
);
3811 piece
= mystrsep(&tp
, 0);
3813 if (!defcpdtable
[j
].len
) {
3814 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3823 /* parse in the character map table */
3824 int AffixMgr::parse_maptable(char * line
, FileMgr
* af
)
3827 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3834 piece
= mystrsep(&tp
, 0);
3836 if (*piece
!= '\0') {
3838 case 0: { np
++; break; }
3840 nummap
= atoi(piece
);
3842 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3845 maptable
= (mapentry
*) malloc(nummap
* sizeof(struct mapentry
));
3846 if (!maptable
) return 1;
3854 piece
= mystrsep(&tp
, 0);
3857 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3861 /* now parse the nummap lines to read in the remainder of the table */
3863 for (int j
=0; j
< nummap
; j
++) {
3864 if (!(nl
= af
->getline())) return 1;
3868 maptable
[j
].set
= NULL
;
3869 maptable
[j
].len
= 0;
3870 piece
= mystrsep(&tp
, 0);
3872 if (*piece
!= '\0') {
3875 if (strncmp(piece
,"MAP",3) != 0) {
3876 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3883 maptable
[j
].len
= 0;
3884 maptable
[j
].set
= NULL
;
3885 maptable
[j
].set_utf16
= NULL
;
3887 maptable
[j
].set
= mystrdup(piece
);
3888 maptable
[j
].len
= strlen(maptable
[j
].set
);
3890 w_char w
[MAXWORDLEN
];
3891 int n
= u8_u16(w
, MAXWORDLEN
, piece
);
3893 flag_qsort((unsigned short *) w
, 0, n
);
3894 maptable
[j
].set_utf16
= (w_char
*) malloc(n
* sizeof(w_char
));
3895 if (!maptable
[j
].set_utf16
) return 1;
3896 memcpy(maptable
[j
].set_utf16
, w
, n
* sizeof(w_char
));
3898 maptable
[j
].len
= n
;
3905 piece
= mystrsep(&tp
, 0);
3907 if ((!(maptable
[j
].set
|| maptable
[j
].set_utf16
)) || (!(maptable
[j
].len
))) {
3908 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3916 /* parse in the word breakpoint table */
3917 int AffixMgr::parse_breaktable(char * line
, FileMgr
* af
)
3919 if (numbreak
!= 0) {
3920 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3927 piece
= mystrsep(&tp
, 0);
3929 if (*piece
!= '\0') {
3931 case 0: { np
++; break; }
3933 numbreak
= atoi(piece
);
3935 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3938 breaktable
= (char **) malloc(numbreak
* sizeof(char *));
3939 if (!breaktable
) return 1;
3947 piece
= mystrsep(&tp
, 0);
3950 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3954 /* now parse the numbreak lines to read in the remainder of the table */
3956 for (int j
=0; j
< numbreak
; j
++) {
3957 if (!(nl
= af
->getline())) return 1;
3961 piece
= mystrsep(&tp
, 0);
3963 if (*piece
!= '\0') {
3966 if (strncmp(piece
,"BREAK",5) != 0) {
3967 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3974 breaktable
[j
] = mystrdup(piece
);
3981 piece
= mystrsep(&tp
, 0);
3984 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3992 void AffixMgr::reverse_condition(char * piece
) {
3994 for (char * k
= piece
+ strlen(piece
) - 1; k
>= piece
; k
--) {
3997 if (neg
) *(k
+1) = '['; else *k
= ']';
4002 if (neg
) *(k
+1) = '^';
4007 if (*(k
+1) == ']') neg
= 1; else *(k
+1) = *k
;
4011 if (neg
) *(k
+1) = *k
;
4017 int AffixMgr::parse_affix(char * line
, const char at
, FileMgr
* af
, char * dupflags
)
4019 int numents
= 0; // number of affentry structures to parse
4021 unsigned short aflag
= 0; // affix char identifier
4024 struct affentry
* ptr
= NULL
;
4025 struct affentry
* nptr
= NULL
;
4032 // checking lines with bad syntax
4034 int basefieldnum
= 0;
4037 // split affix header line into pieces
4041 piece
= mystrsep(&tp
, 0);
4043 if (*piece
!= '\0') {
4045 // piece 1 - is type of affix
4046 case 0: { np
++; break; }
4048 // piece 2 - is affix char
4051 aflag
= pHMgr
->decode_flag(piece
);
4052 if (((at
== 'S') && (dupflags
[aflag
] & dupSFX
)) ||
4053 ((at
== 'P') && (dupflags
[aflag
] & dupPFX
))) {
4054 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of an affix flag\n",
4056 // return 1; XXX permissive mode for bad dictionaries
4058 dupflags
[aflag
] += (char) ((at
== 'S') ? dupSFX
: dupPFX
);
4061 // piece 3 - is cross product indicator
4062 case 2: { np
++; if (*piece
== 'Y') ff
= aeXPRODUCT
; break; }
4064 // piece 4 - is number of affentries
4067 numents
= atoi(piece
);
4069 char * err
= pHMgr
->encode_flag(aflag
);
4071 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n",
4077 ptr
= (struct affentry
*) malloc(numents
* sizeof(struct affentry
));
4080 if (utf8
) ptr
->opts
+= aeUTF8
;
4081 if (pHMgr
->is_aliasf()) ptr
->opts
+= aeALIASF
;
4082 if (pHMgr
->is_aliasm()) ptr
->opts
+= aeALIASM
;
4090 piece
= mystrsep(&tp
, 0);
4092 // check to make sure we parsed enough pieces
4094 char * err
= pHMgr
->encode_flag(aflag
);
4096 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
4103 // store away ptr to first affentry
4106 // now parse numents affentries for this affix
4107 for (int j
=0; j
< numents
; j
++) {
4108 if (!(nl
= af
->getline())) return 1;
4114 // split line into pieces
4115 piece
= mystrsep(&tp
, 0);
4117 if (*piece
!= '\0') {
4119 // piece 1 - is type
4122 if (nptr
!= ptr
) nptr
->opts
= ptr
->opts
&
4123 (char) (aeXPRODUCT
+ aeUTF8
+ aeALIASF
+ aeALIASM
);
4127 // piece 2 - is affix char
4130 if (pHMgr
->decode_flag(piece
) != aflag
) {
4131 char * err
= pHMgr
->encode_flag(aflag
);
4133 HUNSPELL_WARNING(stderr
, "error: line %d: affix %s is corrupt\n",
4134 af
->getlinenum(), err
);
4140 if (nptr
!= ptr
) nptr
->aflag
= ptr
->aflag
;
4144 // piece 3 - is string to strip or 0 for null
4147 if (complexprefixes
) {
4148 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4150 nptr
->strip
= mystrdup(piece
);
4151 nptr
->stripl
= (unsigned char) strlen(nptr
->strip
);
4152 if (strcmp(nptr
->strip
,"0") == 0) {
4154 nptr
->strip
=mystrdup("");
4160 // piece 4 - is affix string or 0 for null
4163 nptr
->morphcode
= NULL
;
4164 nptr
->contclass
= NULL
;
4165 nptr
->contclasslen
= 0;
4167 dash
= strchr(piece
, '/');
4173 remove_ignored_chars_utf(piece
, ignorechars_utf16
, ignorechars_utf16_len
);
4175 remove_ignored_chars(piece
,ignorechars
);
4179 if (complexprefixes
) {
4180 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4182 nptr
->appnd
= mystrdup(piece
);
4184 if (pHMgr
->is_aliasf()) {
4185 int index
= atoi(dash
+ 1);
4186 nptr
->contclasslen
= (unsigned short) pHMgr
->get_aliasf(index
, &(nptr
->contclass
), af
);
4187 if (!nptr
->contclasslen
) HUNSPELL_WARNING(stderr
, "error: bad affix flag alias: \"%s\"\n", dash
+1);
4189 nptr
->contclasslen
= (unsigned short) pHMgr
->decode_flags(&(nptr
->contclass
), dash
+ 1, af
);
4190 flag_qsort(nptr
->contclass
, 0, nptr
->contclasslen
);
4195 for (unsigned short _i
= 0; _i
< nptr
->contclasslen
; _i
++) {
4196 contclasses
[(nptr
->contclass
)[_i
]] = 1;
4201 remove_ignored_chars_utf(piece
, ignorechars_utf16
, ignorechars_utf16_len
);
4203 remove_ignored_chars(piece
,ignorechars
);
4207 if (complexprefixes
) {
4208 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4210 nptr
->appnd
= mystrdup(piece
);
4213 nptr
->appndl
= (unsigned char) strlen(nptr
->appnd
);
4214 if (strcmp(nptr
->appnd
,"0") == 0) {
4216 nptr
->appnd
=mystrdup("");
4222 // piece 5 - is the conditions descriptions
4225 if (complexprefixes
) {
4226 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4227 reverse_condition(piece
);
4229 if (nptr
->stripl
&& (strcmp(piece
, ".") != 0) &&
4230 redundant_condition(at
, nptr
->strip
, nptr
->stripl
, piece
, af
->getlinenum()))
4234 reverse_condition(piece
);
4236 if (encodeit(nptr
, piece
)) return 1;
4242 if (pHMgr
->is_aliasm()) {
4243 int index
= atoi(piece
);
4244 nptr
->morphcode
= pHMgr
->get_aliasm(index
);
4246 if (complexprefixes
) { // XXX - fix me for morph. gen.
4247 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4249 // add the remaining of the line
4252 tp
= tp
+ strlen(tp
);
4254 nptr
->morphcode
= mystrdup(piece
);
4255 if (!nptr
->morphcode
) return 1;
4263 piece
= mystrsep(&tp
, 0);
4265 // check to make sure we parsed enough pieces
4267 char * err
= pHMgr
->encode_flag(aflag
);
4269 HUNSPELL_WARNING(stderr
, "error: line %d: affix %s is corrupt\n",
4270 af
->getlinenum(), err
);
4278 // detect unnecessary fields, excepting comments
4280 int fieldnum
= !(nptr
->morphcode
) ? 5 : ((*(nptr
->morphcode
)=='#') ? 5 : 6);
4281 if (fieldnum
!= basefieldnum
)
4282 HUNSPELL_WARNING(stderr
, "warning: line %d: bad field number\n", af
->getlinenum());
4284 basefieldnum
= !(nptr
->morphcode
) ? 5 : ((*(nptr
->morphcode
)=='#') ? 5 : 6);
4290 // now create SfxEntry or PfxEntry objects and use links to
4291 // build an ordered (sorted by affix string) list
4293 for (int k
= 0; k
< numents
; k
++) {
4295 PfxEntry
* pfxptr
= new PfxEntry(this,nptr
);
4296 build_pfxtree((AffEntry
*)pfxptr
);
4298 SfxEntry
* sfxptr
= new SfxEntry(this,nptr
);
4299 build_sfxtree((AffEntry
*)sfxptr
);
4307 int AffixMgr::redundant_condition(char ft
, char * strip
, int stripl
, const char * cond
, int linenum
) {
4308 int condl
= strlen(cond
);
4313 if (ft
== 'P') { // prefix
4314 if (strncmp(strip
, cond
, condl
) == 0) return 1;
4317 for (i
= 0, j
= 0; (i
< stripl
) && (j
< condl
); i
++, j
++) {
4318 if (cond
[j
] != '[') {
4319 if (cond
[j
] != strip
[i
]) {
4320 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4324 neg
= (cond
[j
+1] == '^') ? 1 : 0;
4328 if (strip
[i
] == cond
[j
]) in
= 1;
4329 } while ((j
< (condl
- 1)) && (cond
[j
] != ']'));
4330 if (j
== (condl
- 1) && (cond
[j
] != ']')) {
4331 HUNSPELL_WARNING(stderr
, "error: line %d: missing ] in condition:\n%s\n", linenum
);
4334 if ((!neg
&& !in
) || (neg
&& in
)) {
4335 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4340 if (j
>= condl
) return 1;
4343 if ((stripl
>= condl
) && strcmp(strip
+ stripl
- condl
, cond
) == 0) return 1;
4346 for (i
= stripl
- 1, j
= condl
- 1; (i
>= 0) && (j
>= 0); i
--, j
--) {
4347 if (cond
[j
] != ']') {
4348 if (cond
[j
] != strip
[i
]) {
4349 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4356 if (strip
[i
] == cond
[j
]) in
= 1;
4357 } while ((j
> 0) && (cond
[j
] != '['));
4358 if ((j
== 0) && (cond
[j
] != '[')) {
4359 HUNSPELL_WARNING(stderr
, "error: error: %d: missing ] in condition:\n%s\n", linenum
);
4362 neg
= (cond
[j
+1] == '^') ? 1 : 0;
4363 if ((!neg
&& !in
) || (neg
&& in
)) {
4364 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4369 if (j
< 0) return 1;