1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
69 #include "hunspell.hxx"
71 #ifndef MOZILLA_CLIENT
77 Hunspell::Hunspell(const char * affpath
, const char * dpath
, const char * key
)
83 affixpath
= mystrdup(affpath
);
86 /* first set up the hash manager */
87 pHMgr
[0] = new HashMgr(dpath
, affpath
, key
);
88 if (pHMgr
[0]) maxdic
= 1;
90 /* next set up the affix manager */
91 /* it needs access to the hash manager lookup methods */
92 pAMgr
= new AffixMgr(affpath
, pHMgr
, &maxdic
, key
);
94 /* get the preferred try string and the dictionary */
95 /* encoding from the Affix Manager for that dictionary */
96 char * try_string
= pAMgr
->get_try_string();
97 encoding
= pAMgr
->get_encoding();
98 csconv
= get_current_cs(encoding
);
99 langnum
= pAMgr
->get_langnum();
100 utf8
= pAMgr
->get_utf8();
101 complexprefixes
= pAMgr
->get_complexprefixes();
102 wordbreak
= pAMgr
->get_breaktable();
104 /* and finally set up the suggestion manager */
105 pSMgr
= new SuggestMgr(try_string
, MAXSUGGESTION
, pAMgr
);
106 if (try_string
) free(try_string
);
109 Hunspell::~Hunspell()
111 if (pSMgr
) delete pSMgr
;
112 if (pAMgr
) delete pAMgr
;
113 for (int i
= 0; i
< maxdic
; i
++) delete pHMgr
[i
];
117 #ifdef MOZILLA_CLIENT
121 if (encoding
) free(encoding
);
123 if (affixpath
) free(affixpath
);
127 // load extra dictionaries
128 int Hunspell::add_dic(const char * dpath
, const char * key
) {
129 if (maxdic
== MAXDIC
|| !affixpath
) return 1;
130 pHMgr
[maxdic
] = new HashMgr(dpath
, affixpath
, key
);
131 if (pHMgr
[maxdic
]) maxdic
++; else return 1;
135 // make a copy of src at destination while removing all leading
136 // blanks and removing any trailing periods after recording
137 // their presence with the abbreviation flag
138 // also since already going through character by character,
139 // set the capitalization type
140 // return the length of the "cleaned" (and UTF-8 encoded) word
142 int Hunspell::cleanword2(char * dest
, const char * src
,
143 w_char
* dest_utf
, int * nc
, int * pcaptype
, int * pabbrev
)
145 unsigned char * p
= (unsigned char *) dest
;
146 const unsigned char * q
= (const unsigned char * ) src
;
148 // first skip over any leading blanks
149 while ((*q
!= '\0') && (*q
== ' ')) q
++;
151 // now strip off any trailing periods (recording their presence)
153 int nl
= strlen((const char *)q
);
154 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
159 // if no characters are left it can't be capitalized
166 strncpy(dest
, (char *) q
, nl
);
170 *nc
= u8_u16(dest_utf
, MAXWORDLEN
, dest
);
171 // don't check too long words
172 if (*nc
>= MAXWORDLEN
) return 0;
173 if (*nc
== -1) { // big Unicode character (non BMP area)
177 *pcaptype
= get_captype_utf8(dest_utf
, *nc
, langnum
);
179 *pcaptype
= get_captype(dest
, nl
, csconv
);
185 int Hunspell::cleanword(char * dest
, const char * src
,
186 int * pcaptype
, int * pabbrev
)
188 unsigned char * p
= (unsigned char *) dest
;
189 const unsigned char * q
= (const unsigned char * ) src
;
192 // first skip over any leading blanks
193 while ((*q
!= '\0') && (*q
== ' ')) q
++;
195 // now strip off any trailing periods (recording their presence)
197 int nl
= strlen((const char *)q
);
198 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
203 // if no characters are left it can't be capitalized
210 // now determine the capitalization type of the first nl letters
218 if (csconv
[(*q
)].ccase
) ncap
++;
219 if (csconv
[(*q
)].cupper
== csconv
[(*q
)].clower
) nneutral
++;
223 // remember to terminate the destination string
225 firstcap
= csconv
[(unsigned char)(*dest
)].ccase
;
228 w_char t
[MAXWORDLEN
];
229 nc
= u8_u16(t
, MAXWORDLEN
, src
);
230 for (int i
= 0; i
< nc
; i
++) {
231 idx
= (t
[i
].h
<< 8) + t
[i
].l
;
232 unsigned short low
= unicodetolower(idx
, langnum
);
233 if (idx
!= low
) ncap
++;
234 if (unicodetoupper(idx
, langnum
) == low
) nneutral
++;
236 u16_u8(dest
, MAXWORDUTF8LEN
, t
, nc
);
238 idx
= (t
[0].h
<< 8) + t
[0].l
;
239 firstcap
= (idx
!= unicodetolower(idx
, langnum
));
243 // now finally set the captype
246 } else if ((ncap
== 1) && firstcap
) {
248 } else if ((ncap
== nc
) || ((ncap
+ nneutral
) == nc
)){
250 } else if ((ncap
> 1) && firstcap
) {
251 *pcaptype
= HUHINITCAP
;
258 void Hunspell::mkallcap(char * p
)
261 w_char u
[MAXWORDLEN
];
262 int nc
= u8_u16(u
, MAXWORDLEN
, p
);
264 for (int i
= 0; i
< nc
; i
++) {
265 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
266 if (idx
!= unicodetoupper(idx
, langnum
)) {
267 u
[i
].h
= (unsigned char) (unicodetoupper(idx
, langnum
) >> 8);
268 u
[i
].l
= (unsigned char) (unicodetoupper(idx
, langnum
) & 0x00FF);
271 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
274 *p
= csconv
[((unsigned char) *p
)].cupper
;
280 int Hunspell::mkallcap2(char * p
, w_char
* u
, int nc
)
284 for (int i
= 0; i
< nc
; i
++) {
285 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
286 unsigned short up
= unicodetoupper(idx
, langnum
);
288 u
[i
].h
= (unsigned char) (up
>> 8);
289 u
[i
].l
= (unsigned char) (up
& 0x00FF);
292 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
296 *p
= csconv
[((unsigned char) *p
)].cupper
;
304 void Hunspell::mkallsmall(char * p
)
307 *p
= csconv
[((unsigned char) *p
)].clower
;
312 int Hunspell::mkallsmall2(char * p
, w_char
* u
, int nc
)
316 for (int i
= 0; i
< nc
; i
++) {
317 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
318 unsigned short low
= unicodetolower(idx
, langnum
);
320 u
[i
].h
= (unsigned char) (low
>> 8);
321 u
[i
].l
= (unsigned char) (low
& 0x00FF);
324 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
328 *p
= csconv
[((unsigned char) *p
)].clower
;
335 // convert UTF-8 sharp S codes to latin 1
336 char * Hunspell::sharps_u8_l1(char * dest
, char * source
) {
339 for (p
++, source
++; *(source
- 1); p
++, source
++) {
341 if (*source
== '\x9F') *--p
= '\xDF';
346 // recursive search for right ss - sharp s permutations
347 hentry
* Hunspell::spellsharps(char * base
, char * pos
, int n
,
348 int repnum
, char * tmp
, int * info
, char **root
) {
349 pos
= strstr(pos
, "ss");
350 if (pos
&& (n
< MAXSHARPS
)) {
353 hentry
* h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
+ 1, tmp
, info
, root
);
357 h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
, tmp
, info
, root
);
359 } else if (repnum
> 0) {
360 if (utf8
) return checkword(base
, info
, root
);
361 return checkword(sharps_u8_l1(tmp
, base
), info
, root
);
366 int Hunspell::is_keepcase(const hentry
* rv
) {
367 return pAMgr
&& rv
->astr
&& pAMgr
->get_keepcase() &&
368 TESTAFF(rv
->astr
, pAMgr
->get_keepcase(), rv
->alen
);
371 /* insert a word to the beginning of the suggestion array and return ns */
372 int Hunspell::insert_sug(char ***slst
, char * word
, int ns
) {
373 char * dup
= mystrdup(word
);
375 if (ns
== MAXSUGGESTION
) {
379 for (int k
= ns
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
384 int Hunspell::spell(const char * word
, int * info
, char ** root
)
386 struct hentry
* rv
=NULL
;
387 // need larger vector. For example, Turkish capital letter I converted a
388 // 2-byte UTF-8 character (dotless i) by mkallsmall.
389 char cw
[MAXWORDUTF8LEN
];
390 char wspace
[MAXWORDUTF8LEN
];
391 w_char unicw
[MAXWORDLEN
];
392 // Hunspell supports XML input of the simplified API (see manual)
393 if (strcmp(word
, SPELL_XML
) == 0) return 1;
394 int nc
= strlen(word
);
397 if (nc
>= MAXWORDUTF8LEN
) return 0;
399 if (nc
>= MAXWORDLEN
) return 0;
406 RepList
* rl
= (pAMgr
) ? pAMgr
->get_iconvtable() : NULL
;
407 if (rl
&& rl
->conv(word
, wspace
)) wl
= cleanword2(cw
, wspace
, unicw
, &nc
, &captype
, &abbv
);
408 else wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
411 if (wl
== 0 || maxdic
== 0) return 1;
412 if (root
) *root
= NULL
;
414 // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
415 enum { NBEGIN
, NNUM
, NSEP
};
419 for (i
= 0; (i
< wl
); i
++) {
420 if ((cw
[i
] <= '9') && (cw
[i
] >= '0')) {
422 } else if ((cw
[i
] == ',') || (cw
[i
] == '.') || (cw
[i
] == '-')) {
423 if ((nstate
== NSEP
) || (i
== 0)) break;
427 if ((i
== wl
) && (nstate
== NNUM
)) return 1;
428 if (!info
) info
= &info2
; else *info
= 0;
434 rv
= checkword(cw
, info
, root
);
435 if ((abbv
) && !(rv
)) {
436 memcpy(wspace
,cw
,wl
);
438 *(wspace
+wl
+1) = '\0';
439 rv
= checkword(wspace
, info
, root
);
444 rv
= checkword(cw
, info
, root
);
447 memcpy(wspace
,cw
,wl
);
449 *(wspace
+wl
+1) = '\0';
450 rv
= checkword(wspace
, info
, root
);
453 // Spec. prefix handling for Catalan, French, Italian:
454 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
455 if (pAMgr
&& strchr(cw
, '\'')) {
456 wl
= mkallsmall2(cw
, unicw
, nc
);
457 char * apostrophe
= strchr(cw
, '\'');
459 w_char tmpword
[MAXWORDLEN
];
461 wl2
= u8_u16(tmpword
, MAXWORDLEN
, cw
);
464 mkinitcap2(apostrophe
+ 1, unicw
+ wl2
+ 1, nc
- wl2
- 1);
465 rv
= checkword(cw
, info
, root
);
469 mkinitcap2(apostrophe
+ 1, unicw
, nc
);
470 rv
= checkword(cw
, info
, root
);
473 mkinitcap2(cw
, unicw
, nc
);
474 rv
= checkword(cw
, info
, root
);
477 if (pAMgr
&& pAMgr
->get_checksharps() && strstr(cw
, "SS")) {
478 char tmpword
[MAXWORDUTF8LEN
];
479 wl
= mkallsmall2(cw
, unicw
, nc
);
480 memcpy(wspace
,cw
,(wl
+1));
481 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
483 wl2
= mkinitcap2(cw
, unicw
, nc
);
484 rv
= spellsharps(cw
, cw
, 0, 0, tmpword
, info
, root
);
486 if ((abbv
) && !(rv
)) {
488 *(wspace
+wl
+1) = '\0';
489 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
491 memcpy(wspace
, cw
, wl2
);
493 *(wspace
+wl2
+1) = '\0';
494 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
501 wl
= mkallsmall2(cw
, unicw
, nc
);
502 memcpy(wspace
,cw
,(wl
+1));
503 wl2
= mkinitcap2(cw
, unicw
, nc
);
504 if (captype
== INITCAP
) *info
+= SPELL_INITCAP
;
505 rv
= checkword(cw
, info
, root
);
506 if (captype
== INITCAP
) *info
-= SPELL_INITCAP
;
507 // forbid bad capitalization
508 // (for example, ijs -> Ijs instead of IJs in Dutch)
509 // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
510 if (*info
& SPELL_FORBIDDEN
) {
514 if (rv
&& is_keepcase(rv
) && (captype
== ALLCAP
)) rv
= NULL
;
517 rv
= checkword(wspace
, info
, root
);
521 *(wspace
+wl
+1) = '\0';
522 rv
= checkword(wspace
, info
, root
);
524 memcpy(wspace
, cw
, wl2
);
526 *(wspace
+wl2
+1) = '\0';
527 if (captype
== INITCAP
) *info
+= SPELL_INITCAP
;
528 rv
= checkword(wspace
, info
, root
);
529 if (captype
== INITCAP
) *info
-= SPELL_INITCAP
;
530 if (rv
&& is_keepcase(rv
) && (captype
== ALLCAP
)) rv
= NULL
;
534 if (rv
&& is_keepcase(rv
) &&
535 ((captype
== ALLCAP
) ||
536 // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
537 // in INITCAP form, too.
538 !(pAMgr
->get_checksharps() &&
539 ((utf8
&& strstr(wspace
, "\xC3\x9F")) ||
540 (!utf8
&& strchr(wspace
, '\xDF')))))) rv
= NULL
;
547 // recursive breaking at break points
553 int numbreak
= pAMgr
? pAMgr
->get_numbreak() : 0;
554 // check boundary patterns (^begin and end$)
555 for (int j
= 0; j
< numbreak
; j
++) {
556 int plen
= strlen(wordbreak
[j
]);
557 if (plen
== 1 || plen
> wl
) continue;
558 if (wordbreak
[j
][0] == '^' && strncmp(cw
, wordbreak
[j
] + 1, plen
- 1) == 0
559 && spell(cw
+ plen
- 1)) return 1;
560 if (wordbreak
[j
][plen
- 1] == '$' &&
561 strncmp(cw
+ wl
- plen
+ 1, wordbreak
[j
], plen
- 1) == 0) {
562 r
= cw
[wl
- plen
+ 1];
563 cw
[wl
- plen
+ 1] = '\0';
564 if (spell(cw
)) return 1;
565 cw
[wl
- plen
+ 1] = r
;
569 for (int j
= 0; j
< numbreak
; j
++) {
571 int plen
= strlen(wordbreak
[j
]);
572 s
=(char *) strstr(cw
, wordbreak
[j
]);
573 if (s
&& (s
> cw
) && (s
< cw
+ wl
- plen
)) {
574 if (!spell(s
+ plen
)) continue;
577 // examine 2 sides of the break point
578 if (spell(cw
)) return 1;
581 // LANG_hu: spec. dash rule
582 if (langnum
== LANG_hu
&& strcmp(wordbreak
[j
], "-") == 0) {
585 if (spell(cw
)) return 1; // check the first part with dash
588 // end of LANG speficic region
597 struct hentry
* Hunspell::checkword(const char * w
, int * info
, char ** root
)
599 struct hentry
* he
= NULL
;
601 char w2
[MAXWORDUTF8LEN
];
604 char * ignoredchars
= pAMgr
->get_ignore();
605 if (ignoredchars
!= NULL
) {
608 int ignoredchars_utf16_len
;
609 unsigned short * ignoredchars_utf16
= pAMgr
->get_ignore_utf16(&ignoredchars_utf16_len
);
610 remove_ignored_chars_utf(w2
, ignoredchars_utf16
, ignoredchars_utf16_len
);
612 remove_ignored_chars(w2
,ignoredchars
);
617 // word reversing wrapper for complex prefixes
618 if (complexprefixes
) {
623 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
626 // look word in hash table
627 for (i
= 0; (i
< maxdic
) && !he
; i
++) {
628 he
= (pHMgr
[i
])->lookup(word
);
630 // check forbidden and onlyincompound words
631 if ((he
) && (he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
632 if (info
) *info
+= SPELL_FORBIDDEN
;
633 // LANG_hu section: set dash information for suggestions
634 if (langnum
== LANG_hu
) {
635 if (pAMgr
->get_compoundflag() &&
636 TESTAFF(he
->astr
, pAMgr
->get_compoundflag(), he
->alen
)) {
637 if (info
) *info
+= SPELL_COMPOUND
;
643 // he = next not needaffix, onlyincompound homonym or onlyupcase word
644 while (he
&& (he
->astr
) &&
645 ((pAMgr
->get_needaffix() && TESTAFF(he
->astr
, pAMgr
->get_needaffix(), he
->alen
)) ||
646 (pAMgr
->get_onlyincompound() && TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
)) ||
647 (info
&& (*info
& SPELL_INITCAP
) && TESTAFF(he
->astr
, ONLYUPCASEFLAG
, he
->alen
))
648 )) he
= he
->next_homonym
;
651 // check with affixes
653 // try stripping off affixes */
655 he
= pAMgr
->affix_check(word
, len
, 0);
657 // check compound restriction and onlyupcase
658 if (he
&& he
->astr
&& (
659 (pAMgr
->get_onlyincompound() &&
660 TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
)) ||
661 (info
&& (*info
& SPELL_INITCAP
) &&
662 TESTAFF(he
->astr
, ONLYUPCASEFLAG
, he
->alen
)))) {
667 if ((he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
668 if (info
) *info
+= SPELL_FORBIDDEN
;
672 *root
= mystrdup(&(he
->word
));
673 if (*root
&& complexprefixes
) {
674 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
677 // try check compound word
678 } else if (pAMgr
->get_compound()) {
679 he
= pAMgr
->compound_check(word
, len
, 0, 0, 100, 0, NULL
, 0, 0);
680 // LANG_hu section: `moving rule' with last dash
681 if ((!he
) && (langnum
== LANG_hu
) && (word
[len
-1] == '-')) {
682 char * dup
= mystrdup(word
);
683 if (!dup
) return NULL
;
685 he
= pAMgr
->compound_check(dup
, len
-1, -5, 0, 100, 0, NULL
, 1, 0);
688 // end of LANG speficic region
691 *root
= mystrdup(&(he
->word
));
692 if (*root
&& complexprefixes
) {
693 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
696 if (info
) *info
+= SPELL_COMPOUND
;
705 int Hunspell::suggest(char*** slst
, const char * word
)
708 char cw
[MAXWORDUTF8LEN
];
709 char wspace
[MAXWORDUTF8LEN
];
710 if (!pSMgr
|| maxdic
== 0) return 0;
711 w_char unicw
[MAXWORDLEN
];
713 // process XML input of the simplified API (see manual)
714 if (strncmp(word
, SPELL_XML
, sizeof(SPELL_XML
) - 3) == 0) {
715 return spellml(slst
, word
);
717 int nc
= strlen(word
);
719 if (nc
>= MAXWORDUTF8LEN
) return 0;
721 if (nc
>= MAXWORDLEN
) return 0;
728 RepList
* rl
= (pAMgr
) ? pAMgr
->get_iconvtable() : NULL
;
729 if (rl
&& rl
->conv(word
, wspace
)) wl
= cleanword2(cw
, wspace
, unicw
, &nc
, &captype
, &abbv
);
730 else wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
732 if (wl
== 0) return 0;
738 ns
= pSMgr
->suggest(slst
, cw
, ns
, &onlycmpdsug
);
744 ns
= pSMgr
->suggest(slst
, cw
, ns
, &onlycmpdsug
);
746 memcpy(wspace
,cw
,(wl
+1));
747 mkallsmall2(wspace
, unicw
, nc
);
748 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
754 ns
= pSMgr
->suggest(slst
, cw
, ns
, &onlycmpdsug
);
757 // something.The -> something. The
758 char * dot
= strchr(cw
, '.');
759 if (dot
&& (dot
> cw
)) {
762 w_char w_
[MAXWORDLEN
];
763 int wl_
= u8_u16(w_
, MAXWORDLEN
, dot
+ 1);
764 captype_
= get_captype_utf8(w_
, wl_
, langnum
);
765 } else captype_
= get_captype(dot
+1, strlen(dot
+1), csconv
);
766 if (captype_
== INITCAP
) {
767 char * st
= mystrdup(cw
);
768 if (st
) st
= (char *) realloc(st
, wl
+ 2);
770 st
[(dot
- cw
) + 1] = ' ';
771 strcpy(st
+ (dot
- cw
) + 2, dot
+ 1);
772 ns
= insert_sug(slst
, st
, ns
);
777 if (captype
== HUHINITCAP
) {
778 // TheOpenOffice.org -> The OpenOffice.org
779 memcpy(wspace
,cw
,(wl
+1));
780 mkinitsmall2(wspace
, unicw
, nc
);
781 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
783 memcpy(wspace
,cw
,(wl
+1));
784 mkallsmall2(wspace
, unicw
, nc
);
785 if (spell(wspace
)) ns
= insert_sug(slst
, wspace
, ns
);
787 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
788 if (captype
== HUHINITCAP
) {
789 mkinitcap2(wspace
, unicw
, nc
);
790 if (spell(wspace
)) ns
= insert_sug(slst
, wspace
, ns
);
791 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
793 // aNew -> "a New" (instead of "a new")
794 for (int j
= prevns
; j
< ns
; j
++) {
795 char * space
= strchr((*slst
)[j
],' ');
797 int slen
= strlen(space
+ 1);
798 // different case after space (need capitalisation)
799 if ((slen
< wl
) && strcmp(cw
+ wl
- slen
, space
+ 1)) {
800 w_char w
[MAXWORDLEN
];
802 char * r
= (*slst
)[j
];
803 if (utf8
) wc
= u8_u16(w
, MAXWORDLEN
, space
+ 1);
804 mkinitcap2(space
+ 1, w
, wc
);
805 // set as first suggestion
806 for (int k
= j
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
816 memcpy(wspace
, cw
, (wl
+1));
817 mkallsmall2(wspace
, unicw
, nc
);
818 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
820 if (pAMgr
&& pAMgr
->get_keepcase() && spell(wspace
))
821 ns
= insert_sug(slst
, wspace
, ns
);
822 mkinitcap2(wspace
, unicw
, nc
);
823 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
824 for (int j
=0; j
< ns
; j
++) {
825 mkallcap((*slst
)[j
]);
826 if (pAMgr
&& pAMgr
->get_checksharps()) {
829 pos
= strstr((*slst
)[j
], "\xC3\x9F");
833 pos
= strstr(pos
+2, "\xC3\x9F");
836 pos
= strchr((*slst
)[j
], '\xDF');
838 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 2);
839 mystrrep((*slst
)[j
], "\xDF", "SS");
840 pos
= strchr((*slst
)[j
], '\xDF');
849 // LANG_hu section: replace '-' with ' ' in Hungarian
850 if (langnum
== LANG_hu
) {
851 for (int j
=0; j
< ns
; j
++) {
852 char * pos
= strchr((*slst
)[j
],'-');
855 char w
[MAXWORDUTF8LEN
];
857 strcpy(w
, (*slst
)[j
]);
859 spell(w
, &info
, NULL
);
860 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
866 // END OF LANG_hu section
868 // try ngram approach since found nothing
869 if ((ns
== 0 || onlycmpdsug
) && pAMgr
&& (pAMgr
->get_maxngramsugs() != 0)) {
872 ns
= pSMgr
->ngsuggest(*slst
, cw
, ns
, pHMgr
, maxdic
);
878 memcpy(wspace
,cw
,(wl
+1));
879 mkallsmall2(wspace
, unicw
, nc
);
880 ns
= pSMgr
->ngsuggest(*slst
, wspace
, ns
, pHMgr
, maxdic
);
885 memcpy(wspace
,cw
,(wl
+1));
886 mkallsmall2(wspace
, unicw
, nc
);
887 ns
= pSMgr
->ngsuggest(*slst
, wspace
, ns
, pHMgr
, maxdic
);
891 memcpy(wspace
,cw
,(wl
+1));
892 mkallsmall2(wspace
, unicw
, nc
);
894 ns
= pSMgr
->ngsuggest(*slst
, wspace
, ns
, pHMgr
, maxdic
);
895 for (int j
= oldns
; j
< ns
; j
++)
896 mkallcap((*slst
)[j
]);
902 // try dash suggestion (Afo-American -> Afro-American)
903 if (strchr(cw
, '-')) {
904 char * pos
= strchr(cw
, '-');
910 for (int j
= 0; j
< ns
&& nodashsug
== 1; j
++) {
911 if (strchr((*slst
)[j
], '-')) nodashsug
= 0;
913 while (nodashsug
&& !last
) {
914 if (*pos
== '\0') last
= 1; else *pos
= '\0';
916 nn
= suggest(&nlst
, ppos
);
917 for (int j
= nn
- 1; j
>= 0; j
--) {
918 strncpy(wspace
, cw
, ppos
- cw
);
919 strcpy(wspace
+ (ppos
- cw
), nlst
[j
]);
922 strcat(wspace
, pos
+ 1);
924 ns
= insert_sug(slst
, wspace
, ns
);
927 if (nlst
!= NULL
) free(nlst
);
933 pos
= strchr(ppos
, '-');
935 if (!pos
) pos
= cw
+ strlen(cw
);
939 // word reversing wrapper for complex prefixes
940 if (complexprefixes
) {
941 for (int j
= 0; j
< ns
; j
++) {
942 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
947 if (capwords
) for (int j
=0; j
< ns
; j
++) {
948 mkinitcap((*slst
)[j
]);
951 // expand suggestions with dot(s)
952 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
953 for (int j
= 0; j
< ns
; j
++) {
954 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
955 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
959 // remove bad capitalized and forbidden forms
960 if (pAMgr
&& (pAMgr
->get_keepcase() || pAMgr
->get_forbiddenword())) {
965 for (int j
=0; j
< ns
; j
++) {
966 if (!strchr((*slst
)[j
],' ') && !spell((*slst
)[j
])) {
971 len
= u8_u16(w
, MAXSWL
, (*slst
)[j
]);
973 strcpy(s
, (*slst
)[j
]);
976 mkallsmall2(s
, w
, len
);
979 (*slst
)[l
] = mystrdup(s
);
982 mkinitcap2(s
, w
, len
);
984 (*slst
)[l
] = mystrdup(s
);
989 (*slst
)[l
] = (*slst
)[j
];
998 // remove duplications
1000 for (int j
= 0; j
< ns
; j
++) {
1001 (*slst
)[l
] = (*slst
)[j
];
1002 for (int k
= 0; k
< l
; k
++) {
1003 if (strcmp((*slst
)[k
], (*slst
)[j
]) == 0) {
1011 // output conversion
1012 rl
= (pAMgr
) ? pAMgr
->get_oconvtable() : NULL
;
1013 for (int j
= 0; rl
&& j
< ns
; j
++) {
1014 if (rl
->conv((*slst
)[j
], wspace
)) {
1016 (*slst
)[j
] = mystrdup(wspace
);
1020 // if suggestions removed by nosuggest, onlyincompound parameters
1021 if (l
== 0 && *slst
) {
1028 void Hunspell::free_list(char *** slst
, int n
) {
1032 char * Hunspell::get_dic_encoding()
1037 #ifdef HUNSPELL_EXPERIMENTAL
1038 // XXX need UTF-8 support
1039 int Hunspell::suggest_auto(char*** slst
, const char * word
)
1041 char cw
[MAXWORDUTF8LEN
];
1042 char wspace
[MAXWORDUTF8LEN
];
1043 if (!pSMgr
|| maxdic
== 0) return 0;
1044 int wl
= strlen(word
);
1046 if (wl
>= MAXWORDUTF8LEN
) return 0;
1048 if (wl
>= MAXWORDLEN
) return 0;
1052 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1053 if (wl
== 0) return 0;
1055 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1059 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
1065 memcpy(wspace
,cw
,(wl
+1));
1067 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1068 for (int j
=0; j
< ns
; j
++)
1069 mkinitcap((*slst
)[j
]);
1070 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
1077 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
1079 memcpy(wspace
,cw
,(wl
+1));
1081 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1087 memcpy(wspace
,cw
,(wl
+1));
1089 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1092 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1094 for (int j
=0; j
< ns
; j
++)
1095 mkallcap((*slst
)[j
]);
1100 // word reversing wrapper for complex prefixes
1101 if (complexprefixes
) {
1102 for (int j
= 0; j
< ns
; j
++) {
1103 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
1107 // expand suggestions with dot(s)
1108 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
1109 for (int j
= 0; j
< ns
; j
++) {
1110 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
1111 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
1115 // LANG_hu section: replace '-' with ' ' in Hungarian
1116 if (langnum
== LANG_hu
) {
1117 for (int j
=0; j
< ns
; j
++) {
1118 char * pos
= strchr((*slst
)[j
],'-');
1121 char w
[MAXWORDUTF8LEN
];
1123 strcpy(w
, (*slst
)[j
]);
1125 spell(w
, &info
, NULL
);
1126 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
1132 // END OF LANG_hu section
1137 int Hunspell::stem(char*** slst
, char ** desc
, int n
)
1139 char result
[MAXLNLEN
];
1140 char result2
[MAXLNLEN
];
1142 if (n
== 0) return 0;
1144 for (int i
= 0; i
< n
; i
++) {
1146 // add compound word parts (except the last one)
1147 char * s
= (char *) desc
[i
];
1148 char * part
= strstr(s
, MORPH_PART
);
1150 char * nextpart
= strstr(part
+ 1, MORPH_PART
);
1152 copy_field(result
+ strlen(result
), part
, MORPH_PART
);
1154 nextpart
= strstr(part
+ 1, MORPH_PART
);
1162 char * alt
= strstr(tok
, " | ");
1165 alt
= strstr(alt
, " | ");
1167 int pln
= line_tok(tok
, &pl
, MSEP_ALT
);
1168 for (int k
= 0; k
< pln
; k
++) {
1169 // add derivational suffixes
1170 if (strstr(pl
[k
], MORPH_DERI_SFX
)) {
1171 // remove inflectional suffixes
1172 char * is
= strstr(pl
[k
], MORPH_INFL_SFX
);
1174 char * sg
= pSMgr
->suggest_gen(&(pl
[k
]), 1, pl
[k
]);
1177 int genl
= line_tok(sg
, &gen
, MSEP_REC
);
1179 for (int j
= 0; j
< genl
; j
++) {
1180 sprintf(result2
+ strlen(result2
), "%c%s%s",
1181 MSEP_REC
, result
, gen
[j
]);
1183 freelist(&gen
, genl
);
1186 sprintf(result2
+ strlen(result2
), "%c%s", MSEP_REC
, result
);
1187 if (strstr(pl
[k
], MORPH_SURF_PFX
)) {
1188 copy_field(result2
+ strlen(result2
), pl
[k
], MORPH_SURF_PFX
);
1190 copy_field(result2
+ strlen(result2
), pl
[k
], MORPH_STEM
);
1195 int sln
= line_tok(result2
, slst
, MSEP_REC
);
1196 return uniqlist(*slst
, sln
);
1200 int Hunspell::stem(char*** slst
, const char * word
)
1203 int pln
= analyze(&pl
, word
);
1204 int pln2
= stem(slst
, pl
, pln
);
1209 #ifdef HUNSPELL_EXPERIMENTAL
1210 int Hunspell::suggest_pos_stems(char*** slst
, const char * word
)
1212 char cw
[MAXWORDUTF8LEN
];
1213 char wspace
[MAXWORDUTF8LEN
];
1214 if (! pSMgr
|| maxdic
== 0) return 0;
1215 int wl
= strlen(word
);
1217 if (wl
>= MAXWORDUTF8LEN
) return 0;
1219 if (wl
>= MAXWORDLEN
) return 0;
1223 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1224 if (wl
== 0) return 0;
1226 int ns
= 0; // ns=0 = normalized input
1228 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1233 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1235 if ((abbv
) && (ns
== 0)) {
1236 memcpy(wspace
,cw
,wl
);
1238 *(wspace
+wl
+1) = '\0';
1239 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1247 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1249 if (ns
== 0 || ((*slst
)[0][0] == '#')) {
1250 memcpy(wspace
,cw
,(wl
+1));
1252 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1260 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1263 memcpy(wspace
,cw
,(wl
+1));
1265 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1269 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1277 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1279 const char * Hunspell::get_wordchars()
1281 return pAMgr
->get_wordchars();
1284 unsigned short * Hunspell::get_wordchars_utf16(int * len
)
1286 return pAMgr
->get_wordchars_utf16(len
);
1289 void Hunspell::mkinitcap(char * p
)
1292 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1295 w_char u
[MAXWORDLEN
];
1296 len
= u8_u16(u
, MAXWORDLEN
, p
);
1297 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1298 u
[0].h
= (unsigned char) (i
>> 8);
1299 u
[0].l
= (unsigned char) (i
& 0x00FF);
1300 u16_u8(p
, MAXWORDUTF8LEN
, u
, len
);
1304 int Hunspell::mkinitcap2(char * p
, w_char
* u
, int nc
)
1307 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1308 } else if (nc
> 0) {
1309 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1310 u
[0].h
= (unsigned char) (i
>> 8);
1311 u
[0].l
= (unsigned char) (i
& 0x00FF);
1312 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1318 int Hunspell::mkinitsmall2(char * p
, w_char
* u
, int nc
)
1321 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].clower
;
1322 } else if (nc
> 0) {
1323 unsigned short i
= unicodetolower((u
[0].h
<< 8) + u
[0].l
, langnum
);
1324 u
[0].h
= (unsigned char) (i
>> 8);
1325 u
[0].l
= (unsigned char) (i
& 0x00FF);
1326 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1332 int Hunspell::add(const char * word
)
1334 if (pHMgr
[0]) return (pHMgr
[0])->add(word
);
1338 int Hunspell::add_with_affix(const char * word
, const char * example
)
1340 if (pHMgr
[0]) return (pHMgr
[0])->add_with_affix(word
, example
);
1344 int Hunspell::remove(const char * word
)
1346 if (pHMgr
[0]) return (pHMgr
[0])->remove(word
);
1350 const char * Hunspell::get_version()
1352 return pAMgr
->get_version();
1355 struct cs_info
* Hunspell::get_csconv()
1360 void Hunspell::cat_result(char * result
, char * st
)
1363 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1364 mystrcat(result
, st
, MAXLNLEN
);
1369 int Hunspell::analyze(char*** slst
, const char * word
)
1371 char cw
[MAXWORDUTF8LEN
];
1372 char wspace
[MAXWORDUTF8LEN
];
1373 w_char unicw
[MAXWORDLEN
];
1376 if (! pSMgr
|| maxdic
== 0) return 0;
1377 int nc
= strlen(word
);
1379 if (nc
>= MAXWORDUTF8LEN
) return 0;
1381 if (nc
>= MAXWORDLEN
) return 0;
1388 RepList
* rl
= (pAMgr
) ? pAMgr
->get_iconvtable() : NULL
;
1389 if (rl
&& rl
->conv(word
, wspace
)) wl
= cleanword2(cw
, wspace
, unicw
, &nc
, &captype
, &abbv
);
1390 else wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
1394 for (wl
= 0; wl
< abbv
; wl
++) cw
[wl
] = '.';
1400 char result
[MAXLNLEN
];
1410 // LANG_hu section: set dash information for suggestions
1411 if (langnum
== LANG_hu
) {
1413 (((cw
[n
] <= '9') && (cw
[n
] >= '0')) || (((cw
[n
] == '.') || (cw
[n
] == ',')) && (n
> 0)))) {
1415 if ((cw
[n
] == '.') || (cw
[n
] == ',')) {
1416 if (((n2
== 0) && (n
> 3)) ||
1417 ((n2
> 0) && ((cw
[n
-1] == '.') || (cw
[n
-1] == ',')))) break;
1423 if ((n
== wl
) && (n3
> 0) && (n
- n3
> 3)) return 0;
1424 if ((n
== wl
) || ((n
>0) && ((cw
[n
]=='%') || (cw
[n
]=='\xB0')) && checkword(cw
+n
, NULL
, NULL
))) {
1425 mystrcat(result
, cw
, MAXLNLEN
);
1426 result
[n
- 1] = '\0';
1427 if (n
== wl
) cat_result(result
, pSMgr
->suggest_morph(cw
+ n
- 1));
1431 cat_result(result
, pSMgr
->suggest_morph(cw
+ n
- 1));
1432 mystrcat(result
, "+", MAXLNLEN
); // XXX SPEC. MORPHCODE
1434 cat_result(result
, pSMgr
->suggest_morph(cw
+ n
));
1436 return line_tok(result
, slst
, MSEP_REC
);
1439 // END OF LANG_hu section
1445 cat_result(result
, pSMgr
->suggest_morph(cw
));
1447 memcpy(wspace
,cw
,wl
);
1449 *(wspace
+wl
+1) = '\0';
1450 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1455 wl
= mkallsmall2(cw
, unicw
, nc
);
1456 memcpy(wspace
,cw
,(wl
+1));
1457 wl2
= mkinitcap2(cw
, unicw
, nc
);
1458 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1459 cat_result(result
, pSMgr
->suggest_morph(cw
));
1462 *(wspace
+wl
+1) = '\0';
1463 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1465 memcpy(wspace
, cw
, wl2
);
1466 *(wspace
+wl2
) = '.';
1467 *(wspace
+wl2
+1) = '\0';
1469 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1474 cat_result(result
, pSMgr
->suggest_morph(cw
));
1476 memcpy(wspace
,cw
,wl
);
1478 *(wspace
+wl
+1) = '\0';
1479 cat_result(result
, pSMgr
->suggest_morph(cw
));
1481 wl
= mkallsmall2(cw
, unicw
, nc
);
1482 memcpy(wspace
,cw
,(wl
+1));
1483 wl2
= mkinitcap2(cw
, unicw
, nc
);
1485 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1486 cat_result(result
, pSMgr
->suggest_morph(cw
));
1489 *(wspace
+wl
+1) = '\0';
1490 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1492 memcpy(wspace
, cw
, wl2
);
1493 *(wspace
+wl2
) = '.';
1494 *(wspace
+wl2
+1) = '\0';
1496 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1503 // word reversing wrapper for complex prefixes
1504 if (complexprefixes
) {
1505 if (utf8
) reverseword_utf(result
); else reverseword(result
);
1507 return line_tok(result
, slst
, MSEP_REC
);
1511 // compound word with dash (HU) I18n
1514 // LANG_hu section: set dash information for suggestions
1515 if (langnum
== LANG_hu
) dash
= (char *) strchr(cw
,'-');
1516 if ((langnum
== LANG_hu
) && dash
) {
1518 // examine 2 sides of the dash
1519 if (dash
[1] == '\0') { // base word ending with dash
1520 if (spell(cw
)) return line_tok(pSMgr
->suggest_morph(cw
), slst
, MSEP_REC
);
1521 } else if ((dash
[1] == 'e') && (dash
[2] == '\0')) { // XXX (HU) -e hat.
1522 if (spell(cw
) && (spell("-e"))) {
1523 st
= pSMgr
->suggest_morph(cw
);
1525 mystrcat(result
, st
, MAXLNLEN
);
1528 mystrcat(result
,"+", MAXLNLEN
); // XXX spec. separator in MORPHCODE
1529 st
= pSMgr
->suggest_morph("-e");
1531 mystrcat(result
, st
, MAXLNLEN
);
1534 return line_tok(result
, slst
, MSEP_REC
);
1537 // first word ending with dash: word- XXX ???
1538 char r2
= *(dash
+ 1);
1541 nresult
= spell(cw
);
1544 if (nresult
&& spell(dash
+1) && ((strlen(dash
+1) > 1) ||
1545 ((dash
[1] > '0') && (dash
[1] < '9')))) {
1546 st
= pSMgr
->suggest_morph(cw
);
1548 mystrcat(result
, st
, MAXLNLEN
);
1550 mystrcat(result
,"+", MAXLNLEN
); // XXX spec. separator in MORPHCODE
1552 st
= pSMgr
->suggest_morph(dash
+1);
1554 mystrcat(result
, st
, MAXLNLEN
);
1557 return line_tok(result
, slst
, MSEP_REC
);
1560 // affixed number in correct word
1561 if (nresult
&& (dash
> cw
) && (((*(dash
-1)<='9') &&
1562 (*(dash
-1)>='0')) || (*(dash
-1)=='.'))) {
1565 if (*(dash
- n
) == '.') n
++;
1566 // search first not a number character to left from dash
1567 while (((dash
- n
)>=cw
) && ((*(dash
- n
)=='0') || (n
< 3)) && (n
< 6)) {
1570 if ((dash
- n
) < cw
) n
--;
1571 // numbers: valami1000000-hoz
1572 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1574 for(; n
>= 1; n
--) {
1575 if ((*(dash
- n
) >= '0') && (*(dash
- n
) <= '9') && checkword(dash
- n
, NULL
, NULL
)) {
1576 mystrcat(result
, cw
, MAXLNLEN
);
1577 result
[dash
- cw
- n
] = '\0';
1578 st
= pSMgr
->suggest_morph(dash
- n
);
1580 mystrcat(result
, st
, MAXLNLEN
);
1583 return line_tok(result
, slst
, MSEP_REC
);
1591 int Hunspell::generate(char*** slst
, const char * word
, char ** pl
, int pln
)
1594 if (!pSMgr
|| !pln
) return 0;
1596 int pl2n
= analyze(&pl2
, word
);
1599 char cw
[MAXWORDUTF8LEN
];
1600 cleanword(cw
, word
, &captype
, &abbv
);
1601 char result
[MAXLNLEN
];
1604 for (int i
= 0; i
< pln
; i
++) {
1605 cat_result(result
, pSMgr
->suggest_gen(pl2
, pl2n
, pl
[i
]));
1607 freelist(&pl2
, pl2n
);
1611 if (captype
== ALLCAP
) mkallcap(result
);
1614 int linenum
= line_tok(result
, slst
, MSEP_REC
);
1617 if (captype
== INITCAP
|| captype
== HUHINITCAP
) {
1618 for (int j
=0; j
< linenum
; j
++) mkinitcap((*slst
)[j
]);
1621 // temporary filtering of prefix related errors (eg.
1622 // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
1625 for (int j
=0; j
< linenum
; j
++) {
1626 if (!spell((*slst
)[j
])) {
1630 if (r
< j
) (*slst
)[r
] = (*slst
)[j
];
1634 if (r
> 0) return r
;
1641 int Hunspell::generate(char*** slst
, const char * word
, const char * pattern
)
1644 int pln
= analyze(&pl
, pattern
);
1645 int n
= generate(slst
, word
, pl
, pln
);
1647 return uniqlist(*slst
, n
);
1650 // minimal XML parser functions
1651 int Hunspell::get_xml_par(char * dest
, const char * par
, int max
)
1656 char * dmax
= dest
+ max
;
1657 if (end
== '>') end
= '<';
1658 else if (end
!= '\'' && end
!= '"') return 0; // bad XML
1659 for (par
++; d
< dmax
&& *par
!= '\0' && *par
!= end
; par
++, d
++) *d
= *par
;
1661 mystrrep(dest
, "<", "<");
1662 mystrrep(dest
, "&", "&");
1666 // return the beginning of the element (attr == NULL) or the attribute
1667 const char * Hunspell::get_xml_pos(const char * s
, const char * attr
)
1669 const char * end
= strchr(s
, '>');
1671 if (attr
== NULL
) return end
;
1673 p
= strstr(p
, attr
);
1674 if (!p
|| p
>= end
) return 0;
1675 } while (*(p
-1) != ' ' && *(p
-1) != '\n');
1676 return p
+ strlen(attr
);
1679 int Hunspell::check_xml_par(const char * q
, const char * attr
, const char * value
) {
1680 char cw
[MAXWORDUTF8LEN
];
1681 if (get_xml_par(cw
, get_xml_pos(q
, attr
), MAXWORDUTF8LEN
- 1) &&
1682 strcmp(cw
, value
) == 0) return 1;
1686 int Hunspell::get_xml_list(char ***slst
, char * list
, const char * tag
) {
1689 if (!list
) return 0;
1690 for (p
= list
; (p
= strstr(p
, tag
)); p
++) n
++;
1691 if (n
== 0) return 0;
1692 *slst
= (char **) malloc(sizeof(char *) * n
);
1693 if (!*slst
) return 0;
1694 for (p
= list
, n
= 0; (p
= strstr(p
, tag
)); p
++, n
++) {
1696 (*slst
)[n
] = (char *) malloc(l
);
1697 if (!(*slst
)[n
]) return (n
> 0 ? n
- 1 : 0);
1698 get_xml_par((*slst
)[n
], p
+ strlen(tag
) - 1, l
);
1703 int Hunspell::spellml(char*** slst
, const char * word
)
1706 char cw
[MAXWORDUTF8LEN
], cw2
[MAXWORDUTF8LEN
];
1707 q
= (char *) strstr(word
, "<query");
1708 if (!q
) return 0; // bad XML input
1709 q2
= strchr(q
, '>');
1710 if (!q2
) return 0; // bad XML input
1711 q2
= strstr(q2
, "<word");
1712 if (!q2
) return 0; // bad XML input
1713 if (check_xml_par(q
, "type=", "analyze")) {
1715 if (get_xml_par(cw
, strchr(q2
, '>'), MAXWORDUTF8LEN
)) n
= analyze(slst
, cw
);
1716 if (n
== 0) return 0;
1717 // convert the result to <code><a>ana1</a><a>ana2</a></code> format
1718 for (int i
= 0; i
< n
; i
++) s
+= strlen((*slst
)[i
]);
1719 char * r
= (char *) malloc(6 + 5 * s
+ 7 * n
+ 7 + 1); // XXX 5*s->&->&
1721 strcpy(r
, "<code>");
1722 for (int i
= 0; i
< n
; i
++) {
1724 strcpy(r
+ l
, "<a>");
1725 strcpy(r
+ l
+ 3, (*slst
)[i
]);
1726 mystrrep(r
+ l
+ 3, "\t", " ");
1727 mystrrep(r
+ l
+ 3, "<", "<");
1728 mystrrep(r
+ l
+ 3, "&", "&");
1732 strcat(r
, "</code>");
1735 } else if (check_xml_par(q
, "type=", "stem")) {
1736 if (get_xml_par(cw
, strchr(q2
, '>'), MAXWORDUTF8LEN
)) return stem(slst
, cw
);
1737 } else if (check_xml_par(q
, "type=", "generate")) {
1738 int n
= get_xml_par(cw
, strchr(q2
, '>'), MAXWORDUTF8LEN
);
1739 if (n
== 0) return 0;
1740 char * q3
= strstr(q2
+ 1, "<word");
1742 if (get_xml_par(cw2
, strchr(q3
, '>'), MAXWORDUTF8LEN
)) {
1743 return generate(slst
, cw
, cw2
);
1747 if ((q2
= strstr(q2
+ 1, "<code")) &&
1748 (n
= get_xml_list(&slst2
, strchr(q2
, '>'), "<a>"))) {
1749 int n2
= generate(slst
, cw
, slst2
, n
);
1750 freelist(&slst2
, n
);
1751 return uniqlist(*slst
, n2
);
1759 #ifdef HUNSPELL_EXPERIMENTAL
1760 // XXX need UTF-8 support
1761 char * Hunspell::morph_with_correction(const char * word
)
1763 char cw
[MAXWORDUTF8LEN
];
1764 char wspace
[MAXWORDUTF8LEN
];
1765 if (! pSMgr
|| maxdic
== 0) return NULL
;
1766 int wl
= strlen(word
);
1768 if (wl
>= MAXWORDUTF8LEN
) return NULL
;
1770 if (wl
>= MAXWORDLEN
) return NULL
;
1774 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1775 if (wl
== 0) return NULL
;
1777 char result
[MAXLNLEN
];
1785 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1787 mystrcat(result
, st
, MAXLNLEN
);
1791 memcpy(wspace
,cw
,wl
);
1793 *(wspace
+wl
+1) = '\0';
1794 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1796 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1797 mystrcat(result
, st
, MAXLNLEN
);
1804 memcpy(wspace
,cw
,(wl
+1));
1806 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1808 mystrcat(result
, st
, MAXLNLEN
);
1811 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1813 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1814 mystrcat(result
, st
, MAXLNLEN
);
1818 memcpy(wspace
,cw
,wl
);
1820 *(wspace
+wl
+1) = '\0';
1822 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1824 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1825 mystrcat(result
, st
, MAXLNLEN
);
1829 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1831 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1832 mystrcat(result
, st
, MAXLNLEN
);
1839 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1841 mystrcat(result
, st
, MAXLNLEN
);
1844 memcpy(wspace
,cw
,(wl
+1));
1846 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1848 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1849 mystrcat(result
, st
, MAXLNLEN
);
1855 memcpy(wspace
,cw
,(wl
+1));
1856 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1858 mystrcat(result
, st
, MAXLNLEN
);
1862 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1864 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1865 mystrcat(result
, st
, MAXLNLEN
);
1869 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1871 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1872 mystrcat(result
, st
, MAXLNLEN
);
1876 memcpy(wspace
,cw
,(wl
+1));
1878 *(wspace
+wl
+1) = '\0';
1879 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1880 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1882 mystrcat(result
, st
, MAXLNLEN
);
1886 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1888 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1889 mystrcat(result
, st
, MAXLNLEN
);
1893 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1895 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1896 mystrcat(result
, st
, MAXLNLEN
);
1904 if (*result
) return mystrdup(result
);
1908 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1910 Hunhandle
*Hunspell_create(const char * affpath
, const char * dpath
)
1912 return (Hunhandle
*)(new Hunspell(affpath
, dpath
));
1915 Hunhandle
*Hunspell_create_key(const char * affpath
, const char * dpath
,
1918 return (Hunhandle
*)(new Hunspell(affpath
, dpath
, key
));
1921 void Hunspell_destroy(Hunhandle
*pHunspell
)
1923 delete (Hunspell
*)(pHunspell
);
1926 int Hunspell_spell(Hunhandle
*pHunspell
, const char *word
)
1928 return ((Hunspell
*)pHunspell
)->spell(word
);
1931 char *Hunspell_get_dic_encoding(Hunhandle
*pHunspell
)
1933 return ((Hunspell
*)pHunspell
)->get_dic_encoding();
1936 int Hunspell_suggest(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1938 return ((Hunspell
*)pHunspell
)->suggest(slst
, word
);
1941 int Hunspell_analyze(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1943 return ((Hunspell
*)pHunspell
)->analyze(slst
, word
);
1946 int Hunspell_stem(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1948 return ((Hunspell
*)pHunspell
)->stem(slst
, word
);
1951 int Hunspell_stem(Hunhandle
*pHunspell
, char*** slst
, char** desc
, int n
)
1953 return ((Hunspell
*)pHunspell
)->stem(slst
, desc
, n
);
1956 int Hunspell_generate(Hunhandle
*pHunspell
, char*** slst
, const char * word
,
1959 return ((Hunspell
*)pHunspell
)->generate(slst
, word
, word2
);
1962 int Hunspell_generate(Hunhandle
*pHunspell
, char*** slst
, const char * word
,
1965 return ((Hunspell
*)pHunspell
)->generate(slst
, word
, desc
, n
);
1968 /* functions for run-time modification of the dictionary */
1970 /* add word to the run-time dictionary */
1972 int Hunspell_add(Hunhandle
*pHunspell
, const char * word
) {
1973 return ((Hunspell
*)pHunspell
)->add(word
);
1976 /* add word to the run-time dictionary with affix flags of
1977 * the example (a dictionary word): Hunspell will recognize
1978 * affixed forms of the new word, too.
1981 int Hunspell_add_with_affix(Hunhandle
*pHunspell
, const char * word
,
1982 const char * example
) {
1983 return ((Hunspell
*)pHunspell
)->add_with_affix(word
, example
);
1986 /* remove word from the run-time dictionary */
1988 int Hunspell_remove(Hunhandle
*pHunspell
, const char * word
) {
1989 return ((Hunspell
*)pHunspell
)->remove(word
);
1992 void Hunspell_free_list(Hunhandle
*pHunspell
, char *** slst
, int n
) {