Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / extensions / spellcheck / hunspell / src / hunspell.cpp
blob4f9b46f5bfa40b1bcc5fb4292ef0229b86be4806
1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
21 * Davide Prina
22 * Giuseppe Modugno
23 * Gianluca Turconi
24 * Simon Brouwer
25 * Noll Janos
26 * Biro Arpad
27 * Goldman Eleonora
28 * Sarlos Tamas
29 * Bencsath Boldizsar
30 * Halacsy Peter
31 * Dvornik Laszlo
32 * Gefferth Andras
33 * Nagy Viktor
34 * Varga Daniel
35 * Chris Halls
36 * Rene Engelhard
37 * Bram Moolenaar
38 * Dafydd Jones
39 * Harri Pitkanen
40 * Andras Timar
41 * Tor Lillqvist
43 * Alternatively, the contents of this file may be used under the terms of
44 * either the GNU General Public License Version 2 or later (the "GPL"), or
45 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
46 * in which case the provisions of the GPL or the LGPL are applicable instead
47 * of those above. If you wish to allow use of your version of this file only
48 * under the terms of either the GPL or the LGPL, and not to allow others to
49 * use your version of this file under the terms of the MPL, indicate your
50 * decision by deleting the provisions above and replace them with the notice
51 * and other provisions required by the GPL or the LGPL. If you do not delete
52 * the provisions above, a recipient may use your version of this file under
53 * the terms of any one of the MPL, the GPL or the LGPL.
55 ******* END LICENSE BLOCK *******/
57 #ifndef MOZILLA_CLIENT
58 #include <cstdlib>
59 #include <cstring>
60 #include <cstdio>
61 #else
62 #include <stdlib.h>
63 #include <string.h>
64 #include <stdio.h>
65 #endif
67 #include "csutil.hxx"
68 #include "hunspell.h"
69 #include "hunspell.hxx"
71 #ifndef MOZILLA_CLIENT
72 #ifndef W32
73 using namespace std;
74 #endif
75 #endif
77 Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
79 encoding = NULL;
80 csconv = NULL;
81 utf8 = 0;
82 complexprefixes = 0;
83 affixpath = mystrdup(affpath);
84 maxdic = 0;
86 /* first set up the hash manager */
87 pHMgr[0] = new HashMgr(dpath, affpath, key);
88 if (pHMgr[0]) maxdic = 1;
90 /* next set up the affix manager */
91 /* it needs access to the hash manager lookup methods */
92 pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
94 /* get the preferred try string and the dictionary */
95 /* encoding from the Affix Manager for that dictionary */
96 char * try_string = pAMgr->get_try_string();
97 encoding = pAMgr->get_encoding();
98 csconv = get_current_cs(encoding);
99 langnum = pAMgr->get_langnum();
100 utf8 = pAMgr->get_utf8();
101 complexprefixes = pAMgr->get_complexprefixes();
102 wordbreak = pAMgr->get_breaktable();
104 /* and finally set up the suggestion manager */
105 pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
106 if (try_string) free(try_string);
109 Hunspell::~Hunspell()
111 if (pSMgr) delete pSMgr;
112 if (pAMgr) delete pAMgr;
113 for (int i = 0; i < maxdic; i++) delete pHMgr[i];
114 maxdic = 0;
115 pSMgr = NULL;
116 pAMgr = NULL;
117 #ifdef MOZILLA_CLIENT
118 free(csconv);
119 #endif
120 csconv= NULL;
121 if (encoding) free(encoding);
122 encoding = NULL;
123 if (affixpath) free(affixpath);
124 affixpath = NULL;
127 // load extra dictionaries
128 int Hunspell::add_dic(const char * dpath, const char * key) {
129 if (maxdic == MAXDIC || !affixpath) return 1;
130 pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
131 if (pHMgr[maxdic]) maxdic++; else return 1;
132 return 0;
135 // make a copy of src at destination while removing all leading
136 // blanks and removing any trailing periods after recording
137 // their presence with the abbreviation flag
138 // also since already going through character by character,
139 // set the capitalization type
140 // return the length of the "cleaned" (and UTF-8 encoded) word
142 int Hunspell::cleanword2(char * dest, const char * src,
143 w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
145 unsigned char * p = (unsigned char *) dest;
146 const unsigned char * q = (const unsigned char * ) src;
148 // first skip over any leading blanks
149 while ((*q != '\0') && (*q == ' ')) q++;
151 // now strip off any trailing periods (recording their presence)
152 *pabbrev = 0;
153 int nl = strlen((const char *)q);
154 while ((nl > 0) && (*(q+nl-1)=='.')) {
155 nl--;
156 (*pabbrev)++;
159 // if no characters are left it can't be capitalized
160 if (nl <= 0) {
161 *pcaptype = NOCAP;
162 *p = '\0';
163 return 0;
166 strncpy(dest, (char *) q, nl);
167 *(dest + nl) = '\0';
168 nl = strlen(dest);
169 if (utf8) {
170 *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
171 // don't check too long words
172 if (*nc >= MAXWORDLEN) return 0;
173 if (*nc == -1) { // big Unicode character (non BMP area)
174 *pcaptype = NOCAP;
175 return nl;
177 *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
178 } else {
179 *pcaptype = get_captype(dest, nl, csconv);
180 *nc = nl;
182 return nl;
185 int Hunspell::cleanword(char * dest, const char * src,
186 int * pcaptype, int * pabbrev)
188 unsigned char * p = (unsigned char *) dest;
189 const unsigned char * q = (const unsigned char * ) src;
190 int firstcap = 0;
192 // first skip over any leading blanks
193 while ((*q != '\0') && (*q == ' ')) q++;
195 // now strip off any trailing periods (recording their presence)
196 *pabbrev = 0;
197 int nl = strlen((const char *)q);
198 while ((nl > 0) && (*(q+nl-1)=='.')) {
199 nl--;
200 (*pabbrev)++;
203 // if no characters are left it can't be capitalized
204 if (nl <= 0) {
205 *pcaptype = NOCAP;
206 *p = '\0';
207 return 0;
210 // now determine the capitalization type of the first nl letters
211 int ncap = 0;
212 int nneutral = 0;
213 int nc = 0;
215 if (!utf8) {
216 while (nl > 0) {
217 nc++;
218 if (csconv[(*q)].ccase) ncap++;
219 if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
220 *p++ = *q++;
221 nl--;
223 // remember to terminate the destination string
224 *p = '\0';
225 firstcap = csconv[(unsigned char)(*dest)].ccase;
226 } else {
227 unsigned short idx;
228 w_char t[MAXWORDLEN];
229 nc = u8_u16(t, MAXWORDLEN, src);
230 for (int i = 0; i < nc; i++) {
231 idx = (t[i].h << 8) + t[i].l;
232 unsigned short low = unicodetolower(idx, langnum);
233 if (idx != low) ncap++;
234 if (unicodetoupper(idx, langnum) == low) nneutral++;
236 u16_u8(dest, MAXWORDUTF8LEN, t, nc);
237 if (ncap) {
238 idx = (t[0].h << 8) + t[0].l;
239 firstcap = (idx != unicodetolower(idx, langnum));
243 // now finally set the captype
244 if (ncap == 0) {
245 *pcaptype = NOCAP;
246 } else if ((ncap == 1) && firstcap) {
247 *pcaptype = INITCAP;
248 } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
249 *pcaptype = ALLCAP;
250 } else if ((ncap > 1) && firstcap) {
251 *pcaptype = HUHINITCAP;
252 } else {
253 *pcaptype = HUHCAP;
255 return strlen(dest);
258 void Hunspell::mkallcap(char * p)
260 if (utf8) {
261 w_char u[MAXWORDLEN];
262 int nc = u8_u16(u, MAXWORDLEN, p);
263 unsigned short idx;
264 for (int i = 0; i < nc; i++) {
265 idx = (u[i].h << 8) + u[i].l;
266 if (idx != unicodetoupper(idx, langnum)) {
267 u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
268 u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
271 u16_u8(p, MAXWORDUTF8LEN, u, nc);
272 } else {
273 while (*p != '\0') {
274 *p = csconv[((unsigned char) *p)].cupper;
275 p++;
280 int Hunspell::mkallcap2(char * p, w_char * u, int nc)
282 if (utf8) {
283 unsigned short idx;
284 for (int i = 0; i < nc; i++) {
285 idx = (u[i].h << 8) + u[i].l;
286 unsigned short up = unicodetoupper(idx, langnum);
287 if (idx != up) {
288 u[i].h = (unsigned char) (up >> 8);
289 u[i].l = (unsigned char) (up & 0x00FF);
292 u16_u8(p, MAXWORDUTF8LEN, u, nc);
293 return strlen(p);
294 } else {
295 while (*p != '\0') {
296 *p = csconv[((unsigned char) *p)].cupper;
297 p++;
300 return nc;
304 void Hunspell::mkallsmall(char * p)
306 while (*p != '\0') {
307 *p = csconv[((unsigned char) *p)].clower;
308 p++;
312 int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
314 if (utf8) {
315 unsigned short idx;
316 for (int i = 0; i < nc; i++) {
317 idx = (u[i].h << 8) + u[i].l;
318 unsigned short low = unicodetolower(idx, langnum);
319 if (idx != low) {
320 u[i].h = (unsigned char) (low >> 8);
321 u[i].l = (unsigned char) (low & 0x00FF);
324 u16_u8(p, MAXWORDUTF8LEN, u, nc);
325 return strlen(p);
326 } else {
327 while (*p != '\0') {
328 *p = csconv[((unsigned char) *p)].clower;
329 p++;
332 return nc;
335 // convert UTF-8 sharp S codes to latin 1
336 char * Hunspell::sharps_u8_l1(char * dest, char * source) {
337 char * p = dest;
338 *p = *source;
339 for (p++, source++; *(source - 1); p++, source++) {
340 *p = *source;
341 if (*source == '\x9F') *--p = '\xDF';
343 return dest;
346 // recursive search for right ss - sharp s permutations
347 hentry * Hunspell::spellsharps(char * base, char * pos, int n,
348 int repnum, char * tmp, int * info, char **root) {
349 pos = strstr(pos, "ss");
350 if (pos && (n < MAXSHARPS)) {
351 *pos = '\xC3';
352 *(pos + 1) = '\x9F';
353 hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
354 if (h) return h;
355 *pos = 's';
356 *(pos + 1) = 's';
357 h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
358 if (h) return h;
359 } else if (repnum > 0) {
360 if (utf8) return checkword(base, info, root);
361 return checkword(sharps_u8_l1(tmp, base), info, root);
363 return NULL;
366 int Hunspell::is_keepcase(const hentry * rv) {
367 return pAMgr && rv->astr && pAMgr->get_keepcase() &&
368 TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
371 /* insert a word to the beginning of the suggestion array and return ns */
372 int Hunspell::insert_sug(char ***slst, char * word, int ns) {
373 char * dup = mystrdup(word);
374 if (!dup) return ns;
375 if (ns == MAXSUGGESTION) {
376 ns--;
377 free((*slst)[ns]);
379 for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
380 (*slst)[0] = dup;
381 return ns + 1;
384 int Hunspell::spell(const char * word, int * info, char ** root)
386 struct hentry * rv=NULL;
387 // need larger vector. For example, Turkish capital letter I converted a
388 // 2-byte UTF-8 character (dotless i) by mkallsmall.
389 char cw[MAXWORDUTF8LEN];
390 char wspace[MAXWORDUTF8LEN];
391 w_char unicw[MAXWORDLEN];
392 // Hunspell supports XML input of the simplified API (see manual)
393 if (strcmp(word, SPELL_XML) == 0) return 1;
394 int nc = strlen(word);
395 int wl2 = 0;
396 if (utf8) {
397 if (nc >= MAXWORDUTF8LEN) return 0;
398 } else {
399 if (nc >= MAXWORDLEN) return 0;
401 int captype = 0;
402 int abbv = 0;
403 int wl = 0;
405 // input conversion
406 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
407 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
408 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
410 int info2 = 0;
411 if (wl == 0 || maxdic == 0) return 1;
412 if (root) *root = NULL;
414 // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
415 enum { NBEGIN, NNUM, NSEP };
416 int nstate = NBEGIN;
417 int i;
419 for (i = 0; (i < wl); i++) {
420 if ((cw[i] <= '9') && (cw[i] >= '0')) {
421 nstate = NNUM;
422 } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
423 if ((nstate == NSEP) || (i == 0)) break;
424 nstate = NSEP;
425 } else break;
427 if ((i == wl) && (nstate == NNUM)) return 1;
428 if (!info) info = &info2; else *info = 0;
430 switch(captype) {
431 case HUHCAP:
432 case HUHINITCAP:
433 case NOCAP: {
434 rv = checkword(cw, info, root);
435 if ((abbv) && !(rv)) {
436 memcpy(wspace,cw,wl);
437 *(wspace+wl) = '.';
438 *(wspace+wl+1) = '\0';
439 rv = checkword(wspace, info, root);
441 break;
443 case ALLCAP: {
444 rv = checkword(cw, info, root);
445 if (rv) break;
446 if (abbv) {
447 memcpy(wspace,cw,wl);
448 *(wspace+wl) = '.';
449 *(wspace+wl+1) = '\0';
450 rv = checkword(wspace, info, root);
451 if (rv) break;
453 // Spec. prefix handling for Catalan, French, Italian:
454 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
455 if (pAMgr && strchr(cw, '\'')) {
456 wl = mkallsmall2(cw, unicw, nc);
457 char * apostrophe = strchr(cw, '\'');
458 if (utf8) {
459 w_char tmpword[MAXWORDLEN];
460 *apostrophe = '\0';
461 wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
462 *apostrophe = '\'';
463 if (wl2 < nc) {
464 mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
465 rv = checkword(cw, info, root);
466 if (rv) break;
468 } else {
469 mkinitcap2(apostrophe + 1, unicw, nc);
470 rv = checkword(cw, info, root);
471 if (rv) break;
473 mkinitcap2(cw, unicw, nc);
474 rv = checkword(cw, info, root);
475 if (rv) break;
477 if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
478 char tmpword[MAXWORDUTF8LEN];
479 wl = mkallsmall2(cw, unicw, nc);
480 memcpy(wspace,cw,(wl+1));
481 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
482 if (!rv) {
483 wl2 = mkinitcap2(cw, unicw, nc);
484 rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
486 if ((abbv) && !(rv)) {
487 *(wspace+wl) = '.';
488 *(wspace+wl+1) = '\0';
489 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
490 if (!rv) {
491 memcpy(wspace, cw, wl2);
492 *(wspace+wl2) = '.';
493 *(wspace+wl2+1) = '\0';
494 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
497 if (rv) break;
500 case INITCAP: {
501 wl = mkallsmall2(cw, unicw, nc);
502 memcpy(wspace,cw,(wl+1));
503 wl2 = mkinitcap2(cw, unicw, nc);
504 if (captype == INITCAP) *info += SPELL_INITCAP;
505 rv = checkword(cw, info, root);
506 if (captype == INITCAP) *info -= SPELL_INITCAP;
507 // forbid bad capitalization
508 // (for example, ijs -> Ijs instead of IJs in Dutch)
509 // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
510 if (*info & SPELL_FORBIDDEN) {
511 rv = NULL;
512 break;
514 if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
515 if (rv) break;
517 rv = checkword(wspace, info, root);
518 if (abbv && !rv) {
520 *(wspace+wl) = '.';
521 *(wspace+wl+1) = '\0';
522 rv = checkword(wspace, info, root);
523 if (!rv) {
524 memcpy(wspace, cw, wl2);
525 *(wspace+wl2) = '.';
526 *(wspace+wl2+1) = '\0';
527 if (captype == INITCAP) *info += SPELL_INITCAP;
528 rv = checkword(wspace, info, root);
529 if (captype == INITCAP) *info -= SPELL_INITCAP;
530 if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
531 break;
534 if (rv && is_keepcase(rv) &&
535 ((captype == ALLCAP) ||
536 // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
537 // in INITCAP form, too.
538 !(pAMgr->get_checksharps() &&
539 ((utf8 && strstr(wspace, "\xC3\x9F")) ||
540 (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
541 break;
545 if (rv) return 1;
547 // recursive breaking at break points
548 if (wordbreak) {
549 char * s;
550 char r;
551 int corr = 0;
552 wl = strlen(cw);
553 int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
554 // check boundary patterns (^begin and end$)
555 for (int j = 0; j < numbreak; j++) {
556 int plen = strlen(wordbreak[j]);
557 if (plen == 1 || plen > wl) continue;
558 if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
559 && spell(cw + plen - 1)) return 1;
560 if (wordbreak[j][plen - 1] == '$' &&
561 strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
562 r = cw[wl - plen + 1];
563 cw[wl - plen + 1] = '\0';
564 if (spell(cw)) return 1;
565 cw[wl - plen + 1] = r;
568 // other patterns
569 for (int j = 0; j < numbreak; j++) {
570 int result = 0;
571 int plen = strlen(wordbreak[j]);
572 s=(char *) strstr(cw, wordbreak[j]);
573 if (s && (s > cw) && (s < cw + wl - plen)) {
574 if (!spell(s + plen)) continue;
575 r = *s;
576 *s = '\0';
577 // examine 2 sides of the break point
578 if (spell(cw)) return 1;
579 *s = r;
581 // LANG_hu: spec. dash rule
582 if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
583 r = s[1];
584 s[1] = '\0';
585 if (spell(cw)) return 1; // check the first part with dash
586 s[1] = r;
588 // end of LANG speficic region
594 return 0;
597 struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
599 struct hentry * he = NULL;
600 int len, i;
601 char w2[MAXWORDUTF8LEN];
602 const char * word;
604 char * ignoredchars = pAMgr->get_ignore();
605 if (ignoredchars != NULL) {
606 strcpy(w2, w);
607 if (utf8) {
608 int ignoredchars_utf16_len;
609 unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
610 remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
611 } else {
612 remove_ignored_chars(w2,ignoredchars);
614 word = w2;
615 } else word = w;
617 // word reversing wrapper for complex prefixes
618 if (complexprefixes) {
619 if (word != w2) {
620 strcpy(w2, word);
621 word = w2;
623 if (utf8) reverseword_utf(w2); else reverseword(w2);
626 // look word in hash table
627 for (i = 0; (i < maxdic) && !he; i ++) {
628 he = (pHMgr[i])->lookup(word);
630 // check forbidden and onlyincompound words
631 if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
632 if (info) *info += SPELL_FORBIDDEN;
633 // LANG_hu section: set dash information for suggestions
634 if (langnum == LANG_hu) {
635 if (pAMgr->get_compoundflag() &&
636 TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
637 if (info) *info += SPELL_COMPOUND;
640 return NULL;
643 // he = next not needaffix, onlyincompound homonym or onlyupcase word
644 while (he && (he->astr) &&
645 ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
646 (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
647 (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
648 )) he = he->next_homonym;
651 // check with affixes
652 if (!he && pAMgr) {
653 // try stripping off affixes */
654 len = strlen(word);
655 he = pAMgr->affix_check(word, len, 0);
657 // check compound restriction and onlyupcase
658 if (he && he->astr && (
659 (pAMgr->get_onlyincompound() &&
660 TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
661 (info && (*info & SPELL_INITCAP) &&
662 TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
663 he = NULL;
666 if (he) {
667 if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
668 if (info) *info += SPELL_FORBIDDEN;
669 return NULL;
671 if (root) {
672 *root = mystrdup(&(he->word));
673 if (*root && complexprefixes) {
674 if (utf8) reverseword_utf(*root); else reverseword(*root);
677 // try check compound word
678 } else if (pAMgr->get_compound()) {
679 he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0);
680 // LANG_hu section: `moving rule' with last dash
681 if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
682 char * dup = mystrdup(word);
683 if (!dup) return NULL;
684 dup[len-1] = '\0';
685 he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0);
686 free(dup);
688 // end of LANG speficic region
689 if (he) {
690 if (root) {
691 *root = mystrdup(&(he->word));
692 if (*root && complexprefixes) {
693 if (utf8) reverseword_utf(*root); else reverseword(*root);
696 if (info) *info += SPELL_COMPOUND;
702 return he;
705 int Hunspell::suggest(char*** slst, const char * word)
707 int onlycmpdsug = 0;
708 char cw[MAXWORDUTF8LEN];
709 char wspace[MAXWORDUTF8LEN];
710 if (!pSMgr || maxdic == 0) return 0;
711 w_char unicw[MAXWORDLEN];
712 *slst = NULL;
713 // process XML input of the simplified API (see manual)
714 if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
715 return spellml(slst, word);
717 int nc = strlen(word);
718 if (utf8) {
719 if (nc >= MAXWORDUTF8LEN) return 0;
720 } else {
721 if (nc >= MAXWORDLEN) return 0;
723 int captype = 0;
724 int abbv = 0;
725 int wl = 0;
727 // input conversion
728 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
729 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
730 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
732 if (wl == 0) return 0;
733 int ns = 0;
734 int capwords = 0;
736 switch(captype) {
737 case NOCAP: {
738 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
739 break;
742 case INITCAP: {
743 capwords = 1;
744 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
745 if (ns == -1) break;
746 memcpy(wspace,cw,(wl+1));
747 mkallsmall2(wspace, unicw, nc);
748 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
749 break;
751 case HUHINITCAP:
752 capwords = 1;
753 case HUHCAP: {
754 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
755 if (ns != -1) {
756 int prevns;
757 // something.The -> something. The
758 char * dot = strchr(cw, '.');
759 if (dot && (dot > cw)) {
760 int captype_;
761 if (utf8) {
762 w_char w_[MAXWORDLEN];
763 int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
764 captype_ = get_captype_utf8(w_, wl_, langnum);
765 } else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
766 if (captype_ == INITCAP) {
767 char * st = mystrdup(cw);
768 if (st) st = (char *) realloc(st, wl + 2);
769 if (st) {
770 st[(dot - cw) + 1] = ' ';
771 strcpy(st + (dot - cw) + 2, dot + 1);
772 ns = insert_sug(slst, st, ns);
773 free(st);
777 if (captype == HUHINITCAP) {
778 // TheOpenOffice.org -> The OpenOffice.org
779 memcpy(wspace,cw,(wl+1));
780 mkinitsmall2(wspace, unicw, nc);
781 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
783 memcpy(wspace,cw,(wl+1));
784 mkallsmall2(wspace, unicw, nc);
785 if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
786 prevns = ns;
787 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
788 if (captype == HUHINITCAP) {
789 mkinitcap2(wspace, unicw, nc);
790 if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
791 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
793 // aNew -> "a New" (instead of "a new")
794 for (int j = prevns; j < ns; j++) {
795 char * space = strchr((*slst)[j],' ');
796 if (space) {
797 int slen = strlen(space + 1);
798 // different case after space (need capitalisation)
799 if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
800 w_char w[MAXWORDLEN];
801 int wc = 0;
802 char * r = (*slst)[j];
803 if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
804 mkinitcap2(space + 1, w, wc);
805 // set as first suggestion
806 for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
807 (*slst)[0] = r;
812 break;
815 case ALLCAP: {
816 memcpy(wspace, cw, (wl+1));
817 mkallsmall2(wspace, unicw, nc);
818 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
819 if (ns == -1) break;
820 if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
821 ns = insert_sug(slst, wspace, ns);
822 mkinitcap2(wspace, unicw, nc);
823 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
824 for (int j=0; j < ns; j++) {
825 mkallcap((*slst)[j]);
826 if (pAMgr && pAMgr->get_checksharps()) {
827 char * pos;
828 if (utf8) {
829 pos = strstr((*slst)[j], "\xC3\x9F");
830 while (pos) {
831 *pos = 'S';
832 *(pos+1) = 'S';
833 pos = strstr(pos+2, "\xC3\x9F");
835 } else {
836 pos = strchr((*slst)[j], '\xDF');
837 while (pos) {
838 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
839 mystrrep((*slst)[j], "\xDF", "SS");
840 pos = strchr((*slst)[j], '\xDF');
845 break;
849 // LANG_hu section: replace '-' with ' ' in Hungarian
850 if (langnum == LANG_hu) {
851 for (int j=0; j < ns; j++) {
852 char * pos = strchr((*slst)[j],'-');
853 if (pos) {
854 int info;
855 char w[MAXWORDUTF8LEN];
856 *pos = '\0';
857 strcpy(w, (*slst)[j]);
858 strcat(w, pos + 1);
859 spell(w, &info, NULL);
860 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
861 *pos = ' ';
862 } else *pos = '-';
866 // END OF LANG_hu section
868 // try ngram approach since found nothing
869 if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) {
870 switch(captype) {
871 case NOCAP: {
872 ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
873 break;
875 case HUHINITCAP:
876 capwords = 1;
877 case HUHCAP: {
878 memcpy(wspace,cw,(wl+1));
879 mkallsmall2(wspace, unicw, nc);
880 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
881 break;
883 case INITCAP: {
884 capwords = 1;
885 memcpy(wspace,cw,(wl+1));
886 mkallsmall2(wspace, unicw, nc);
887 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
888 break;
890 case ALLCAP: {
891 memcpy(wspace,cw,(wl+1));
892 mkallsmall2(wspace, unicw, nc);
893 int oldns = ns;
894 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
895 for (int j = oldns; j < ns; j++)
896 mkallcap((*slst)[j]);
897 break;
902 // try dash suggestion (Afo-American -> Afro-American)
903 if (strchr(cw, '-')) {
904 char * pos = strchr(cw, '-');
905 char * ppos = cw;
906 int nodashsug = 1;
907 char ** nlst = NULL;
908 int nn = 0;
909 int last = 0;
910 for (int j = 0; j < ns && nodashsug == 1; j++) {
911 if (strchr((*slst)[j], '-')) nodashsug = 0;
913 while (nodashsug && !last) {
914 if (*pos == '\0') last = 1; else *pos = '\0';
915 if (!spell(ppos)) {
916 nn = suggest(&nlst, ppos);
917 for (int j = nn - 1; j >= 0; j--) {
918 strncpy(wspace, cw, ppos - cw);
919 strcpy(wspace + (ppos - cw), nlst[j]);
920 if (!last) {
921 strcat(wspace, "-");
922 strcat(wspace, pos + 1);
924 ns = insert_sug(slst, wspace, ns);
925 free(nlst[j]);
927 if (nlst != NULL) free(nlst);
928 nodashsug = 0;
930 if (!last) {
931 *pos = '-';
932 ppos = pos + 1;
933 pos = strchr(ppos, '-');
935 if (!pos) pos = cw + strlen(cw);
939 // word reversing wrapper for complex prefixes
940 if (complexprefixes) {
941 for (int j = 0; j < ns; j++) {
942 if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
946 // capitalize
947 if (capwords) for (int j=0; j < ns; j++) {
948 mkinitcap((*slst)[j]);
951 // expand suggestions with dot(s)
952 if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
953 for (int j = 0; j < ns; j++) {
954 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
955 strcat((*slst)[j], word + strlen(word) - abbv);
959 // remove bad capitalized and forbidden forms
960 if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
961 switch (captype) {
962 case INITCAP:
963 case ALLCAP: {
964 int l = 0;
965 for (int j=0; j < ns; j++) {
966 if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
967 char s[MAXSWUTF8L];
968 w_char w[MAXSWL];
969 int len;
970 if (utf8) {
971 len = u8_u16(w, MAXSWL, (*slst)[j]);
972 } else {
973 strcpy(s, (*slst)[j]);
974 len = strlen(s);
976 mkallsmall2(s, w, len);
977 free((*slst)[j]);
978 if (spell(s)) {
979 (*slst)[l] = mystrdup(s);
980 if ((*slst)[l]) l++;
981 } else {
982 mkinitcap2(s, w, len);
983 if (spell(s)) {
984 (*slst)[l] = mystrdup(s);
985 if ((*slst)[l]) l++;
988 } else {
989 (*slst)[l] = (*slst)[j];
990 l++;
993 ns = l;
998 // remove duplications
999 int l = 0;
1000 for (int j = 0; j < ns; j++) {
1001 (*slst)[l] = (*slst)[j];
1002 for (int k = 0; k < l; k++) {
1003 if (strcmp((*slst)[k], (*slst)[j]) == 0) {
1004 free((*slst)[j]);
1005 l--;
1008 l++;
1011 // output conversion
1012 rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
1013 for (int j = 0; rl && j < ns; j++) {
1014 if (rl->conv((*slst)[j], wspace)) {
1015 free((*slst)[j]);
1016 (*slst)[j] = mystrdup(wspace);
1020 // if suggestions removed by nosuggest, onlyincompound parameters
1021 if (l == 0 && *slst) {
1022 free(*slst);
1023 *slst = NULL;
1025 return l;
1028 void Hunspell::free_list(char *** slst, int n) {
1029 freelist(slst, n);
1032 char * Hunspell::get_dic_encoding()
1034 return encoding;
1037 #ifdef HUNSPELL_EXPERIMENTAL
1038 // XXX need UTF-8 support
1039 int Hunspell::suggest_auto(char*** slst, const char * word)
1041 char cw[MAXWORDUTF8LEN];
1042 char wspace[MAXWORDUTF8LEN];
1043 if (!pSMgr || maxdic == 0) return 0;
1044 int wl = strlen(word);
1045 if (utf8) {
1046 if (wl >= MAXWORDUTF8LEN) return 0;
1047 } else {
1048 if (wl >= MAXWORDLEN) return 0;
1050 int captype = 0;
1051 int abbv = 0;
1052 wl = cleanword(cw, word, &captype, &abbv);
1053 if (wl == 0) return 0;
1054 int ns = 0;
1055 *slst = NULL; // HU, nsug in pSMgr->suggest
1057 switch(captype) {
1058 case NOCAP: {
1059 ns = pSMgr->suggest_auto(slst, cw, ns);
1060 if (ns>0) break;
1061 break;
1064 case INITCAP: {
1065 memcpy(wspace,cw,(wl+1));
1066 mkallsmall(wspace);
1067 ns = pSMgr->suggest_auto(slst, wspace, ns);
1068 for (int j=0; j < ns; j++)
1069 mkinitcap((*slst)[j]);
1070 ns = pSMgr->suggest_auto(slst, cw, ns);
1071 break;
1075 case HUHINITCAP:
1076 case HUHCAP: {
1077 ns = pSMgr->suggest_auto(slst, cw, ns);
1078 if (ns == 0) {
1079 memcpy(wspace,cw,(wl+1));
1080 mkallsmall(wspace);
1081 ns = pSMgr->suggest_auto(slst, wspace, ns);
1083 break;
1086 case ALLCAP: {
1087 memcpy(wspace,cw,(wl+1));
1088 mkallsmall(wspace);
1089 ns = pSMgr->suggest_auto(slst, wspace, ns);
1091 mkinitcap(wspace);
1092 ns = pSMgr->suggest_auto(slst, wspace, ns);
1094 for (int j=0; j < ns; j++)
1095 mkallcap((*slst)[j]);
1096 break;
1100 // word reversing wrapper for complex prefixes
1101 if (complexprefixes) {
1102 for (int j = 0; j < ns; j++) {
1103 if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
1107 // expand suggestions with dot(s)
1108 if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
1109 for (int j = 0; j < ns; j++) {
1110 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
1111 strcat((*slst)[j], word + strlen(word) - abbv);
1115 // LANG_hu section: replace '-' with ' ' in Hungarian
1116 if (langnum == LANG_hu) {
1117 for (int j=0; j < ns; j++) {
1118 char * pos = strchr((*slst)[j],'-');
1119 if (pos) {
1120 int info;
1121 char w[MAXWORDUTF8LEN];
1122 *pos = '\0';
1123 strcpy(w, (*slst)[j]);
1124 strcat(w, pos + 1);
1125 spell(w, &info, NULL);
1126 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
1127 *pos = ' ';
1128 } else *pos = '-';
1132 // END OF LANG_hu section
1133 return ns;
1135 #endif
1137 int Hunspell::stem(char*** slst, char ** desc, int n)
1139 char result[MAXLNLEN];
1140 char result2[MAXLNLEN];
1141 *slst = NULL;
1142 if (n == 0) return 0;
1143 *result2 = '\0';
1144 for (int i = 0; i < n; i++) {
1145 *result = '\0';
1146 // add compound word parts (except the last one)
1147 char * s = (char *) desc[i];
1148 char * part = strstr(s, MORPH_PART);
1149 if (part) {
1150 char * nextpart = strstr(part + 1, MORPH_PART);
1151 while (nextpart) {
1152 copy_field(result + strlen(result), part, MORPH_PART);
1153 part = nextpart;
1154 nextpart = strstr(part + 1, MORPH_PART);
1156 s = part;
1159 char **pl;
1160 char tok[MAXLNLEN];
1161 strcpy(tok, s);
1162 char * alt = strstr(tok, " | ");
1163 while (alt) {
1164 alt[1] = MSEP_ALT;
1165 alt = strstr(alt, " | ");
1167 int pln = line_tok(tok, &pl, MSEP_ALT);
1168 for (int k = 0; k < pln; k++) {
1169 // add derivational suffixes
1170 if (strstr(pl[k], MORPH_DERI_SFX)) {
1171 // remove inflectional suffixes
1172 char * is = strstr(pl[k], MORPH_INFL_SFX);
1173 if (is) *is = '\0';
1174 char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
1175 if (sg) {
1176 char ** gen;
1177 int genl = line_tok(sg, &gen, MSEP_REC);
1178 free(sg);
1179 for (int j = 0; j < genl; j++) {
1180 sprintf(result2 + strlen(result2), "%c%s%s",
1181 MSEP_REC, result, gen[j]);
1183 freelist(&gen, genl);
1185 } else {
1186 sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
1187 if (strstr(pl[k], MORPH_SURF_PFX)) {
1188 copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
1190 copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
1193 freelist(&pl, pln);
1195 int sln = line_tok(result2, slst, MSEP_REC);
1196 return uniqlist(*slst, sln);
1200 int Hunspell::stem(char*** slst, const char * word)
1202 char ** pl;
1203 int pln = analyze(&pl, word);
1204 int pln2 = stem(slst, pl, pln);
1205 freelist(&pl, pln);
1206 return pln2;
1209 #ifdef HUNSPELL_EXPERIMENTAL
1210 int Hunspell::suggest_pos_stems(char*** slst, const char * word)
1212 char cw[MAXWORDUTF8LEN];
1213 char wspace[MAXWORDUTF8LEN];
1214 if (! pSMgr || maxdic == 0) return 0;
1215 int wl = strlen(word);
1216 if (utf8) {
1217 if (wl >= MAXWORDUTF8LEN) return 0;
1218 } else {
1219 if (wl >= MAXWORDLEN) return 0;
1221 int captype = 0;
1222 int abbv = 0;
1223 wl = cleanword(cw, word, &captype, &abbv);
1224 if (wl == 0) return 0;
1226 int ns = 0; // ns=0 = normalized input
1228 *slst = NULL; // HU, nsug in pSMgr->suggest
1230 switch(captype) {
1231 case HUHCAP:
1232 case NOCAP: {
1233 ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1235 if ((abbv) && (ns == 0)) {
1236 memcpy(wspace,cw,wl);
1237 *(wspace+wl) = '.';
1238 *(wspace+wl+1) = '\0';
1239 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1242 break;
1245 case INITCAP: {
1247 ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1249 if (ns == 0 || ((*slst)[0][0] == '#')) {
1250 memcpy(wspace,cw,(wl+1));
1251 mkallsmall(wspace);
1252 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1255 break;
1259 case ALLCAP: {
1260 ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1261 if (ns != 0) break;
1263 memcpy(wspace,cw,(wl+1));
1264 mkallsmall(wspace);
1265 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1267 if (ns == 0) {
1268 mkinitcap(wspace);
1269 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1271 break;
1275 return ns;
1277 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1279 const char * Hunspell::get_wordchars()
1281 return pAMgr->get_wordchars();
1284 unsigned short * Hunspell::get_wordchars_utf16(int * len)
1286 return pAMgr->get_wordchars_utf16(len);
1289 void Hunspell::mkinitcap(char * p)
1291 if (!utf8) {
1292 if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
1293 } else {
1294 int len;
1295 w_char u[MAXWORDLEN];
1296 len = u8_u16(u, MAXWORDLEN, p);
1297 unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
1298 u[0].h = (unsigned char) (i >> 8);
1299 u[0].l = (unsigned char) (i & 0x00FF);
1300 u16_u8(p, MAXWORDUTF8LEN, u, len);
1304 int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
1306 if (!utf8) {
1307 if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
1308 } else if (nc > 0) {
1309 unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
1310 u[0].h = (unsigned char) (i >> 8);
1311 u[0].l = (unsigned char) (i & 0x00FF);
1312 u16_u8(p, MAXWORDUTF8LEN, u, nc);
1313 return strlen(p);
1315 return nc;
1318 int Hunspell::mkinitsmall2(char * p, w_char * u, int nc)
1320 if (!utf8) {
1321 if (*p != '\0') *p = csconv[((unsigned char)*p)].clower;
1322 } else if (nc > 0) {
1323 unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
1324 u[0].h = (unsigned char) (i >> 8);
1325 u[0].l = (unsigned char) (i & 0x00FF);
1326 u16_u8(p, MAXWORDUTF8LEN, u, nc);
1327 return strlen(p);
1329 return nc;
1332 int Hunspell::add(const char * word)
1334 if (pHMgr[0]) return (pHMgr[0])->add(word);
1335 return 0;
1338 int Hunspell::add_with_affix(const char * word, const char * example)
1340 if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
1341 return 0;
1344 int Hunspell::remove(const char * word)
1346 if (pHMgr[0]) return (pHMgr[0])->remove(word);
1347 return 0;
1350 const char * Hunspell::get_version()
1352 return pAMgr->get_version();
1355 struct cs_info * Hunspell::get_csconv()
1357 return csconv;
1360 void Hunspell::cat_result(char * result, char * st)
1362 if (st) {
1363 if (*result) mystrcat(result, "\n", MAXLNLEN);
1364 mystrcat(result, st, MAXLNLEN);
1365 free(st);
1369 int Hunspell::analyze(char*** slst, const char * word)
1371 char cw[MAXWORDUTF8LEN];
1372 char wspace[MAXWORDUTF8LEN];
1373 w_char unicw[MAXWORDLEN];
1374 int wl2 = 0;
1375 *slst = NULL;
1376 if (! pSMgr || maxdic == 0) return 0;
1377 int nc = strlen(word);
1378 if (utf8) {
1379 if (nc >= MAXWORDUTF8LEN) return 0;
1380 } else {
1381 if (nc >= MAXWORDLEN) return 0;
1383 int captype = 0;
1384 int abbv = 0;
1385 int wl = 0;
1387 // input conversion
1388 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
1389 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
1390 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
1392 if (wl == 0) {
1393 if (abbv) {
1394 for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
1395 cw[wl] = '\0';
1396 abbv = 0;
1397 } else return 0;
1400 char result[MAXLNLEN];
1401 char * st = NULL;
1403 *result = '\0';
1405 int n = 0;
1406 int n2 = 0;
1407 int n3 = 0;
1409 // test numbers
1410 // LANG_hu section: set dash information for suggestions
1411 if (langnum == LANG_hu) {
1412 while ((n < wl) &&
1413 (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
1414 n++;
1415 if ((cw[n] == '.') || (cw[n] == ',')) {
1416 if (((n2 == 0) && (n > 3)) ||
1417 ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
1418 n2++;
1419 n3 = n;
1423 if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
1424 if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
1425 mystrcat(result, cw, MAXLNLEN);
1426 result[n - 1] = '\0';
1427 if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
1428 else {
1429 char sign = cw[n];
1430 cw[n] = '\0';
1431 cat_result(result, pSMgr->suggest_morph(cw + n - 1));
1432 mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
1433 cw[n] = sign;
1434 cat_result(result, pSMgr->suggest_morph(cw + n));
1436 return line_tok(result, slst, MSEP_REC);
1439 // END OF LANG_hu section
1441 switch(captype) {
1442 case HUHCAP:
1443 case HUHINITCAP:
1444 case NOCAP: {
1445 cat_result(result, pSMgr->suggest_morph(cw));
1446 if (abbv) {
1447 memcpy(wspace,cw,wl);
1448 *(wspace+wl) = '.';
1449 *(wspace+wl+1) = '\0';
1450 cat_result(result, pSMgr->suggest_morph(wspace));
1452 break;
1454 case INITCAP: {
1455 wl = mkallsmall2(cw, unicw, nc);
1456 memcpy(wspace,cw,(wl+1));
1457 wl2 = mkinitcap2(cw, unicw, nc);
1458 cat_result(result, pSMgr->suggest_morph(wspace));
1459 cat_result(result, pSMgr->suggest_morph(cw));
1460 if (abbv) {
1461 *(wspace+wl) = '.';
1462 *(wspace+wl+1) = '\0';
1463 cat_result(result, pSMgr->suggest_morph(wspace));
1465 memcpy(wspace, cw, wl2);
1466 *(wspace+wl2) = '.';
1467 *(wspace+wl2+1) = '\0';
1469 cat_result(result, pSMgr->suggest_morph(wspace));
1471 break;
1473 case ALLCAP: {
1474 cat_result(result, pSMgr->suggest_morph(cw));
1475 if (abbv) {
1476 memcpy(wspace,cw,wl);
1477 *(wspace+wl) = '.';
1478 *(wspace+wl+1) = '\0';
1479 cat_result(result, pSMgr->suggest_morph(cw));
1481 wl = mkallsmall2(cw, unicw, nc);
1482 memcpy(wspace,cw,(wl+1));
1483 wl2 = mkinitcap2(cw, unicw, nc);
1485 cat_result(result, pSMgr->suggest_morph(wspace));
1486 cat_result(result, pSMgr->suggest_morph(cw));
1487 if (abbv) {
1488 *(wspace+wl) = '.';
1489 *(wspace+wl+1) = '\0';
1490 cat_result(result, pSMgr->suggest_morph(wspace));
1492 memcpy(wspace, cw, wl2);
1493 *(wspace+wl2) = '.';
1494 *(wspace+wl2+1) = '\0';
1496 cat_result(result, pSMgr->suggest_morph(wspace));
1498 break;
1502 if (*result) {
1503 // word reversing wrapper for complex prefixes
1504 if (complexprefixes) {
1505 if (utf8) reverseword_utf(result); else reverseword(result);
1507 return line_tok(result, slst, MSEP_REC);
1511 // compound word with dash (HU) I18n
1512 char * dash = NULL;
1513 int nresult = 0;
1514 // LANG_hu section: set dash information for suggestions
1515 if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
1516 if ((langnum == LANG_hu) && dash) {
1517 *dash='\0';
1518 // examine 2 sides of the dash
1519 if (dash[1] == '\0') { // base word ending with dash
1520 if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC);
1521 } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
1522 if (spell(cw) && (spell("-e"))) {
1523 st = pSMgr->suggest_morph(cw);
1524 if (st) {
1525 mystrcat(result, st, MAXLNLEN);
1526 free(st);
1528 mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
1529 st = pSMgr->suggest_morph("-e");
1530 if (st) {
1531 mystrcat(result, st, MAXLNLEN);
1532 free(st);
1534 return line_tok(result, slst, MSEP_REC);
1536 } else {
1537 // first word ending with dash: word- XXX ???
1538 char r2 = *(dash + 1);
1539 dash[0]='-';
1540 dash[1]='\0';
1541 nresult = spell(cw);
1542 dash[1] = r2;
1543 dash[0]='\0';
1544 if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
1545 ((dash[1] > '0') && (dash[1] < '9')))) {
1546 st = pSMgr->suggest_morph(cw);
1547 if (st) {
1548 mystrcat(result, st, MAXLNLEN);
1549 free(st);
1550 mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
1552 st = pSMgr->suggest_morph(dash+1);
1553 if (st) {
1554 mystrcat(result, st, MAXLNLEN);
1555 free(st);
1557 return line_tok(result, slst, MSEP_REC);
1560 // affixed number in correct word
1561 if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
1562 (*(dash-1)>='0')) || (*(dash-1)=='.'))) {
1563 *dash='-';
1564 n = 1;
1565 if (*(dash - n) == '.') n++;
1566 // search first not a number character to left from dash
1567 while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
1568 n++;
1570 if ((dash - n) < cw) n--;
1571 // numbers: valami1000000-hoz
1572 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1573 // 56-hoz, 6-hoz
1574 for(; n >= 1; n--) {
1575 if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) {
1576 mystrcat(result, cw, MAXLNLEN);
1577 result[dash - cw - n] = '\0';
1578 st = pSMgr->suggest_morph(dash - n);
1579 if (st) {
1580 mystrcat(result, st, MAXLNLEN);
1581 free(st);
1583 return line_tok(result, slst, MSEP_REC);
1588 return 0;
1591 int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln)
1593 *slst = NULL;
1594 if (!pSMgr || !pln) return 0;
1595 char **pl2;
1596 int pl2n = analyze(&pl2, word);
1597 int captype = 0;
1598 int abbv = 0;
1599 char cw[MAXWORDUTF8LEN];
1600 cleanword(cw, word, &captype, &abbv);
1601 char result[MAXLNLEN];
1602 *result = '\0';
1604 for (int i = 0; i < pln; i++) {
1605 cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
1607 freelist(&pl2, pl2n);
1609 if (*result) {
1610 // allcap
1611 if (captype == ALLCAP) mkallcap(result);
1613 // line split
1614 int linenum = line_tok(result, slst, MSEP_REC);
1616 // capitalize
1617 if (captype == INITCAP || captype == HUHINITCAP) {
1618 for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]);
1621 // temporary filtering of prefix related errors (eg.
1622 // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
1624 int r = 0;
1625 for (int j=0; j < linenum; j++) {
1626 if (!spell((*slst)[j])) {
1627 free((*slst)[j]);
1628 (*slst)[j] = NULL;
1629 } else {
1630 if (r < j) (*slst)[r] = (*slst)[j];
1631 r++;
1634 if (r > 0) return r;
1635 free(*slst);
1636 *slst = NULL;
1638 return 0;
1641 int Hunspell::generate(char*** slst, const char * word, const char * pattern)
1643 char **pl;
1644 int pln = analyze(&pl, pattern);
1645 int n = generate(slst, word, pl, pln);
1646 freelist(&pl, pln);
1647 return uniqlist(*slst, n);
1650 // minimal XML parser functions
1651 int Hunspell::get_xml_par(char * dest, const char * par, int max)
1653 char * d = dest;
1654 if (!par) return 0;
1655 char end = *par;
1656 char * dmax = dest + max;
1657 if (end == '>') end = '<';
1658 else if (end != '\'' && end != '"') return 0; // bad XML
1659 for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par;
1660 *d = '\0';
1661 mystrrep(dest, "&lt;", "<");
1662 mystrrep(dest, "&amp;", "&");
1663 return d - dest;
1666 // return the beginning of the element (attr == NULL) or the attribute
1667 const char * Hunspell::get_xml_pos(const char * s, const char * attr)
1669 const char * end = strchr(s, '>');
1670 const char * p = s;
1671 if (attr == NULL) return end;
1672 do {
1673 p = strstr(p, attr);
1674 if (!p || p >= end) return 0;
1675 } while (*(p-1) != ' ' && *(p-1) != '\n');
1676 return p + strlen(attr);
1679 int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) {
1680 char cw[MAXWORDUTF8LEN];
1681 if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
1682 strcmp(cw, value) == 0) return 1;
1683 return 0;
1686 int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) {
1687 int n = 0;
1688 char * p;
1689 if (!list) return 0;
1690 for (p = list; (p = strstr(p, tag)); p++) n++;
1691 if (n == 0) return 0;
1692 *slst = (char **) malloc(sizeof(char *) * n);
1693 if (!*slst) return 0;
1694 for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) {
1695 int l = strlen(p);
1696 (*slst)[n] = (char *) malloc(l);
1697 if (!(*slst)[n]) return (n > 0 ? n - 1 : 0);
1698 get_xml_par((*slst)[n], p + strlen(tag) - 1, l);
1700 return n;
1703 int Hunspell::spellml(char*** slst, const char * word)
1705 char *q, *q2;
1706 char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
1707 q = (char *) strstr(word, "<query");
1708 if (!q) return 0; // bad XML input
1709 q2 = strchr(q, '>');
1710 if (!q2) return 0; // bad XML input
1711 q2 = strstr(q2, "<word");
1712 if (!q2) return 0; // bad XML input
1713 if (check_xml_par(q, "type=", "analyze")) {
1714 int n = 0, s = 0;
1715 if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) n = analyze(slst, cw);
1716 if (n == 0) return 0;
1717 // convert the result to <code><a>ana1</a><a>ana2</a></code> format
1718 for (int i = 0; i < n; i++) s+= strlen((*slst)[i]);
1719 char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->&amp;
1720 if (!r) return 0;
1721 strcpy(r, "<code>");
1722 for (int i = 0; i < n; i++) {
1723 int l = strlen(r);
1724 strcpy(r + l, "<a>");
1725 strcpy(r + l + 3, (*slst)[i]);
1726 mystrrep(r + l + 3, "\t", " ");
1727 mystrrep(r + l + 3, "<", "&lt;");
1728 mystrrep(r + l + 3, "&", "&amp;");
1729 strcat(r, "</a>");
1730 free((*slst)[i]);
1732 strcat(r, "</code>");
1733 (*slst)[0] = r;
1734 return 1;
1735 } else if (check_xml_par(q, "type=", "stem")) {
1736 if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) return stem(slst, cw);
1737 } else if (check_xml_par(q, "type=", "generate")) {
1738 int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN);
1739 if (n == 0) return 0;
1740 char * q3 = strstr(q2 + 1, "<word");
1741 if (q3) {
1742 if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN)) {
1743 return generate(slst, cw, cw2);
1745 } else {
1746 char ** slst2;
1747 if ((q2 = strstr(q2 + 1, "<code")) &&
1748 (n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) {
1749 int n2 = generate(slst, cw, slst2, n);
1750 freelist(&slst2, n);
1751 return uniqlist(*slst, n2);
1755 return 0;
1759 #ifdef HUNSPELL_EXPERIMENTAL
1760 // XXX need UTF-8 support
1761 char * Hunspell::morph_with_correction(const char * word)
1763 char cw[MAXWORDUTF8LEN];
1764 char wspace[MAXWORDUTF8LEN];
1765 if (! pSMgr || maxdic == 0) return NULL;
1766 int wl = strlen(word);
1767 if (utf8) {
1768 if (wl >= MAXWORDUTF8LEN) return NULL;
1769 } else {
1770 if (wl >= MAXWORDLEN) return NULL;
1772 int captype = 0;
1773 int abbv = 0;
1774 wl = cleanword(cw, word, &captype, &abbv);
1775 if (wl == 0) return NULL;
1777 char result[MAXLNLEN];
1778 char * st = NULL;
1780 *result = '\0';
1783 switch(captype) {
1784 case NOCAP: {
1785 st = pSMgr->suggest_morph_for_spelling_error(cw);
1786 if (st) {
1787 mystrcat(result, st, MAXLNLEN);
1788 free(st);
1790 if (abbv) {
1791 memcpy(wspace,cw,wl);
1792 *(wspace+wl) = '.';
1793 *(wspace+wl+1) = '\0';
1794 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1795 if (st) {
1796 if (*result) mystrcat(result, "\n", MAXLNLEN);
1797 mystrcat(result, st, MAXLNLEN);
1798 free(st);
1801 break;
1803 case INITCAP: {
1804 memcpy(wspace,cw,(wl+1));
1805 mkallsmall(wspace);
1806 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1807 if (st) {
1808 mystrcat(result, st, MAXLNLEN);
1809 free(st);
1811 st = pSMgr->suggest_morph_for_spelling_error(cw);
1812 if (st) {
1813 if (*result) mystrcat(result, "\n", MAXLNLEN);
1814 mystrcat(result, st, MAXLNLEN);
1815 free(st);
1817 if (abbv) {
1818 memcpy(wspace,cw,wl);
1819 *(wspace+wl) = '.';
1820 *(wspace+wl+1) = '\0';
1821 mkallsmall(wspace);
1822 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1823 if (st) {
1824 if (*result) mystrcat(result, "\n", MAXLNLEN);
1825 mystrcat(result, st, MAXLNLEN);
1826 free(st);
1828 mkinitcap(wspace);
1829 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1830 if (st) {
1831 if (*result) mystrcat(result, "\n", MAXLNLEN);
1832 mystrcat(result, st, MAXLNLEN);
1833 free(st);
1836 break;
1838 case HUHCAP: {
1839 st = pSMgr->suggest_morph_for_spelling_error(cw);
1840 if (st) {
1841 mystrcat(result, st, MAXLNLEN);
1842 free(st);
1844 memcpy(wspace,cw,(wl+1));
1845 mkallsmall(wspace);
1846 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1847 if (st) {
1848 if (*result) mystrcat(result, "\n", MAXLNLEN);
1849 mystrcat(result, st, MAXLNLEN);
1850 free(st);
1852 break;
1854 case ALLCAP: {
1855 memcpy(wspace,cw,(wl+1));
1856 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1857 if (st) {
1858 mystrcat(result, st, MAXLNLEN);
1859 free(st);
1861 mkallsmall(wspace);
1862 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1863 if (st) {
1864 if (*result) mystrcat(result, "\n", MAXLNLEN);
1865 mystrcat(result, st, MAXLNLEN);
1866 free(st);
1868 mkinitcap(wspace);
1869 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1870 if (st) {
1871 if (*result) mystrcat(result, "\n", MAXLNLEN);
1872 mystrcat(result, st, MAXLNLEN);
1873 free(st);
1875 if (abbv) {
1876 memcpy(wspace,cw,(wl+1));
1877 *(wspace+wl) = '.';
1878 *(wspace+wl+1) = '\0';
1879 if (*result) mystrcat(result, "\n", MAXLNLEN);
1880 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1881 if (st) {
1882 mystrcat(result, st, MAXLNLEN);
1883 free(st);
1885 mkallsmall(wspace);
1886 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1887 if (st) {
1888 if (*result) mystrcat(result, "\n", MAXLNLEN);
1889 mystrcat(result, st, MAXLNLEN);
1890 free(st);
1892 mkinitcap(wspace);
1893 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1894 if (st) {
1895 if (*result) mystrcat(result, "\n", MAXLNLEN);
1896 mystrcat(result, st, MAXLNLEN);
1897 free(st);
1900 break;
1904 if (*result) return mystrdup(result);
1905 return NULL;
1908 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1910 Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
1912 return (Hunhandle*)(new Hunspell(affpath, dpath));
1915 Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
1916 const char * key)
1918 return (Hunhandle*)(new Hunspell(affpath, dpath, key));
1921 void Hunspell_destroy(Hunhandle *pHunspell)
1923 delete (Hunspell*)(pHunspell);
1926 int Hunspell_spell(Hunhandle *pHunspell, const char *word)
1928 return ((Hunspell*)pHunspell)->spell(word);
1931 char *Hunspell_get_dic_encoding(Hunhandle *pHunspell)
1933 return ((Hunspell*)pHunspell)->get_dic_encoding();
1936 int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word)
1938 return ((Hunspell*)pHunspell)->suggest(slst, word);
1941 int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
1943 return ((Hunspell*)pHunspell)->analyze(slst, word);
1946 int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
1948 return ((Hunspell*)pHunspell)->stem(slst, word);
1951 int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n)
1953 return ((Hunspell*)pHunspell)->stem(slst, desc, n);
1956 int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
1957 const char * word2)
1959 return ((Hunspell*)pHunspell)->generate(slst, word, word2);
1962 int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
1963 char** desc, int n)
1965 return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
1968 /* functions for run-time modification of the dictionary */
1970 /* add word to the run-time dictionary */
1972 int Hunspell_add(Hunhandle *pHunspell, const char * word) {
1973 return ((Hunspell*)pHunspell)->add(word);
1976 /* add word to the run-time dictionary with affix flags of
1977 * the example (a dictionary word): Hunspell will recognize
1978 * affixed forms of the new word, too.
1981 int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
1982 const char * example) {
1983 return ((Hunspell*)pHunspell)->add_with_affix(word, example);
1986 /* remove word from the run-time dictionary */
1988 int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
1989 return ((Hunspell*)pHunspell)->remove(word);
1992 void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n) {
1993 freelist(slst, n);