1 /*-------------------------------------------------------------------------
4 * Normalizing word with ISpell
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
17 #include "tsearch/dicts/spell.h"
18 #include "tsearch/ts_locale.h"
19 #include "utils/memutils.h"
23 * Initialization requires a lot of memory that's not needed
24 * after the initialization is done. In init function,
25 * CurrentMemoryContext is a long lived memory context associated
26 * with the dictionary cache entry, so we use a temporary context
27 * for the short-lived stuff.
29 static MemoryContext tmpCtx
= NULL
;
31 #define tmpalloc(sz) MemoryContextAlloc(tmpCtx, (sz))
32 #define tmpalloc0(sz) MemoryContextAllocZero(tmpCtx, (sz))
38 * XXX: This assumes that CurrentMemoryContext doesn't have any children
39 * other than the one we create here.
41 if (CurrentMemoryContext
->firstchild
== NULL
)
43 tmpCtx
= AllocSetContextCreate(CurrentMemoryContext
,
44 "Ispell dictionary init context",
45 ALLOCSET_DEFAULT_MINSIZE
,
46 ALLOCSET_DEFAULT_INITSIZE
,
47 ALLOCSET_DEFAULT_MAXSIZE
);
50 tmpCtx
= CurrentMemoryContext
->firstchild
;
54 lowerstr_ctx(char *src
)
56 MemoryContext saveCtx
;
59 saveCtx
= MemoryContextSwitchTo(tmpCtx
);
61 MemoryContextSwitchTo(saveCtx
);
67 #define MAXNORMLEN 256
69 #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
70 #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
71 #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
73 static char *VoidString
= "";
76 cmpspell(const void *s1
, const void *s2
)
78 return (strcmp((*(const SPELL
**) s1
)->word
, (*(const SPELL
**) s2
)->word
));
81 cmpspellaffix(const void *s1
, const void *s2
)
83 return (strncmp((*(const SPELL
**) s1
)->p
.flag
, (*(const SPELL
**) s2
)->p
.flag
, MAXFLAGLEN
));
87 findchar(char *str
, int c
)
100 /* backward string compare for suffix tree operations */
102 strbcmp(const unsigned char *s1
, const unsigned char *s2
)
104 int l1
= strlen((const char *) s1
) - 1,
105 l2
= strlen((const char *) s2
) - 1;
107 while (l1
>= 0 && l2
>= 0)
124 strbncmp(const unsigned char *s1
, const unsigned char *s2
, size_t count
)
126 int l1
= strlen((const char *) s1
) - 1,
127 l2
= strlen((const char *) s2
) - 1,
130 while (l1
>= 0 && l2
>= 0 && l
> 0)
150 cmpaffix(const void *s1
, const void *s2
)
152 const AFFIX
*a1
= (const AFFIX
*) s1
;
153 const AFFIX
*a2
= (const AFFIX
*) s2
;
155 if (a1
->type
< a2
->type
)
157 if (a1
->type
> a2
->type
)
159 if (a1
->type
== FF_PREFIX
)
160 return strcmp(a1
->repl
, a2
->repl
);
162 return strbcmp((const unsigned char *) a1
->repl
,
163 (const unsigned char *) a2
->repl
);
167 NIAddSpell(IspellDict
*Conf
, const char *word
, const char *flag
)
169 if (Conf
->nspell
>= Conf
->mspell
)
173 Conf
->mspell
+= 1024 * 20;
174 Conf
->Spell
= (SPELL
**) repalloc(Conf
->Spell
, Conf
->mspell
* sizeof(SPELL
*));
178 Conf
->mspell
= 1024 * 20;
179 Conf
->Spell
= (SPELL
**) tmpalloc(Conf
->mspell
* sizeof(SPELL
*));
182 Conf
->Spell
[Conf
->nspell
] = (SPELL
*) tmpalloc(SPELLHDRSZ
+ strlen(word
) + 1);
183 strcpy(Conf
->Spell
[Conf
->nspell
]->word
, word
);
184 strncpy(Conf
->Spell
[Conf
->nspell
]->p
.flag
, flag
, MAXFLAGLEN
);
191 * Note caller must already have applied get_tsearch_config_filename
194 NIImportDictionary(IspellDict
*Conf
, const char *filename
)
196 tsearch_readline_state trst
;
201 if (!tsearch_readline_begin(&trst
, filename
))
203 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
204 errmsg("could not open dictionary file \"%s\": %m",
207 while ((line
= tsearch_readline(&trst
)) != NULL
)
213 /* Extract flag from the line */
215 if ((s
= findchar(line
, '/')))
221 /* we allow only single encoded flags for faster works */
222 if (pg_mblen(s
) == 1 && t_isprint(s
) && !t_isspace(s
))
234 /* Remove trailing spaces */
245 pstr
= lowerstr_ctx(line
);
247 NIAddSpell(Conf
, pstr
, flag
);
252 tsearch_readline_end(&trst
);
257 FindWord(IspellDict
*Conf
, const char *word
, int affixflag
, int flag
)
259 SPNode
*node
= Conf
->Dictionary
;
263 uint8
*ptr
= (uint8
*) word
;
265 flag
&= FF_DICTFLAGMASK
;
269 StopLow
= node
->data
;
270 StopHigh
= node
->data
+ node
->length
;
271 while (StopLow
< StopHigh
)
273 StopMiddle
= StopLow
+ ((StopHigh
- StopLow
) >> 1);
274 if (StopMiddle
->val
== *ptr
)
276 if (*(ptr
+ 1) == '\0' && StopMiddle
->isword
)
280 if (StopMiddle
->compoundflag
& FF_COMPOUNDONLY
)
283 else if ((flag
& StopMiddle
->compoundflag
) == 0)
286 if ((affixflag
== 0) || (strchr(Conf
->AffixData
[StopMiddle
->affix
], affixflag
) != NULL
))
289 node
= StopMiddle
->node
;
293 else if (StopMiddle
->val
< *ptr
)
294 StopLow
= StopMiddle
+ 1;
296 StopHigh
= StopMiddle
;
298 if (StopLow
>= StopHigh
)
305 NIAddAffix(IspellDict
*Conf
, int flag
, char flagflags
, const char *mask
, const char *find
, const char *repl
, int type
)
309 if (Conf
->naffixes
>= Conf
->maffixes
)
313 Conf
->maffixes
+= 16;
314 Conf
->Affix
= (AFFIX
*) repalloc((void *) Conf
->Affix
, Conf
->maffixes
* sizeof(AFFIX
));
319 Conf
->Affix
= (AFFIX
*) palloc(Conf
->maffixes
* sizeof(AFFIX
));
323 Affix
= Conf
->Affix
+ Conf
->naffixes
;
325 if (strcmp(mask
, ".") == 0)
330 else if (RS_isRegis(mask
))
334 RS_compile(&(Affix
->reg
.regis
), (type
== FF_SUFFIX
) ? true : false,
335 (mask
&& *mask
) ? mask
: VoidString
);
347 tmask
= (char *) tmpalloc(strlen(mask
) + 3);
348 if (type
== FF_SUFFIX
)
349 sprintf(tmask
, "%s$", mask
);
351 sprintf(tmask
, "^%s", mask
);
353 masklen
= strlen(tmask
);
354 wmask
= (pg_wchar
*) tmpalloc((masklen
+ 1) * sizeof(pg_wchar
));
355 wmasklen
= pg_mb2wchar_with_len(tmask
, wmask
, masklen
);
357 err
= pg_regcomp(&(Affix
->reg
.regex
), wmask
, wmasklen
, REG_ADVANCED
| REG_NOSUB
);
362 pg_regerror(err
, &(Affix
->reg
.regex
), errstr
, sizeof(errstr
));
364 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION
),
365 errmsg("invalid regular expression: %s", errstr
)));
369 Affix
->flagflags
= flagflags
;
370 if ((Affix
->flagflags
& FF_COMPOUNDONLY
) || (Affix
->flagflags
& FF_COMPOUNDPERMITFLAG
))
372 if ((Affix
->flagflags
& FF_COMPOUNDFLAG
) == 0)
373 Affix
->flagflags
|= FF_COMPOUNDFLAG
;
378 Affix
->find
= (find
&& *find
) ? pstrdup(find
) : VoidString
;
379 if ((Affix
->replen
= strlen(repl
)) > 0)
380 Affix
->repl
= pstrdup(repl
);
382 Affix
->repl
= VoidString
;
386 #define PAE_WAIT_MASK 0
388 #define PAE_WAIT_FIND 2
390 #define PAE_WAIT_REPL 4
394 parse_affentry(char *str
, char *mask
, char *find
, char *repl
)
396 int state
= PAE_WAIT_MASK
;
401 *mask
= *find
= *repl
= '\0';
405 if (state
== PAE_WAIT_MASK
)
407 if (t_iseq(str
, '#'))
409 else if (!t_isspace(str
))
411 COPYCHAR(pmask
, str
);
412 pmask
+= pg_mblen(str
);
416 else if (state
== PAE_INMASK
)
418 if (t_iseq(str
, '>'))
421 state
= PAE_WAIT_FIND
;
423 else if (!t_isspace(str
))
425 COPYCHAR(pmask
, str
);
426 pmask
+= pg_mblen(str
);
429 else if (state
== PAE_WAIT_FIND
)
431 if (t_iseq(str
, '-'))
435 else if (t_isalpha(str
) || t_iseq(str
, '\'') /* english 's */ )
437 COPYCHAR(prepl
, str
);
438 prepl
+= pg_mblen(str
);
441 else if (!t_isspace(str
))
443 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
444 errmsg("syntax error")));
446 else if (state
== PAE_INFIND
)
448 if (t_iseq(str
, ','))
451 state
= PAE_WAIT_REPL
;
453 else if (t_isalpha(str
))
455 COPYCHAR(pfind
, str
);
456 pfind
+= pg_mblen(str
);
458 else if (!t_isspace(str
))
460 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
461 errmsg("syntax error")));
463 else if (state
== PAE_WAIT_REPL
)
465 if (t_iseq(str
, '-'))
467 break; /* void repl */
469 else if (t_isalpha(str
))
471 COPYCHAR(prepl
, str
);
472 prepl
+= pg_mblen(str
);
475 else if (!t_isspace(str
))
477 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
478 errmsg("syntax error")));
480 else if (state
== PAE_INREPL
)
482 if (t_iseq(str
, '#'))
487 else if (t_isalpha(str
))
489 COPYCHAR(prepl
, str
);
490 prepl
+= pg_mblen(str
);
492 else if (!t_isspace(str
))
494 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
495 errmsg("syntax error")));
498 elog(ERROR
, "unrecognized state in parse_affentry: %d", state
);
500 str
+= pg_mblen(str
);
503 *pmask
= *pfind
= *prepl
= '\0';
505 return (*mask
&& (*find
|| *repl
)) ? true : false;
509 addFlagValue(IspellDict
*Conf
, char *s
, uint32 val
)
511 while (*s
&& t_isspace(s
))
516 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
517 errmsg("syntax error")));
519 if (pg_mblen(s
) != 1)
521 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
522 errmsg("multibyte flag character is not allowed")));
524 Conf
->flagval
[(unsigned int) *s
] = (unsigned char) val
;
525 Conf
->usecompound
= true;
529 NIImportOOAffixes(IspellDict
*Conf
, const char *filename
)
540 bool isSuffix
= false;
543 tsearch_readline_state trst
;
545 char scanbuf
[BUFSIZ
];
550 /* read file to find any flag */
551 memset(Conf
->flagval
, 0, sizeof(Conf
->flagval
));
552 Conf
->usecompound
= false;
554 if (!tsearch_readline_begin(&trst
, filename
))
556 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
557 errmsg("could not open affix file \"%s\": %m",
560 while ((recoded
= tsearch_readline(&trst
)) != NULL
)
562 if (*recoded
== '\0' || t_isspace(recoded
) || t_iseq(recoded
, '#'))
568 if (STRNCMP(recoded
, "COMPOUNDFLAG") == 0)
569 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDFLAG"),
571 else if (STRNCMP(recoded
, "COMPOUNDBEGIN") == 0)
572 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDBEGIN"),
574 else if (STRNCMP(recoded
, "COMPOUNDLAST") == 0)
575 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDLAST"),
577 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
578 else if (STRNCMP(recoded
, "COMPOUNDEND") == 0)
579 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDEND"),
581 else if (STRNCMP(recoded
, "COMPOUNDMIDDLE") == 0)
582 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDMIDDLE"),
584 else if (STRNCMP(recoded
, "ONLYINCOMPOUND") == 0)
585 addFlagValue(Conf
, recoded
+ strlen("ONLYINCOMPOUND"),
587 else if (STRNCMP(recoded
, "COMPOUNDPERMITFLAG") == 0)
588 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDPERMITFLAG"),
589 FF_COMPOUNDPERMITFLAG
);
590 else if (STRNCMP(recoded
, "COMPOUNDFORBIDFLAG") == 0)
591 addFlagValue(Conf
, recoded
+ strlen("COMPOUNDFORBIDFLAG"),
592 FF_COMPOUNDFORBIDFLAG
);
593 else if (STRNCMP(recoded
, "FLAG") == 0)
595 char *s
= recoded
+ strlen("FLAG");
597 while (*s
&& t_isspace(s
))
600 if (*s
&& STRNCMP(s
, "default") != 0)
602 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
603 errmsg("Ispell dictionary supports only default flag value")));
608 tsearch_readline_end(&trst
);
610 sprintf(scanbuf
, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ
/ 5, BUFSIZ
/ 5, BUFSIZ
/ 5, BUFSIZ
/ 5);
612 if (!tsearch_readline_begin(&trst
, filename
))
614 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
615 errmsg("could not open affix file \"%s\": %m",
618 while ((recoded
= tsearch_readline(&trst
)) != NULL
)
620 if (*recoded
== '\0' || t_isspace(recoded
) || t_iseq(recoded
, '#'))
623 scanread
= sscanf(recoded
, scanbuf
, type
, sflag
, find
, repl
, mask
);
627 ptype
= lowerstr_ctx(type
);
628 if (scanread
< 4 || (STRNCMP(ptype
, "sfx") && STRNCMP(ptype
, "pfx")))
633 if (strlen(sflag
) != 1)
636 isSuffix
= (STRNCMP(ptype
, "sfx") == 0) ? true : false;
637 if (t_iseq(find
, 'y') || t_iseq(find
, 'Y'))
638 flagflags
= FF_CROSSPRODUCT
;
647 if (strlen(sflag
) != 1 || flag
!= *sflag
|| flag
== 0)
649 prepl
= lowerstr_ctx(repl
);
651 if ((ptr
= strchr(prepl
, '/')) != NULL
)
654 ptr
= repl
+ (ptr
- prepl
) + 1;
657 aflg
|= Conf
->flagval
[(unsigned int) *ptr
];
661 pfind
= lowerstr_ctx(find
);
662 pmask
= lowerstr_ctx(mask
);
663 if (t_iseq(find
, '0'))
665 if (t_iseq(repl
, '0'))
668 NIAddAffix(Conf
, flag
, flagflags
| aflg
, pmask
, pfind
, prepl
,
669 isSuffix
? FF_SUFFIX
: FF_PREFIX
);
679 tsearch_readline_end(&trst
);
687 * Note caller must already have applied get_tsearch_config_filename
690 NIImportAffixes(IspellDict
*Conf
, const char *filename
)
697 bool suffixes
= false;
698 bool prefixes
= false;
701 tsearch_readline_state trst
;
702 bool oldformat
= false;
703 char *recoded
= NULL
;
707 if (!tsearch_readline_begin(&trst
, filename
))
709 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
710 errmsg("could not open affix file \"%s\": %m",
713 memset(Conf
->flagval
, 0, sizeof(Conf
->flagval
));
714 Conf
->usecompound
= false;
716 while ((recoded
= tsearch_readline(&trst
)) != NULL
)
718 pstr
= lowerstr(recoded
);
720 /* Skip comments and empty lines */
721 if (*pstr
== '#' || *pstr
== '\n')
724 if (STRNCMP(pstr
, "compoundwords") == 0)
726 s
= findchar(pstr
, 'l');
729 s
= recoded
+ (s
- pstr
); /* we need non-lowercased
731 while (*s
&& !t_isspace(s
))
733 while (*s
&& t_isspace(s
))
736 if (*s
&& pg_mblen(s
) == 1)
738 Conf
->flagval
[(unsigned int) *s
] = FF_COMPOUNDFLAG
;
739 Conf
->usecompound
= true;
745 if (STRNCMP(pstr
, "suffixes") == 0)
752 if (STRNCMP(pstr
, "prefixes") == 0)
759 if (STRNCMP(pstr
, "flag") == 0)
761 s
= recoded
+ 4; /* we need non-lowercased string */
764 while (*s
&& t_isspace(s
))
768 /* allow only single-encoded flags */
769 if (pg_mblen(s
) != 1)
771 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
772 errmsg("multibyte flag character is not allowed")));
776 flagflags
|= FF_CROSSPRODUCT
;
781 flagflags
|= FF_COMPOUNDONLY
;
788 /* allow only single-encoded flags */
789 if (pg_mblen(s
) != 1)
791 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
792 errmsg("multibyte flag character is not allowed")));
794 flag
= (unsigned char) *s
;
797 if (STRNCMP(recoded
, "COMPOUNDFLAG") == 0 || STRNCMP(recoded
, "COMPOUNDMIN") == 0 ||
798 STRNCMP(recoded
, "PFX") == 0 || STRNCMP(recoded
, "SFX") == 0)
802 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
803 errmsg("wrong affix file format for flag")));
804 tsearch_readline_end(&trst
);
805 NIImportOOAffixes(Conf
, filename
);
808 if ((!suffixes
) && (!prefixes
))
811 if (!parse_affentry(pstr
, mask
, find
, repl
))
814 NIAddAffix(Conf
, flag
, flagflags
, mask
, find
, repl
, suffixes
? FF_SUFFIX
: FF_PREFIX
);
820 tsearch_readline_end(&trst
);
824 MergeAffix(IspellDict
*Conf
, int a1
, int a2
)
828 while (Conf
->nAffixData
+ 1 >= Conf
->lenAffixData
)
830 Conf
->lenAffixData
*= 2;
831 Conf
->AffixData
= (char **) repalloc(Conf
->AffixData
,
832 sizeof(char *) * Conf
->lenAffixData
);
835 ptr
= Conf
->AffixData
+ Conf
->nAffixData
;
836 *ptr
= palloc(strlen(Conf
->AffixData
[a1
]) + strlen(Conf
->AffixData
[a2
]) +
837 1 /* space */ + 1 /* \0 */ );
838 sprintf(*ptr
, "%s %s", Conf
->AffixData
[a1
], Conf
->AffixData
[a2
]);
843 return Conf
->nAffixData
- 1;
847 makeCompoundFlags(IspellDict
*Conf
, int affix
)
850 char *str
= Conf
->AffixData
[affix
];
854 flag
|= Conf
->flagval
[(unsigned int) *str
];
858 return (flag
& FF_DICTFLAGMASK
);
862 mkSPNode(IspellDict
*Conf
, int low
, int high
, int level
)
866 char lastchar
= '\0';
871 for (i
= low
; i
< high
; i
++)
872 if (Conf
->Spell
[i
]->p
.d
.len
> level
&& lastchar
!= Conf
->Spell
[i
]->word
[level
])
875 lastchar
= Conf
->Spell
[i
]->word
[level
];
881 rs
= (SPNode
*) palloc0(SPNHDRSZ
+ nchar
* sizeof(SPNodeData
));
886 for (i
= low
; i
< high
; i
++)
887 if (Conf
->Spell
[i
]->p
.d
.len
> level
)
889 if (lastchar
!= Conf
->Spell
[i
]->word
[level
])
893 data
->node
= mkSPNode(Conf
, lownew
, i
, level
+ 1);
897 lastchar
= Conf
->Spell
[i
]->word
[level
];
899 data
->val
= ((uint8
*) (Conf
->Spell
[i
]->word
))[level
];
900 if (Conf
->Spell
[i
]->p
.d
.len
== level
+ 1)
902 bool clearCompoundOnly
= false;
904 if (data
->isword
&& data
->affix
!= Conf
->Spell
[i
]->p
.d
.affix
)
907 * MergeAffix called a few times. If one of word is
908 * allowed to be in compound word and another isn't, then
909 * clear FF_COMPOUNDONLY flag.
912 clearCompoundOnly
= (FF_COMPOUNDONLY
& data
->compoundflag
913 & makeCompoundFlags(Conf
, Conf
->Spell
[i
]->p
.d
.affix
))
915 data
->affix
= MergeAffix(Conf
, data
->affix
, Conf
->Spell
[i
]->p
.d
.affix
);
918 data
->affix
= Conf
->Spell
[i
]->p
.d
.affix
;
921 data
->compoundflag
= makeCompoundFlags(Conf
, data
->affix
);
923 if ((data
->compoundflag
& FF_COMPOUNDONLY
) &&
924 (data
->compoundflag
& FF_COMPOUNDFLAG
) == 0)
925 data
->compoundflag
|= FF_COMPOUNDFLAG
;
927 if (clearCompoundOnly
)
928 data
->compoundflag
&= ~FF_COMPOUNDONLY
;
932 data
->node
= mkSPNode(Conf
, lownew
, high
, level
+ 1);
938 * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
942 NISortDictionary(IspellDict
*Conf
)
950 /* compress affixes */
952 /* Count the number of different flags used in the dictionary */
954 qsort((void *) Conf
->Spell
, Conf
->nspell
, sizeof(SPELL
*), cmpspellaffix
);
957 for (i
= 0; i
< Conf
->nspell
; i
++)
959 if (i
== 0 || strncmp(Conf
->Spell
[i
]->p
.flag
, Conf
->Spell
[i
- 1]->p
.flag
, MAXFLAGLEN
))
964 * Fill in Conf->AffixData with the affixes that were used in the
965 * dictionary. Replace textual flag-field of Conf->Spell entries with
966 * indexes into Conf->AffixData array.
968 Conf
->AffixData
= (char **) palloc0(naffix
* sizeof(char *));
971 for (i
= 0; i
< Conf
->nspell
; i
++)
973 if (i
== 0 || strncmp(Conf
->Spell
[i
]->p
.flag
, Conf
->AffixData
[curaffix
], MAXFLAGLEN
))
976 Assert(curaffix
< naffix
);
977 Conf
->AffixData
[curaffix
] = pstrdup(Conf
->Spell
[i
]->p
.flag
);
980 Conf
->Spell
[i
]->p
.d
.affix
= curaffix
;
981 Conf
->Spell
[i
]->p
.d
.len
= strlen(Conf
->Spell
[i
]->word
);
984 Conf
->lenAffixData
= Conf
->nAffixData
= naffix
;
986 qsort((void *) Conf
->Spell
, Conf
->nspell
, sizeof(SPELL
*), cmpspell
);
987 Conf
->Dictionary
= mkSPNode(Conf
, 0, Conf
->nspell
, 0);
993 mkANode(IspellDict
*Conf
, int low
, int high
, int level
, int type
)
997 uint8 lastchar
= '\0';
1004 for (i
= low
; i
< high
; i
++)
1005 if (Conf
->Affix
[i
].replen
> level
&& lastchar
!= GETCHAR(Conf
->Affix
+ i
, level
, type
))
1008 lastchar
= GETCHAR(Conf
->Affix
+ i
, level
, type
);
1014 aff
= (AFFIX
**) tmpalloc(sizeof(AFFIX
*) * (high
- low
+ 1));
1017 rs
= (AffixNode
*) palloc0(ANHRDSZ
+ nchar
* sizeof(AffixNodeData
));
1022 for (i
= low
; i
< high
; i
++)
1023 if (Conf
->Affix
[i
].replen
> level
)
1025 if (lastchar
!= GETCHAR(Conf
->Affix
+ i
, level
, type
))
1029 data
->node
= mkANode(Conf
, lownew
, i
, level
+ 1, type
);
1033 data
->aff
= (AFFIX
**) palloc(sizeof(AFFIX
*) * naff
);
1034 memcpy(data
->aff
, aff
, sizeof(AFFIX
*) * naff
);
1040 lastchar
= GETCHAR(Conf
->Affix
+ i
, level
, type
);
1042 data
->val
= GETCHAR(Conf
->Affix
+ i
, level
, type
);
1043 if (Conf
->Affix
[i
].replen
== level
+ 1)
1044 { /* affix stopped */
1045 aff
[naff
++] = Conf
->Affix
+ i
;
1049 data
->node
= mkANode(Conf
, lownew
, high
, level
+ 1, type
);
1053 data
->aff
= (AFFIX
**) palloc(sizeof(AFFIX
*) * naff
);
1054 memcpy(data
->aff
, aff
, sizeof(AFFIX
*) * naff
);
1064 mkVoidAffix(IspellDict
*Conf
, bool issuffix
, int startsuffix
)
1068 int start
= (issuffix
) ? startsuffix
: 0;
1069 int end
= (issuffix
) ? Conf
->naffixes
: startsuffix
;
1070 AffixNode
*Affix
= (AffixNode
*) palloc0(ANHRDSZ
+ sizeof(AffixNodeData
));
1077 Affix
->data
->node
= Conf
->Suffix
;
1078 Conf
->Suffix
= Affix
;
1082 Affix
->data
->node
= Conf
->Prefix
;
1083 Conf
->Prefix
= Affix
;
1087 for (i
= start
; i
< end
; i
++)
1088 if (Conf
->Affix
[i
].replen
== 0)
1094 Affix
->data
->aff
= (AFFIX
**) palloc(sizeof(AFFIX
*) * cnt
);
1095 Affix
->data
->naff
= (uint32
) cnt
;
1098 for (i
= start
; i
< end
; i
++)
1099 if (Conf
->Affix
[i
].replen
== 0)
1101 Affix
->data
->aff
[cnt
] = Conf
->Affix
+ i
;
1107 isAffixInUse(IspellDict
*Conf
, char flag
)
1111 for (i
= 0; i
< Conf
->nAffixData
; i
++)
1112 if (strchr(Conf
->AffixData
[i
], flag
) != NULL
)
1119 NISortAffixes(IspellDict
*Conf
)
1124 int firstsuffix
= Conf
->naffixes
;
1128 if (Conf
->naffixes
== 0)
1131 if (Conf
->naffixes
> 1)
1132 qsort((void *) Conf
->Affix
, Conf
->naffixes
, sizeof(AFFIX
), cmpaffix
);
1133 Conf
->CompoundAffix
= ptr
= (CMPDAffix
*) palloc(sizeof(CMPDAffix
) * Conf
->naffixes
);
1136 for (i
= 0; i
< Conf
->naffixes
; i
++)
1138 Affix
= &(((AFFIX
*) Conf
->Affix
)[i
]);
1139 if (Affix
->type
== FF_SUFFIX
&& i
< firstsuffix
)
1142 if ((Affix
->flagflags
& FF_COMPOUNDFLAG
) && Affix
->replen
> 0 &&
1143 isAffixInUse(Conf
, (char) Affix
->flag
))
1145 if (ptr
== Conf
->CompoundAffix
||
1146 ptr
->issuffix
!= (ptr
- 1)->issuffix
||
1147 strbncmp((const unsigned char *) (ptr
- 1)->affix
,
1148 (const unsigned char *) Affix
->repl
,
1151 /* leave only unique and minimals suffixes */
1152 ptr
->affix
= Affix
->repl
;
1153 ptr
->len
= Affix
->replen
;
1154 ptr
->issuffix
= (Affix
->type
== FF_SUFFIX
) ? true : false;
1160 Conf
->CompoundAffix
= (CMPDAffix
*) repalloc(Conf
->CompoundAffix
, sizeof(CMPDAffix
) * (ptr
- Conf
->CompoundAffix
+ 1));
1162 Conf
->Prefix
= mkANode(Conf
, 0, firstsuffix
, 0, FF_PREFIX
);
1163 Conf
->Suffix
= mkANode(Conf
, firstsuffix
, Conf
->naffixes
, 0, FF_SUFFIX
);
1164 mkVoidAffix(Conf
, true, firstsuffix
);
1165 mkVoidAffix(Conf
, false, firstsuffix
);
1168 static AffixNodeData
*
1169 FindAffixes(AffixNode
*node
, const char *word
, int wrdlen
, int *level
, int type
)
1171 AffixNodeData
*StopLow
,
1177 { /* search void affixes */
1178 if (node
->data
->naff
)
1180 node
= node
->data
->node
;
1183 while (node
&& *level
< wrdlen
)
1185 StopLow
= node
->data
;
1186 StopHigh
= node
->data
+ node
->length
;
1187 while (StopLow
< StopHigh
)
1189 StopMiddle
= StopLow
+ ((StopHigh
- StopLow
) >> 1);
1190 symbol
= GETWCHAR(word
, wrdlen
, *level
, type
);
1192 if (StopMiddle
->val
== symbol
)
1195 if (StopMiddle
->naff
)
1197 node
= StopMiddle
->node
;
1200 else if (StopMiddle
->val
< symbol
)
1201 StopLow
= StopMiddle
+ 1;
1203 StopHigh
= StopMiddle
;
1205 if (StopLow
>= StopHigh
)
1212 CheckAffix(const char *word
, size_t len
, AFFIX
*Affix
, int flagflags
, char *newword
, int *baselen
)
1215 * Check compound allow flags
1220 if (Affix
->flagflags
& FF_COMPOUNDONLY
)
1223 else if (flagflags
& FF_COMPOUNDBEGIN
)
1225 if (Affix
->flagflags
& FF_COMPOUNDFORBIDFLAG
)
1227 if ((Affix
->flagflags
& FF_COMPOUNDBEGIN
) == 0)
1228 if (Affix
->type
== FF_SUFFIX
)
1231 else if (flagflags
& FF_COMPOUNDMIDDLE
)
1233 if ((Affix
->flagflags
& FF_COMPOUNDMIDDLE
) == 0 ||
1234 (Affix
->flagflags
& FF_COMPOUNDFORBIDFLAG
))
1237 else if (flagflags
& FF_COMPOUNDLAST
)
1239 if (Affix
->flagflags
& FF_COMPOUNDFORBIDFLAG
)
1241 if ((Affix
->flagflags
& FF_COMPOUNDLAST
) == 0)
1242 if (Affix
->type
== FF_PREFIX
)
1247 * make replace pattern of affix
1249 if (Affix
->type
== FF_SUFFIX
)
1251 strcpy(newword
, word
);
1252 strcpy(newword
+ len
- Affix
->replen
, Affix
->find
);
1253 if (baselen
) /* store length of non-changed part of word */
1254 *baselen
= len
- Affix
->replen
;
1259 * if prefix is a all non-chaged part's length then all word contains
1260 * only prefix and suffix, so out
1262 if (baselen
&& *baselen
+ strlen(Affix
->find
) <= Affix
->replen
)
1264 strcpy(newword
, Affix
->find
);
1265 strcat(newword
, word
+ Affix
->replen
);
1269 * check resulting word
1271 if (Affix
->issimple
)
1273 else if (Affix
->isregis
)
1275 if (RS_execute(&(Affix
->reg
.regis
), newword
))
1285 /* Convert data string to wide characters */
1286 newword_len
= strlen(newword
);
1287 data
= (pg_wchar
*) palloc((newword_len
+ 1) * sizeof(pg_wchar
));
1288 data_len
= pg_mb2wchar_with_len(newword
, data
, newword_len
);
1290 if (!(err
= pg_regexec(&(Affix
->reg
.regex
), data
, data_len
, 0, NULL
, 0, NULL
, 0)))
1302 addToResult(char **forms
, char **cur
, char *word
)
1304 if (cur
- forms
>= MAX_NORM
- 1)
1306 if (forms
== cur
|| strcmp(word
, *(cur
- 1)) != 0)
1308 *cur
= pstrdup(word
);
1317 NormalizeSubWord(IspellDict
*Conf
, char *word
, int flag
)
1319 AffixNodeData
*suffix
= NULL
,
1323 int wrdlen
= strlen(word
),
1327 char newword
[2 * MAXNORMLEN
] = "";
1328 char pnewword
[2 * MAXNORMLEN
] = "";
1329 AffixNode
*snode
= Conf
->Suffix
,
1334 if (wrdlen
> MAXNORMLEN
)
1336 cur
= forms
= (char **) palloc(MAX_NORM
* sizeof(char *));
1340 /* Check that the word itself is normal form */
1341 if (FindWord(Conf
, word
, 0, flag
))
1343 *cur
= pstrdup(word
);
1348 /* Find all other NORMAL forms of the 'word' (check only prefix) */
1349 pnode
= Conf
->Prefix
;
1353 prefix
= FindAffixes(pnode
, word
, wrdlen
, &plevel
, FF_PREFIX
);
1356 for (j
= 0; j
< prefix
->naff
; j
++)
1358 if (CheckAffix(word
, wrdlen
, prefix
->aff
[j
], flag
, newword
, NULL
))
1360 /* prefix success */
1361 if (FindWord(Conf
, newword
, prefix
->aff
[j
]->flag
, flag
))
1362 cur
+= addToResult(forms
, cur
, newword
);
1365 pnode
= prefix
->node
;
1369 * Find all other NORMAL forms of the 'word' (check suffix and then
1376 /* find possible suffix */
1377 suffix
= FindAffixes(snode
, word
, wrdlen
, &slevel
, FF_SUFFIX
);
1380 /* foreach suffix check affix */
1381 for (i
= 0; i
< suffix
->naff
; i
++)
1383 if (CheckAffix(word
, wrdlen
, suffix
->aff
[i
], flag
, newword
, &baselen
))
1385 /* suffix success */
1386 if (FindWord(Conf
, newword
, suffix
->aff
[i
]->flag
, flag
))
1387 cur
+= addToResult(forms
, cur
, newword
);
1389 /* now we will look changed word with prefixes */
1390 pnode
= Conf
->Prefix
;
1392 swrdlen
= strlen(newword
);
1395 prefix
= FindAffixes(pnode
, newword
, swrdlen
, &plevel
, FF_PREFIX
);
1398 for (j
= 0; j
< prefix
->naff
; j
++)
1400 if (CheckAffix(newword
, swrdlen
, prefix
->aff
[j
], flag
, pnewword
, &baselen
))
1402 /* prefix success */
1403 int ff
= (prefix
->aff
[j
]->flagflags
& suffix
->aff
[i
]->flagflags
& FF_CROSSPRODUCT
) ?
1404 0 : prefix
->aff
[j
]->flag
;
1406 if (FindWord(Conf
, pnewword
, ff
, flag
))
1407 cur
+= addToResult(forms
, cur
, pnewword
);
1410 pnode
= prefix
->node
;
1415 snode
= suffix
->node
;
1426 typedef struct SplitVar
1431 struct SplitVar
*next
;
1435 CheckCompoundAffixes(CMPDAffix
**ptr
, char *word
, int len
, bool CheckInPlace
)
1441 while ((*ptr
)->affix
)
1443 if (len
> (*ptr
)->len
&& strncmp((*ptr
)->affix
, word
, (*ptr
)->len
) == 0)
1446 issuffix
= (*ptr
)->issuffix
;
1448 return (issuffix
) ? len
: 0;
1457 while ((*ptr
)->affix
)
1459 if (len
> (*ptr
)->len
&& (affbegin
= strstr(word
, (*ptr
)->affix
)) != NULL
)
1461 len
= (*ptr
)->len
+ (affbegin
- word
);
1462 issuffix
= (*ptr
)->issuffix
;
1464 return (issuffix
) ? len
: 0;
1473 CopyVar(SplitVar
*s
, int makedup
)
1475 SplitVar
*v
= (SplitVar
*) palloc(sizeof(SplitVar
));
1482 v
->lenstem
= s
->lenstem
;
1483 v
->stem
= (char **) palloc(sizeof(char *) * v
->lenstem
);
1484 v
->nstem
= s
->nstem
;
1485 for (i
= 0; i
< s
->nstem
; i
++)
1486 v
->stem
[i
] = (makedup
) ? pstrdup(s
->stem
[i
]) : s
->stem
[i
];
1491 v
->stem
= (char **) palloc(sizeof(char *) * v
->lenstem
);
1498 AddStem(SplitVar
*v
, char *word
)
1500 if ( v
->nstem
>= v
->lenstem
)
1503 v
->stem
= (char **) repalloc(v
->stem
, sizeof(char *) * v
->lenstem
);
1506 v
->stem
[v
->nstem
] = word
;
1511 SplitToVariants(IspellDict
*Conf
, SPNode
*snode
, SplitVar
*orig
, char *word
, int wordlen
, int startpos
, int minpos
)
1513 SplitVar
*var
= NULL
;
1514 SPNodeData
*StopLow
,
1517 SPNode
*node
= (snode
) ? snode
: Conf
->Dictionary
;
1518 int level
= (snode
) ? minpos
: startpos
; /* recursive
1523 int compoundflag
= 0;
1525 notprobed
= (char *) palloc(wordlen
);
1526 memset(notprobed
, 1, wordlen
);
1527 var
= CopyVar(orig
, 1);
1529 while (level
< wordlen
)
1531 /* find word with epenthetic or/and compound affix */
1532 caff
= Conf
->CompoundAffix
;
1533 while (level
> startpos
&& (lenaff
= CheckCompoundAffixes(&caff
, word
+ level
, wordlen
- level
, (node
) ? true : false)) >= 0)
1536 * there is one of compound affixes, so check word for existings
1538 char buf
[MAXNORMLEN
];
1541 lenaff
= level
- startpos
+ lenaff
;
1543 if (!notprobed
[startpos
+ lenaff
- 1])
1546 if (level
+ lenaff
- 1 <= minpos
)
1549 if ( lenaff
>= MAXNORMLEN
)
1550 continue; /* skip too big value */
1552 memcpy(buf
, word
+ startpos
, lenaff
);
1556 compoundflag
= FF_COMPOUNDBEGIN
;
1557 else if (level
== wordlen
- 1)
1558 compoundflag
= FF_COMPOUNDLAST
;
1560 compoundflag
= FF_COMPOUNDMIDDLE
;
1561 subres
= NormalizeSubWord(Conf
, buf
, compoundflag
);
1564 /* Yes, it was a word from dictionary */
1565 SplitVar
*new = CopyVar(var
, 0);
1566 SplitVar
*ptr
= var
;
1567 char **sptr
= subres
;
1569 notprobed
[startpos
+ lenaff
- 1] = 0;
1573 AddStem( new, *sptr
);
1580 ptr
->next
= SplitToVariants(Conf
, NULL
, new, word
, wordlen
, startpos
+ lenaff
, startpos
+ lenaff
);
1590 StopLow
= node
->data
;
1591 StopHigh
= node
->data
+ node
->length
;
1592 while (StopLow
< StopHigh
)
1594 StopMiddle
= StopLow
+ ((StopHigh
- StopLow
) >> 1);
1595 if (StopMiddle
->val
== ((uint8
*) (word
))[level
])
1597 else if (StopMiddle
->val
< ((uint8
*) (word
))[level
])
1598 StopLow
= StopMiddle
+ 1;
1600 StopHigh
= StopMiddle
;
1603 if (StopLow
< StopHigh
)
1605 if (level
== FF_COMPOUNDBEGIN
)
1606 compoundflag
= FF_COMPOUNDBEGIN
;
1607 else if (level
== wordlen
- 1)
1608 compoundflag
= FF_COMPOUNDLAST
;
1610 compoundflag
= FF_COMPOUNDMIDDLE
;
1612 /* find infinitive */
1613 if (StopMiddle
->isword
&&
1614 (StopMiddle
->compoundflag
& compoundflag
) &&
1617 /* ok, we found full compoundallowed word */
1620 /* and its length more than minimal */
1621 if (wordlen
== level
+ 1)
1623 /* well, it was last word */
1624 AddStem( var
, pnstrdup(word
+ startpos
, wordlen
- startpos
) );
1630 /* then we will search more big word at the same point */
1631 SplitVar
*ptr
= var
;
1635 ptr
->next
= SplitToVariants(Conf
, node
, var
, word
, wordlen
, startpos
, level
);
1636 /* we can find next word */
1638 AddStem( var
, pnstrdup(word
+ startpos
, level
- startpos
) );
1639 node
= Conf
->Dictionary
;
1645 node
= StopMiddle
->node
;
1652 AddStem( var
, pnstrdup(word
+ startpos
, wordlen
- startpos
) );
1658 addNorm( TSLexeme
**lres
, TSLexeme
**lcur
, char *word
, int flags
, uint16 NVariant
)
1660 if ( *lres
== NULL
)
1661 *lcur
= *lres
= (TSLexeme
*) palloc(MAX_NORM
* sizeof(TSLexeme
));
1663 if ( *lcur
- *lres
< MAX_NORM
-1 ) {
1664 (*lcur
)->lexeme
= word
;
1665 (*lcur
)->flags
= flags
;
1666 (*lcur
)->nvariant
= NVariant
;
1668 (*lcur
)->lexeme
= NULL
;
1673 NINormalizeWord(IspellDict
*Conf
, char *word
)
1676 TSLexeme
*lcur
= NULL
,
1678 uint16 NVariant
= 1;
1680 res
= NormalizeSubWord(Conf
, word
, 0);
1686 while (*ptr
&& (lcur
-lres
) < MAX_NORM
)
1688 addNorm( &lres
, &lcur
, *ptr
, 0, NVariant
++);
1694 if (Conf
->usecompound
)
1696 int wordlen
= strlen(word
);
1698 *var
= SplitToVariants(Conf
, NULL
, NULL
, word
, wordlen
, 0, -1);
1705 char **subres
= NormalizeSubWord(Conf
, var
->stem
[var
->nstem
- 1], FF_COMPOUNDLAST
);
1709 char **subptr
= subres
;
1713 for (i
= 0; i
< var
->nstem
- 1; i
++)
1715 addNorm( &lres
, &lcur
, (subptr
== subres
) ? var
->stem
[i
] : pstrdup(var
->stem
[i
]), 0, NVariant
);
1718 addNorm( &lres
, &lcur
, *subptr
, 0, NVariant
);
1724 var
->stem
[0] = NULL
;
1725 pfree(var
->stem
[var
->nstem
- 1]);
1729 for (i
= 0; i
< var
->nstem
&& var
->stem
[i
]; i
++)
1730 pfree(var
->stem
[i
]);