1 /*-------------------------------------------------------------------------
4 * Thesaurus dictionary: phrase to phrase substitution
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 * src/backend/tsearch/dict_thesaurus.c
12 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "utils/fmgrprotos.h"
22 #include "utils/regproc.h"
26 * Temporary we use TSLexeme.flags for inner use...
28 #define DT_USEASIS 0x1000
30 typedef struct LexemeInfo
32 uint32 idsubst
; /* entry's number in DictThesaurus->subst */
33 uint16 posinsubst
; /* pos info in entry */
34 uint16 tnvariant
; /* total num lexemes in one variant */
35 struct LexemeInfo
*nextentry
;
36 struct LexemeInfo
*nextvariant
;
47 uint16 lastlexeme
; /* number lexemes to substitute */
49 TSLexeme
*res
; /* prepared substituted result */
54 /* subdictionary to normalize lexemes */
56 TSDictionaryCacheEntry
*subdict
;
58 /* Array to search lexeme by exact match */
60 int nwrds
; /* current number of words */
61 int ntwrds
; /* allocated array length */
64 * Storage of substituted result, n-th element is for n-th expression
72 newLexeme(DictThesaurus
*d
, char *b
, char *e
, uint32 idsubst
, uint16 posinsubst
)
76 if (d
->nwrds
>= d
->ntwrds
)
81 d
->wrds
= (TheLexeme
*) palloc(sizeof(TheLexeme
) * d
->ntwrds
);
86 d
->wrds
= (TheLexeme
*) repalloc(d
->wrds
, sizeof(TheLexeme
) * d
->ntwrds
);
90 ptr
= d
->wrds
+ d
->nwrds
;
93 ptr
->lexeme
= palloc(e
- b
+ 1);
95 memcpy(ptr
->lexeme
, b
, e
- b
);
96 ptr
->lexeme
[e
- b
] = '\0';
98 ptr
->entries
= (LexemeInfo
*) palloc(sizeof(LexemeInfo
));
100 ptr
->entries
->nextentry
= NULL
;
101 ptr
->entries
->idsubst
= idsubst
;
102 ptr
->entries
->posinsubst
= posinsubst
;
106 addWrd(DictThesaurus
*d
, char *b
, char *e
, uint32 idsubst
, uint16 nwrd
, uint16 posinsubst
, bool useasis
)
109 static int ntres
= 0;
116 if (idsubst
>= d
->nsubst
)
121 d
->subst
= (TheSubstitute
*) palloc(sizeof(TheSubstitute
) * d
->nsubst
);
126 d
->subst
= (TheSubstitute
*) repalloc(d
->subst
, sizeof(TheSubstitute
) * d
->nsubst
);
131 ptr
= d
->subst
+ idsubst
;
133 ptr
->lastlexeme
= posinsubst
- 1;
135 if (nres
+ 1 >= ntres
)
140 ptr
->res
= (TSLexeme
*) palloc(sizeof(TSLexeme
) * ntres
);
145 ptr
->res
= (TSLexeme
*) repalloc(ptr
->res
, sizeof(TSLexeme
) * ntres
);
149 ptr
->res
[nres
].lexeme
= palloc(e
- b
+ 1);
150 memcpy(ptr
->res
[nres
].lexeme
, b
, e
- b
);
151 ptr
->res
[nres
].lexeme
[e
- b
] = '\0';
153 ptr
->res
[nres
].nvariant
= nwrd
;
155 ptr
->res
[nres
].flags
= DT_USEASIS
;
157 ptr
->res
[nres
].flags
= 0;
159 ptr
->res
[++nres
].lexeme
= NULL
;
164 #define TR_WAITSUBS 3
168 thesaurusRead(const char *filename
, DictThesaurus
*d
)
170 tsearch_readline_state trst
;
172 bool useasis
= false;
175 filename
= get_tsearch_config_filename(filename
, "ths");
176 if (!tsearch_readline_begin(&trst
, filename
))
178 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
179 errmsg("could not open thesaurus file \"%s\": %m",
182 while ((line
= tsearch_readline(&trst
)) != NULL
)
185 int state
= TR_WAITLEX
;
186 char *beginwrd
= NULL
;
187 uint32 posinsubst
= 0;
192 /* is it a comment? */
193 while (*ptr
&& isspace((unsigned char) *ptr
))
194 ptr
+= pg_mblen(ptr
);
196 if (t_iseq(ptr
, '#') || *ptr
== '\0' ||
197 t_iseq(ptr
, '\n') || t_iseq(ptr
, '\r'))
205 if (state
== TR_WAITLEX
)
207 if (t_iseq(ptr
, ':'))
211 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
212 errmsg("unexpected delimiter")));
215 else if (!isspace((unsigned char) *ptr
))
221 else if (state
== TR_INLEX
)
223 if (t_iseq(ptr
, ':'))
225 newLexeme(d
, beginwrd
, ptr
, idsubst
, posinsubst
++);
228 else if (isspace((unsigned char) *ptr
))
230 newLexeme(d
, beginwrd
, ptr
, idsubst
, posinsubst
++);
234 else if (state
== TR_WAITSUBS
)
236 if (t_iseq(ptr
, '*'))
240 beginwrd
= ptr
+ pg_mblen(ptr
);
242 else if (t_iseq(ptr
, '\\'))
246 beginwrd
= ptr
+ pg_mblen(ptr
);
248 else if (!isspace((unsigned char) *ptr
))
255 else if (state
== TR_INSUBS
)
257 if (isspace((unsigned char) *ptr
))
261 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
262 errmsg("unexpected end of line or lexeme")));
263 addWrd(d
, beginwrd
, ptr
, idsubst
, nwrd
++, posinsubst
, useasis
);
268 elog(ERROR
, "unrecognized thesaurus state: %d", state
);
270 ptr
+= pg_mblen(ptr
);
273 if (state
== TR_INSUBS
)
277 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
278 errmsg("unexpected end of line or lexeme")));
279 addWrd(d
, beginwrd
, ptr
, idsubst
, nwrd
++, posinsubst
, useasis
);
284 if (!(nwrd
&& posinsubst
))
286 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
287 errmsg("unexpected end of line")));
289 if (nwrd
!= (uint16
) nwrd
|| posinsubst
!= (uint16
) posinsubst
)
291 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
292 errmsg("too many lexemes in thesaurus entry")));
299 tsearch_readline_end(&trst
);
303 addCompiledLexeme(TheLexeme
*newwrds
, int *nnw
, int *tnm
, TSLexeme
*lexeme
, LexemeInfo
*src
, uint16 tnvariant
)
308 newwrds
= (TheLexeme
*) repalloc(newwrds
, sizeof(TheLexeme
) * *tnm
);
311 newwrds
[*nnw
].entries
= (LexemeInfo
*) palloc(sizeof(LexemeInfo
));
313 if (lexeme
&& lexeme
->lexeme
)
315 newwrds
[*nnw
].lexeme
= pstrdup(lexeme
->lexeme
);
316 newwrds
[*nnw
].entries
->tnvariant
= tnvariant
;
320 newwrds
[*nnw
].lexeme
= NULL
;
321 newwrds
[*nnw
].entries
->tnvariant
= 1;
324 newwrds
[*nnw
].entries
->idsubst
= src
->idsubst
;
325 newwrds
[*nnw
].entries
->posinsubst
= src
->posinsubst
;
327 newwrds
[*nnw
].entries
->nextentry
= NULL
;
334 cmpLexemeInfo(LexemeInfo
*a
, LexemeInfo
*b
)
336 if (a
== NULL
|| b
== NULL
)
339 if (a
->idsubst
== b
->idsubst
)
341 if (a
->posinsubst
== b
->posinsubst
)
343 if (a
->tnvariant
== b
->tnvariant
)
346 return (a
->tnvariant
> b
->tnvariant
) ? 1 : -1;
349 return (a
->posinsubst
> b
->posinsubst
) ? 1 : -1;
352 return (a
->idsubst
> b
->idsubst
) ? 1 : -1;
356 cmpLexeme(const TheLexeme
*a
, const TheLexeme
*b
)
358 if (a
->lexeme
== NULL
)
360 if (b
->lexeme
== NULL
)
365 else if (b
->lexeme
== NULL
)
368 return strcmp(a
->lexeme
, b
->lexeme
);
372 cmpLexemeQ(const void *a
, const void *b
)
374 return cmpLexeme((const TheLexeme
*) a
, (const TheLexeme
*) b
);
378 cmpTheLexeme(const void *a
, const void *b
)
380 const TheLexeme
*la
= (const TheLexeme
*) a
;
381 const TheLexeme
*lb
= (const TheLexeme
*) b
;
384 if ((res
= cmpLexeme(la
, lb
)) != 0)
387 return -cmpLexemeInfo(la
->entries
, lb
->entries
);
391 compileTheLexeme(DictThesaurus
*d
)
396 TheLexeme
*newwrds
= (TheLexeme
*) palloc(sizeof(TheLexeme
) * tnm
),
399 for (i
= 0; i
< d
->nwrds
; i
++)
403 if (strcmp(d
->wrds
[i
].lexeme
, "?") == 0) /* Is stop word marker? */
404 newwrds
= addCompiledLexeme(newwrds
, &nnw
, &tnm
, NULL
, d
->wrds
[i
].entries
, 0);
407 ptr
= (TSLexeme
*) DatumGetPointer(FunctionCall4(&(d
->subdict
->lexize
),
408 PointerGetDatum(d
->subdict
->dictData
),
409 PointerGetDatum(d
->wrds
[i
].lexeme
),
410 Int32GetDatum(strlen(d
->wrds
[i
].lexeme
)),
411 PointerGetDatum(NULL
)));
415 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
416 errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
418 d
->wrds
[i
].entries
->idsubst
+ 1)));
419 else if (!(ptr
->lexeme
))
421 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
422 errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
424 d
->wrds
[i
].entries
->idsubst
+ 1),
425 errhint("Use \"?\" to represent a stop word within a sample phrase.")));
430 TSLexeme
*remptr
= ptr
+ 1;
432 int curvar
= ptr
->nvariant
;
434 /* compute n words in one variant */
435 while (remptr
->lexeme
)
437 if (remptr
->nvariant
!= (remptr
- 1)->nvariant
)
444 while (remptr
->lexeme
&& remptr
->nvariant
== curvar
)
446 newwrds
= addCompiledLexeme(newwrds
, &nnw
, &tnm
, remptr
, d
->wrds
[i
].entries
, tnvar
);
455 pfree(d
->wrds
[i
].lexeme
);
456 pfree(d
->wrds
[i
].entries
);
467 qsort(d
->wrds
, d
->nwrds
, sizeof(TheLexeme
), cmpTheLexeme
);
471 ptrwrds
= d
->wrds
+ 1;
472 while (ptrwrds
- d
->wrds
< d
->nwrds
)
474 if (cmpLexeme(ptrwrds
, newwrds
) == 0)
476 if (cmpLexemeInfo(ptrwrds
->entries
, newwrds
->entries
))
478 ptrwrds
->entries
->nextentry
= newwrds
->entries
;
479 newwrds
->entries
= ptrwrds
->entries
;
482 pfree(ptrwrds
->entries
);
485 pfree(ptrwrds
->lexeme
);
496 d
->nwrds
= newwrds
- d
->wrds
+ 1;
497 d
->wrds
= (TheLexeme
*) repalloc(d
->wrds
, sizeof(TheLexeme
) * d
->nwrds
);
502 compileTheSubstitute(DictThesaurus
*d
)
506 for (i
= 0; i
< d
->nsubst
; i
++)
508 TSLexeme
*rem
= d
->subst
[i
].res
,
513 outptr
= d
->subst
[i
].res
= (TSLexeme
*) palloc(sizeof(TSLexeme
) * n
);
514 outptr
->lexeme
= NULL
;
517 while (inptr
&& inptr
->lexeme
)
522 if (inptr
->flags
& DT_USEASIS
)
523 { /* do not lexize */
526 tmplex
[1].lexeme
= NULL
;
531 lexized
= (TSLexeme
*) DatumGetPointer(FunctionCall4(&(d
->subdict
->lexize
),
532 PointerGetDatum(d
->subdict
->dictData
),
533 PointerGetDatum(inptr
->lexeme
),
534 Int32GetDatum(strlen(inptr
->lexeme
)),
535 PointerGetDatum(NULL
)));
538 if (lexized
&& lexized
->lexeme
)
540 int toset
= (lexized
->lexeme
&& outptr
!= d
->subst
[i
].res
) ? (outptr
- d
->subst
[i
].res
) : -1;
542 while (lexized
->lexeme
)
544 if (outptr
- d
->subst
[i
].res
+ 1 >= n
)
546 int diff
= outptr
- d
->subst
[i
].res
;
549 d
->subst
[i
].res
= (TSLexeme
*) repalloc(d
->subst
[i
].res
, sizeof(TSLexeme
) * n
);
550 outptr
= d
->subst
[i
].res
+ diff
;
554 outptr
->lexeme
= pstrdup(lexized
->lexeme
);
561 d
->subst
[i
].res
[toset
].flags
|= TSL_ADDPOS
;
566 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
567 errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568 inptr
->lexeme
, i
+ 1)));
573 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
574 errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575 inptr
->lexeme
, i
+ 1)));
579 pfree(inptr
->lexeme
);
583 if (outptr
== d
->subst
[i
].res
)
585 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
586 errmsg("thesaurus substitute phrase is empty (rule %d)",
589 d
->subst
[i
].reslen
= outptr
- d
->subst
[i
].res
;
596 thesaurus_init(PG_FUNCTION_ARGS
)
598 List
*dictoptions
= (List
*) PG_GETARG_POINTER(0);
600 char *subdictname
= NULL
;
601 bool fileloaded
= false;
605 d
= (DictThesaurus
*) palloc0(sizeof(DictThesaurus
));
607 foreach(l
, dictoptions
)
609 DefElem
*defel
= (DefElem
*) lfirst(l
);
611 if (strcmp(defel
->defname
, "dictfile") == 0)
615 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
616 errmsg("multiple DictFile parameters")));
617 thesaurusRead(defGetString(defel
), d
);
620 else if (strcmp(defel
->defname
, "dictionary") == 0)
624 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
625 errmsg("multiple Dictionary parameters")));
626 subdictname
= pstrdup(defGetString(defel
));
631 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
632 errmsg("unrecognized Thesaurus parameter: \"%s\"",
639 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
640 errmsg("missing DictFile parameter")));
643 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
644 errmsg("missing Dictionary parameter")));
646 namelist
= stringToQualifiedNameList(subdictname
, NULL
);
647 d
->subdictOid
= get_ts_dict_oid(namelist
, false);
648 d
->subdict
= lookup_ts_dictionary_cache(d
->subdictOid
);
651 compileTheSubstitute(d
);
653 PG_RETURN_POINTER(d
);
657 findTheLexeme(DictThesaurus
*d
, char *lexeme
)
668 res
= bsearch(&key
, d
->wrds
, d
->nwrds
, sizeof(TheLexeme
), cmpLexemeQ
);
676 matchIdSubst(LexemeInfo
*stored
, uint32 idsubst
)
684 for (; stored
; stored
= stored
->nextvariant
)
685 if (stored
->idsubst
== idsubst
)
696 findVariant(LexemeInfo
*in
, LexemeInfo
*stored
, uint16 curpos
, LexemeInfo
**newin
, int newn
)
701 LexemeInfo
*ptr
= newin
[0];
703 for (i
= 0; i
< newn
; i
++)
705 while (newin
[i
] && newin
[i
]->idsubst
< ptr
->idsubst
)
706 newin
[i
] = newin
[i
]->nextentry
;
708 if (newin
[i
] == NULL
)
711 if (newin
[i
]->idsubst
> ptr
->idsubst
)
718 while (newin
[i
]->idsubst
== ptr
->idsubst
)
720 if (newin
[i
]->posinsubst
== curpos
&& newin
[i
]->tnvariant
== newn
)
726 newin
[i
] = newin
[i
]->nextentry
;
727 if (newin
[i
] == NULL
)
731 if (newin
[i
]->idsubst
!= ptr
->idsubst
)
739 if (i
== newn
&& matchIdSubst(stored
, ptr
->idsubst
) && (in
== NULL
|| !matchIdSubst(in
, ptr
->idsubst
)))
742 ptr
->nextvariant
= in
;
747 for (i
= 0; i
< newn
; i
++)
748 newin
[i
] = newin
[i
]->nextentry
;
753 copyTSLexeme(TheSubstitute
*ts
)
758 res
= (TSLexeme
*) palloc(sizeof(TSLexeme
) * (ts
->reslen
+ 1));
759 for (i
= 0; i
< ts
->reslen
; i
++)
762 res
[i
].lexeme
= pstrdup(ts
->res
[i
].lexeme
);
765 res
[ts
->reslen
].lexeme
= NULL
;
771 checkMatch(DictThesaurus
*d
, LexemeInfo
*info
, uint16 curpos
, bool *moreres
)
776 Assert(info
->idsubst
< d
->nsubst
);
777 if (info
->nextvariant
)
779 if (d
->subst
[info
->idsubst
].lastlexeme
== curpos
)
780 return copyTSLexeme(d
->subst
+ info
->idsubst
);
781 info
= info
->nextvariant
;
788 thesaurus_lexize(PG_FUNCTION_ARGS
)
790 DictThesaurus
*d
= (DictThesaurus
*) PG_GETARG_POINTER(0);
791 DictSubState
*dstate
= (DictSubState
*) PG_GETARG_POINTER(3);
792 TSLexeme
*res
= NULL
;
796 bool moreres
= false;
798 if (PG_NARGS() != 4 || dstate
== NULL
)
799 elog(ERROR
, "forbidden call of thesaurus or nested call");
802 PG_RETURN_POINTER(NULL
);
803 stored
= (LexemeInfo
*) dstate
->private_state
;
806 curpos
= stored
->posinsubst
+ 1;
808 if (!d
->subdict
->isvalid
)
809 d
->subdict
= lookup_ts_dictionary_cache(d
->subdictOid
);
811 res
= (TSLexeme
*) DatumGetPointer(FunctionCall4(&(d
->subdict
->lexize
),
812 PointerGetDatum(d
->subdict
->dictData
),
815 PointerGetDatum(NULL
)));
817 if (res
&& res
->lexeme
)
824 uint16 nv
= ptr
->nvariant
;
830 while (ptr
->lexeme
&& nv
== ptr
->nvariant
)
836 infos
= (LexemeInfo
**) palloc(sizeof(LexemeInfo
*) * nlex
);
837 for (i
= 0; i
< nlex
; i
++)
838 if ((infos
[i
] = findTheLexeme(d
, basevar
[i
].lexeme
)) == NULL
)
843 /* no chance to find */
848 info
= findVariant(info
, stored
, curpos
, infos
, nlex
);
853 LexemeInfo
*infos
= findTheLexeme(d
, NULL
);
855 info
= findVariant(NULL
, stored
, curpos
, &infos
, 1);
859 info
= NULL
; /* word isn't recognized */
862 dstate
->private_state
= info
;
866 dstate
->getnext
= false;
867 PG_RETURN_POINTER(NULL
);
870 if ((res
= checkMatch(d
, info
, curpos
, &moreres
)) != NULL
)
872 dstate
->getnext
= moreres
;
873 PG_RETURN_POINTER(res
);
876 dstate
->getnext
= true;
878 PG_RETURN_POINTER(NULL
);