1 /*-------------------------------------------------------------------------
4 * Thesaurus dictionary: phrase to phrase substitution
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
26 * Temporay we use TSLexeme.flags for inner use...
28 #define DT_USEASIS 0x1000
30 typedef struct LexemeInfo
32 uint16 idsubst
; /* entry's number in DictThesaurus->subst */
33 uint16 posinsubst
; /* pos info in entry */
34 uint16 tnvariant
; /* total num lexemes in one variant */
35 struct LexemeInfo
*nextentry
;
36 struct LexemeInfo
*nextvariant
;
47 uint16 lastlexeme
; /* number lexemes to substitute */
49 TSLexeme
*res
; /* prepared substituted result */
54 /* subdictionary to normalize lexemes */
56 TSDictionaryCacheEntry
*subdict
;
58 /* Array to search lexeme by exact match */
64 * Storage of substituted result, n-th element is for n-th expression
72 newLexeme(DictThesaurus
*d
, char *b
, char *e
, uint16 idsubst
, uint16 posinsubst
)
76 if (d
->nwrds
>= d
->ntwrds
)
81 d
->wrds
= (TheLexeme
*) palloc(sizeof(TheLexeme
) * d
->ntwrds
);
86 d
->wrds
= (TheLexeme
*) repalloc(d
->wrds
, sizeof(TheLexeme
) * d
->ntwrds
);
90 ptr
= d
->wrds
+ d
->nwrds
;
93 ptr
->lexeme
= palloc(e
- b
+ 1);
95 memcpy(ptr
->lexeme
, b
, e
- b
);
96 ptr
->lexeme
[e
- b
] = '\0';
98 ptr
->entries
= (LexemeInfo
*) palloc(sizeof(LexemeInfo
));
100 ptr
->entries
->nextentry
= NULL
;
101 ptr
->entries
->idsubst
= idsubst
;
102 ptr
->entries
->posinsubst
= posinsubst
;
106 addWrd(DictThesaurus
*d
, char *b
, char *e
, uint16 idsubst
, uint16 nwrd
, uint16 posinsubst
, bool useasis
)
109 static int ntres
= 0;
116 if (idsubst
>= d
->nsubst
)
121 d
->subst
= (TheSubstitute
*) palloc(sizeof(TheSubstitute
) * d
->nsubst
);
126 d
->subst
= (TheSubstitute
*) repalloc(d
->subst
, sizeof(TheSubstitute
) * d
->nsubst
);
131 ptr
= d
->subst
+ idsubst
;
133 ptr
->lastlexeme
= posinsubst
- 1;
135 if (nres
+ 1 >= ntres
)
140 ptr
->res
= (TSLexeme
*) palloc(sizeof(TSLexeme
) * ntres
);
145 ptr
->res
= (TSLexeme
*) repalloc(ptr
->res
, sizeof(TSLexeme
) * ntres
);
150 ptr
->res
[nres
].lexeme
= palloc(e
- b
+ 1);
151 memcpy(ptr
->res
[nres
].lexeme
, b
, e
- b
);
152 ptr
->res
[nres
].lexeme
[e
- b
] = '\0';
154 ptr
->res
[nres
].nvariant
= nwrd
;
156 ptr
->res
[nres
].flags
= DT_USEASIS
;
158 ptr
->res
[nres
].flags
= 0;
160 ptr
->res
[++nres
].lexeme
= NULL
;
165 #define TR_WAITSUBS 3
169 thesaurusRead(char *filename
, DictThesaurus
*d
)
171 tsearch_readline_state trst
;
173 bool useasis
= false;
176 filename
= get_tsearch_config_filename(filename
, "ths");
177 if (!tsearch_readline_begin(&trst
, filename
))
179 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
180 errmsg("could not open thesaurus file \"%s\": %m",
183 while ((line
= tsearch_readline(&trst
)) != NULL
)
186 int state
= TR_WAITLEX
;
187 char *beginwrd
= NULL
;
188 uint16 posinsubst
= 0;
193 /* is it a comment? */
194 while (*ptr
&& t_isspace(ptr
))
195 ptr
+= pg_mblen(ptr
);
197 if (t_iseq(ptr
, '#') || *ptr
== '\0' ||
198 t_iseq(ptr
, '\n') || t_iseq(ptr
, '\r'))
206 if (state
== TR_WAITLEX
)
208 if (t_iseq(ptr
, ':'))
212 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
213 errmsg("unexpected delimiter")));
216 else if (!t_isspace(ptr
))
222 else if (state
== TR_INLEX
)
224 if (t_iseq(ptr
, ':'))
226 newLexeme(d
, beginwrd
, ptr
, idsubst
, posinsubst
++);
229 else if (t_isspace(ptr
))
231 newLexeme(d
, beginwrd
, ptr
, idsubst
, posinsubst
++);
235 else if (state
== TR_WAITSUBS
)
237 if (t_iseq(ptr
, '*'))
241 beginwrd
= ptr
+ pg_mblen(ptr
);
243 else if (t_iseq(ptr
, '\\'))
247 beginwrd
= ptr
+ pg_mblen(ptr
);
249 else if (!t_isspace(ptr
))
256 else if (state
== TR_INSUBS
)
262 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
263 errmsg("unexpected end of line or lexeme")));
264 addWrd(d
, beginwrd
, ptr
, idsubst
, nwrd
++, posinsubst
, useasis
);
269 elog(ERROR
, "unrecognized thesaurus state: %d", state
);
271 ptr
+= pg_mblen(ptr
);
274 if (state
== TR_INSUBS
)
278 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
279 errmsg("unexpected end of line or lexeme")));
280 addWrd(d
, beginwrd
, ptr
, idsubst
, nwrd
++, posinsubst
, useasis
);
285 if (!(nwrd
&& posinsubst
))
287 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
288 errmsg("unexpected end of line")));
295 tsearch_readline_end(&trst
);
299 addCompiledLexeme(TheLexeme
*newwrds
, int *nnw
, int *tnm
, TSLexeme
*lexeme
, LexemeInfo
*src
, uint16 tnvariant
)
305 newwrds
= (TheLexeme
*) repalloc(newwrds
, sizeof(TheLexeme
) * *tnm
);
308 newwrds
[*nnw
].entries
= (LexemeInfo
*) palloc(sizeof(LexemeInfo
));
310 if (lexeme
&& lexeme
->lexeme
)
312 newwrds
[*nnw
].lexeme
= pstrdup(lexeme
->lexeme
);
313 newwrds
[*nnw
].entries
->tnvariant
= tnvariant
;
317 newwrds
[*nnw
].lexeme
= NULL
;
318 newwrds
[*nnw
].entries
->tnvariant
= 1;
321 newwrds
[*nnw
].entries
->idsubst
= src
->idsubst
;
322 newwrds
[*nnw
].entries
->posinsubst
= src
->posinsubst
;
324 newwrds
[*nnw
].entries
->nextentry
= NULL
;
331 cmpLexemeInfo(LexemeInfo
*a
, LexemeInfo
*b
)
333 if (a
== NULL
|| b
== NULL
)
336 if (a
->idsubst
== b
->idsubst
)
338 if (a
->posinsubst
== b
->posinsubst
)
340 if (a
->tnvariant
== b
->tnvariant
)
343 return (a
->tnvariant
> b
->tnvariant
) ? 1 : -1;
346 return (a
->posinsubst
> b
->posinsubst
) ? 1 : -1;
349 return (a
->idsubst
> b
->idsubst
) ? 1 : -1;
353 cmpLexeme(TheLexeme
*a
, TheLexeme
*b
)
355 if (a
->lexeme
== NULL
)
357 if (b
->lexeme
== NULL
)
362 else if (b
->lexeme
== NULL
)
365 return strcmp(a
->lexeme
, b
->lexeme
);
369 cmpLexemeQ(const void *a
, const void *b
)
371 return cmpLexeme((TheLexeme
*) a
, (TheLexeme
*) b
);
375 cmpTheLexeme(const void *a
, const void *b
)
377 TheLexeme
*la
= (TheLexeme
*) a
;
378 TheLexeme
*lb
= (TheLexeme
*) b
;
381 if ((res
= cmpLexeme(la
, lb
)) != 0)
384 return -cmpLexemeInfo(la
->entries
, lb
->entries
);
388 compileTheLexeme(DictThesaurus
*d
)
393 TheLexeme
*newwrds
= (TheLexeme
*) palloc(sizeof(TheLexeme
) * tnm
),
396 for (i
= 0; i
< d
->nwrds
; i
++)
400 if (strcmp(d
->wrds
[i
].lexeme
, "?") == 0) /* Is stop word marker? */
401 newwrds
= addCompiledLexeme(newwrds
, &nnw
, &tnm
, NULL
, d
->wrds
[i
].entries
, 0);
404 ptr
= (TSLexeme
*) DatumGetPointer(FunctionCall4(&(d
->subdict
->lexize
),
405 PointerGetDatum(d
->subdict
->dictData
),
406 PointerGetDatum(d
->wrds
[i
].lexeme
),
407 Int32GetDatum(strlen(d
->wrds
[i
].lexeme
)),
408 PointerGetDatum(NULL
)));
412 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
413 errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
415 d
->wrds
[i
].entries
->idsubst
+ 1)));
416 else if (!(ptr
->lexeme
))
418 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
419 errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
421 d
->wrds
[i
].entries
->idsubst
+ 1),
422 errhint("Use \"?\" to represent a stop word within a sample phrase.")));
427 TSLexeme
*remptr
= ptr
+ 1;
429 int curvar
= ptr
->nvariant
;
431 /* compute n words in one variant */
432 while (remptr
->lexeme
)
434 if (remptr
->nvariant
!= (remptr
- 1)->nvariant
)
441 while (remptr
->lexeme
&& remptr
->nvariant
== curvar
)
443 newwrds
= addCompiledLexeme(newwrds
, &nnw
, &tnm
, remptr
, d
->wrds
[i
].entries
, tnvar
);
452 pfree(d
->wrds
[i
].lexeme
);
453 pfree(d
->wrds
[i
].entries
);
463 qsort(d
->wrds
, d
->nwrds
, sizeof(TheLexeme
), cmpTheLexeme
);
467 ptrwrds
= d
->wrds
+ 1;
468 while (ptrwrds
- d
->wrds
< d
->nwrds
)
470 if (cmpLexeme(ptrwrds
, newwrds
) == 0)
472 if (cmpLexemeInfo(ptrwrds
->entries
, newwrds
->entries
))
474 ptrwrds
->entries
->nextentry
= newwrds
->entries
;
475 newwrds
->entries
= ptrwrds
->entries
;
478 pfree(ptrwrds
->entries
);
481 pfree(ptrwrds
->lexeme
);
492 d
->nwrds
= newwrds
- d
->wrds
+ 1;
493 d
->wrds
= (TheLexeme
*) repalloc(d
->wrds
, sizeof(TheLexeme
) * d
->nwrds
);
498 compileTheSubstitute(DictThesaurus
*d
)
502 for (i
= 0; i
< d
->nsubst
; i
++)
504 TSLexeme
*rem
= d
->subst
[i
].res
,
509 outptr
= d
->subst
[i
].res
= (TSLexeme
*) palloc(sizeof(TSLexeme
) * n
);
510 outptr
->lexeme
= NULL
;
513 while (inptr
&& inptr
->lexeme
)
518 if (inptr
->flags
& DT_USEASIS
)
519 { /* do not lexize */
522 tmplex
[1].lexeme
= NULL
;
527 lexized
= (TSLexeme
*) DatumGetPointer(
529 &(d
->subdict
->lexize
),
530 PointerGetDatum(d
->subdict
->dictData
),
531 PointerGetDatum(inptr
->lexeme
),
532 Int32GetDatum(strlen(inptr
->lexeme
)),
533 PointerGetDatum(NULL
)
538 if (lexized
&& lexized
->lexeme
)
540 int toset
= (lexized
->lexeme
&& outptr
!= d
->subst
[i
].res
) ? (outptr
- d
->subst
[i
].res
) : -1;
542 while (lexized
->lexeme
)
544 if (outptr
- d
->subst
[i
].res
+ 1 >= n
)
546 int diff
= outptr
- d
->subst
[i
].res
;
549 d
->subst
[i
].res
= (TSLexeme
*) repalloc(d
->subst
[i
].res
, sizeof(TSLexeme
) * n
);
550 outptr
= d
->subst
[i
].res
+ diff
;
554 outptr
->lexeme
= pstrdup(lexized
->lexeme
);
561 d
->subst
[i
].res
[toset
].flags
|= TSL_ADDPOS
;
566 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
567 errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568 inptr
->lexeme
, i
+ 1)));
573 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
574 errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575 inptr
->lexeme
, i
+ 1)));
579 pfree(inptr
->lexeme
);
583 if (outptr
== d
->subst
[i
].res
)
585 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
586 errmsg("thesaurus substitute phrase is empty (rule %d)",
589 d
->subst
[i
].reslen
= outptr
- d
->subst
[i
].res
;
596 thesaurus_init(PG_FUNCTION_ARGS
)
598 List
*dictoptions
= (List
*) PG_GETARG_POINTER(0);
600 char *subdictname
= NULL
;
601 bool fileloaded
= false;
604 d
= (DictThesaurus
*) palloc0(sizeof(DictThesaurus
));
606 foreach(l
, dictoptions
)
608 DefElem
*defel
= (DefElem
*) lfirst(l
);
610 if (pg_strcasecmp("DictFile", defel
->defname
) == 0)
614 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
615 errmsg("multiple DictFile parameters")));
616 thesaurusRead(defGetString(defel
), d
);
619 else if (pg_strcasecmp("Dictionary", defel
->defname
) == 0)
623 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
624 errmsg("multiple Dictionary parameters")));
625 subdictname
= pstrdup(defGetString(defel
));
630 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
631 errmsg("unrecognized Thesaurus parameter: \"%s\"",
638 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
639 errmsg("missing DictFile parameter")));
642 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
643 errmsg("missing Dictionary parameter")));
645 d
->subdictOid
= TSDictionaryGetDictid(stringToQualifiedNameList(subdictname
), false);
646 d
->subdict
= lookup_ts_dictionary_cache(d
->subdictOid
);
649 compileTheSubstitute(d
);
651 PG_RETURN_POINTER(d
);
655 findTheLexeme(DictThesaurus
*d
, char *lexeme
)
666 res
= bsearch(&key
, d
->wrds
, d
->nwrds
, sizeof(TheLexeme
), cmpLexemeQ
);
674 matchIdSubst(LexemeInfo
*stored
, uint16 idsubst
)
682 for (; stored
; stored
= stored
->nextvariant
)
683 if (stored
->idsubst
== idsubst
)
694 findVariant(LexemeInfo
*in
, LexemeInfo
*stored
, uint16 curpos
, LexemeInfo
**newin
, int newn
)
699 LexemeInfo
*ptr
= newin
[0];
701 for (i
= 0; i
< newn
; i
++)
703 while (newin
[i
] && newin
[i
]->idsubst
< ptr
->idsubst
)
704 newin
[i
] = newin
[i
]->nextentry
;
706 if (newin
[i
] == NULL
)
709 if (newin
[i
]->idsubst
> ptr
->idsubst
)
716 while (newin
[i
]->idsubst
== ptr
->idsubst
)
718 if (newin
[i
]->posinsubst
== curpos
&& newin
[i
]->tnvariant
== newn
)
724 newin
[i
] = newin
[i
]->nextentry
;
725 if (newin
[i
] == NULL
)
729 if (newin
[i
]->idsubst
!= ptr
->idsubst
)
737 if (i
== newn
&& matchIdSubst(stored
, ptr
->idsubst
) && (in
== NULL
|| !matchIdSubst(in
, ptr
->idsubst
)))
740 ptr
->nextvariant
= in
;
745 for (i
= 0; i
< newn
; i
++)
746 newin
[i
] = newin
[i
]->nextentry
;
753 copyTSLexeme(TheSubstitute
*ts
)
758 res
= (TSLexeme
*) palloc(sizeof(TSLexeme
) * (ts
->reslen
+ 1));
759 for (i
= 0; i
< ts
->reslen
; i
++)
762 res
[i
].lexeme
= pstrdup(ts
->res
[i
].lexeme
);
765 res
[ts
->reslen
].lexeme
= NULL
;
771 checkMatch(DictThesaurus
*d
, LexemeInfo
*info
, uint16 curpos
, bool *moreres
)
776 Assert(info
->idsubst
< d
->nsubst
);
777 if (info
->nextvariant
)
779 if (d
->subst
[info
->idsubst
].lastlexeme
== curpos
)
780 return copyTSLexeme(d
->subst
+ info
->idsubst
);
781 info
= info
->nextvariant
;
788 thesaurus_lexize(PG_FUNCTION_ARGS
)
790 DictThesaurus
*d
= (DictThesaurus
*) PG_GETARG_POINTER(0);
791 DictSubState
*dstate
= (DictSubState
*) PG_GETARG_POINTER(3);
792 TSLexeme
*res
= NULL
;
796 bool moreres
= false;
798 if (PG_NARGS() != 4 || dstate
== NULL
)
799 elog(ERROR
, "forbidden call of thesaurus or nested call");
802 PG_RETURN_POINTER(NULL
);
803 stored
= (LexemeInfo
*) dstate
->private;
806 curpos
= stored
->posinsubst
+ 1;
808 if (!d
->subdict
->isvalid
)
809 d
->subdict
= lookup_ts_dictionary_cache(d
->subdictOid
);
811 res
= (TSLexeme
*) DatumGetPointer(FunctionCall4(&(d
->subdict
->lexize
),
812 PointerGetDatum(d
->subdict
->dictData
),
815 PointerGetDatum(NULL
)));
817 if (res
&& res
->lexeme
)
824 uint16 nv
= ptr
->nvariant
;
830 while (ptr
->lexeme
&& nv
== ptr
->nvariant
)
836 infos
= (LexemeInfo
**) palloc(sizeof(LexemeInfo
*) * nlex
);
837 for (i
= 0; i
< nlex
; i
++)
838 if ((infos
[i
] = findTheLexeme(d
, basevar
[i
].lexeme
)) == NULL
)
843 /* no chance to find */
848 info
= findVariant(info
, stored
, curpos
, infos
, nlex
);
853 LexemeInfo
*infos
= findTheLexeme(d
, NULL
);
855 info
= findVariant(NULL
, stored
, curpos
, &infos
, 1);
859 info
= NULL
; /* word isn't recognized */
862 dstate
->private = (void *) info
;
866 dstate
->getnext
= false;
867 PG_RETURN_POINTER(NULL
);
870 if ((res
= checkMatch(d
, info
, curpos
, &moreres
)) != NULL
)
872 dstate
->getnext
= moreres
;
873 PG_RETURN_POINTER(res
);
876 dstate
->getnext
= true;
878 PG_RETURN_POINTER(NULL
);