Fix oversight in previous error-reporting patch; mustn't pfree path string
[PostgreSQL.git] / src / backend / tsearch / dict_thesaurus.c
blob9d1f105985ee3bfce500df00127966c19e690f10
1 /*-------------------------------------------------------------------------
3 * dict_thesaurus.c
4 * Thesaurus dictionary: phrase to phrase substitution
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * $PostgreSQL$
12 *-------------------------------------------------------------------------
14 #include "postgres.h"
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
26 * Temporay we use TSLexeme.flags for inner use...
28 #define DT_USEASIS 0x1000
30 typedef struct LexemeInfo
32 uint16 idsubst; /* entry's number in DictThesaurus->subst */
33 uint16 posinsubst; /* pos info in entry */
34 uint16 tnvariant; /* total num lexemes in one variant */
35 struct LexemeInfo *nextentry;
36 struct LexemeInfo *nextvariant;
37 } LexemeInfo;
39 typedef struct
41 char *lexeme;
42 LexemeInfo *entries;
43 } TheLexeme;
45 typedef struct
47 uint16 lastlexeme; /* number lexemes to substitute */
48 uint16 reslen;
49 TSLexeme *res; /* prepared substituted result */
50 } TheSubstitute;
52 typedef struct
54 /* subdictionary to normalize lexemes */
55 Oid subdictOid;
56 TSDictionaryCacheEntry *subdict;
58 /* Array to search lexeme by exact match */
59 TheLexeme *wrds;
60 int nwrds;
61 int ntwrds;
64 * Storage of substituted result, n-th element is for n-th expression
66 TheSubstitute *subst;
67 int nsubst;
68 } DictThesaurus;
71 static void
72 newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
74 TheLexeme *ptr;
76 if (d->nwrds >= d->ntwrds)
78 if (d->ntwrds == 0)
80 d->ntwrds = 16;
81 d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
83 else
85 d->ntwrds *= 2;
86 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
90 ptr = d->wrds + d->nwrds;
91 d->nwrds++;
93 ptr->lexeme = palloc(e - b + 1);
95 memcpy(ptr->lexeme, b, e - b);
96 ptr->lexeme[e - b] = '\0';
98 ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
100 ptr->entries->nextentry = NULL;
101 ptr->entries->idsubst = idsubst;
102 ptr->entries->posinsubst = posinsubst;
105 static void
106 addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
108 static int nres = 0;
109 static int ntres = 0;
110 TheSubstitute *ptr;
112 if (nwrd == 0)
114 nres = ntres = 0;
116 if (idsubst >= d->nsubst)
118 if (d->nsubst == 0)
120 d->nsubst = 16;
121 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
123 else
125 d->nsubst *= 2;
126 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
131 ptr = d->subst + idsubst;
133 ptr->lastlexeme = posinsubst - 1;
135 if (nres + 1 >= ntres)
137 if (ntres == 0)
139 ntres = 2;
140 ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
142 else
144 ntres *= 2;
145 ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
150 ptr->res[nres].lexeme = palloc(e - b + 1);
151 memcpy(ptr->res[nres].lexeme, b, e - b);
152 ptr->res[nres].lexeme[e - b] = '\0';
154 ptr->res[nres].nvariant = nwrd;
155 if (useasis)
156 ptr->res[nres].flags = DT_USEASIS;
157 else
158 ptr->res[nres].flags = 0;
160 ptr->res[++nres].lexeme = NULL;
163 #define TR_WAITLEX 1
164 #define TR_INLEX 2
165 #define TR_WAITSUBS 3
166 #define TR_INSUBS 4
168 static void
169 thesaurusRead(char *filename, DictThesaurus *d)
171 tsearch_readline_state trst;
172 uint16 idsubst = 0;
173 bool useasis = false;
174 char *line;
176 filename = get_tsearch_config_filename(filename, "ths");
177 if (!tsearch_readline_begin(&trst, filename))
178 ereport(ERROR,
179 (errcode(ERRCODE_CONFIG_FILE_ERROR),
180 errmsg("could not open thesaurus file \"%s\": %m",
181 filename)));
183 while ((line = tsearch_readline(&trst)) != NULL)
185 char *ptr;
186 int state = TR_WAITLEX;
187 char *beginwrd = NULL;
188 uint16 posinsubst = 0;
189 uint16 nwrd = 0;
191 ptr = line;
193 /* is it a comment? */
194 while (*ptr && t_isspace(ptr))
195 ptr += pg_mblen(ptr);
197 if (t_iseq(ptr, '#') || *ptr == '\0' ||
198 t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
200 pfree(line);
201 continue;
204 while (*ptr)
206 if (state == TR_WAITLEX)
208 if (t_iseq(ptr, ':'))
210 if (posinsubst == 0)
211 ereport(ERROR,
212 (errcode(ERRCODE_CONFIG_FILE_ERROR),
213 errmsg("unexpected delimiter")));
214 state = TR_WAITSUBS;
216 else if (!t_isspace(ptr))
218 beginwrd = ptr;
219 state = TR_INLEX;
222 else if (state == TR_INLEX)
224 if (t_iseq(ptr, ':'))
226 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
227 state = TR_WAITSUBS;
229 else if (t_isspace(ptr))
231 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
232 state = TR_WAITLEX;
235 else if (state == TR_WAITSUBS)
237 if (t_iseq(ptr, '*'))
239 useasis = true;
240 state = TR_INSUBS;
241 beginwrd = ptr + pg_mblen(ptr);
243 else if (t_iseq(ptr, '\\'))
245 useasis = false;
246 state = TR_INSUBS;
247 beginwrd = ptr + pg_mblen(ptr);
249 else if (!t_isspace(ptr))
251 useasis = false;
252 beginwrd = ptr;
253 state = TR_INSUBS;
256 else if (state == TR_INSUBS)
258 if (t_isspace(ptr))
260 if (ptr == beginwrd)
261 ereport(ERROR,
262 (errcode(ERRCODE_CONFIG_FILE_ERROR),
263 errmsg("unexpected end of line or lexeme")));
264 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
265 state = TR_WAITSUBS;
268 else
269 elog(ERROR, "unrecognized thesaurus state: %d", state);
271 ptr += pg_mblen(ptr);
274 if (state == TR_INSUBS)
276 if (ptr == beginwrd)
277 ereport(ERROR,
278 (errcode(ERRCODE_CONFIG_FILE_ERROR),
279 errmsg("unexpected end of line or lexeme")));
280 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
283 idsubst++;
285 if (!(nwrd && posinsubst))
286 ereport(ERROR,
287 (errcode(ERRCODE_CONFIG_FILE_ERROR),
288 errmsg("unexpected end of line")));
290 pfree(line);
293 d->nsubst = idsubst;
295 tsearch_readline_end(&trst);
298 static TheLexeme *
299 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
302 if (*nnw >= *tnm)
304 *tnm *= 2;
305 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
308 newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
310 if (lexeme && lexeme->lexeme)
312 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
313 newwrds[*nnw].entries->tnvariant = tnvariant;
315 else
317 newwrds[*nnw].lexeme = NULL;
318 newwrds[*nnw].entries->tnvariant = 1;
321 newwrds[*nnw].entries->idsubst = src->idsubst;
322 newwrds[*nnw].entries->posinsubst = src->posinsubst;
324 newwrds[*nnw].entries->nextentry = NULL;
326 (*nnw)++;
327 return newwrds;
330 static int
331 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
333 if (a == NULL || b == NULL)
334 return 0;
336 if (a->idsubst == b->idsubst)
338 if (a->posinsubst == b->posinsubst)
340 if (a->tnvariant == b->tnvariant)
341 return 0;
343 return (a->tnvariant > b->tnvariant) ? 1 : -1;
346 return (a->posinsubst > b->posinsubst) ? 1 : -1;
349 return (a->idsubst > b->idsubst) ? 1 : -1;
352 static int
353 cmpLexeme(TheLexeme *a, TheLexeme *b)
355 if (a->lexeme == NULL)
357 if (b->lexeme == NULL)
358 return 0;
359 else
360 return 1;
362 else if (b->lexeme == NULL)
363 return -1;
365 return strcmp(a->lexeme, b->lexeme);
368 static int
369 cmpLexemeQ(const void *a, const void *b)
371 return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
374 static int
375 cmpTheLexeme(const void *a, const void *b)
377 TheLexeme *la = (TheLexeme *) a;
378 TheLexeme *lb = (TheLexeme *) b;
379 int res;
381 if ((res = cmpLexeme(la, lb)) != 0)
382 return res;
384 return -cmpLexemeInfo(la->entries, lb->entries);
387 static void
388 compileTheLexeme(DictThesaurus *d)
390 int i,
391 nnw = 0,
392 tnm = 16;
393 TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
394 *ptrwrds;
396 for (i = 0; i < d->nwrds; i++)
398 TSLexeme *ptr;
400 if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
401 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
402 else
404 ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
405 PointerGetDatum(d->subdict->dictData),
406 PointerGetDatum(d->wrds[i].lexeme),
407 Int32GetDatum(strlen(d->wrds[i].lexeme)),
408 PointerGetDatum(NULL)));
410 if (!ptr)
411 ereport(ERROR,
412 (errcode(ERRCODE_CONFIG_FILE_ERROR),
413 errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
414 d->wrds[i].lexeme,
415 d->wrds[i].entries->idsubst + 1)));
416 else if (!(ptr->lexeme))
417 ereport(ERROR,
418 (errcode(ERRCODE_CONFIG_FILE_ERROR),
419 errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
420 d->wrds[i].lexeme,
421 d->wrds[i].entries->idsubst + 1),
422 errhint("Use \"?\" to represent a stop word within a sample phrase.")));
423 else
425 while (ptr->lexeme)
427 TSLexeme *remptr = ptr + 1;
428 int tnvar = 1;
429 int curvar = ptr->nvariant;
431 /* compute n words in one variant */
432 while (remptr->lexeme)
434 if (remptr->nvariant != (remptr - 1)->nvariant)
435 break;
436 tnvar++;
437 remptr++;
440 remptr = ptr;
441 while (remptr->lexeme && remptr->nvariant == curvar)
443 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
444 remptr++;
447 ptr = remptr;
452 pfree(d->wrds[i].lexeme);
453 pfree(d->wrds[i].entries);
456 pfree(d->wrds);
457 d->wrds = newwrds;
458 d->nwrds = nnw;
459 d->ntwrds = tnm;
461 if (d->nwrds > 1)
463 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
465 /* uniq */
466 newwrds = d->wrds;
467 ptrwrds = d->wrds + 1;
468 while (ptrwrds - d->wrds < d->nwrds)
470 if (cmpLexeme(ptrwrds, newwrds) == 0)
472 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
474 ptrwrds->entries->nextentry = newwrds->entries;
475 newwrds->entries = ptrwrds->entries;
477 else
478 pfree(ptrwrds->entries);
480 if (ptrwrds->lexeme)
481 pfree(ptrwrds->lexeme);
483 else
485 newwrds++;
486 *newwrds = *ptrwrds;
489 ptrwrds++;
492 d->nwrds = newwrds - d->wrds + 1;
493 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
497 static void
498 compileTheSubstitute(DictThesaurus *d)
500 int i;
502 for (i = 0; i < d->nsubst; i++)
504 TSLexeme *rem = d->subst[i].res,
505 *outptr,
506 *inptr;
507 int n = 2;
509 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
510 outptr->lexeme = NULL;
511 inptr = rem;
513 while (inptr && inptr->lexeme)
515 TSLexeme *lexized,
516 tmplex[2];
518 if (inptr->flags & DT_USEASIS)
519 { /* do not lexize */
520 tmplex[0] = *inptr;
521 tmplex[0].flags = 0;
522 tmplex[1].lexeme = NULL;
523 lexized = tmplex;
525 else
527 lexized = (TSLexeme *) DatumGetPointer(
528 FunctionCall4(
529 &(d->subdict->lexize),
530 PointerGetDatum(d->subdict->dictData),
531 PointerGetDatum(inptr->lexeme),
532 Int32GetDatum(strlen(inptr->lexeme)),
533 PointerGetDatum(NULL)
538 if (lexized && lexized->lexeme)
540 int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
542 while (lexized->lexeme)
544 if (outptr - d->subst[i].res + 1 >= n)
546 int diff = outptr - d->subst[i].res;
548 n *= 2;
549 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
550 outptr = d->subst[i].res + diff;
553 *outptr = *lexized;
554 outptr->lexeme = pstrdup(lexized->lexeme);
556 outptr++;
557 lexized++;
560 if (toset > 0)
561 d->subst[i].res[toset].flags |= TSL_ADDPOS;
563 else if (lexized)
565 ereport(ERROR,
566 (errcode(ERRCODE_CONFIG_FILE_ERROR),
567 errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568 inptr->lexeme, i + 1)));
570 else
572 ereport(ERROR,
573 (errcode(ERRCODE_CONFIG_FILE_ERROR),
574 errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575 inptr->lexeme, i + 1)));
578 if (inptr->lexeme)
579 pfree(inptr->lexeme);
580 inptr++;
583 if (outptr == d->subst[i].res)
584 ereport(ERROR,
585 (errcode(ERRCODE_CONFIG_FILE_ERROR),
586 errmsg("thesaurus substitute phrase is empty (rule %d)",
587 i + 1)));
589 d->subst[i].reslen = outptr - d->subst[i].res;
591 pfree(rem);
595 Datum
596 thesaurus_init(PG_FUNCTION_ARGS)
598 List *dictoptions = (List *) PG_GETARG_POINTER(0);
599 DictThesaurus *d;
600 char *subdictname = NULL;
601 bool fileloaded = false;
602 ListCell *l;
604 d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
606 foreach(l, dictoptions)
608 DefElem *defel = (DefElem *) lfirst(l);
610 if (pg_strcasecmp("DictFile", defel->defname) == 0)
612 if (fileloaded)
613 ereport(ERROR,
614 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
615 errmsg("multiple DictFile parameters")));
616 thesaurusRead(defGetString(defel), d);
617 fileloaded = true;
619 else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
621 if (subdictname)
622 ereport(ERROR,
623 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
624 errmsg("multiple Dictionary parameters")));
625 subdictname = pstrdup(defGetString(defel));
627 else
629 ereport(ERROR,
630 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
631 errmsg("unrecognized Thesaurus parameter: \"%s\"",
632 defel->defname)));
636 if (!fileloaded)
637 ereport(ERROR,
638 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
639 errmsg("missing DictFile parameter")));
640 if (!subdictname)
641 ereport(ERROR,
642 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
643 errmsg("missing Dictionary parameter")));
645 d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
646 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
648 compileTheLexeme(d);
649 compileTheSubstitute(d);
651 PG_RETURN_POINTER(d);
654 static LexemeInfo *
655 findTheLexeme(DictThesaurus *d, char *lexeme)
657 TheLexeme key,
658 *res;
660 if (d->nwrds == 0)
661 return NULL;
663 key.lexeme = lexeme;
664 key.entries = NULL;
666 res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
668 if (res == NULL)
669 return NULL;
670 return res->entries;
673 static bool
674 matchIdSubst(LexemeInfo *stored, uint16 idsubst)
676 bool res = true;
678 if (stored)
680 res = false;
682 for (; stored; stored = stored->nextvariant)
683 if (stored->idsubst == idsubst)
685 res = true;
686 break;
690 return res;
693 static LexemeInfo *
694 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
696 for (;;)
698 int i;
699 LexemeInfo *ptr = newin[0];
701 for (i = 0; i < newn; i++)
703 while (newin[i] && newin[i]->idsubst < ptr->idsubst)
704 newin[i] = newin[i]->nextentry;
706 if (newin[i] == NULL)
707 return in;
709 if (newin[i]->idsubst > ptr->idsubst)
711 ptr = newin[i];
712 i = -1;
713 continue;
716 while (newin[i]->idsubst == ptr->idsubst)
718 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
720 ptr = newin[i];
721 break;
724 newin[i] = newin[i]->nextentry;
725 if (newin[i] == NULL)
726 return in;
729 if (newin[i]->idsubst != ptr->idsubst)
731 ptr = newin[i];
732 i = -1;
733 continue;
737 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
738 { /* found */
740 ptr->nextvariant = in;
741 in = ptr;
744 /* step forward */
745 for (i = 0; i < newn; i++)
746 newin[i] = newin[i]->nextentry;
749 return NULL;
752 static TSLexeme *
753 copyTSLexeme(TheSubstitute *ts)
755 TSLexeme *res;
756 uint16 i;
758 res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
759 for (i = 0; i < ts->reslen; i++)
761 res[i] = ts->res[i];
762 res[i].lexeme = pstrdup(ts->res[i].lexeme);
765 res[ts->reslen].lexeme = NULL;
767 return res;
770 static TSLexeme *
771 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
773 *moreres = false;
774 while (info)
776 Assert(info->idsubst < d->nsubst);
777 if (info->nextvariant)
778 *moreres = true;
779 if (d->subst[info->idsubst].lastlexeme == curpos)
780 return copyTSLexeme(d->subst + info->idsubst);
781 info = info->nextvariant;
784 return NULL;
787 Datum
788 thesaurus_lexize(PG_FUNCTION_ARGS)
790 DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
791 DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
792 TSLexeme *res = NULL;
793 LexemeInfo *stored,
794 *info = NULL;
795 uint16 curpos = 0;
796 bool moreres = false;
798 if (PG_NARGS() != 4 || dstate == NULL)
799 elog(ERROR, "forbidden call of thesaurus or nested call");
801 if (dstate->isend)
802 PG_RETURN_POINTER(NULL);
803 stored = (LexemeInfo *) dstate->private;
805 if (stored)
806 curpos = stored->posinsubst + 1;
808 if (!d->subdict->isvalid)
809 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
811 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
812 PointerGetDatum(d->subdict->dictData),
813 PG_GETARG_DATUM(1),
814 PG_GETARG_DATUM(2),
815 PointerGetDatum(NULL)));
817 if (res && res->lexeme)
819 TSLexeme *ptr = res,
820 *basevar;
822 while (ptr->lexeme)
824 uint16 nv = ptr->nvariant;
825 uint16 i,
826 nlex = 0;
827 LexemeInfo **infos;
829 basevar = ptr;
830 while (ptr->lexeme && nv == ptr->nvariant)
832 nlex++;
833 ptr++;
836 infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
837 for (i = 0; i < nlex; i++)
838 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
839 break;
841 if (i < nlex)
843 /* no chance to find */
844 pfree(infos);
845 continue;
848 info = findVariant(info, stored, curpos, infos, nlex);
851 else if (res)
852 { /* stop-word */
853 LexemeInfo *infos = findTheLexeme(d, NULL);
855 info = findVariant(NULL, stored, curpos, &infos, 1);
857 else
859 info = NULL; /* word isn't recognized */
862 dstate->private = (void *) info;
864 if (!info)
866 dstate->getnext = false;
867 PG_RETURN_POINTER(NULL);
870 if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
872 dstate->getnext = moreres;
873 PG_RETURN_POINTER(res);
876 dstate->getnext = true;
878 PG_RETURN_POINTER(NULL);