Fix oversight in previous error-reporting patch; mustn't pfree path string
[PostgreSQL.git] / src / backend / tsearch / ts_parse.c
blob39be8ce187190e733fe06cd2e3b9b326ff80082f
1 /*-------------------------------------------------------------------------
3 * ts_parse.c
4 * main parse functions for tsearch
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * $PostgreSQL$
12 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_public.h"
19 #include "tsearch/ts_utils.h"
21 #define IGNORE_LONGLEXEME 1
24 * Lexize subsystem
27 typedef struct ParsedLex
29 int type;
30 char *lemm;
31 int lenlemm;
32 bool resfollow;
33 struct ParsedLex *next;
34 } ParsedLex;
36 typedef struct ListParsedLex
38 ParsedLex *head;
39 ParsedLex *tail;
40 } ListParsedLex;
42 typedef struct
44 TSConfigCacheEntry *cfg;
45 Oid curDictId;
46 int posDict;
47 DictSubState dictState;
48 ParsedLex *curSub;
49 ListParsedLex towork; /* current list to work */
50 ListParsedLex waste; /* list of lexemes that already lexized */
53 * fields to store last variant to lexize (basically, thesaurus or similar
54 * to, which wants several lexemes
57 ParsedLex *lastRes;
58 TSLexeme *tmpRes;
59 } LexizeData;
61 static void
62 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
64 ld->cfg = cfg;
65 ld->curDictId = InvalidOid;
66 ld->posDict = 0;
67 ld->towork.head = ld->towork.tail = ld->curSub = NULL;
68 ld->waste.head = ld->waste.tail = NULL;
69 ld->lastRes = NULL;
70 ld->tmpRes = NULL;
73 static void
74 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
76 if (list->tail)
78 list->tail->next = newpl;
79 list->tail = newpl;
81 else
82 list->head = list->tail = newpl;
83 newpl->next = NULL;
86 static ParsedLex *
87 LPLRemoveHead(ListParsedLex *list)
89 ParsedLex *res = list->head;
91 if (list->head)
92 list->head = list->head->next;
94 if (list->head == NULL)
95 list->tail = NULL;
97 return res;
100 static void
101 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
103 ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
105 newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
106 newpl->type = type;
107 newpl->lemm = lemm;
108 newpl->lenlemm = lenlemm;
109 LPLAddTail(&ld->towork, newpl);
110 ld->curSub = ld->towork.tail;
113 static void
114 RemoveHead(LexizeData *ld)
116 LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
118 ld->posDict = 0;
121 static void
122 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
124 if (correspondLexem)
126 *correspondLexem = ld->waste.head;
128 else
130 ParsedLex *tmp,
131 *ptr = ld->waste.head;
133 while (ptr)
135 tmp = ptr->next;
136 pfree(ptr);
137 ptr = tmp;
140 ld->waste.head = ld->waste.tail = NULL;
143 static void
144 moveToWaste(LexizeData *ld, ParsedLex *stop)
146 bool go = true;
148 while (ld->towork.head && go)
150 if (ld->towork.head == stop)
152 ld->curSub = stop->next;
153 go = false;
155 RemoveHead(ld);
159 static void
160 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
162 if (ld->tmpRes)
164 TSLexeme *ptr;
166 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
167 pfree(ptr->lexeme);
168 pfree(ld->tmpRes);
170 ld->tmpRes = res;
171 ld->lastRes = lex;
174 static TSLexeme *
175 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
177 int i;
178 ListDictionary *map;
179 TSDictionaryCacheEntry *dict;
180 TSLexeme *res;
182 if (ld->curDictId == InvalidOid)
185 * usial mode: dictionary wants only one word, but we should keep in
186 * mind that we should go through all stack
189 while (ld->towork.head)
191 ParsedLex *curVal = ld->towork.head;
193 map = ld->cfg->map + curVal->type;
195 if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
197 /* skip this type of lexeme */
198 RemoveHead(ld);
199 continue;
202 for (i = ld->posDict; i < map->len; i++)
204 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
206 ld->dictState.isend = ld->dictState.getnext = false;
207 ld->dictState.private = NULL;
208 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
209 &(dict->lexize),
210 PointerGetDatum(dict->dictData),
211 PointerGetDatum(curVal->lemm),
212 Int32GetDatum(curVal->lenlemm),
213 PointerGetDatum(&ld->dictState)
216 if (ld->dictState.getnext)
219 * dictionary wants next word, so setup and store current
220 * position and go to multiword mode
223 ld->curDictId = DatumGetObjectId(map->dictIds[i]);
224 ld->posDict = i + 1;
225 ld->curSub = curVal->next;
226 if (res)
227 setNewTmpRes(ld, curVal, res);
228 return LexizeExec(ld, correspondLexem);
231 if (!res) /* dictionary doesn't know this lexeme */
232 continue;
234 RemoveHead(ld);
235 setCorrLex(ld, correspondLexem);
236 return res;
239 RemoveHead(ld);
242 else
243 { /* curDictId is valid */
244 dict = lookup_ts_dictionary_cache(ld->curDictId);
247 * Dictionary ld->curDictId asks us about following words
250 while (ld->curSub)
252 ParsedLex *curVal = ld->curSub;
254 map = ld->cfg->map + curVal->type;
256 if (curVal->type != 0)
258 bool dictExists = false;
260 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
262 /* skip this type of lexeme */
263 ld->curSub = curVal->next;
264 continue;
268 * We should be sure that current type of lexeme is recognized
269 * by our dictinonary: we just check is it exist in list of
270 * dictionaries ?
272 for (i = 0; i < map->len && !dictExists; i++)
273 if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
274 dictExists = true;
276 if (!dictExists)
279 * Dictionary can't work with current tpe of lexeme,
280 * return to basic mode and redo all stored lexemes
282 ld->curDictId = InvalidOid;
283 return LexizeExec(ld, correspondLexem);
287 ld->dictState.isend = (curVal->type == 0) ? true : false;
288 ld->dictState.getnext = false;
290 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
291 &(dict->lexize),
292 PointerGetDatum(dict->dictData),
293 PointerGetDatum(curVal->lemm),
294 Int32GetDatum(curVal->lenlemm),
295 PointerGetDatum(&ld->dictState)
298 if (ld->dictState.getnext)
300 /* Dictionary wants one more */
301 ld->curSub = curVal->next;
302 if (res)
303 setNewTmpRes(ld, curVal, res);
304 continue;
307 if (res || ld->tmpRes)
310 * Dictionary normalizes lexemes, so we remove from stack all
311 * used lexemes, return to basic mode and redo end of stack
312 * (if it exists)
314 if (res)
316 moveToWaste(ld, ld->curSub);
318 else
320 res = ld->tmpRes;
321 moveToWaste(ld, ld->lastRes);
324 /* reset to initial state */
325 ld->curDictId = InvalidOid;
326 ld->posDict = 0;
327 ld->lastRes = NULL;
328 ld->tmpRes = NULL;
329 setCorrLex(ld, correspondLexem);
330 return res;
334 * Dict don't want next lexem and didn't recognize anything, redo
335 * from ld->towork.head
337 ld->curDictId = InvalidOid;
338 return LexizeExec(ld, correspondLexem);
342 setCorrLex(ld, correspondLexem);
343 return NULL;
347 * Parse string and lexize words.
349 * prs will be filled in.
351 void
352 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
354 int type,
355 lenlemm;
356 char *lemm = NULL;
357 LexizeData ldata;
358 TSLexeme *norms;
359 TSConfigCacheEntry *cfg;
360 TSParserCacheEntry *prsobj;
361 void *prsdata;
363 cfg = lookup_ts_config_cache(cfgId);
364 prsobj = lookup_ts_parser_cache(cfg->prsId);
366 prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
367 PointerGetDatum(buf),
368 Int32GetDatum(buflen)));
370 LexizeInit(&ldata, cfg);
374 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
375 PointerGetDatum(prsdata),
376 PointerGetDatum(&lemm),
377 PointerGetDatum(&lenlemm)));
379 if (type > 0 && lenlemm >= MAXSTRLEN)
381 #ifdef IGNORE_LONGLEXEME
382 ereport(NOTICE,
383 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
384 errmsg("word is too long to be indexed"),
385 errdetail("Words longer than %d characters are ignored.",
386 MAXSTRLEN)));
387 continue;
388 #else
389 ereport(ERROR,
390 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
391 errmsg("word is too long to be indexed"),
392 errdetail("Words longer than %d characters are ignored.",
393 MAXSTRLEN)));
394 #endif
397 LexizeAddLemm(&ldata, type, lemm, lenlemm);
399 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
401 TSLexeme *ptr = norms;
403 prs->pos++; /* set pos */
405 while (ptr->lexeme)
407 if (prs->curwords == prs->lenwords)
409 prs->lenwords *= 2;
410 prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
413 if (ptr->flags & TSL_ADDPOS)
414 prs->pos++;
415 prs->words[prs->curwords].len = strlen(ptr->lexeme);
416 prs->words[prs->curwords].word = ptr->lexeme;
417 prs->words[prs->curwords].nvariant = ptr->nvariant;
418 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
419 prs->words[prs->curwords].alen = 0;
420 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
421 ptr++;
422 prs->curwords++;
424 pfree(norms);
426 } while (type > 0);
428 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
432 * Headline framework
434 static void
435 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
437 while (prs->curwords >= prs->lenwords)
439 prs->lenwords *= 2;
440 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
442 memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
443 prs->words[prs->curwords].type = (uint8) type;
444 prs->words[prs->curwords].len = buflen;
445 prs->words[prs->curwords].word = palloc(buflen);
446 memcpy(prs->words[prs->curwords].word, buf, buflen);
447 prs->curwords++;
450 static void
451 hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
453 int i;
454 QueryItem *item = GETQUERY(query);
455 HeadlineWordEntry *word;
457 while (prs->curwords + query->size >= prs->lenwords)
459 prs->lenwords *= 2;
460 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
463 word = &(prs->words[prs->curwords - 1]);
464 for (i = 0; i < query->size; i++)
466 if (item->type == QI_VAL &&
467 tsCompareString( GETOPERAND(query) + item->operand.distance, item->operand.length,
468 buf, buflen, item->operand.prefix ) == 0 )
470 if (word->item)
472 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
473 prs->words[prs->curwords].item = &item->operand;
474 prs->words[prs->curwords].repeated = 1;
475 prs->curwords++;
477 else
478 word->item = &item->operand;
480 item++;
484 static void
485 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
487 ParsedLex *tmplexs;
488 TSLexeme *ptr;
490 while (lexs)
493 if (lexs->type > 0)
494 hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
496 ptr = norms;
497 while (ptr && ptr->lexeme)
499 hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
500 ptr++;
503 tmplexs = lexs->next;
504 pfree(lexs);
505 lexs = tmplexs;
508 if (norms)
510 ptr = norms;
511 while (ptr->lexeme)
513 pfree(ptr->lexeme);
514 ptr++;
516 pfree(norms);
520 void
521 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
523 int type,
524 lenlemm;
525 char *lemm = NULL;
526 LexizeData ldata;
527 TSLexeme *norms;
528 ParsedLex *lexs;
529 TSConfigCacheEntry *cfg;
530 TSParserCacheEntry *prsobj;
531 void *prsdata;
533 cfg = lookup_ts_config_cache(cfgId);
534 prsobj = lookup_ts_parser_cache(cfg->prsId);
536 prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
537 PointerGetDatum(buf),
538 Int32GetDatum(buflen)));
540 LexizeInit(&ldata, cfg);
544 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
545 PointerGetDatum(prsdata),
546 PointerGetDatum(&lemm),
547 PointerGetDatum(&lenlemm)));
549 if (type > 0 && lenlemm >= MAXSTRLEN)
551 #ifdef IGNORE_LONGLEXEME
552 ereport(NOTICE,
553 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
554 errmsg("word is too long to be indexed"),
555 errdetail("Words longer than %d characters are ignored.",
556 MAXSTRLEN)));
557 continue;
558 #else
559 ereport(ERROR,
560 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
561 errmsg("word is too long to be indexed"),
562 errdetail("Words longer than %d characters are ignored.",
563 MAXSTRLEN)));
564 #endif
567 LexizeAddLemm(&ldata, type, lemm, lenlemm);
571 if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
572 addHLParsedLex(prs, query, lexs, norms);
573 else
574 addHLParsedLex(prs, query, lexs, NULL);
575 } while (norms);
577 } while (type > 0);
579 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
582 text *
583 generateHeadline(HeadlineParsedText *prs)
585 text *out;
586 char *ptr;
587 int len = 128;
588 int numfragments = 0;
589 int2 infrag = 0;
591 HeadlineWordEntry *wrd = prs->words;
593 out = (text *) palloc(len);
594 ptr = ((char *) out) + VARHDRSZ;
596 while (wrd - prs->words < prs->curwords)
598 while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
600 int dist = ptr - ((char *) out);
602 len *= 2;
603 out = (text *) repalloc(out, len);
604 ptr = ((char *) out) + dist;
607 if (wrd->in && !wrd->repeated)
609 if (!infrag)
612 /* start of a new fragment */
613 infrag = 1;
614 numfragments ++;
615 /* add a fragment delimitor if this is after the first one */
616 if (numfragments > 1)
618 memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
619 ptr += prs->fragdelimlen;
623 if (wrd->replace)
625 *ptr = ' ';
626 ptr++;
628 else
630 if (wrd->selected)
632 memcpy(ptr, prs->startsel, prs->startsellen);
633 ptr += prs->startsellen;
635 memcpy(ptr, wrd->word, wrd->len);
636 ptr += wrd->len;
637 if (wrd->selected)
639 memcpy(ptr, prs->stopsel, prs->stopsellen);
640 ptr += prs->stopsellen;
644 else if (!wrd->repeated)
646 if (infrag)
647 infrag = 0;
648 pfree(wrd->word);
651 wrd++;
654 SET_VARSIZE(out, ptr - ((char *) out));
655 return out;