1 /*-------------------------------------------------------------------------
4 * main parse functions for tsearch
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_public.h"
19 #include "tsearch/ts_utils.h"
21 #define IGNORE_LONGLEXEME 1
27 typedef struct ParsedLex
33 struct ParsedLex
*next
;
36 typedef struct ListParsedLex
44 TSConfigCacheEntry
*cfg
;
47 DictSubState dictState
;
49 ListParsedLex towork
; /* current list to work */
50 ListParsedLex waste
; /* list of lexemes that already lexized */
53 * fields to store last variant to lexize (basically, thesaurus or similar
54 * to, which wants several lexemes
62 LexizeInit(LexizeData
*ld
, TSConfigCacheEntry
*cfg
)
65 ld
->curDictId
= InvalidOid
;
67 ld
->towork
.head
= ld
->towork
.tail
= ld
->curSub
= NULL
;
68 ld
->waste
.head
= ld
->waste
.tail
= NULL
;
74 LPLAddTail(ListParsedLex
*list
, ParsedLex
*newpl
)
78 list
->tail
->next
= newpl
;
82 list
->head
= list
->tail
= newpl
;
87 LPLRemoveHead(ListParsedLex
*list
)
89 ParsedLex
*res
= list
->head
;
92 list
->head
= list
->head
->next
;
94 if (list
->head
== NULL
)
101 LexizeAddLemm(LexizeData
*ld
, int type
, char *lemm
, int lenlemm
)
103 ParsedLex
*newpl
= (ParsedLex
*) palloc(sizeof(ParsedLex
));
105 newpl
= (ParsedLex
*) palloc(sizeof(ParsedLex
));
108 newpl
->lenlemm
= lenlemm
;
109 LPLAddTail(&ld
->towork
, newpl
);
110 ld
->curSub
= ld
->towork
.tail
;
114 RemoveHead(LexizeData
*ld
)
116 LPLAddTail(&ld
->waste
, LPLRemoveHead(&ld
->towork
));
122 setCorrLex(LexizeData
*ld
, ParsedLex
**correspondLexem
)
126 *correspondLexem
= ld
->waste
.head
;
131 *ptr
= ld
->waste
.head
;
140 ld
->waste
.head
= ld
->waste
.tail
= NULL
;
144 moveToWaste(LexizeData
*ld
, ParsedLex
*stop
)
148 while (ld
->towork
.head
&& go
)
150 if (ld
->towork
.head
== stop
)
152 ld
->curSub
= stop
->next
;
160 setNewTmpRes(LexizeData
*ld
, ParsedLex
*lex
, TSLexeme
*res
)
166 for (ptr
= ld
->tmpRes
; ptr
->lexeme
; ptr
++)
175 LexizeExec(LexizeData
*ld
, ParsedLex
**correspondLexem
)
179 TSDictionaryCacheEntry
*dict
;
182 if (ld
->curDictId
== InvalidOid
)
185 * usial mode: dictionary wants only one word, but we should keep in
186 * mind that we should go through all stack
189 while (ld
->towork
.head
)
191 ParsedLex
*curVal
= ld
->towork
.head
;
193 map
= ld
->cfg
->map
+ curVal
->type
;
195 if (curVal
->type
== 0 || curVal
->type
>= ld
->cfg
->lenmap
|| map
->len
== 0)
197 /* skip this type of lexeme */
202 for (i
= ld
->posDict
; i
< map
->len
; i
++)
204 dict
= lookup_ts_dictionary_cache(map
->dictIds
[i
]);
206 ld
->dictState
.isend
= ld
->dictState
.getnext
= false;
207 ld
->dictState
.private = NULL
;
208 res
= (TSLexeme
*) DatumGetPointer(FunctionCall4(
210 PointerGetDatum(dict
->dictData
),
211 PointerGetDatum(curVal
->lemm
),
212 Int32GetDatum(curVal
->lenlemm
),
213 PointerGetDatum(&ld
->dictState
)
216 if (ld
->dictState
.getnext
)
219 * dictionary wants next word, so setup and store current
220 * position and go to multiword mode
223 ld
->curDictId
= DatumGetObjectId(map
->dictIds
[i
]);
225 ld
->curSub
= curVal
->next
;
227 setNewTmpRes(ld
, curVal
, res
);
228 return LexizeExec(ld
, correspondLexem
);
231 if (!res
) /* dictionary doesn't know this lexeme */
235 setCorrLex(ld
, correspondLexem
);
243 { /* curDictId is valid */
244 dict
= lookup_ts_dictionary_cache(ld
->curDictId
);
247 * Dictionary ld->curDictId asks us about following words
252 ParsedLex
*curVal
= ld
->curSub
;
254 map
= ld
->cfg
->map
+ curVal
->type
;
256 if (curVal
->type
!= 0)
258 bool dictExists
= false;
260 if (curVal
->type
>= ld
->cfg
->lenmap
|| map
->len
== 0)
262 /* skip this type of lexeme */
263 ld
->curSub
= curVal
->next
;
268 * We should be sure that current type of lexeme is recognized
269 * by our dictinonary: we just check is it exist in list of
272 for (i
= 0; i
< map
->len
&& !dictExists
; i
++)
273 if (ld
->curDictId
== DatumGetObjectId(map
->dictIds
[i
]))
279 * Dictionary can't work with current tpe of lexeme,
280 * return to basic mode and redo all stored lexemes
282 ld
->curDictId
= InvalidOid
;
283 return LexizeExec(ld
, correspondLexem
);
287 ld
->dictState
.isend
= (curVal
->type
== 0) ? true : false;
288 ld
->dictState
.getnext
= false;
290 res
= (TSLexeme
*) DatumGetPointer(FunctionCall4(
292 PointerGetDatum(dict
->dictData
),
293 PointerGetDatum(curVal
->lemm
),
294 Int32GetDatum(curVal
->lenlemm
),
295 PointerGetDatum(&ld
->dictState
)
298 if (ld
->dictState
.getnext
)
300 /* Dictionary wants one more */
301 ld
->curSub
= curVal
->next
;
303 setNewTmpRes(ld
, curVal
, res
);
307 if (res
|| ld
->tmpRes
)
310 * Dictionary normalizes lexemes, so we remove from stack all
311 * used lexemes, return to basic mode and redo end of stack
316 moveToWaste(ld
, ld
->curSub
);
321 moveToWaste(ld
, ld
->lastRes
);
324 /* reset to initial state */
325 ld
->curDictId
= InvalidOid
;
329 setCorrLex(ld
, correspondLexem
);
334 * Dict don't want next lexem and didn't recognize anything, redo
335 * from ld->towork.head
337 ld
->curDictId
= InvalidOid
;
338 return LexizeExec(ld
, correspondLexem
);
342 setCorrLex(ld
, correspondLexem
);
347 * Parse string and lexize words.
349 * prs will be filled in.
352 parsetext(Oid cfgId
, ParsedText
*prs
, char *buf
, int buflen
)
359 TSConfigCacheEntry
*cfg
;
360 TSParserCacheEntry
*prsobj
;
363 cfg
= lookup_ts_config_cache(cfgId
);
364 prsobj
= lookup_ts_parser_cache(cfg
->prsId
);
366 prsdata
= (void *) DatumGetPointer(FunctionCall2(&prsobj
->prsstart
,
367 PointerGetDatum(buf
),
368 Int32GetDatum(buflen
)));
370 LexizeInit(&ldata
, cfg
);
374 type
= DatumGetInt32(FunctionCall3(&(prsobj
->prstoken
),
375 PointerGetDatum(prsdata
),
376 PointerGetDatum(&lemm
),
377 PointerGetDatum(&lenlemm
)));
379 if (type
> 0 && lenlemm
>= MAXSTRLEN
)
381 #ifdef IGNORE_LONGLEXEME
383 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
384 errmsg("word is too long to be indexed"),
385 errdetail("Words longer than %d characters are ignored.",
390 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
391 errmsg("word is too long to be indexed"),
392 errdetail("Words longer than %d characters are ignored.",
397 LexizeAddLemm(&ldata
, type
, lemm
, lenlemm
);
399 while ((norms
= LexizeExec(&ldata
, NULL
)) != NULL
)
401 TSLexeme
*ptr
= norms
;
403 prs
->pos
++; /* set pos */
407 if (prs
->curwords
== prs
->lenwords
)
410 prs
->words
= (ParsedWord
*) repalloc((void *) prs
->words
, prs
->lenwords
* sizeof(ParsedWord
));
413 if (ptr
->flags
& TSL_ADDPOS
)
415 prs
->words
[prs
->curwords
].len
= strlen(ptr
->lexeme
);
416 prs
->words
[prs
->curwords
].word
= ptr
->lexeme
;
417 prs
->words
[prs
->curwords
].nvariant
= ptr
->nvariant
;
418 prs
->words
[prs
->curwords
].flags
= ptr
->flags
& TSL_PREFIX
;
419 prs
->words
[prs
->curwords
].alen
= 0;
420 prs
->words
[prs
->curwords
].pos
.pos
= LIMITPOS(prs
->pos
);
428 FunctionCall1(&(prsobj
->prsend
), PointerGetDatum(prsdata
));
435 hladdword(HeadlineParsedText
*prs
, char *buf
, int buflen
, int type
)
437 while (prs
->curwords
>= prs
->lenwords
)
440 prs
->words
= (HeadlineWordEntry
*) repalloc((void *) prs
->words
, prs
->lenwords
* sizeof(HeadlineWordEntry
));
442 memset(&(prs
->words
[prs
->curwords
]), 0, sizeof(HeadlineWordEntry
));
443 prs
->words
[prs
->curwords
].type
= (uint8
) type
;
444 prs
->words
[prs
->curwords
].len
= buflen
;
445 prs
->words
[prs
->curwords
].word
= palloc(buflen
);
446 memcpy(prs
->words
[prs
->curwords
].word
, buf
, buflen
);
451 hlfinditem(HeadlineParsedText
*prs
, TSQuery query
, char *buf
, int buflen
)
454 QueryItem
*item
= GETQUERY(query
);
455 HeadlineWordEntry
*word
;
457 while (prs
->curwords
+ query
->size
>= prs
->lenwords
)
460 prs
->words
= (HeadlineWordEntry
*) repalloc((void *) prs
->words
, prs
->lenwords
* sizeof(HeadlineWordEntry
));
463 word
= &(prs
->words
[prs
->curwords
- 1]);
464 for (i
= 0; i
< query
->size
; i
++)
466 if (item
->type
== QI_VAL
&&
467 tsCompareString( GETOPERAND(query
) + item
->operand
.distance
, item
->operand
.length
,
468 buf
, buflen
, item
->operand
.prefix
) == 0 )
472 memcpy(&(prs
->words
[prs
->curwords
]), word
, sizeof(HeadlineWordEntry
));
473 prs
->words
[prs
->curwords
].item
= &item
->operand
;
474 prs
->words
[prs
->curwords
].repeated
= 1;
478 word
->item
= &item
->operand
;
485 addHLParsedLex(HeadlineParsedText
*prs
, TSQuery query
, ParsedLex
*lexs
, TSLexeme
*norms
)
494 hladdword(prs
, lexs
->lemm
, lexs
->lenlemm
, lexs
->type
);
497 while (ptr
&& ptr
->lexeme
)
499 hlfinditem(prs
, query
, ptr
->lexeme
, strlen(ptr
->lexeme
));
503 tmplexs
= lexs
->next
;
521 hlparsetext(Oid cfgId
, HeadlineParsedText
*prs
, TSQuery query
, char *buf
, int buflen
)
529 TSConfigCacheEntry
*cfg
;
530 TSParserCacheEntry
*prsobj
;
533 cfg
= lookup_ts_config_cache(cfgId
);
534 prsobj
= lookup_ts_parser_cache(cfg
->prsId
);
536 prsdata
= (void *) DatumGetPointer(FunctionCall2(&(prsobj
->prsstart
),
537 PointerGetDatum(buf
),
538 Int32GetDatum(buflen
)));
540 LexizeInit(&ldata
, cfg
);
544 type
= DatumGetInt32(FunctionCall3(&(prsobj
->prstoken
),
545 PointerGetDatum(prsdata
),
546 PointerGetDatum(&lemm
),
547 PointerGetDatum(&lenlemm
)));
549 if (type
> 0 && lenlemm
>= MAXSTRLEN
)
551 #ifdef IGNORE_LONGLEXEME
553 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
554 errmsg("word is too long to be indexed"),
555 errdetail("Words longer than %d characters are ignored.",
560 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
561 errmsg("word is too long to be indexed"),
562 errdetail("Words longer than %d characters are ignored.",
567 LexizeAddLemm(&ldata
, type
, lemm
, lenlemm
);
571 if ((norms
= LexizeExec(&ldata
, &lexs
)) != NULL
)
572 addHLParsedLex(prs
, query
, lexs
, norms
);
574 addHLParsedLex(prs
, query
, lexs
, NULL
);
579 FunctionCall1(&(prsobj
->prsend
), PointerGetDatum(prsdata
));
583 generateHeadline(HeadlineParsedText
*prs
)
588 int numfragments
= 0;
591 HeadlineWordEntry
*wrd
= prs
->words
;
593 out
= (text
*) palloc(len
);
594 ptr
= ((char *) out
) + VARHDRSZ
;
596 while (wrd
- prs
->words
< prs
->curwords
)
598 while (wrd
->len
+ prs
->stopsellen
+ prs
->startsellen
+ prs
->fragdelimlen
+ (ptr
- ((char *) out
)) >= len
)
600 int dist
= ptr
- ((char *) out
);
603 out
= (text
*) repalloc(out
, len
);
604 ptr
= ((char *) out
) + dist
;
607 if (wrd
->in
&& !wrd
->repeated
)
612 /* start of a new fragment */
615 /* add a fragment delimitor if this is after the first one */
616 if (numfragments
> 1)
618 memcpy(ptr
, prs
->fragdelim
, prs
->fragdelimlen
);
619 ptr
+= prs
->fragdelimlen
;
632 memcpy(ptr
, prs
->startsel
, prs
->startsellen
);
633 ptr
+= prs
->startsellen
;
635 memcpy(ptr
, wrd
->word
, wrd
->len
);
639 memcpy(ptr
, prs
->stopsel
, prs
->stopsellen
);
640 ptr
+= prs
->stopsellen
;
644 else if (!wrd
->repeated
)
654 SET_VARSIZE(out
, ptr
- ((char *) out
));