src/backend/tsearch/ts_parse.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * ts_parse.c
   4  *              main parse functions for tsearch
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include "tsearch/ts_cache.h"
  18 #include "tsearch/ts_public.h"
  19 #include "tsearch/ts_utils.h"
  20
  21 #define IGNORE_LONGLEXEME       1
  22
  23 /*
  24  * Lexize subsystem
  25  */
  26
  27 typedef struct ParsedLex
  28 {
  29         int                     type;
  30         char       *lemm;
  31         int                     lenlemm;
  32         bool            resfollow;
  33         struct ParsedLex *next;
  34 } ParsedLex;
  35
  36 typedef struct ListParsedLex
  37 {
  38         ParsedLex  *head;
  39         ParsedLex  *tail;
  40 } ListParsedLex;
  41
  42 typedef struct
  43 {
  44         TSConfigCacheEntry *cfg;
  45         Oid                     curDictId;
  46         int                     posDict;
  47         DictSubState dictState;
  48         ParsedLex  *curSub;
  49         ListParsedLex towork;           /* current list to work */
  50         ListParsedLex waste;            /* list of lexemes that already lexized */
  51
  52         /*
  53          * fields to store last variant to lexize (basically, thesaurus or similar
  54          * to, which wants      several lexemes
  55          */
  56
  57         ParsedLex  *lastRes;
  58         TSLexeme   *tmpRes;
  59 } LexizeData;
  60
  61 static void
  62 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
  63 {
  64         ld->cfg = cfg;
  65         ld->curDictId = InvalidOid;
  66         ld->posDict = 0;
  67         ld->towork.head = ld->towork.tail = ld->curSub = NULL;
  68         ld->waste.head = ld->waste.tail = NULL;
  69         ld->lastRes = NULL;
  70         ld->tmpRes = NULL;
  71 }
  72
  73 static void
  74 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
  75 {
  76         if (list->tail)
  77         {
  78                 list->tail->next = newpl;
  79                 list->tail = newpl;
  80         }
  81         else
  82                 list->head = list->tail = newpl;
  83         newpl->next = NULL;
  84 }
  85
  86 static ParsedLex *
  87 LPLRemoveHead(ListParsedLex *list)
  88 {
  89         ParsedLex  *res = list->head;
  90
  91         if (list->head)
  92                 list->head = list->head->next;
  93
  94         if (list->head == NULL)
  95                 list->tail = NULL;
  96
  97         return res;
  98 }
  99
 100 static void
 101 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
 102 {
 103         ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
 104
 105         newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
 106         newpl->type = type;
 107         newpl->lemm = lemm;
 108         newpl->lenlemm = lenlemm;
 109         LPLAddTail(&ld->towork, newpl);
 110         ld->curSub = ld->towork.tail;
 111 }
 112
 113 static void
 114 RemoveHead(LexizeData *ld)
 115 {
 116         LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
 117
 118         ld->posDict = 0;
 119 }
 120
 121 static void
 122 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
 123 {
 124         if (correspondLexem)
 125         {
 126                 *correspondLexem = ld->waste.head;
 127         }
 128         else
 129         {
 130                 ParsedLex  *tmp,
 131                                    *ptr = ld->waste.head;
 132
 133                 while (ptr)
 134                 {
 135                         tmp = ptr->next;
 136                         pfree(ptr);
 137                         ptr = tmp;
 138                 }
 139         }
 140         ld->waste.head = ld->waste.tail = NULL;
 141 }
 142
 143 static void
 144 moveToWaste(LexizeData *ld, ParsedLex *stop)
 145 {
 146         bool            go = true;
 147
 148         while (ld->towork.head && go)
 149         {
 150                 if (ld->towork.head == stop)
 151                 {
 152                         ld->curSub = stop->next;
 153                         go = false;
 154                 }
 155                 RemoveHead(ld);
 156         }
 157 }
 158
 159 static void
 160 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
 161 {
 162         if (ld->tmpRes)
 163         {
 164                 TSLexeme   *ptr;
 165
 166                 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
 167                         pfree(ptr->lexeme);
 168                 pfree(ld->tmpRes);
 169         }
 170         ld->tmpRes = res;
 171         ld->lastRes = lex;
 172 }
 173
 174 static TSLexeme *
 175 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
 176 {
 177         int                     i;
 178         ListDictionary *map;
 179         TSDictionaryCacheEntry *dict;
 180         TSLexeme   *res;
 181
 182         if (ld->curDictId == InvalidOid)
 183         {
 184                 /*
 185                  * usial mode: dictionary wants only one word, but we should keep in
 186                  * mind that we should go through all stack
 187                  */
 188
 189                 while (ld->towork.head)
 190                 {
 191                         ParsedLex  *curVal = ld->towork.head;
 192
 193                         map = ld->cfg->map + curVal->type;
 194
 195                         if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
 196                         {
 197                                 /* skip this type of lexeme */
 198                                 RemoveHead(ld);
 199                                 continue;
 200                         }
 201
 202                         for (i = ld->posDict; i < map->len; i++)
 203                         {
 204                                 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
 205
 206                                 ld->dictState.isend = ld->dictState.getnext = false;
 207                                 ld->dictState.private = NULL;
 208                                 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
 209                                                                                                                          &(dict->lexize),
 210                                                                                          PointerGetDatum(dict->dictData),
 211                                                                                            PointerGetDatum(curVal->lemm),
 212                                                                                           Int32GetDatum(curVal->lenlemm),
 213                                                                                           PointerGetDatum(&ld->dictState)
 214                                                                                                                                  ));
 215
 216                                 if (ld->dictState.getnext)
 217                                 {
 218                                         /*
 219                                          * dictionary wants next word, so setup and store current
 220                                          * position and go to multiword mode
 221                                          */
 222
 223                                         ld->curDictId = DatumGetObjectId(map->dictIds[i]);
 224                                         ld->posDict = i + 1;
 225                                         ld->curSub = curVal->next;
 226                                         if (res)
 227                                                 setNewTmpRes(ld, curVal, res);
 228                                         return LexizeExec(ld, correspondLexem);
 229                                 }
 230
 231                                 if (!res)               /* dictionary doesn't know this lexeme */
 232                                         continue;
 233
 234                                 RemoveHead(ld);
 235                                 setCorrLex(ld, correspondLexem);
 236                                 return res;
 237                         }
 238
 239                         RemoveHead(ld);
 240                 }
 241         }
 242         else
 243         {                                                       /* curDictId is valid */
 244                 dict = lookup_ts_dictionary_cache(ld->curDictId);
 245
 246                 /*
 247                  * Dictionary ld->curDictId asks  us about following words
 248                  */
 249
 250                 while (ld->curSub)
 251                 {
 252                         ParsedLex  *curVal = ld->curSub;
 253
 254                         map = ld->cfg->map + curVal->type;
 255
 256                         if (curVal->type != 0)
 257                         {
 258                                 bool            dictExists = false;
 259
 260                                 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
 261                                 {
 262                                         /* skip this type of lexeme */
 263                                         ld->curSub = curVal->next;
 264                                         continue;
 265                                 }
 266
 267                                 /*
 268                                  * We should be sure that current type of lexeme is recognized
 269                                  * by our dictinonary: we just check is it exist in list of
 270                                  * dictionaries ?
 271                                  */
 272                                 for (i = 0; i < map->len && !dictExists; i++)
 273                                         if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
 274                                                 dictExists = true;
 275
 276                                 if (!dictExists)
 277                                 {
 278                                         /*
 279                                          * Dictionary can't work with current tpe of lexeme,
 280                                          * return to basic mode and redo all stored lexemes
 281                                          */
 282                                         ld->curDictId = InvalidOid;
 283                                         return LexizeExec(ld, correspondLexem);
 284                                 }
 285                         }
 286
 287                         ld->dictState.isend = (curVal->type == 0) ? true : false;
 288                         ld->dictState.getnext = false;
 289
 290                         res = (TSLexeme *) DatumGetPointer(FunctionCall4(
 291                                                                                                                          &(dict->lexize),
 292                                                                                          PointerGetDatum(dict->dictData),
 293                                                                                            PointerGetDatum(curVal->lemm),
 294                                                                                           Int32GetDatum(curVal->lenlemm),
 295                                                                                           PointerGetDatum(&ld->dictState)
 296                                                                                                                          ));
 297
 298                         if (ld->dictState.getnext)
 299                         {
 300                                 /* Dictionary wants one more */
 301                                 ld->curSub = curVal->next;
 302                                 if (res)
 303                                         setNewTmpRes(ld, curVal, res);
 304                                 continue;
 305                         }
 306
 307                         if (res || ld->tmpRes)
 308                         {
 309                                 /*
 310                                  * Dictionary normalizes lexemes, so we remove from stack all
 311                                  * used lexemes, return to basic mode and redo end of stack
 312                                  * (if it exists)
 313                                  */
 314                                 if (res)
 315                                 {
 316                                         moveToWaste(ld, ld->curSub);
 317                                 }
 318                                 else
 319                                 {
 320                                         res = ld->tmpRes;
 321                                         moveToWaste(ld, ld->lastRes);
 322                                 }
 323
 324                                 /* reset to initial state */
 325                                 ld->curDictId = InvalidOid;
 326                                 ld->posDict = 0;
 327                                 ld->lastRes = NULL;
 328                                 ld->tmpRes = NULL;
 329                                 setCorrLex(ld, correspondLexem);
 330                                 return res;
 331                         }
 332
 333                         /*
 334                          * Dict don't want next lexem and didn't recognize anything, redo
 335                          * from ld->towork.head
 336                          */
 337                         ld->curDictId = InvalidOid;
 338                         return LexizeExec(ld, correspondLexem);
 339                 }
 340         }
 341
 342         setCorrLex(ld, correspondLexem);
 343         return NULL;
 344 }
 345
 346 /*
 347  * Parse string and lexize words.
 348  *
 349  * prs will be filled in.
 350  */
 351 void
 352 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
 353 {
 354         int                     type,
 355                                 lenlemm;
 356         char       *lemm = NULL;
 357         LexizeData      ldata;
 358         TSLexeme   *norms;
 359         TSConfigCacheEntry *cfg;
 360         TSParserCacheEntry *prsobj;
 361         void       *prsdata;
 362
 363         cfg = lookup_ts_config_cache(cfgId);
 364         prsobj = lookup_ts_parser_cache(cfg->prsId);
 365
 366         prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
 367                                                                                                          PointerGetDatum(buf),
 368                                                                                                          Int32GetDatum(buflen)));
 369
 370         LexizeInit(&ldata, cfg);
 371
 372         do
 373         {
 374                 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
 375                                                                                    PointerGetDatum(prsdata),
 376                                                                                    PointerGetDatum(&lemm),
 377                                                                                    PointerGetDatum(&lenlemm)));
 378
 379                 if (type > 0 && lenlemm >= MAXSTRLEN)
 380                 {
 381 #ifdef IGNORE_LONGLEXEME
 382                         ereport(NOTICE,
 383                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 384                                          errmsg("word is too long to be indexed"),
 385                                          errdetail("Words longer than %d characters are ignored.",
 386                                                            MAXSTRLEN)));
 387                         continue;
 388 #else
 389                         ereport(ERROR,
 390                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 391                                          errmsg("word is too long to be indexed"),
 392                                          errdetail("Words longer than %d characters are ignored.",
 393                                                            MAXSTRLEN)));
 394 #endif
 395                 }
 396
 397                 LexizeAddLemm(&ldata, type, lemm, lenlemm);
 398
 399                 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
 400                 {
 401                         TSLexeme   *ptr = norms;
 402
 403                         prs->pos++;                     /* set pos */
 404
 405                         while (ptr->lexeme)
 406                         {
 407                                 if (prs->curwords == prs->lenwords)
 408                                 {
 409                                         prs->lenwords *= 2;
 410                                         prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
 411                                 }
 412
 413                                 if (ptr->flags & TSL_ADDPOS)
 414                                         prs->pos++;
 415                                 prs->words[prs->curwords].len = strlen(ptr->lexeme);
 416                                 prs->words[prs->curwords].word = ptr->lexeme;
 417                                 prs->words[prs->curwords].nvariant = ptr->nvariant;
 418                                 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
 419                                 prs->words[prs->curwords].alen = 0;
 420                                 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
 421                                 ptr++;
 422                                 prs->curwords++;
 423                         }
 424                         pfree(norms);
 425                 }
 426         } while (type > 0);
 427
 428         FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
 429 }
 430
 431 /*
 432  * Headline framework
 433  */
 434 static void
 435 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
 436 {
 437         while (prs->curwords >= prs->lenwords)
 438         {
 439                 prs->lenwords *= 2;
 440                 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
 441         }
 442         memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
 443         prs->words[prs->curwords].type = (uint8) type;
 444         prs->words[prs->curwords].len = buflen;
 445         prs->words[prs->curwords].word = palloc(buflen);
 446         memcpy(prs->words[prs->curwords].word, buf, buflen);
 447         prs->curwords++;
 448 }
 449
 450 static void
 451 hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
 452 {
 453         int                     i;
 454         QueryItem  *item = GETQUERY(query);
 455         HeadlineWordEntry *word;
 456
 457         while (prs->curwords + query->size >= prs->lenwords)
 458         {
 459                 prs->lenwords *= 2;
 460                 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
 461         }
 462
 463         word = &(prs->words[prs->curwords - 1]);
 464         for (i = 0; i < query->size; i++)
 465         {
 466                 if (item->type == QI_VAL &&
 467                         tsCompareString( GETOPERAND(query) + item->operand.distance, item->operand.length,
 468                                                          buf, buflen, item->operand.prefix ) == 0 )
 469                 {
 470                         if (word->item)
 471                         {
 472                                 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
 473                                 prs->words[prs->curwords].item = &item->operand;
 474                                 prs->words[prs->curwords].repeated = 1;
 475                                 prs->curwords++;
 476                         }
 477                         else
 478                                 word->item = &item->operand;
 479                 }
 480                 item++;
 481         }
 482 }
 483
 484 static void
 485 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
 486 {
 487         ParsedLex  *tmplexs;
 488         TSLexeme   *ptr;
 489
 490         while (lexs)
 491         {
 492
 493                 if (lexs->type > 0)
 494                         hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
 495
 496                 ptr = norms;
 497                 while (ptr && ptr->lexeme)
 498                 {
 499                         hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
 500                         ptr++;
 501                 }
 502
 503                 tmplexs = lexs->next;
 504                 pfree(lexs);
 505                 lexs = tmplexs;
 506         }
 507
 508         if (norms)
 509         {
 510                 ptr = norms;
 511                 while (ptr->lexeme)
 512                 {
 513                         pfree(ptr->lexeme);
 514                         ptr++;
 515                 }
 516                 pfree(norms);
 517         }
 518 }
 519
 520 void
 521 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
 522 {
 523         int                     type,
 524                                 lenlemm;
 525         char       *lemm = NULL;
 526         LexizeData      ldata;
 527         TSLexeme   *norms;
 528         ParsedLex  *lexs;
 529         TSConfigCacheEntry *cfg;
 530         TSParserCacheEntry *prsobj;
 531         void       *prsdata;
 532
 533         cfg = lookup_ts_config_cache(cfgId);
 534         prsobj = lookup_ts_parser_cache(cfg->prsId);
 535
 536         prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
 537                                                                                                          PointerGetDatum(buf),
 538                                                                                                          Int32GetDatum(buflen)));
 539
 540         LexizeInit(&ldata, cfg);
 541
 542         do
 543         {
 544                 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
 545                                                                                    PointerGetDatum(prsdata),
 546                                                                                    PointerGetDatum(&lemm),
 547                                                                                    PointerGetDatum(&lenlemm)));
 548
 549                 if (type > 0 && lenlemm >= MAXSTRLEN)
 550                 {
 551 #ifdef IGNORE_LONGLEXEME
 552                         ereport(NOTICE,
 553                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 554                                          errmsg("word is too long to be indexed"),
 555                                          errdetail("Words longer than %d characters are ignored.",
 556                                                            MAXSTRLEN)));
 557                         continue;
 558 #else
 559                         ereport(ERROR,
 560                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 561                                          errmsg("word is too long to be indexed"),
 562                                          errdetail("Words longer than %d characters are ignored.",
 563                                                            MAXSTRLEN)));
 564 #endif
 565                 }
 566
 567                 LexizeAddLemm(&ldata, type, lemm, lenlemm);
 568
 569                 do
 570                 {
 571                         if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
 572                                 addHLParsedLex(prs, query, lexs, norms);
 573                         else
 574                                 addHLParsedLex(prs, query, lexs, NULL);
 575                 } while (norms);
 576
 577         } while (type > 0);
 578
 579         FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
 580 }
 581
 582 text *
 583 generateHeadline(HeadlineParsedText *prs)
 584 {
 585         text       *out;
 586         char       *ptr;
 587         int                     len          = 128;
 588         int                     numfragments = 0;
 589         int2                    infrag       = 0;
 590
 591         HeadlineWordEntry *wrd = prs->words;
 592
 593         out = (text *) palloc(len);
 594         ptr = ((char *) out) + VARHDRSZ;
 595
 596         while (wrd - prs->words < prs->curwords)
 597         {
 598                 while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
 599                 {
 600                         int                     dist = ptr - ((char *) out);
 601
 602                         len *= 2;
 603                         out = (text *) repalloc(out, len);
 604                         ptr = ((char *) out) + dist;
 605                 }
 606
 607                 if (wrd->in && !wrd->repeated)
 608                 {
 609                         if (!infrag)
 610                         {
 611
 612                                 /* start of a new fragment */
 613                                 infrag = 1;
 614                                 numfragments ++;
 615                                 /* add a fragment delimitor if this is after the first one */
 616                                 if (numfragments > 1)
 617                                 {
 618                                         memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
 619                                         ptr += prs->fragdelimlen;
 620                                 }
 621
 622                         }
 623                         if (wrd->replace)
 624                         {
 625                                 *ptr = ' ';
 626                                 ptr++;
 627                         }
 628                         else
 629                         {
 630                                 if (wrd->selected)
 631                                 {
 632                                         memcpy(ptr, prs->startsel, prs->startsellen);
 633                                         ptr += prs->startsellen;
 634                                 }
 635                                 memcpy(ptr, wrd->word, wrd->len);
 636                                 ptr += wrd->len;
 637                                 if (wrd->selected)
 638                                 {
 639                                         memcpy(ptr, prs->stopsel, prs->stopsellen);
 640                                         ptr += prs->stopsellen;
 641                                 }
 642                         }
 643                 }
 644                 else if (!wrd->repeated)
 645                 {
 646                         if (infrag)
 647                                 infrag = 0;
 648                         pfree(wrd->word);
 649                 }
 650
 651                 wrd++;
 652         }
 653
 654         SET_VARSIZE(out, ptr - ((char *) out));
 655         return out;
 656 }