src/backend/utils/adt/tsvector.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * tsvector.c
   4  *        I/O functions for tsvector
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include "libpq/pqformat.h"
  18 #include "tsearch/ts_type.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "tsearch/ts_utils.h"
  21 #include "utils/memutils.h"
  22
  23 typedef struct
  24 {
  25         WordEntry       entry;                  /* must be first! */
  26         WordEntryPos *pos;
  27         int                     poslen;                 /* number of elements in pos */
  28 } WordEntryIN;
  29
  30
  31 /* Compare two WordEntryPos values for qsort */
  32 static int
  33 comparePos(const void *a, const void *b)
  34 {
  35         int                     apos = WEP_GETPOS(*(const WordEntryPos *) a);
  36         int                     bpos = WEP_GETPOS(*(const WordEntryPos *) b);
  37
  38         if (apos == bpos)
  39                 return 0;
  40         return (apos > bpos) ? 1 : -1;
  41 }
  42
  43 /*
  44  * Removes duplicate pos entries. If there's two entries with same pos
  45  * but different weight, the higher weight is retained.
  46  *
  47  * Returns new length.
  48  */
  49 static int
  50 uniquePos(WordEntryPos *a, int l)
  51 {
  52         WordEntryPos *ptr,
  53                            *res;
  54
  55         if (l <= 1)
  56                 return l;
  57
  58         qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
  59
  60         res = a;
  61         ptr = a + 1;
  62         while (ptr - a < l)
  63         {
  64                 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
  65                 {
  66                         res++;
  67                         *res = *ptr;
  68                         if (res - a >= MAXNUMPOS - 1 ||
  69                                 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
  70                                 break;
  71                 }
  72                 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
  73                         WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
  74                 ptr++;
  75         }
  76
  77         return res + 1 - a;
  78 }
  79
  80 /* Compare two WordEntryIN values for qsort */
  81 static int
  82 compareentry(const void *va, const void *vb, void *arg)
  83 {
  84         const WordEntryIN *a = (const WordEntryIN *) va;
  85         const WordEntryIN *b = (const WordEntryIN *) vb;
  86         char       *BufferStr = (char *) arg;
  87
  88         return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
  89                                                    &BufferStr[b->entry.pos], b->entry.len,
  90                                                    false);
  91 }
  92
  93 /*
  94  * Sort an array of WordEntryIN, remove duplicates.
  95  * *outbuflen receives the amount of space needed for strings and positions.
  96  */
  97 static int
  98 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
  99 {
 100         int                     buflen;
 101         WordEntryIN *ptr,
 102                            *res;
 103
 104         Assert(l >= 1);
 105
 106         if (l > 1)
 107                 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
 108                                   (void *) buf);
 109
 110         buflen = 0;
 111         res = a;
 112         ptr = a + 1;
 113         while (ptr - a < l)
 114         {
 115                 if (!(ptr->entry.len == res->entry.len &&
 116                           strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
 117                                           res->entry.len) == 0))
 118                 {
 119                         /* done accumulating data into *res, count space needed */
 120                         buflen += res->entry.len;
 121                         if (res->entry.haspos)
 122                         {
 123                                 res->poslen = uniquePos(res->pos, res->poslen);
 124                                 buflen = SHORTALIGN(buflen);
 125                                 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
 126                         }
 127                         res++;
 128                         memcpy(res, ptr, sizeof(WordEntryIN));
 129                 }
 130                 else if (ptr->entry.haspos)
 131                 {
 132                         if (res->entry.haspos)
 133                         {
 134                                 /* append ptr's positions to res's positions */
 135                                 int                     newlen = ptr->poslen + res->poslen;
 136
 137                                 res->pos = (WordEntryPos *)
 138                                         repalloc(res->pos, newlen * sizeof(WordEntryPos));
 139                                 memcpy(&res->pos[res->poslen], ptr->pos,
 140                                            ptr->poslen * sizeof(WordEntryPos));
 141                                 res->poslen = newlen;
 142                                 pfree(ptr->pos);
 143                         }
 144                         else
 145                         {
 146                                 /* just give ptr's positions to pos */
 147                                 res->entry.haspos = 1;
 148                                 res->pos = ptr->pos;
 149                                 res->poslen = ptr->poslen;
 150                         }
 151                 }
 152                 ptr++;
 153         }
 154
 155         /* count space needed for last item */
 156         buflen += res->entry.len;
 157         if (res->entry.haspos)
 158         {
 159                 res->poslen = uniquePos(res->pos, res->poslen);
 160                 buflen = SHORTALIGN(buflen);
 161                 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
 162         }
 163
 164         *outbuflen = buflen;
 165         return res + 1 - a;
 166 }
 167
 168 static int
 169 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
 170 {
 171         return compareentry(a, b, buf);
 172 }
 173
 174
 175 Datum
 176 tsvectorin(PG_FUNCTION_ARGS)
 177 {
 178         char       *buf = PG_GETARG_CSTRING(0);
 179         TSVectorParseState state;
 180         WordEntryIN *arr;
 181         int                     totallen;
 182         int                     arrlen;                 /* allocated size of arr */
 183         WordEntry  *inarr;
 184         int                     len = 0;
 185         TSVector        in;
 186         int                     i;
 187         char       *token;
 188         int                     toklen;
 189         WordEntryPos *pos;
 190         int                     poslen;
 191         char       *strbuf;
 192         int                     stroff;
 193
 194         /*
 195          * Tokens are appended to tmpbuf, cur is a pointer to the end of used
 196          * space in tmpbuf.
 197          */
 198         char       *tmpbuf;
 199         char       *cur;
 200         int                     buflen = 256;   /* allocated size of tmpbuf */
 201
 202         pg_verifymbstr(buf, strlen(buf), false);
 203
 204         state = init_tsvector_parser(buf, false, false);
 205
 206         arrlen = 64;
 207         arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
 208         cur = tmpbuf = (char *) palloc(buflen);
 209
 210         while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
 211         {
 212                 if (toklen >= MAXSTRLEN)
 213                         ereport(ERROR,
 214                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 215                                          errmsg("word is too long (%ld bytes, max %ld bytes)",
 216                                                         (long) toklen,
 217                                                         (long) (MAXSTRLEN - 1))));
 218
 219                 if (cur - tmpbuf > MAXSTRPOS)
 220                         ereport(ERROR,
 221                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 222                                          errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
 223                                                         (long) (cur - tmpbuf), (long) MAXSTRPOS)));
 224
 225                 /*
 226                  * Enlarge buffers if needed
 227                  */
 228                 if (len >= arrlen)
 229                 {
 230                         arrlen *= 2;
 231                         arr = (WordEntryIN *)
 232                                 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
 233                 }
 234                 while ((cur - tmpbuf) + toklen >= buflen)
 235                 {
 236                         int                     dist = cur - tmpbuf;
 237
 238                         buflen *= 2;
 239                         tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
 240                         cur = tmpbuf + dist;
 241                 }
 242                 arr[len].entry.len = toklen;
 243                 arr[len].entry.pos = cur - tmpbuf;
 244                 memcpy((void *) cur, (void *) token, toklen);
 245                 cur += toklen;
 246
 247                 if (poslen != 0)
 248                 {
 249                         arr[len].entry.haspos = 1;
 250                         arr[len].pos = pos;
 251                         arr[len].poslen = poslen;
 252                 }
 253                 else
 254                 {
 255                         arr[len].entry.haspos = 0;
 256                         arr[len].pos = NULL;
 257                         arr[len].poslen = 0;
 258                 }
 259                 len++;
 260         }
 261
 262         close_tsvector_parser(state);
 263
 264         if (len > 0)
 265                 len = uniqueentry(arr, len, tmpbuf, &buflen);
 266         else
 267                 buflen = 0;
 268
 269         if (buflen > MAXSTRPOS)
 270                 ereport(ERROR,
 271                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 272                                  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
 273
 274         totallen = CALCDATASIZE(len, buflen);
 275         in = (TSVector) palloc0(totallen);
 276         SET_VARSIZE(in, totallen);
 277         in->size = len;
 278         inarr = ARRPTR(in);
 279         strbuf = STRPTR(in);
 280         stroff = 0;
 281         for (i = 0; i < len; i++)
 282         {
 283                 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
 284                 arr[i].entry.pos = stroff;
 285                 stroff += arr[i].entry.len;
 286                 if (arr[i].entry.haspos)
 287                 {
 288                         if (arr[i].poslen > 0xFFFF)
 289                                 elog(ERROR, "positions array too long");
 290
 291                         /* Copy number of positions */
 292                         stroff = SHORTALIGN(stroff);
 293                         *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
 294                         stroff += sizeof(uint16);
 295
 296                         /* Copy positions */
 297                         memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
 298                         stroff += arr[i].poslen * sizeof(WordEntryPos);
 299
 300                         pfree(arr[i].pos);
 301                 }
 302                 inarr[i] = arr[i].entry;
 303         }
 304
 305         Assert((strbuf + stroff - (char *) in) == totallen);
 306
 307         PG_RETURN_TSVECTOR(in);
 308 }
 309
 310 Datum
 311 tsvectorout(PG_FUNCTION_ARGS)
 312 {
 313         TSVector        out = PG_GETARG_TSVECTOR(0);
 314         char       *outbuf;
 315         int4            i,
 316                                 lenbuf = 0,
 317                                 pp;
 318         WordEntry  *ptr = ARRPTR(out);
 319         char       *curbegin,
 320                            *curin,
 321                            *curout;
 322
 323         lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
 324         for (i = 0; i < out->size; i++)
 325         {
 326                 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
 327                 if (ptr[i].haspos)
 328                         lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
 329         }
 330
 331         curout = outbuf = (char *) palloc(lenbuf);
 332         for (i = 0; i < out->size; i++)
 333         {
 334                 curbegin = curin = STRPTR(out) + ptr->pos;
 335                 if (i != 0)
 336                         *curout++ = ' ';
 337                 *curout++ = '\'';
 338                 while (curin - curbegin < ptr->len)
 339                 {
 340                         int                     len = pg_mblen(curin);
 341
 342                         if (t_iseq(curin, '\''))
 343                                 *curout++ = '\'';
 344                         else if (t_iseq(curin, '\\'))
 345                                 *curout++ = '\\';
 346
 347                         while (len--)
 348                                 *curout++ = *curin++;
 349                 }
 350
 351                 *curout++ = '\'';
 352                 if ((pp = POSDATALEN(out, ptr)) != 0)
 353                 {
 354                         WordEntryPos *wptr;
 355
 356                         *curout++ = ':';
 357                         wptr = POSDATAPTR(out, ptr);
 358                         while (pp)
 359                         {
 360                                 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
 361                                 switch (WEP_GETWEIGHT(*wptr))
 362                                 {
 363                                         case 3:
 364                                                 *curout++ = 'A';
 365                                                 break;
 366                                         case 2:
 367                                                 *curout++ = 'B';
 368                                                 break;
 369                                         case 1:
 370                                                 *curout++ = 'C';
 371                                                 break;
 372                                         case 0:
 373                                         default:
 374                                                 break;
 375                                 }
 376
 377                                 if (pp > 1)
 378                                         *curout++ = ',';
 379                                 pp--;
 380                                 wptr++;
 381                         }
 382                 }
 383                 ptr++;
 384         }
 385
 386         *curout = '\0';
 387         PG_FREE_IF_COPY(out, 0);
 388         PG_RETURN_CSTRING(outbuf);
 389 }
 390
 391 /*
 392  * Binary Input / Output functions. The binary format is as follows:
 393  *
 394  * uint32       number of lexemes
 395  *
 396  * for each lexeme:
 397  *              lexeme text in client encoding, null-terminated
 398  *              uint16  number of positions
 399  *              for each position:
 400  *                      uint16 WordEntryPos
 401  */
 402
 403 Datum
 404 tsvectorsend(PG_FUNCTION_ARGS)
 405 {
 406         TSVector        vec = PG_GETARG_TSVECTOR(0);
 407         StringInfoData buf;
 408         int                     i,
 409                                 j;
 410         WordEntry  *weptr = ARRPTR(vec);
 411
 412         pq_begintypsend(&buf);
 413
 414         pq_sendint(&buf, vec->size, sizeof(int32));
 415         for (i = 0; i < vec->size; i++)
 416         {
 417                 uint16          npos;
 418
 419                 /*
 420                  * the strings in the TSVector array are not null-terminated, so we
 421                  * have to send the null-terminator separately
 422                  */
 423                 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
 424                 pq_sendbyte(&buf, '\0');
 425
 426                 npos = POSDATALEN(vec, weptr);
 427                 pq_sendint(&buf, npos, sizeof(uint16));
 428
 429                 if (npos > 0)
 430                 {
 431                         WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
 432
 433                         for (j = 0; j < npos; j++)
 434                                 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
 435                 }
 436                 weptr++;
 437         }
 438
 439         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 440 }
 441
 442 Datum
 443 tsvectorrecv(PG_FUNCTION_ARGS)
 444 {
 445         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 446         TSVector        vec;
 447         int                     i;
 448         int32           nentries;
 449         int                     datalen;                /* number of bytes used in the variable size
 450                                                                  * area after fixed size TSVector header and
 451                                                                  * WordEntries */
 452         Size            hdrlen;
 453         Size            len;                    /* allocated size of vec */
 454         bool            needSort = false;
 455
 456         nentries = pq_getmsgint(buf, sizeof(int32));
 457         if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
 458                 elog(ERROR, "invalid size of tsvector");
 459
 460         hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
 461
 462         len = hdrlen * 2;                       /* times two to make room for lexemes */
 463         vec = (TSVector) palloc0(len);
 464         vec->size = nentries;
 465
 466         datalen = 0;
 467         for (i = 0; i < nentries; i++)
 468         {
 469                 const char *lexeme;
 470                 uint16          npos;
 471                 size_t          lex_len;
 472
 473                 lexeme = pq_getmsgstring(buf);
 474                 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
 475
 476                 /* sanity checks */
 477
 478                 lex_len = strlen(lexeme);
 479                 if (lex_len > MAXSTRLEN)
 480                         elog(ERROR, "invalid tsvector: lexeme too long");
 481
 482                 if (datalen > MAXSTRPOS)
 483                         elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
 484
 485                 if (npos > MAXNUMPOS)
 486                         elog(ERROR, "unexpected number of tsvector positions");
 487
 488                 /*
 489                  * Looks valid. Fill the WordEntry struct, and copy lexeme.
 490                  *
 491                  * But make sure the buffer is large enough first.
 492                  */
 493                 while (hdrlen + SHORTALIGN(datalen + lex_len) +
 494                            (npos + 1) * sizeof(WordEntryPos) >= len)
 495                 {
 496                         len *= 2;
 497                         vec = (TSVector) repalloc(vec, len);
 498                 }
 499
 500                 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
 501                 vec->entries[i].len = lex_len;
 502                 vec->entries[i].pos = datalen;
 503
 504                 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
 505
 506                 datalen += lex_len;
 507
 508                 if (i > 0 && WordEntryCMP(&vec->entries[i],
 509                                                                   &vec->entries[i - 1],
 510                                                                   STRPTR(vec)) <= 0)
 511                         needSort = true;
 512
 513                 /* Receive positions */
 514                 if (npos > 0)
 515                 {
 516                         uint16          j;
 517                         WordEntryPos *wepptr;
 518
 519                         /*
 520                          * Pad to 2-byte alignment if necessary. Though we used palloc0
 521                          * for the initial allocation, subsequent repalloc'd memory areas
 522                          * are not initialized to zero.
 523                          */
 524                         if (datalen != SHORTALIGN(datalen))
 525                         {
 526                                 *(STRPTR(vec) + datalen) = '\0';
 527                                 datalen = SHORTALIGN(datalen);
 528                         }
 529
 530                         memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
 531
 532                         wepptr = POSDATAPTR(vec, &vec->entries[i]);
 533                         for (j = 0; j < npos; j++)
 534                         {
 535                                 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
 536                                 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
 537                                         elog(ERROR, "position information is misordered");
 538                         }
 539
 540                         datalen += (npos + 1) * sizeof(WordEntry);
 541                 }
 542         }
 543
 544         SET_VARSIZE(vec, hdrlen + datalen);
 545
 546         if (needSort)
 547                 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
 548                                   compareentry, (void *) STRPTR(vec));
 549
 550         PG_RETURN_TSVECTOR(vec);
 551 }