1 /*-------------------------------------------------------------------------
4 * I/O functions for tsvector
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/memutils.h"
25 WordEntry entry
; /* must be first! */
27 int poslen
; /* number of elements in pos */
31 /* Compare two WordEntryPos values for qsort */
33 comparePos(const void *a
, const void *b
)
35 int apos
= WEP_GETPOS(*(const WordEntryPos
*) a
);
36 int bpos
= WEP_GETPOS(*(const WordEntryPos
*) b
);
40 return (apos
> bpos
) ? 1 : -1;
44 * Removes duplicate pos entries. If there's two entries with same pos
45 * but different weight, the higher weight is retained.
50 uniquePos(WordEntryPos
*a
, int l
)
58 qsort((void *) a
, l
, sizeof(WordEntryPos
), comparePos
);
64 if (WEP_GETPOS(*ptr
) != WEP_GETPOS(*res
))
68 if (res
- a
>= MAXNUMPOS
- 1 ||
69 WEP_GETPOS(*res
) == MAXENTRYPOS
- 1)
72 else if (WEP_GETWEIGHT(*ptr
) > WEP_GETWEIGHT(*res
))
73 WEP_SETWEIGHT(*res
, WEP_GETWEIGHT(*ptr
));
80 /* Compare two WordEntryIN values for qsort */
82 compareentry(const void *va
, const void *vb
, void *arg
)
84 const WordEntryIN
*a
= (const WordEntryIN
*) va
;
85 const WordEntryIN
*b
= (const WordEntryIN
*) vb
;
86 char *BufferStr
= (char *) arg
;
88 return tsCompareString(&BufferStr
[a
->entry
.pos
], a
->entry
.len
,
89 &BufferStr
[b
->entry
.pos
], b
->entry
.len
,
94 * Sort an array of WordEntryIN, remove duplicates.
95 * *outbuflen receives the amount of space needed for strings and positions.
98 uniqueentry(WordEntryIN
*a
, int l
, char *buf
, int *outbuflen
)
107 qsort_arg((void *) a
, l
, sizeof(WordEntryIN
), compareentry
,
115 if (!(ptr
->entry
.len
== res
->entry
.len
&&
116 strncmp(&buf
[ptr
->entry
.pos
], &buf
[res
->entry
.pos
],
117 res
->entry
.len
) == 0))
119 /* done accumulating data into *res, count space needed */
120 buflen
+= res
->entry
.len
;
121 if (res
->entry
.haspos
)
123 res
->poslen
= uniquePos(res
->pos
, res
->poslen
);
124 buflen
= SHORTALIGN(buflen
);
125 buflen
+= res
->poslen
* sizeof(WordEntryPos
) + sizeof(uint16
);
128 memcpy(res
, ptr
, sizeof(WordEntryIN
));
130 else if (ptr
->entry
.haspos
)
132 if (res
->entry
.haspos
)
134 /* append ptr's positions to res's positions */
135 int newlen
= ptr
->poslen
+ res
->poslen
;
137 res
->pos
= (WordEntryPos
*)
138 repalloc(res
->pos
, newlen
* sizeof(WordEntryPos
));
139 memcpy(&res
->pos
[res
->poslen
], ptr
->pos
,
140 ptr
->poslen
* sizeof(WordEntryPos
));
141 res
->poslen
= newlen
;
146 /* just give ptr's positions to pos */
147 res
->entry
.haspos
= 1;
149 res
->poslen
= ptr
->poslen
;
155 /* count space needed for last item */
156 buflen
+= res
->entry
.len
;
157 if (res
->entry
.haspos
)
159 res
->poslen
= uniquePos(res
->pos
, res
->poslen
);
160 buflen
= SHORTALIGN(buflen
);
161 buflen
+= res
->poslen
* sizeof(WordEntryPos
) + sizeof(uint16
);
169 WordEntryCMP(WordEntry
*a
, WordEntry
*b
, char *buf
)
171 return compareentry(a
, b
, buf
);
176 tsvectorin(PG_FUNCTION_ARGS
)
178 char *buf
= PG_GETARG_CSTRING(0);
179 TSVectorParseState state
;
182 int arrlen
; /* allocated size of arr */
195 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
200 int buflen
= 256; /* allocated size of tmpbuf */
202 pg_verifymbstr(buf
, strlen(buf
), false);
204 state
= init_tsvector_parser(buf
, false, false);
207 arr
= (WordEntryIN
*) palloc(sizeof(WordEntryIN
) * arrlen
);
208 cur
= tmpbuf
= (char *) palloc(buflen
);
210 while (gettoken_tsvector(state
, &token
, &toklen
, &pos
, &poslen
, NULL
))
212 if (toklen
>= MAXSTRLEN
)
214 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
215 errmsg("word is too long (%ld bytes, max %ld bytes)",
217 (long) (MAXSTRLEN
- 1))));
219 if (cur
- tmpbuf
> MAXSTRPOS
)
221 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
222 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
223 (long) (cur
- tmpbuf
), (long) MAXSTRPOS
)));
226 * Enlarge buffers if needed
231 arr
= (WordEntryIN
*)
232 repalloc((void *) arr
, sizeof(WordEntryIN
) * arrlen
);
234 while ((cur
- tmpbuf
) + toklen
>= buflen
)
236 int dist
= cur
- tmpbuf
;
239 tmpbuf
= (char *) repalloc((void *) tmpbuf
, buflen
);
242 arr
[len
].entry
.len
= toklen
;
243 arr
[len
].entry
.pos
= cur
- tmpbuf
;
244 memcpy((void *) cur
, (void *) token
, toklen
);
249 arr
[len
].entry
.haspos
= 1;
251 arr
[len
].poslen
= poslen
;
255 arr
[len
].entry
.haspos
= 0;
262 close_tsvector_parser(state
);
265 len
= uniqueentry(arr
, len
, tmpbuf
, &buflen
);
269 if (buflen
> MAXSTRPOS
)
271 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
272 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen
, MAXSTRPOS
)));
274 totallen
= CALCDATASIZE(len
, buflen
);
275 in
= (TSVector
) palloc0(totallen
);
276 SET_VARSIZE(in
, totallen
);
281 for (i
= 0; i
< len
; i
++)
283 memcpy(strbuf
+ stroff
, &tmpbuf
[arr
[i
].entry
.pos
], arr
[i
].entry
.len
);
284 arr
[i
].entry
.pos
= stroff
;
285 stroff
+= arr
[i
].entry
.len
;
286 if (arr
[i
].entry
.haspos
)
288 if (arr
[i
].poslen
> 0xFFFF)
289 elog(ERROR
, "positions array too long");
291 /* Copy number of positions */
292 stroff
= SHORTALIGN(stroff
);
293 *(uint16
*) (strbuf
+ stroff
) = (uint16
) arr
[i
].poslen
;
294 stroff
+= sizeof(uint16
);
297 memcpy(strbuf
+ stroff
, arr
[i
].pos
, arr
[i
].poslen
* sizeof(WordEntryPos
));
298 stroff
+= arr
[i
].poslen
* sizeof(WordEntryPos
);
302 inarr
[i
] = arr
[i
].entry
;
305 Assert((strbuf
+ stroff
- (char *) in
) == totallen
);
307 PG_RETURN_TSVECTOR(in
);
311 tsvectorout(PG_FUNCTION_ARGS
)
313 TSVector out
= PG_GETARG_TSVECTOR(0);
318 WordEntry
*ptr
= ARRPTR(out
);
323 lenbuf
= out
->size
* 2 /* '' */ + out
->size
- 1 /* space */ + 2 /* \0 */ ;
324 for (i
= 0; i
< out
->size
; i
++)
326 lenbuf
+= ptr
[i
].len
* 2 * pg_database_encoding_max_length() /* for escape */ ;
328 lenbuf
+= 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out
, &(ptr
[i
]));
331 curout
= outbuf
= (char *) palloc(lenbuf
);
332 for (i
= 0; i
< out
->size
; i
++)
334 curbegin
= curin
= STRPTR(out
) + ptr
->pos
;
338 while (curin
- curbegin
< ptr
->len
)
340 int len
= pg_mblen(curin
);
342 if (t_iseq(curin
, '\''))
344 else if (t_iseq(curin
, '\\'))
348 *curout
++ = *curin
++;
352 if ((pp
= POSDATALEN(out
, ptr
)) != 0)
357 wptr
= POSDATAPTR(out
, ptr
);
360 curout
+= sprintf(curout
, "%d", WEP_GETPOS(*wptr
));
361 switch (WEP_GETWEIGHT(*wptr
))
387 PG_FREE_IF_COPY(out
, 0);
388 PG_RETURN_CSTRING(outbuf
);
392 * Binary Input / Output functions. The binary format is as follows:
394 * uint32 number of lexemes
397 * lexeme text in client encoding, null-terminated
398 * uint16 number of positions
400 * uint16 WordEntryPos
404 tsvectorsend(PG_FUNCTION_ARGS
)
406 TSVector vec
= PG_GETARG_TSVECTOR(0);
410 WordEntry
*weptr
= ARRPTR(vec
);
412 pq_begintypsend(&buf
);
414 pq_sendint(&buf
, vec
->size
, sizeof(int32
));
415 for (i
= 0; i
< vec
->size
; i
++)
420 * the strings in the TSVector array are not null-terminated, so we
421 * have to send the null-terminator separately
423 pq_sendtext(&buf
, STRPTR(vec
) + weptr
->pos
, weptr
->len
);
424 pq_sendbyte(&buf
, '\0');
426 npos
= POSDATALEN(vec
, weptr
);
427 pq_sendint(&buf
, npos
, sizeof(uint16
));
431 WordEntryPos
*wepptr
= POSDATAPTR(vec
, weptr
);
433 for (j
= 0; j
< npos
; j
++)
434 pq_sendint(&buf
, wepptr
[j
], sizeof(WordEntryPos
));
439 PG_RETURN_BYTEA_P(pq_endtypsend(&buf
));
443 tsvectorrecv(PG_FUNCTION_ARGS
)
445 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
449 int datalen
; /* number of bytes used in the variable size
450 * area after fixed size TSVector header and
453 Size len
; /* allocated size of vec */
454 bool needSort
= false;
456 nentries
= pq_getmsgint(buf
, sizeof(int32
));
457 if (nentries
< 0 || nentries
> (MaxAllocSize
/ sizeof(WordEntry
)))
458 elog(ERROR
, "invalid size of tsvector");
460 hdrlen
= DATAHDRSIZE
+ sizeof(WordEntry
) * nentries
;
462 len
= hdrlen
* 2; /* times two to make room for lexemes */
463 vec
= (TSVector
) palloc0(len
);
464 vec
->size
= nentries
;
467 for (i
= 0; i
< nentries
; i
++)
473 lexeme
= pq_getmsgstring(buf
);
474 npos
= (uint16
) pq_getmsgint(buf
, sizeof(uint16
));
478 lex_len
= strlen(lexeme
);
479 if (lex_len
> MAXSTRLEN
)
480 elog(ERROR
, "invalid tsvector: lexeme too long");
482 if (datalen
> MAXSTRPOS
)
483 elog(ERROR
, "invalid tsvector: maximum total lexeme length exceeded");
485 if (npos
> MAXNUMPOS
)
486 elog(ERROR
, "unexpected number of tsvector positions");
489 * Looks valid. Fill the WordEntry struct, and copy lexeme.
491 * But make sure the buffer is large enough first.
493 while (hdrlen
+ SHORTALIGN(datalen
+ lex_len
) +
494 (npos
+ 1) * sizeof(WordEntryPos
) >= len
)
497 vec
= (TSVector
) repalloc(vec
, len
);
500 vec
->entries
[i
].haspos
= (npos
> 0) ? 1 : 0;
501 vec
->entries
[i
].len
= lex_len
;
502 vec
->entries
[i
].pos
= datalen
;
504 memcpy(STRPTR(vec
) + datalen
, lexeme
, lex_len
);
508 if (i
> 0 && WordEntryCMP(&vec
->entries
[i
],
509 &vec
->entries
[i
- 1],
513 /* Receive positions */
517 WordEntryPos
*wepptr
;
520 * Pad to 2-byte alignment if necessary. Though we used palloc0
521 * for the initial allocation, subsequent repalloc'd memory areas
522 * are not initialized to zero.
524 if (datalen
!= SHORTALIGN(datalen
))
526 *(STRPTR(vec
) + datalen
) = '\0';
527 datalen
= SHORTALIGN(datalen
);
530 memcpy(STRPTR(vec
) + datalen
, &npos
, sizeof(uint16
));
532 wepptr
= POSDATAPTR(vec
, &vec
->entries
[i
]);
533 for (j
= 0; j
< npos
; j
++)
535 wepptr
[j
] = (WordEntryPos
) pq_getmsgint(buf
, sizeof(WordEntryPos
));
536 if (j
> 0 && WEP_GETPOS(wepptr
[j
]) <= WEP_GETPOS(wepptr
[j
- 1]))
537 elog(ERROR
, "position information is misordered");
540 datalen
+= (npos
+ 1) * sizeof(WordEntry
);
544 SET_VARSIZE(vec
, hdrlen
+ datalen
);
547 qsort_arg((void *) ARRPTR(vec
), vec
->size
, sizeof(WordEntry
),
548 compareentry
, (void *) STRPTR(vec
));
550 PG_RETURN_TSVECTOR(vec
);