Remove incorrect entries in pg_walsummary's getopt_long call.
[pgsql.git] / contrib / unaccent / unaccent.c
blob707962305f83b5e5161b6763adba652aa09a810c
1 /*-------------------------------------------------------------------------
3 * unaccent.c
4 * Text search unaccent dictionary
6 * Copyright (c) 2009-2024, PostgreSQL Global Development Group
8 * IDENTIFICATION
9 * contrib/unaccent/unaccent.c
11 *-------------------------------------------------------------------------
14 #include "postgres.h"
16 #include "catalog/namespace.h"
17 #include "catalog/pg_ts_dict.h"
18 #include "commands/defrem.h"
19 #include "lib/stringinfo.h"
20 #include "tsearch/ts_cache.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "utils/builtins.h"
24 #include "utils/lsyscache.h"
25 #include "utils/regproc.h"
26 #include "utils/syscache.h"
28 PG_MODULE_MAGIC;
31 * An unaccent dictionary uses a trie to find a string to replace. Each node
32 * of the trie is an array of 256 TrieChar structs; the N-th element of the
33 * array corresponds to next byte value N. That element can contain both a
34 * replacement string (to be used if the source string ends with this byte)
35 * and a link to another trie node (to be followed if there are more bytes).
37 * Note that the trie search logic pays no attention to multibyte character
38 * boundaries. This is OK as long as both the data entered into the trie and
39 * the data we're trying to look up are validly encoded; no partial-character
40 * matches will occur.
42 typedef struct TrieChar
44 struct TrieChar *nextChar;
45 char *replaceTo;
46 int replacelen;
47 } TrieChar;
50 * placeChar - put str into trie's structure, byte by byte.
52 * If node is NULL, we need to make a new node, which will be returned;
53 * otherwise the return value is the same as node.
55 static TrieChar *
56 placeChar(TrieChar *node, const unsigned char *str, int lenstr,
57 const char *replaceTo, int replacelen)
59 TrieChar *curnode;
61 if (!node)
62 node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
64 Assert(lenstr > 0); /* else str[0] doesn't exist */
66 curnode = node + *str;
68 if (lenstr <= 1)
70 if (curnode->replaceTo)
71 ereport(WARNING,
72 (errcode(ERRCODE_CONFIG_FILE_ERROR),
73 errmsg("duplicate source strings, first one will be used")));
74 else
76 curnode->replacelen = replacelen;
77 curnode->replaceTo = (char *) palloc(replacelen);
78 memcpy(curnode->replaceTo, replaceTo, replacelen);
81 else
83 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
84 replaceTo, replacelen);
87 return node;
91 * initTrie - create trie from file.
93 * Function converts UTF8-encoded file into current encoding.
95 static TrieChar *
96 initTrie(const char *filename)
98 TrieChar *volatile rootTrie = NULL;
99 MemoryContext ccxt = CurrentMemoryContext;
100 tsearch_readline_state trst;
101 volatile bool skip;
103 filename = get_tsearch_config_filename(filename, "rules");
104 if (!tsearch_readline_begin(&trst, filename))
105 ereport(ERROR,
106 (errcode(ERRCODE_CONFIG_FILE_ERROR),
107 errmsg("could not open unaccent file \"%s\": %m",
108 filename)));
113 * pg_do_encoding_conversion() (called by tsearch_readline()) will
114 * emit exception if it finds untranslatable characters in current
115 * locale. We just skip such lines, continuing with the next.
117 skip = true;
119 PG_TRY();
121 char *line;
123 while ((line = tsearch_readline(&trst)) != NULL)
125 /*----------
126 * The format of each line must be "src" or "src trg", where
127 * src and trg are sequences of one or more non-whitespace
128 * characters, separated by whitespace. Whitespace at start
129 * or end of line is ignored. If trg is omitted, an empty
130 * string is used as the replacement. trg can be optionally
131 * quoted, in which case whitespaces are included in it.
133 * We use a simple state machine, with states
134 * 0 initial (before src)
135 * 1 in src
136 * 2 in whitespace after src
137 * 3 in trg (non-quoted)
138 * 4 in trg (quoted)
139 * 5 in whitespace after trg
140 * -1 syntax error detected (two strings)
141 * -2 syntax error detected (unfinished quoted string)
142 *----------
144 int state;
145 char *ptr;
146 char *src = NULL;
147 char *trg = NULL;
148 char *trgstore = NULL;
149 int ptrlen;
150 int srclen = 0;
151 int trglen = 0;
152 int trgstorelen = 0;
153 bool trgquoted = false;
155 state = 0;
156 for (ptr = line; *ptr; ptr += ptrlen)
158 ptrlen = pg_mblen(ptr);
159 /* ignore whitespace, but end src or trg */
160 if (t_isspace(ptr))
162 if (state == 1)
163 state = 2;
164 else if (state == 3)
165 state = 5;
166 /* whitespaces are OK in quoted area */
167 if (state != 4)
168 continue;
170 switch (state)
172 case 0:
173 /* start of src */
174 src = ptr;
175 srclen = ptrlen;
176 state = 1;
177 break;
178 case 1:
179 /* continue src */
180 srclen += ptrlen;
181 break;
182 case 2:
183 /* start of trg */
184 if (*ptr == '"')
186 trgquoted = true;
187 state = 4;
189 else
190 state = 3;
192 trg = ptr;
193 trglen = ptrlen;
194 break;
195 case 3:
196 /* continue non-quoted trg */
197 trglen += ptrlen;
198 break;
199 case 4:
200 /* continue quoted trg */
201 trglen += ptrlen;
204 * If this is a quote, consider it as the end of
205 * trg except if the follow-up character is itself
206 * a quote.
208 if (*ptr == '"')
210 if (*(ptr + 1) == '"')
212 ptr++;
213 trglen += 1;
215 else
216 state = 5;
218 break;
219 default:
220 /* bogus line format */
221 state = -1;
222 break;
226 if (state == 1 || state == 2)
228 /* trg was omitted, so use "" */
229 trg = "";
230 trglen = 0;
233 /* If still in a quoted area, fallback to an error */
234 if (state == 4)
235 state = -2;
237 /* If trg was quoted, remove its quotes and unescape it */
238 if (trgquoted && state > 0)
240 /* Ignore first and end quotes */
241 trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
242 trgstorelen = 0;
243 for (int i = 1; i < trglen - 1; i++)
245 trgstore[trgstorelen] = trg[i];
246 trgstorelen++;
247 /* skip second double quotes */
248 if (trg[i] == '"' && trg[i + 1] == '"')
249 i++;
252 else
254 trgstore = (char *) palloc(sizeof(char) * trglen);
255 trgstorelen = trglen;
256 memcpy(trgstore, trg, trgstorelen);
259 if (state > 0)
260 rootTrie = placeChar(rootTrie,
261 (unsigned char *) src, srclen,
262 trgstore, trgstorelen);
263 else if (state == -1)
264 ereport(WARNING,
265 (errcode(ERRCODE_CONFIG_FILE_ERROR),
266 errmsg("invalid syntax: more than two strings in unaccent rule")));
267 else if (state == -2)
268 ereport(WARNING,
269 (errcode(ERRCODE_CONFIG_FILE_ERROR),
270 errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
272 pfree(trgstore);
273 pfree(line);
275 skip = false;
277 PG_CATCH();
279 ErrorData *errdata;
280 MemoryContext ecxt;
282 ecxt = MemoryContextSwitchTo(ccxt);
283 errdata = CopyErrorData();
284 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
286 FlushErrorState();
288 else
290 MemoryContextSwitchTo(ecxt);
291 PG_RE_THROW();
294 PG_END_TRY();
296 while (skip);
298 tsearch_readline_end(&trst);
300 return rootTrie;
304 * findReplaceTo - find longest possible match in trie
306 * On success, returns pointer to ending subnode, plus length of matched
307 * source string in *p_matchlen. On failure, returns NULL.
309 static TrieChar *
310 findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
311 int *p_matchlen)
313 TrieChar *result = NULL;
314 int matchlen = 0;
316 *p_matchlen = 0; /* prevent uninitialized-variable warnings */
318 while (node && matchlen < srclen)
320 node = node + src[matchlen];
321 matchlen++;
323 if (node->replaceTo)
325 result = node;
326 *p_matchlen = matchlen;
329 node = node->nextChar;
332 return result;
335 PG_FUNCTION_INFO_V1(unaccent_init);
336 Datum
337 unaccent_init(PG_FUNCTION_ARGS)
339 List *dictoptions = (List *) PG_GETARG_POINTER(0);
340 TrieChar *rootTrie = NULL;
341 bool fileloaded = false;
342 ListCell *l;
344 foreach(l, dictoptions)
346 DefElem *defel = (DefElem *) lfirst(l);
348 if (strcmp(defel->defname, "rules") == 0)
350 if (fileloaded)
351 ereport(ERROR,
352 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
353 errmsg("multiple Rules parameters")));
354 rootTrie = initTrie(defGetString(defel));
355 fileloaded = true;
357 else
359 ereport(ERROR,
360 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
361 errmsg("unrecognized Unaccent parameter: \"%s\"",
362 defel->defname)));
366 if (!fileloaded)
368 ereport(ERROR,
369 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
370 errmsg("missing Rules parameter")));
373 PG_RETURN_POINTER(rootTrie);
376 PG_FUNCTION_INFO_V1(unaccent_lexize);
377 Datum
378 unaccent_lexize(PG_FUNCTION_ARGS)
380 TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
381 char *srcchar = (char *) PG_GETARG_POINTER(1);
382 int32 len = PG_GETARG_INT32(2);
383 char *srcstart = srcchar;
384 TSLexeme *res;
385 StringInfoData buf;
387 /* we allocate storage for the buffer only if needed */
388 buf.data = NULL;
390 while (len > 0)
392 TrieChar *node;
393 int matchlen;
395 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
396 &matchlen);
397 if (node && node->replaceTo)
399 if (buf.data == NULL)
401 /* initialize buffer */
402 initStringInfo(&buf);
403 /* insert any data we already skipped over */
404 if (srcchar != srcstart)
405 appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
407 appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
409 else
411 matchlen = pg_mblen(srcchar);
412 if (buf.data != NULL)
413 appendBinaryStringInfo(&buf, srcchar, matchlen);
416 srcchar += matchlen;
417 len -= matchlen;
420 /* return a result only if we made at least one substitution */
421 if (buf.data != NULL)
423 res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
424 res->lexeme = buf.data;
425 res->flags = TSL_FILTER;
427 else
428 res = NULL;
430 PG_RETURN_POINTER(res);
434 * Function-like wrapper for dictionary
436 PG_FUNCTION_INFO_V1(unaccent_dict);
437 Datum
438 unaccent_dict(PG_FUNCTION_ARGS)
440 text *str;
441 int strArg;
442 Oid dictOid;
443 TSDictionaryCacheEntry *dict;
444 TSLexeme *res;
446 if (PG_NARGS() == 1)
449 * Use the "unaccent" dictionary that is in the same schema that this
450 * function is in.
452 Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
453 const char *dictname = "unaccent";
455 dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
456 PointerGetDatum(dictname),
457 ObjectIdGetDatum(procnspid));
458 if (!OidIsValid(dictOid))
459 ereport(ERROR,
460 (errcode(ERRCODE_UNDEFINED_OBJECT),
461 errmsg("text search dictionary \"%s.%s\" does not exist",
462 get_namespace_name(procnspid), dictname)));
463 strArg = 0;
465 else
467 dictOid = PG_GETARG_OID(0);
468 strArg = 1;
470 str = PG_GETARG_TEXT_PP(strArg);
472 dict = lookup_ts_dictionary_cache(dictOid);
474 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
475 PointerGetDatum(dict->dictData),
476 PointerGetDatum(VARDATA_ANY(str)),
477 Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
478 PointerGetDatum(NULL)));
480 PG_FREE_IF_COPY(str, strArg);
482 if (res == NULL)
484 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
486 else if (res->lexeme == NULL)
488 pfree(res);
489 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
491 else
493 text *txt = cstring_to_text(res->lexeme);
495 pfree(res->lexeme);
496 pfree(res);
498 PG_RETURN_TEXT_P(txt);