1 /*-------------------------------------------------------------------------
4 * Text search unaccent dictionary
6 * Copyright (c) 2009-2024, PostgreSQL Global Development Group
9 * contrib/unaccent/unaccent.c
11 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "catalog/pg_ts_dict.h"
18 #include "commands/defrem.h"
19 #include "lib/stringinfo.h"
20 #include "tsearch/ts_cache.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "utils/builtins.h"
24 #include "utils/lsyscache.h"
25 #include "utils/regproc.h"
26 #include "utils/syscache.h"
31 * An unaccent dictionary uses a trie to find a string to replace. Each node
32 * of the trie is an array of 256 TrieChar structs; the N-th element of the
33 * array corresponds to next byte value N. That element can contain both a
34 * replacement string (to be used if the source string ends with this byte)
35 * and a link to another trie node (to be followed if there are more bytes).
37 * Note that the trie search logic pays no attention to multibyte character
38 * boundaries. This is OK as long as both the data entered into the trie and
39 * the data we're trying to look up are validly encoded; no partial-character
42 typedef struct TrieChar
44 struct TrieChar
*nextChar
;
50 * placeChar - put str into trie's structure, byte by byte.
52 * If node is NULL, we need to make a new node, which will be returned;
53 * otherwise the return value is the same as node.
56 placeChar(TrieChar
*node
, const unsigned char *str
, int lenstr
,
57 const char *replaceTo
, int replacelen
)
62 node
= (TrieChar
*) palloc0(sizeof(TrieChar
) * 256);
64 Assert(lenstr
> 0); /* else str[0] doesn't exist */
66 curnode
= node
+ *str
;
70 if (curnode
->replaceTo
)
72 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
73 errmsg("duplicate source strings, first one will be used")));
76 curnode
->replacelen
= replacelen
;
77 curnode
->replaceTo
= (char *) palloc(replacelen
);
78 memcpy(curnode
->replaceTo
, replaceTo
, replacelen
);
83 curnode
->nextChar
= placeChar(curnode
->nextChar
, str
+ 1, lenstr
- 1,
84 replaceTo
, replacelen
);
91 * initTrie - create trie from file.
93 * Function converts UTF8-encoded file into current encoding.
96 initTrie(const char *filename
)
98 TrieChar
*volatile rootTrie
= NULL
;
99 MemoryContext ccxt
= CurrentMemoryContext
;
100 tsearch_readline_state trst
;
103 filename
= get_tsearch_config_filename(filename
, "rules");
104 if (!tsearch_readline_begin(&trst
, filename
))
106 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
107 errmsg("could not open unaccent file \"%s\": %m",
113 * pg_do_encoding_conversion() (called by tsearch_readline()) will
114 * emit exception if it finds untranslatable characters in current
115 * locale. We just skip such lines, continuing with the next.
123 while ((line
= tsearch_readline(&trst
)) != NULL
)
126 * The format of each line must be "src" or "src trg", where
127 * src and trg are sequences of one or more non-whitespace
128 * characters, separated by whitespace. Whitespace at start
129 * or end of line is ignored. If trg is omitted, an empty
130 * string is used as the replacement. trg can be optionally
131 * quoted, in which case whitespaces are included in it.
133 * We use a simple state machine, with states
134 * 0 initial (before src)
136 * 2 in whitespace after src
137 * 3 in trg (non-quoted)
139 * 5 in whitespace after trg
140 * -1 syntax error detected (two strings)
141 * -2 syntax error detected (unfinished quoted string)
148 char *trgstore
= NULL
;
153 bool trgquoted
= false;
156 for (ptr
= line
; *ptr
; ptr
+= ptrlen
)
158 ptrlen
= pg_mblen(ptr
);
159 /* ignore whitespace, but end src or trg */
166 /* whitespaces are OK in quoted area */
196 /* continue non-quoted trg */
200 /* continue quoted trg */
204 * If this is a quote, consider it as the end of
205 * trg except if the follow-up character is itself
210 if (*(ptr
+ 1) == '"')
220 /* bogus line format */
226 if (state
== 1 || state
== 2)
228 /* trg was omitted, so use "" */
233 /* If still in a quoted area, fallback to an error */
237 /* If trg was quoted, remove its quotes and unescape it */
238 if (trgquoted
&& state
> 0)
240 /* Ignore first and end quotes */
241 trgstore
= (char *) palloc(sizeof(char) * (trglen
- 2));
243 for (int i
= 1; i
< trglen
- 1; i
++)
245 trgstore
[trgstorelen
] = trg
[i
];
247 /* skip second double quotes */
248 if (trg
[i
] == '"' && trg
[i
+ 1] == '"')
254 trgstore
= (char *) palloc(sizeof(char) * trglen
);
255 trgstorelen
= trglen
;
256 memcpy(trgstore
, trg
, trgstorelen
);
260 rootTrie
= placeChar(rootTrie
,
261 (unsigned char *) src
, srclen
,
262 trgstore
, trgstorelen
);
263 else if (state
== -1)
265 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
266 errmsg("invalid syntax: more than two strings in unaccent rule")));
267 else if (state
== -2)
269 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
270 errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
282 ecxt
= MemoryContextSwitchTo(ccxt
);
283 errdata
= CopyErrorData();
284 if (errdata
->sqlerrcode
== ERRCODE_UNTRANSLATABLE_CHARACTER
)
290 MemoryContextSwitchTo(ecxt
);
298 tsearch_readline_end(&trst
);
304 * findReplaceTo - find longest possible match in trie
306 * On success, returns pointer to ending subnode, plus length of matched
307 * source string in *p_matchlen. On failure, returns NULL.
310 findReplaceTo(TrieChar
*node
, const unsigned char *src
, int srclen
,
313 TrieChar
*result
= NULL
;
316 *p_matchlen
= 0; /* prevent uninitialized-variable warnings */
318 while (node
&& matchlen
< srclen
)
320 node
= node
+ src
[matchlen
];
326 *p_matchlen
= matchlen
;
329 node
= node
->nextChar
;
335 PG_FUNCTION_INFO_V1(unaccent_init
);
337 unaccent_init(PG_FUNCTION_ARGS
)
339 List
*dictoptions
= (List
*) PG_GETARG_POINTER(0);
340 TrieChar
*rootTrie
= NULL
;
341 bool fileloaded
= false;
344 foreach(l
, dictoptions
)
346 DefElem
*defel
= (DefElem
*) lfirst(l
);
348 if (strcmp(defel
->defname
, "rules") == 0)
352 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
353 errmsg("multiple Rules parameters")));
354 rootTrie
= initTrie(defGetString(defel
));
360 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
361 errmsg("unrecognized Unaccent parameter: \"%s\"",
369 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
370 errmsg("missing Rules parameter")));
373 PG_RETURN_POINTER(rootTrie
);
376 PG_FUNCTION_INFO_V1(unaccent_lexize
);
378 unaccent_lexize(PG_FUNCTION_ARGS
)
380 TrieChar
*rootTrie
= (TrieChar
*) PG_GETARG_POINTER(0);
381 char *srcchar
= (char *) PG_GETARG_POINTER(1);
382 int32 len
= PG_GETARG_INT32(2);
383 char *srcstart
= srcchar
;
387 /* we allocate storage for the buffer only if needed */
395 node
= findReplaceTo(rootTrie
, (unsigned char *) srcchar
, len
,
397 if (node
&& node
->replaceTo
)
399 if (buf
.data
== NULL
)
401 /* initialize buffer */
402 initStringInfo(&buf
);
403 /* insert any data we already skipped over */
404 if (srcchar
!= srcstart
)
405 appendBinaryStringInfo(&buf
, srcstart
, srcchar
- srcstart
);
407 appendBinaryStringInfo(&buf
, node
->replaceTo
, node
->replacelen
);
411 matchlen
= pg_mblen(srcchar
);
412 if (buf
.data
!= NULL
)
413 appendBinaryStringInfo(&buf
, srcchar
, matchlen
);
420 /* return a result only if we made at least one substitution */
421 if (buf
.data
!= NULL
)
423 res
= (TSLexeme
*) palloc0(sizeof(TSLexeme
) * 2);
424 res
->lexeme
= buf
.data
;
425 res
->flags
= TSL_FILTER
;
430 PG_RETURN_POINTER(res
);
434 * Function-like wrapper for dictionary
436 PG_FUNCTION_INFO_V1(unaccent_dict
);
438 unaccent_dict(PG_FUNCTION_ARGS
)
443 TSDictionaryCacheEntry
*dict
;
449 * Use the "unaccent" dictionary that is in the same schema that this
452 Oid procnspid
= get_func_namespace(fcinfo
->flinfo
->fn_oid
);
453 const char *dictname
= "unaccent";
455 dictOid
= GetSysCacheOid2(TSDICTNAMENSP
, Anum_pg_ts_dict_oid
,
456 PointerGetDatum(dictname
),
457 ObjectIdGetDatum(procnspid
));
458 if (!OidIsValid(dictOid
))
460 (errcode(ERRCODE_UNDEFINED_OBJECT
),
461 errmsg("text search dictionary \"%s.%s\" does not exist",
462 get_namespace_name(procnspid
), dictname
)));
467 dictOid
= PG_GETARG_OID(0);
470 str
= PG_GETARG_TEXT_PP(strArg
);
472 dict
= lookup_ts_dictionary_cache(dictOid
);
474 res
= (TSLexeme
*) DatumGetPointer(FunctionCall4(&(dict
->lexize
),
475 PointerGetDatum(dict
->dictData
),
476 PointerGetDatum(VARDATA_ANY(str
)),
477 Int32GetDatum(VARSIZE_ANY_EXHDR(str
)),
478 PointerGetDatum(NULL
)));
480 PG_FREE_IF_COPY(str
, strArg
);
484 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg
));
486 else if (res
->lexeme
== NULL
)
489 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg
));
493 text
*txt
= cstring_to_text(res
->lexeme
);
498 PG_RETURN_TEXT_P(txt
);