contrib/unaccent/unaccent.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * unaccent.c
   4  *        Text search unaccent dictionary
   5  *
   6  * Copyright (c) 2009-2024, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        contrib/unaccent/unaccent.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13
  14 #include "postgres.h"
  15
  16 #include "catalog/namespace.h"
  17 #include "catalog/pg_ts_dict.h"
  18 #include "commands/defrem.h"
  19 #include "lib/stringinfo.h"
  20 #include "tsearch/ts_cache.h"
  21 #include "tsearch/ts_locale.h"
  22 #include "tsearch/ts_public.h"
  23 #include "utils/builtins.h"
  24 #include "utils/lsyscache.h"
  25 #include "utils/regproc.h"
  26 #include "utils/syscache.h"
  27
  28 PG_MODULE_MAGIC;
  29
  30 /*
  31  * An unaccent dictionary uses a trie to find a string to replace.  Each node
  32  * of the trie is an array of 256 TrieChar structs; the N-th element of the
  33  * array corresponds to next byte value N.  That element can contain both a
  34  * replacement string (to be used if the source string ends with this byte)
  35  * and a link to another trie node (to be followed if there are more bytes).
  36  *
  37  * Note that the trie search logic pays no attention to multibyte character
  38  * boundaries.  This is OK as long as both the data entered into the trie and
  39  * the data we're trying to look up are validly encoded; no partial-character
  40  * matches will occur.
  41  */
  42 typedef struct TrieChar
  43 {
  44         struct TrieChar *nextChar;
  45         char       *replaceTo;
  46         int                     replacelen;
  47 } TrieChar;
  48
  49 /*
  50  * placeChar - put str into trie's structure, byte by byte.
  51  *
  52  * If node is NULL, we need to make a new node, which will be returned;
  53  * otherwise the return value is the same as node.
  54  */
  55 static TrieChar *
  56 placeChar(TrieChar *node, const unsigned char *str, int lenstr,
  57                   const char *replaceTo, int replacelen)
  58 {
  59         TrieChar   *curnode;
  60
  61         if (!node)
  62                 node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
  63
  64         Assert(lenstr > 0);                     /* else str[0] doesn't exist */
  65
  66         curnode = node + *str;
  67
  68         if (lenstr <= 1)
  69         {
  70                 if (curnode->replaceTo)
  71                         ereport(WARNING,
  72                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
  73                                          errmsg("duplicate source strings, first one will be used")));
  74                 else
  75                 {
  76                         curnode->replacelen = replacelen;
  77                         curnode->replaceTo = (char *) palloc(replacelen);
  78                         memcpy(curnode->replaceTo, replaceTo, replacelen);
  79                 }
  80         }
  81         else
  82         {
  83                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
  84                                                                           replaceTo, replacelen);
  85         }
  86
  87         return node;
  88 }
  89
  90 /*
  91  * initTrie  - create trie from file.
  92  *
  93  * Function converts UTF8-encoded file into current encoding.
  94  */
  95 static TrieChar *
  96 initTrie(const char *filename)
  97 {
  98         TrieChar   *volatile rootTrie = NULL;
  99         MemoryContext ccxt = CurrentMemoryContext;
 100         tsearch_readline_state trst;
 101         volatile bool skip;
 102
 103         filename = get_tsearch_config_filename(filename, "rules");
 104         if (!tsearch_readline_begin(&trst, filename))
 105                 ereport(ERROR,
 106                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 107                                  errmsg("could not open unaccent file \"%s\": %m",
 108                                                 filename)));
 109
 110         do
 111         {
 112                 /*
 113                  * pg_do_encoding_conversion() (called by tsearch_readline()) will
 114                  * emit exception if it finds untranslatable characters in current
 115                  * locale. We just skip such lines, continuing with the next.
 116                  */
 117                 skip = true;
 118
 119                 PG_TRY();
 120                 {
 121                         char       *line;
 122
 123                         while ((line = tsearch_readline(&trst)) != NULL)
 124                         {
 125                                 /*----------
 126                                  * The format of each line must be "src" or "src trg", where
 127                                  * src and trg are sequences of one or more non-whitespace
 128                                  * characters, separated by whitespace.  Whitespace at start
 129                                  * or end of line is ignored.  If trg is omitted, an empty
 130                                  * string is used as the replacement.  trg can be optionally
 131                                  * quoted, in which case whitespaces are included in it.
 132                                  *
 133                                  * We use a simple state machine, with states
 134                                  *      0       initial (before src)
 135                                  *      1       in src
 136                                  *      2       in whitespace after src
 137                                  *      3       in trg (non-quoted)
 138                                  *      4       in trg (quoted)
 139                                  *      5       in whitespace after trg
 140                                  *      -1      syntax error detected (two strings)
 141                                  *      -2      syntax error detected (unfinished quoted string)
 142                                  *----------
 143                                  */
 144                                 int                     state;
 145                                 char       *ptr;
 146                                 char       *src = NULL;
 147                                 char       *trg = NULL;
 148                                 char       *trgstore = NULL;
 149                                 int                     ptrlen;
 150                                 int                     srclen = 0;
 151                                 int                     trglen = 0;
 152                                 int                     trgstorelen = 0;
 153                                 bool            trgquoted = false;
 154
 155                                 state = 0;
 156                                 for (ptr = line; *ptr; ptr += ptrlen)
 157                                 {
 158                                         ptrlen = pg_mblen(ptr);
 159                                         /* ignore whitespace, but end src or trg */
 160                                         if (t_isspace(ptr))
 161                                         {
 162                                                 if (state == 1)
 163                                                         state = 2;
 164                                                 else if (state == 3)
 165                                                         state = 5;
 166                                                 /* whitespaces are OK in quoted area */
 167                                                 if (state != 4)
 168                                                         continue;
 169                                         }
 170                                         switch (state)
 171                                         {
 172                                                 case 0:
 173                                                         /* start of src */
 174                                                         src = ptr;
 175                                                         srclen = ptrlen;
 176                                                         state = 1;
 177                                                         break;
 178                                                 case 1:
 179                                                         /* continue src */
 180                                                         srclen += ptrlen;
 181                                                         break;
 182                                                 case 2:
 183                                                         /* start of trg */
 184                                                         if (*ptr == '"')
 185                                                         {
 186                                                                 trgquoted = true;
 187                                                                 state = 4;
 188                                                         }
 189                                                         else
 190                                                                 state = 3;
 191
 192                                                         trg = ptr;
 193                                                         trglen = ptrlen;
 194                                                         break;
 195                                                 case 3:
 196                                                         /* continue non-quoted trg */
 197                                                         trglen += ptrlen;
 198                                                         break;
 199                                                 case 4:
 200                                                         /* continue quoted trg */
 201                                                         trglen += ptrlen;
 202
 203                                                         /*
 204                                                          * If this is a quote, consider it as the end of
 205                                                          * trg except if the follow-up character is itself
 206                                                          * a quote.
 207                                                          */
 208                                                         if (*ptr == '"')
 209                                                         {
 210                                                                 if (*(ptr + 1) == '"')
 211                                                                 {
 212                                                                         ptr++;
 213                                                                         trglen += 1;
 214                                                                 }
 215                                                                 else
 216                                                                         state = 5;
 217                                                         }
 218                                                         break;
 219                                                 default:
 220                                                         /* bogus line format */
 221                                                         state = -1;
 222                                                         break;
 223                                         }
 224                                 }
 225
 226                                 if (state == 1 || state == 2)
 227                                 {
 228                                         /* trg was omitted, so use "" */
 229                                         trg = "";
 230                                         trglen = 0;
 231                                 }
 232
 233                                 /* If still in a quoted area, fallback to an error */
 234                                 if (state == 4)
 235                                         state = -2;
 236
 237                                 /* If trg was quoted, remove its quotes and unescape it */
 238                                 if (trgquoted && state > 0)
 239                                 {
 240                                         /* Ignore first and end quotes */
 241                                         trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
 242                                         trgstorelen = 0;
 243                                         for (int i = 1; i < trglen - 1; i++)
 244                                         {
 245                                                 trgstore[trgstorelen] = trg[i];
 246                                                 trgstorelen++;
 247                                                 /* skip second double quotes */
 248                                                 if (trg[i] == '"' && trg[i + 1] == '"')
 249                                                         i++;
 250                                         }
 251                                 }
 252                                 else
 253                                 {
 254                                         trgstore = (char *) palloc(sizeof(char) * trglen);
 255                                         trgstorelen = trglen;
 256                                         memcpy(trgstore, trg, trgstorelen);
 257                                 }
 258
 259                                 if (state > 0)
 260                                         rootTrie = placeChar(rootTrie,
 261                                                                                  (unsigned char *) src, srclen,
 262                                                                                  trgstore, trgstorelen);
 263                                 else if (state == -1)
 264                                         ereport(WARNING,
 265                                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
 266                                                          errmsg("invalid syntax: more than two strings in unaccent rule")));
 267                                 else if (state == -2)
 268                                         ereport(WARNING,
 269                                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
 270                                                          errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
 271
 272                                 pfree(trgstore);
 273                                 pfree(line);
 274                         }
 275                         skip = false;
 276                 }
 277                 PG_CATCH();
 278                 {
 279                         ErrorData  *errdata;
 280                         MemoryContext ecxt;
 281
 282                         ecxt = MemoryContextSwitchTo(ccxt);
 283                         errdata = CopyErrorData();
 284                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
 285                         {
 286                                 FlushErrorState();
 287                         }
 288                         else
 289                         {
 290                                 MemoryContextSwitchTo(ecxt);
 291                                 PG_RE_THROW();
 292                         }
 293                 }
 294                 PG_END_TRY();
 295         }
 296         while (skip);
 297
 298         tsearch_readline_end(&trst);
 299
 300         return rootTrie;
 301 }
 302
 303 /*
 304  * findReplaceTo - find longest possible match in trie
 305  *
 306  * On success, returns pointer to ending subnode, plus length of matched
 307  * source string in *p_matchlen.  On failure, returns NULL.
 308  */
 309 static TrieChar *
 310 findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
 311                           int *p_matchlen)
 312 {
 313         TrieChar   *result = NULL;
 314         int                     matchlen = 0;
 315
 316         *p_matchlen = 0;                        /* prevent uninitialized-variable warnings */
 317
 318         while (node && matchlen < srclen)
 319         {
 320                 node = node + src[matchlen];
 321                 matchlen++;
 322
 323                 if (node->replaceTo)
 324                 {
 325                         result = node;
 326                         *p_matchlen = matchlen;
 327                 }
 328
 329                 node = node->nextChar;
 330         }
 331
 332         return result;
 333 }
 334
 335 PG_FUNCTION_INFO_V1(unaccent_init);
 336 Datum
 337 unaccent_init(PG_FUNCTION_ARGS)
 338 {
 339         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 340         TrieChar   *rootTrie = NULL;
 341         bool            fileloaded = false;
 342         ListCell   *l;
 343
 344         foreach(l, dictoptions)
 345         {
 346                 DefElem    *defel = (DefElem *) lfirst(l);
 347
 348                 if (strcmp(defel->defname, "rules") == 0)
 349                 {
 350                         if (fileloaded)
 351                                 ereport(ERROR,
 352                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 353                                                  errmsg("multiple Rules parameters")));
 354                         rootTrie = initTrie(defGetString(defel));
 355                         fileloaded = true;
 356                 }
 357                 else
 358                 {
 359                         ereport(ERROR,
 360                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 361                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
 362                                                         defel->defname)));
 363                 }
 364         }
 365
 366         if (!fileloaded)
 367         {
 368                 ereport(ERROR,
 369                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 370                                  errmsg("missing Rules parameter")));
 371         }
 372
 373         PG_RETURN_POINTER(rootTrie);
 374 }
 375
 376 PG_FUNCTION_INFO_V1(unaccent_lexize);
 377 Datum
 378 unaccent_lexize(PG_FUNCTION_ARGS)
 379 {
 380         TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
 381         char       *srcchar = (char *) PG_GETARG_POINTER(1);
 382         int32           len = PG_GETARG_INT32(2);
 383         char       *srcstart = srcchar;
 384         TSLexeme   *res;
 385         StringInfoData buf;
 386
 387         /* we allocate storage for the buffer only if needed */
 388         buf.data = NULL;
 389
 390         while (len > 0)
 391         {
 392                 TrieChar   *node;
 393                 int                     matchlen;
 394
 395                 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
 396                                                          &matchlen);
 397                 if (node && node->replaceTo)
 398                 {
 399                         if (buf.data == NULL)
 400                         {
 401                                 /* initialize buffer */
 402                                 initStringInfo(&buf);
 403                                 /* insert any data we already skipped over */
 404                                 if (srcchar != srcstart)
 405                                         appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
 406                         }
 407                         appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
 408                 }
 409                 else
 410                 {
 411                         matchlen = pg_mblen(srcchar);
 412                         if (buf.data != NULL)
 413                                 appendBinaryStringInfo(&buf, srcchar, matchlen);
 414                 }
 415
 416                 srcchar += matchlen;
 417                 len -= matchlen;
 418         }
 419
 420         /* return a result only if we made at least one substitution */
 421         if (buf.data != NULL)
 422         {
 423                 res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
 424                 res->lexeme = buf.data;
 425                 res->flags = TSL_FILTER;
 426         }
 427         else
 428                 res = NULL;
 429
 430         PG_RETURN_POINTER(res);
 431 }
 432
 433 /*
 434  * Function-like wrapper for dictionary
 435  */
 436 PG_FUNCTION_INFO_V1(unaccent_dict);
 437 Datum
 438 unaccent_dict(PG_FUNCTION_ARGS)
 439 {
 440         text       *str;
 441         int                     strArg;
 442         Oid                     dictOid;
 443         TSDictionaryCacheEntry *dict;
 444         TSLexeme   *res;
 445
 446         if (PG_NARGS() == 1)
 447         {
 448                 /*
 449                  * Use the "unaccent" dictionary that is in the same schema that this
 450                  * function is in.
 451                  */
 452                 Oid                     procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
 453                 const char *dictname = "unaccent";
 454
 455                 dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
 456                                                                   PointerGetDatum(dictname),
 457                                                                   ObjectIdGetDatum(procnspid));
 458                 if (!OidIsValid(dictOid))
 459                         ereport(ERROR,
 460                                         (errcode(ERRCODE_UNDEFINED_OBJECT),
 461                                          errmsg("text search dictionary \"%s.%s\" does not exist",
 462                                                         get_namespace_name(procnspid), dictname)));
 463                 strArg = 0;
 464         }
 465         else
 466         {
 467                 dictOid = PG_GETARG_OID(0);
 468                 strArg = 1;
 469         }
 470         str = PG_GETARG_TEXT_PP(strArg);
 471
 472         dict = lookup_ts_dictionary_cache(dictOid);
 473
 474         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
 475                                                                                                          PointerGetDatum(dict->dictData),
 476                                                                                                          PointerGetDatum(VARDATA_ANY(str)),
 477                                                                                                          Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
 478                                                                                                          PointerGetDatum(NULL)));
 479
 480         PG_FREE_IF_COPY(str, strArg);
 481
 482         if (res == NULL)
 483         {
 484                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 485         }
 486         else if (res->lexeme == NULL)
 487         {
 488                 pfree(res);
 489                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 490         }
 491         else
 492         {
 493                 text       *txt = cstring_to_text(res->lexeme);
 494
 495                 pfree(res->lexeme);
 496                 pfree(res);
 497
 498                 PG_RETURN_TEXT_P(txt);
 499         }
 500 }