src/backend/snowball/dict_snowball.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * dict_snowball.c
   4  *              Snowball dictionary
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        $PostgreSQL$
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "postgres.h"
  14
  15 #include "commands/defrem.h"
  16 #include "fmgr.h"
  17 #include "tsearch/ts_locale.h"
  18 #include "tsearch/ts_public.h"
  19 #include "tsearch/ts_utils.h"
  20 #include "utils/builtins.h"
  21
  22 /* Some platforms define MAXINT and/or MININT, causing conflicts */
  23 #ifdef MAXINT
  24 #undef MAXINT
  25 #endif
  26 #ifdef MININT
  27 #undef MININT
  28 #endif
  29
  30 /* Now we can include the original Snowball header.h */
  31 #include "snowball/libstemmer/header.h"
  32 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
  33 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
  34 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
  35 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
  36 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
  37 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
  38 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
  39 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
  40 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
  41 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
  42 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
  43 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
  44 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
  45 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
  46 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
  47 #include "snowball/libstemmer/stem_UTF_8_danish.h"
  48 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
  49 #include "snowball/libstemmer/stem_UTF_8_english.h"
  50 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
  51 #include "snowball/libstemmer/stem_UTF_8_french.h"
  52 #include "snowball/libstemmer/stem_UTF_8_german.h"
  53 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
  54 #include "snowball/libstemmer/stem_UTF_8_italian.h"
  55 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
  56 #include "snowball/libstemmer/stem_UTF_8_porter.h"
  57 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
  58 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
  59 #include "snowball/libstemmer/stem_UTF_8_russian.h"
  60 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
  61 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
  62 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
  63
  64
  65 PG_MODULE_MAGIC;
  66
  67 PG_FUNCTION_INFO_V1(dsnowball_init);
  68 Datum           dsnowball_init(PG_FUNCTION_ARGS);
  69
  70 PG_FUNCTION_INFO_V1(dsnowball_lexize);
  71 Datum           dsnowball_lexize(PG_FUNCTION_ARGS);
  72
  73 /* List of supported modules */
  74 typedef struct stemmer_module
  75 {
  76         const char *name;
  77         pg_enc          enc;
  78         struct SN_env *(*create) (void);
  79         void            (*close) (struct SN_env *);
  80         int                     (*stem) (struct SN_env *);
  81 } stemmer_module;
  82
  83 static const stemmer_module stemmer_modules[] =
  84 {
  85         /*
  86          * Stemmers list from Snowball distribution
  87          */
  88         {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
  89         {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
  90         {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
  91         {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
  92         {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
  93         {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
  94         {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
  95         {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
  96         {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
  97         {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
  98         {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
  99         {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
 100         {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
 101         {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
 102         {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
 103         {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 104         {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 105         {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 106         {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
 107         {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
 108         {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 109         {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
 110         {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
 111         {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
 112         {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
 113         {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
 114         {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
 115         {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
 116         {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 117         {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
 118         {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
 119
 120         /*
 121          * Stemmer with PG_SQL_ASCII encoding should be valid for any server
 122          * encoding
 123          */
 124         {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
 125
 126         {NULL, 0, NULL, NULL, NULL} /* list end marker */
 127 };
 128
 129
 130 typedef struct DictSnowball
 131 {
 132         struct SN_env *z;
 133         StopList        stoplist;
 134         bool            needrecode;             /* needs recoding before/after call stem */
 135         int                     (*stem) (struct SN_env * z);
 136
 137         /*
 138          * snowball saves alloced memory between calls, so we should run it in our
 139          * private memory context. Note, init function is executed in long lived
 140          * context, so we just remember CurrentMemoryContext
 141          */
 142         MemoryContext dictCtx;
 143 } DictSnowball;
 144
 145
 146 static void
 147 locate_stem_module(DictSnowball *d, char *lang)
 148 {
 149         const stemmer_module *m;
 150
 151         /*
 152          * First, try to find exact match of stemmer module. Stemmer with
 153          * PG_SQL_ASCII encoding is treated as working with any server encoding
 154          */
 155         for (m = stemmer_modules; m->name; m++)
 156         {
 157                 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
 158                         pg_strcasecmp(m->name, lang) == 0)
 159                 {
 160                         d->stem = m->stem;
 161                         d->z = m->create();
 162                         d->needrecode = false;
 163                         return;
 164                 }
 165         }
 166
 167         /*
 168          * Second, try to find stemmer for needed language for UTF8 encoding.
 169          */
 170         for (m = stemmer_modules; m->name; m++)
 171         {
 172                 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
 173                 {
 174                         d->stem = m->stem;
 175                         d->z = m->create();
 176                         d->needrecode = true;
 177                         return;
 178                 }
 179         }
 180
 181         ereport(ERROR,
 182                         (errcode(ERRCODE_UNDEFINED_OBJECT),
 183                          errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
 184                                         lang, GetDatabaseEncodingName())));
 185 }
 186
 187 Datum
 188 dsnowball_init(PG_FUNCTION_ARGS)
 189 {
 190         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 191         DictSnowball *d;
 192         bool            stoploaded = false;
 193         ListCell   *l;
 194
 195         d = (DictSnowball *) palloc0(sizeof(DictSnowball));
 196
 197         foreach(l, dictoptions)
 198         {
 199                 DefElem    *defel = (DefElem *) lfirst(l);
 200
 201                 if (pg_strcasecmp("StopWords", defel->defname) == 0)
 202                 {
 203                         if (stoploaded)
 204                                 ereport(ERROR,
 205                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 206                                                  errmsg("multiple StopWords parameters")));
 207                         readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 208                         stoploaded = true;
 209                 }
 210                 else if (pg_strcasecmp("Language", defel->defname) == 0)
 211                 {
 212                         if (d->stem)
 213                                 ereport(ERROR,
 214                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 215                                                  errmsg("multiple Language parameters")));
 216                         locate_stem_module(d, defGetString(defel));
 217                 }
 218                 else
 219                 {
 220                         ereport(ERROR,
 221                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 222                                          errmsg("unrecognized Snowball parameter: \"%s\"",
 223                                                         defel->defname)));
 224                 }
 225         }
 226
 227         if (!d->stem)
 228                 ereport(ERROR,
 229                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 230                                  errmsg("missing Language parameter")));
 231
 232         d->dictCtx = CurrentMemoryContext;
 233
 234         PG_RETURN_POINTER(d);
 235 }
 236
 237 Datum
 238 dsnowball_lexize(PG_FUNCTION_ARGS)
 239 {
 240         DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 241         char       *in = (char *) PG_GETARG_POINTER(1);
 242         int32           len = PG_GETARG_INT32(2);
 243         char       *txt = lowerstr_with_len(in, len);
 244         TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
 245
 246         if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 247         {
 248                 pfree(txt);
 249         }
 250         else
 251         {
 252                 MemoryContext saveCtx;
 253
 254                 /*
 255                  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
 256                  */
 257                 if (d->needrecode)
 258                 {
 259                         char       *recoded;
 260
 261                         recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
 262                                                                                                                  strlen(txt),
 263                                                                                                            GetDatabaseEncoding(),
 264                                                                                                                  PG_UTF8);
 265                         if (recoded != txt)
 266                         {
 267                                 pfree(txt);
 268                                 txt = recoded;
 269                         }
 270                 }
 271
 272                 /* see comment about d->dictCtx */
 273                 saveCtx = MemoryContextSwitchTo(d->dictCtx);
 274                 SN_set_current(d->z, strlen(txt), (symbol *) txt);
 275                 d->stem(d->z);
 276                 MemoryContextSwitchTo(saveCtx);
 277
 278                 if (d->z->p && d->z->l)
 279                 {
 280                         txt = repalloc(txt, d->z->l + 1);
 281                         memcpy(txt, d->z->p, d->z->l);
 282                         txt[d->z->l] = '\0';
 283                 }
 284
 285                 /* back recode if needed */
 286                 if (d->needrecode)
 287                 {
 288                         char       *recoded;
 289
 290                         recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
 291                                                                                                                  strlen(txt),
 292                                                                                                                  PG_UTF8,
 293                                                                                                           GetDatabaseEncoding());
 294                         if (recoded != txt)
 295                         {
 296                                 pfree(txt);
 297                                 txt = recoded;
 298                         }
 299                 }
 300
 301                 res->lexeme = txt;
 302         }
 303
 304         PG_RETURN_POINTER(res);
 305 }