Fix oversight in previous error-reporting patch; mustn't pfree path string
[PostgreSQL.git] / src / backend / snowball / dict_snowball.c
blob438bc1c1a01309884e0c280b94a013b380548979
1 /*-------------------------------------------------------------------------
3 * dict_snowball.c
4 * Snowball dictionary
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 * IDENTIFICATION
9 * $PostgreSQL$
11 *-------------------------------------------------------------------------
13 #include "postgres.h"
15 #include "commands/defrem.h"
16 #include "fmgr.h"
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_public.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/builtins.h"
22 /* Some platforms define MAXINT and/or MININT, causing conflicts */
23 #ifdef MAXINT
24 #undef MAXINT
25 #endif
26 #ifdef MININT
27 #undef MININT
28 #endif
30 /* Now we can include the original Snowball header.h */
31 #include "snowball/libstemmer/header.h"
32 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
33 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
34 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
35 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
36 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
37 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
38 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
39 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
40 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
41 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
42 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
43 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
44 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
45 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
46 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
47 #include "snowball/libstemmer/stem_UTF_8_danish.h"
48 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
49 #include "snowball/libstemmer/stem_UTF_8_english.h"
50 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
51 #include "snowball/libstemmer/stem_UTF_8_french.h"
52 #include "snowball/libstemmer/stem_UTF_8_german.h"
53 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
54 #include "snowball/libstemmer/stem_UTF_8_italian.h"
55 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
56 #include "snowball/libstemmer/stem_UTF_8_porter.h"
57 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
58 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
59 #include "snowball/libstemmer/stem_UTF_8_russian.h"
60 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
61 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
62 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
65 PG_MODULE_MAGIC;
67 PG_FUNCTION_INFO_V1(dsnowball_init);
68 Datum dsnowball_init(PG_FUNCTION_ARGS);
70 PG_FUNCTION_INFO_V1(dsnowball_lexize);
71 Datum dsnowball_lexize(PG_FUNCTION_ARGS);
73 /* List of supported modules */
74 typedef struct stemmer_module
76 const char *name;
77 pg_enc enc;
78 struct SN_env *(*create) (void);
79 void (*close) (struct SN_env *);
80 int (*stem) (struct SN_env *);
81 } stemmer_module;
83 static const stemmer_module stemmer_modules[] =
86 * Stemmers list from Snowball distribution
88 {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
89 {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
90 {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
91 {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
92 {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
93 {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
94 {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
95 {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
96 {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
97 {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
98 {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
99 {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
100 {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
101 {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
102 {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
103 {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
104 {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
105 {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
106 {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
107 {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
108 {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
109 {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
110 {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
111 {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
112 {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
113 {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
114 {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
115 {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
116 {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
117 {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
118 {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
121 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
122 * encoding
124 {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
126 {NULL, 0, NULL, NULL, NULL} /* list end marker */
130 typedef struct DictSnowball
132 struct SN_env *z;
133 StopList stoplist;
134 bool needrecode; /* needs recoding before/after call stem */
135 int (*stem) (struct SN_env * z);
138 * snowball saves alloced memory between calls, so we should run it in our
139 * private memory context. Note, init function is executed in long lived
140 * context, so we just remember CurrentMemoryContext
142 MemoryContext dictCtx;
143 } DictSnowball;
146 static void
147 locate_stem_module(DictSnowball *d, char *lang)
149 const stemmer_module *m;
152 * First, try to find exact match of stemmer module. Stemmer with
153 * PG_SQL_ASCII encoding is treated as working with any server encoding
155 for (m = stemmer_modules; m->name; m++)
157 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
158 pg_strcasecmp(m->name, lang) == 0)
160 d->stem = m->stem;
161 d->z = m->create();
162 d->needrecode = false;
163 return;
168 * Second, try to find stemmer for needed language for UTF8 encoding.
170 for (m = stemmer_modules; m->name; m++)
172 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
174 d->stem = m->stem;
175 d->z = m->create();
176 d->needrecode = true;
177 return;
181 ereport(ERROR,
182 (errcode(ERRCODE_UNDEFINED_OBJECT),
183 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
184 lang, GetDatabaseEncodingName())));
187 Datum
188 dsnowball_init(PG_FUNCTION_ARGS)
190 List *dictoptions = (List *) PG_GETARG_POINTER(0);
191 DictSnowball *d;
192 bool stoploaded = false;
193 ListCell *l;
195 d = (DictSnowball *) palloc0(sizeof(DictSnowball));
197 foreach(l, dictoptions)
199 DefElem *defel = (DefElem *) lfirst(l);
201 if (pg_strcasecmp("StopWords", defel->defname) == 0)
203 if (stoploaded)
204 ereport(ERROR,
205 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
206 errmsg("multiple StopWords parameters")));
207 readstoplist(defGetString(defel), &d->stoplist, lowerstr);
208 stoploaded = true;
210 else if (pg_strcasecmp("Language", defel->defname) == 0)
212 if (d->stem)
213 ereport(ERROR,
214 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
215 errmsg("multiple Language parameters")));
216 locate_stem_module(d, defGetString(defel));
218 else
220 ereport(ERROR,
221 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
222 errmsg("unrecognized Snowball parameter: \"%s\"",
223 defel->defname)));
227 if (!d->stem)
228 ereport(ERROR,
229 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
230 errmsg("missing Language parameter")));
232 d->dictCtx = CurrentMemoryContext;
234 PG_RETURN_POINTER(d);
237 Datum
238 dsnowball_lexize(PG_FUNCTION_ARGS)
240 DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
241 char *in = (char *) PG_GETARG_POINTER(1);
242 int32 len = PG_GETARG_INT32(2);
243 char *txt = lowerstr_with_len(in, len);
244 TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
246 if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
248 pfree(txt);
250 else
252 MemoryContext saveCtx;
255 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
257 if (d->needrecode)
259 char *recoded;
261 recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
262 strlen(txt),
263 GetDatabaseEncoding(),
264 PG_UTF8);
265 if (recoded != txt)
267 pfree(txt);
268 txt = recoded;
272 /* see comment about d->dictCtx */
273 saveCtx = MemoryContextSwitchTo(d->dictCtx);
274 SN_set_current(d->z, strlen(txt), (symbol *) txt);
275 d->stem(d->z);
276 MemoryContextSwitchTo(saveCtx);
278 if (d->z->p && d->z->l)
280 txt = repalloc(txt, d->z->l + 1);
281 memcpy(txt, d->z->p, d->z->l);
282 txt[d->z->l] = '\0';
285 /* back recode if needed */
286 if (d->needrecode)
288 char *recoded;
290 recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
291 strlen(txt),
292 PG_UTF8,
293 GetDatabaseEncoding());
294 if (recoded != txt)
296 pfree(txt);
297 txt = recoded;
301 res->lexeme = txt;
304 PG_RETURN_POINTER(res);