1 /*-------------------------------------------------------------------------
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
11 *-------------------------------------------------------------------------
15 #include "commands/defrem.h"
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_public.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/builtins.h"
22 /* Some platforms define MAXINT and/or MININT, causing conflicts */
30 /* Now we can include the original Snowball header.h */
31 #include "snowball/libstemmer/header.h"
32 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
33 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
34 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
35 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
36 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
37 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
38 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
39 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
40 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
41 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
42 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
43 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
44 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
45 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
46 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
47 #include "snowball/libstemmer/stem_UTF_8_danish.h"
48 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
49 #include "snowball/libstemmer/stem_UTF_8_english.h"
50 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
51 #include "snowball/libstemmer/stem_UTF_8_french.h"
52 #include "snowball/libstemmer/stem_UTF_8_german.h"
53 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
54 #include "snowball/libstemmer/stem_UTF_8_italian.h"
55 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
56 #include "snowball/libstemmer/stem_UTF_8_porter.h"
57 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
58 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
59 #include "snowball/libstemmer/stem_UTF_8_russian.h"
60 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
61 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
62 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
67 PG_FUNCTION_INFO_V1(dsnowball_init
);
68 Datum
dsnowball_init(PG_FUNCTION_ARGS
);
70 PG_FUNCTION_INFO_V1(dsnowball_lexize
);
71 Datum
dsnowball_lexize(PG_FUNCTION_ARGS
);
73 /* List of supported modules */
74 typedef struct stemmer_module
78 struct SN_env
*(*create
) (void);
79 void (*close
) (struct SN_env
*);
80 int (*stem
) (struct SN_env
*);
83 static const stemmer_module stemmer_modules
[] =
86 * Stemmers list from Snowball distribution
88 {"danish", PG_LATIN1
, danish_ISO_8859_1_create_env
, danish_ISO_8859_1_close_env
, danish_ISO_8859_1_stem
},
89 {"dutch", PG_LATIN1
, dutch_ISO_8859_1_create_env
, dutch_ISO_8859_1_close_env
, dutch_ISO_8859_1_stem
},
90 {"english", PG_LATIN1
, english_ISO_8859_1_create_env
, english_ISO_8859_1_close_env
, english_ISO_8859_1_stem
},
91 {"finnish", PG_LATIN1
, finnish_ISO_8859_1_create_env
, finnish_ISO_8859_1_close_env
, finnish_ISO_8859_1_stem
},
92 {"french", PG_LATIN1
, french_ISO_8859_1_create_env
, french_ISO_8859_1_close_env
, french_ISO_8859_1_stem
},
93 {"german", PG_LATIN1
, german_ISO_8859_1_create_env
, german_ISO_8859_1_close_env
, german_ISO_8859_1_stem
},
94 {"hungarian", PG_LATIN1
, hungarian_ISO_8859_1_create_env
, hungarian_ISO_8859_1_close_env
, hungarian_ISO_8859_1_stem
},
95 {"italian", PG_LATIN1
, italian_ISO_8859_1_create_env
, italian_ISO_8859_1_close_env
, italian_ISO_8859_1_stem
},
96 {"norwegian", PG_LATIN1
, norwegian_ISO_8859_1_create_env
, norwegian_ISO_8859_1_close_env
, norwegian_ISO_8859_1_stem
},
97 {"porter", PG_LATIN1
, porter_ISO_8859_1_create_env
, porter_ISO_8859_1_close_env
, porter_ISO_8859_1_stem
},
98 {"portuguese", PG_LATIN1
, portuguese_ISO_8859_1_create_env
, portuguese_ISO_8859_1_close_env
, portuguese_ISO_8859_1_stem
},
99 {"spanish", PG_LATIN1
, spanish_ISO_8859_1_create_env
, spanish_ISO_8859_1_close_env
, spanish_ISO_8859_1_stem
},
100 {"swedish", PG_LATIN1
, swedish_ISO_8859_1_create_env
, swedish_ISO_8859_1_close_env
, swedish_ISO_8859_1_stem
},
101 {"romanian", PG_LATIN2
, romanian_ISO_8859_2_create_env
, romanian_ISO_8859_2_close_env
, romanian_ISO_8859_2_stem
},
102 {"russian", PG_KOI8R
, russian_KOI8_R_create_env
, russian_KOI8_R_close_env
, russian_KOI8_R_stem
},
103 {"danish", PG_UTF8
, danish_UTF_8_create_env
, danish_UTF_8_close_env
, danish_UTF_8_stem
},
104 {"dutch", PG_UTF8
, dutch_UTF_8_create_env
, dutch_UTF_8_close_env
, dutch_UTF_8_stem
},
105 {"english", PG_UTF8
, english_UTF_8_create_env
, english_UTF_8_close_env
, english_UTF_8_stem
},
106 {"finnish", PG_UTF8
, finnish_UTF_8_create_env
, finnish_UTF_8_close_env
, finnish_UTF_8_stem
},
107 {"french", PG_UTF8
, french_UTF_8_create_env
, french_UTF_8_close_env
, french_UTF_8_stem
},
108 {"german", PG_UTF8
, german_UTF_8_create_env
, german_UTF_8_close_env
, german_UTF_8_stem
},
109 {"hungarian", PG_UTF8
, hungarian_UTF_8_create_env
, hungarian_UTF_8_close_env
, hungarian_UTF_8_stem
},
110 {"italian", PG_UTF8
, italian_UTF_8_create_env
, italian_UTF_8_close_env
, italian_UTF_8_stem
},
111 {"norwegian", PG_UTF8
, norwegian_UTF_8_create_env
, norwegian_UTF_8_close_env
, norwegian_UTF_8_stem
},
112 {"porter", PG_UTF8
, porter_UTF_8_create_env
, porter_UTF_8_close_env
, porter_UTF_8_stem
},
113 {"portuguese", PG_UTF8
, portuguese_UTF_8_create_env
, portuguese_UTF_8_close_env
, portuguese_UTF_8_stem
},
114 {"romanian", PG_UTF8
, romanian_UTF_8_create_env
, romanian_UTF_8_close_env
, romanian_UTF_8_stem
},
115 {"russian", PG_UTF8
, russian_UTF_8_create_env
, russian_UTF_8_close_env
, russian_UTF_8_stem
},
116 {"spanish", PG_UTF8
, spanish_UTF_8_create_env
, spanish_UTF_8_close_env
, spanish_UTF_8_stem
},
117 {"swedish", PG_UTF8
, swedish_UTF_8_create_env
, swedish_UTF_8_close_env
, swedish_UTF_8_stem
},
118 {"turkish", PG_UTF8
, turkish_UTF_8_create_env
, turkish_UTF_8_close_env
, turkish_UTF_8_stem
},
121 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
124 {"english", PG_SQL_ASCII
, english_ISO_8859_1_create_env
, english_ISO_8859_1_close_env
, english_ISO_8859_1_stem
},
126 {NULL
, 0, NULL
, NULL
, NULL
} /* list end marker */
130 typedef struct DictSnowball
134 bool needrecode
; /* needs recoding before/after call stem */
135 int (*stem
) (struct SN_env
* z
);
138 * snowball saves alloced memory between calls, so we should run it in our
139 * private memory context. Note, init function is executed in long lived
140 * context, so we just remember CurrentMemoryContext
142 MemoryContext dictCtx
;
147 locate_stem_module(DictSnowball
*d
, char *lang
)
149 const stemmer_module
*m
;
152 * First, try to find exact match of stemmer module. Stemmer with
153 * PG_SQL_ASCII encoding is treated as working with any server encoding
155 for (m
= stemmer_modules
; m
->name
; m
++)
157 if ((m
->enc
== PG_SQL_ASCII
|| m
->enc
== GetDatabaseEncoding()) &&
158 pg_strcasecmp(m
->name
, lang
) == 0)
162 d
->needrecode
= false;
168 * Second, try to find stemmer for needed language for UTF8 encoding.
170 for (m
= stemmer_modules
; m
->name
; m
++)
172 if (m
->enc
== PG_UTF8
&& pg_strcasecmp(m
->name
, lang
) == 0)
176 d
->needrecode
= true;
182 (errcode(ERRCODE_UNDEFINED_OBJECT
),
183 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
184 lang
, GetDatabaseEncodingName())));
188 dsnowball_init(PG_FUNCTION_ARGS
)
190 List
*dictoptions
= (List
*) PG_GETARG_POINTER(0);
192 bool stoploaded
= false;
195 d
= (DictSnowball
*) palloc0(sizeof(DictSnowball
));
197 foreach(l
, dictoptions
)
199 DefElem
*defel
= (DefElem
*) lfirst(l
);
201 if (pg_strcasecmp("StopWords", defel
->defname
) == 0)
205 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
206 errmsg("multiple StopWords parameters")));
207 readstoplist(defGetString(defel
), &d
->stoplist
, lowerstr
);
210 else if (pg_strcasecmp("Language", defel
->defname
) == 0)
214 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
215 errmsg("multiple Language parameters")));
216 locate_stem_module(d
, defGetString(defel
));
221 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
222 errmsg("unrecognized Snowball parameter: \"%s\"",
229 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
230 errmsg("missing Language parameter")));
232 d
->dictCtx
= CurrentMemoryContext
;
234 PG_RETURN_POINTER(d
);
238 dsnowball_lexize(PG_FUNCTION_ARGS
)
240 DictSnowball
*d
= (DictSnowball
*) PG_GETARG_POINTER(0);
241 char *in
= (char *) PG_GETARG_POINTER(1);
242 int32 len
= PG_GETARG_INT32(2);
243 char *txt
= lowerstr_with_len(in
, len
);
244 TSLexeme
*res
= palloc0(sizeof(TSLexeme
) * 2);
246 if (*txt
== '\0' || searchstoplist(&(d
->stoplist
), txt
))
252 MemoryContext saveCtx
;
255 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
261 recoded
= (char *) pg_do_encoding_conversion((unsigned char *) txt
,
263 GetDatabaseEncoding(),
272 /* see comment about d->dictCtx */
273 saveCtx
= MemoryContextSwitchTo(d
->dictCtx
);
274 SN_set_current(d
->z
, strlen(txt
), (symbol
*) txt
);
276 MemoryContextSwitchTo(saveCtx
);
278 if (d
->z
->p
&& d
->z
->l
)
280 txt
= repalloc(txt
, d
->z
->l
+ 1);
281 memcpy(txt
, d
->z
->p
, d
->z
->l
);
285 /* back recode if needed */
290 recoded
= (char *) pg_do_encoding_conversion((unsigned char *) txt
,
293 GetDatabaseEncoding());
304 PG_RETURN_POINTER(res
);