4 * Affix stripping stemming algorithm for Tamil
5 * By Damodharan Rajalingam
11 stringdef aytham '{U+0B83}'
13 /* Uyir - independent vowels */
14 stringdef a '{U+0B85}'
15 stringdef aa '{U+0B86}'
16 stringdef i '{U+0B87}'
17 stringdef ii '{U+0B88}'
18 stringdef u '{U+0B89}'
19 stringdef uu '{U+0B8A}'
20 stringdef e '{U+0B8E}'
21 stringdef ee '{U+0B8F}'
22 stringdef ai '{U+0B90}'
23 stringdef o '{U+0B92}'
24 stringdef oo '{U+0B93}'
25 stringdef au '{U+0B94}'
28 stringdef ka '{U+0B95}'
29 stringdef nga '{U+0B99}'
30 stringdef ca '{U+0B9A}'
31 stringdef ja '{U+0B9C}'
32 stringdef nya '{U+0B9E}'
33 stringdef tta '{U+0B9F}'
34 stringdef nna '{U+0BA3}'
35 stringdef ta '{U+0BA4}'
36 stringdef tha '{U+0BA4}'
37 stringdef na '{U+0BA8}'
38 stringdef nnna '{U+0BA9}'
39 stringdef pa '{U+0BAA}'
40 stringdef ma '{U+0BAE}'
41 stringdef ya '{U+0BAF}'
42 stringdef ra '{U+0BB0}'
43 stringdef rra '{U+0BB1}'
44 stringdef la '{U+0BB2}'
45 stringdef lla '{U+0BB3}'
46 stringdef llla '{U+0BB4}'
47 stringdef zha '{U+0BB4}'
48 stringdef va '{U+0BB5}'
50 /* Vatamozi - borrowed */
51 stringdef sha '{U+0BB6}'
52 stringdef ssa '{U+0BB7}'
53 stringdef sa '{U+0BB8}'
54 stringdef ha '{U+0BB9}'
57 /* Dependent vowel signs (kombu etc.) */
58 stringdef vs_aa '{U+0BBE}'
59 stringdef vs_i '{U+0BBF}'
60 stringdef vs_ii '{U+0BC0}'
61 stringdef vs_u '{U+0BC1}'
62 stringdef vs_uu '{U+0BC2}'
63 stringdef vs_e '{U+0BC6}'
64 stringdef vs_ee '{U+0BC7}'
65 stringdef vs_ai '{U+0BC8}'
66 stringdef vs_o '{U+0BCA}'
67 stringdef vs_oo '{U+0BCB}'
68 stringdef vs_au '{U+0BCC}'
71 stringdef pulli '{U+0BCD}'
74 stringdef au_lmark '{U+0BD7}'
79 remove_question_suffixes
80 remove_question_prefixes
81 remove_pronoun_prefixes
82 remove_command_suffixes
84 remove_vetrumai_urupukal
90 remove_common_word_endings
101 define has_min_length as (
105 define fix_va_start as (
106 (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
107 (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
108 (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
109 (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
112 define fix_endings as (
116 define remove_question_prefixes as (
117 [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
121 // Gives signal t if an ending was fixed, signal f otherwise.
122 define fix_ending as (
125 ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
127 ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
129 ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
131 ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
133 // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
134 ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
136 ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
138 ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
140 ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
142 ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
144 ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
146 ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
148 ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
150 ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
152 ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
154 ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
156 ( [ '{nga}{pulli}' ] delete )
158 ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
162 define remove_pronoun_prefixes as (
164 [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
169 define remove_plural_suffix as (
172 ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
173 ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
174 ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
175 ( [ '{ka}{lla}{pulli}' ] delete )
180 define remove_question_suffixes as (
185 [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
192 define remove_command_suffixes as (
196 [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
201 define remove_um as (
204 backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
210 define remove_common_word_endings as (
211 // These are not suffixes actually but are
212 // some words that are attached to other words
213 // but can be removed for stemming
217 test ( [ '{vs_u}{tta}{nnna}{pulli}' or
218 '{vs_i}{la}{pulli}{la}{vs_ai}' or
219 '{vs_i}{tta}{ma}{pulli}' or
220 '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
221 '{vs_aa}{ka}{vs_i}' or
222 '{vs_aa}{ka}{vs_i}{ya}' or
223 '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
224 '{vs_u}{lla}{pulli}{lla}' or
225 '{vs_u}{tta}{vs_ai}{ya}' or
226 '{vs_u}{tta}{vs_ai}' or
227 '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
228 ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
230 '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
234 test ( [ among('{pa}{tta}{vs_u}'
235 '{pa}{tta}{pulli}{tta}'
236 '{pa}{tta}{pulli}{tta}{vs_u}'
237 '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
238 '{pa}{tta}{pulli}{tta}{nna}'
239 '{ka}{vs_u}{ra}{vs_i}{ya}'
240 '{pa}{rra}{pulli}{rra}{vs_i}'
241 '{va}{vs_i}{tta}{vs_u}'
242 '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
243 '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
246 '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
254 define remove_vetrumai_urupukal as (
256 unset found_vetrumai_urupu
260 test ( ['{nnna}{vs_ai}'] delete )
262 test ([ ( '{vs_i}{nnna}{vs_ai}' or
263 '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
264 ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
269 '{vs_o}{tta}{vs_u}' or
270 '{vs_oo}{tta}{vs_u}' or
271 '{vs_i}{la}{pulli}' or
272 '{vs_i}{rra}{pulli}' or
273 ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
274 '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
275 '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
277 ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
278 '{vs_aa}{la}{pulli}' or
279 '{vs_u}{tta}{vs_ai}' or
280 '{vs_aa}{ma}{la}{pulli}' or
281 ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
287 '{ka}{nna}{pulli}' or
288 '{ma}{vs_u}{nnna}{pulli}' or
289 '{ma}{vs_ee}{la}{pulli}' or
290 '{ma}{vs_ee}{rra}{pulli}' or
291 '{ka}{vs_ii}{llla}{pulli}' or
292 '{pa}{vs_i}{nnna}{pulli}' or
293 ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
297 test ([ '{vs_ii}' ] <- '{vs_i}')
300 (set found_vetrumai_urupu)
301 do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
306 define remove_tense_suffixes as (
308 repeat ( found_a_match (do remove_tense_suffix) )
311 define remove_tense_suffix as (
317 '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
324 '{ma}{vs_aa}{ra}{pulli}' or
325 '{ma}{vs_i}{nnna}{pulli}' or
326 '{nnna}{nnna}{pulli}' or
327 '{nnna}{vs_aa}{nnna}{pulli}' or
328 '{nnna}{vs_aa}{lla}{pulli}' or
329 '{nnna}{vs_aa}{ra}{pulli}' or
330 ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
331 '{nnna}{lla}{pulli}' or
332 '{va}{lla}{pulli}' or
333 '{nnna}{ra}{pulli}' or
335 '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
336 '{pa}{nnna}{pulli}' or
337 '{pa}{lla}{pulli}' or
339 ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
340 '{vs_i}{rra}{pulli}{rra}{vs_u}' or
342 '{nnna}{ma}{pulli}' or
343 '{ta}{vs_u}{ma}{pulli}' or
344 '{rra}{vs_u}{ma}{pulli}' or
345 '{ka}{vs_u}{ma}{pulli}' or
346 '{nnna}{vs_e}{nnna}{pulli}' or
354 ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
355 '{vs_aa}{lla}{pulli}' or
356 '{vs_aa}{ra}{pulli}' or
357 '{vs_ee}{nnna}{pulli}' or
359 '{vs_aa}{ma}{pulli}' or
360 '{vs_e}{ma}{pulli}' or
361 '{vs_ee}{ma}{pulli}' or
362 '{vs_oo}{ma}{pulli}' or
363 '{ka}{vs_u}{ma}{pulli}' or
364 '{ta}{vs_u}{ma}{pulli}' or
365 '{tta}{vs_u}{ma}{pulli}' or
366 '{rra}{vs_u}{ma}{pulli}' or
367 '{vs_aa}{ya}{pulli}' or
368 '{nnna}{vs_e}{nnna}{pulli}' or
369 '{nnna}{vs_i}{ra}{pulli}' or
370 '{vs_ii}{ra}{pulli}' or
371 '{vs_ii}{ya}{ra}{pulli}'
376 test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
381 '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
382 '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
383 '{ka}{vs_i}{nnna}{pulli}{rra}'
384 '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
386 '{ka}{vs_i}{rra}{pulli}'
395 unset found_vetrumai_urupu
398 do remove_question_prefixes
399 do remove_pronoun_prefixes
400 do remove_question_suffixes
402 do remove_common_word_endings
403 do remove_vetrumai_urupukal
404 do remove_plural_suffix
405 do remove_command_suffixes
406 do remove_tense_suffixes