4 namespace Lucene
.Net
.Analysis
.Ru
6 /* ====================================================================
7 * The Apache Software License, Version 1.1
9 * Copyright (c) 2001 The Apache Software Foundation. All rights
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
24 * 3. The end-user documentation included with the redistribution,
25 * if any, must include the following acknowledgment:
26 * "This product includes software developed by the
27 * Apache Software Foundation (http://www.apache.org/)."
28 * Alternately, this acknowledgment may appear in the software itself,
29 * if and wherever such third-party acknowledgments normally appear.
31 * 4. The names "Apache" and "Apache Software Foundation" and
32 * "Apache Lucene" must not be used to endorse or promote products
33 * derived from this software without prior written permission. For
34 * written permission, please contact apache@apache.org.
36 * 5. Products derived from this software may not be called "Apache",
37 * "Apache Lucene", nor may "Apache" appear in their name, without
38 * prior written permission of the Apache Software Foundation.
40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
44 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * ====================================================================
54 * This software consists of voluntary contributions made by many
55 * individuals on behalf of the Apache Software Foundation. For more
56 * information on the Apache Software Foundation, please see
57 * <http://www.apache.org/>.
61 /// Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
63 /// <author>Boris Okner, b.okner@rogers.com</author>
64 /// <version>$Id: RussianStemmer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
65 public class RussianStemmer
67 private char[] charset
;
70 /// positions of RV, R1 and R2 respectively
72 private int RV
, R1
, R2
;
77 private static char A
= (char)0;
78 private static char B
= (char)1;
79 private static char V
= (char)2;
80 private static char G
= (char)3;
81 private static char D
= (char)4;
82 private static char E
= (char)5;
83 private static char ZH
= (char)6;
84 private static char Z
= (char)7;
85 private static char I
= (char)8;
86 private static char I_
= (char)9;
87 private static char K
= (char)10;
88 private static char L
= (char)11;
89 private static char M
= (char)12;
90 private static char N
= (char)13;
91 private static char O
= (char)14;
92 private static char P
= (char)15;
93 private static char R
= (char)16;
94 private static char S
= (char)17;
95 private static char T
= (char)18;
96 private static char U
= (char)19;
97 private static char F
= (char)20;
98 private static char X
= (char)21;
99 private static char TS
= (char)22;
100 private static char CH
= (char)23;
101 private static char SH
= (char)24;
102 private static char SHCH
= (char)25;
103 private static char HARD
= (char)26;
104 private static char Y
= (char)27;
105 private static char SOFT
= (char)28;
106 private static char AE
= (char)29;
107 private static char IU
= (char)30;
108 private static char IA
= (char)31;
113 private static char[] vowels
= { A, E, I, O, U, Y, AE, IU, IA }
;
115 private static char[][] perfectiveGerundEndings1
= {
117 new char[] { V, SH, I }
,
118 new char[] { V, SH, I, S, SOFT }
121 private static char[][] perfectiveGerund1Predessors
= {
126 private static char[][] perfectiveGerundEndings2
= {
129 new char[] {I, V, SH, I }
,
130 new char[] {Y, V, SH, I }
,
131 new char[] {I, V, SH, I, S, SOFT }
,
132 new char[] {Y, V, SH, I, S, SOFT }
135 private static char[][] adjectiveEndings
= {
140 new char[] { E, I_ }
,
141 new char[] { I, I_ }
,
142 new char[] { Y, I_ }
,
143 new char[] { O, I_ }
,
150 new char[] { U, IU }
,
151 new char[] { IU, IU }
,
152 new char[] { A, IA }
,
153 new char[] { IA, IA }
,
154 new char[] { O, IU }
,
155 new char[] { E, IU }
,
156 new char[] { I, M, I }
,
157 new char[] { Y, M, I }
,
158 new char[] { E, G, O }
,
159 new char[] { O, G, O }
,
160 new char[] { E, M, U }
,
161 new char[] {O, M, U }
164 private static char[][] participleEndings1
= {
168 new char[] { V, SH }
,
169 new char[] { IU, SHCH }
172 private static char[][] participleEndings2
= {
173 new char[] { I, V, SH }
,
174 new char[] { Y, V, SH }
,
175 new char[] { U, IU, SHCH }
178 private static char[][] participle1Predessors
= {
183 private static char[][] reflexiveEndings
= {
184 new char[] { S, IA }
,
185 new char[] { S, SOFT }
188 private static char[][] verbEndings1
= {
195 new char[] { IU, T }
,
201 new char[] { E, T, E }
,
202 new char[] { I_, T, E }
,
203 new char[] { T, SOFT }
,
204 new char[] { E, SH, SOFT }
,
205 new char[] { N, N, O }
208 private static char[][] verbEndings2
= {
210 new char[] { U, IU }
,
212 new char[] { E, I_ }
,
213 new char[] { IA, T }
,
214 new char[] { U, I_ }
,
221 new char[] { I, L, A }
,
222 new char[] { Y, L, A }
,
223 new char[] { E, N, A }
,
224 new char[] { I, T, E }
,
225 new char[] { I, L, I }
,
226 new char[] { Y, L, I }
,
227 new char[] { I, L, O }
,
228 new char[] { Y, L, O }
,
229 new char[] { E, N, O }
,
230 new char[] { U, E, T }
,
231 new char[] { U, IU, T }
,
232 new char[] { E, N, Y }
,
233 new char[] { I, T, SOFT }
,
234 new char[] { Y, T, SOFT }
,
235 new char[] { I, SH, SOFT }
,
236 new char[] { E, I_, T, E }
,
237 new char[] { U, I_, T, E }
240 private static char[][] verb1Predessors
= {
245 private static char[][] nounEndings
= {
259 new char[] { SOFT, E }
,
260 new char[] { IA, X }
,
261 new char[] { I, IU }
,
264 new char[] { E, I_ }
,
265 new char[] { O, I_ }
,
270 new char[] { SOFT, IU }
,
271 new char[] { I, IA }
,
272 new char[] { SOFT, IA }
,
273 new char[] { I, I_ }
,
274 new char[] { IA, M }
,
275 new char[] { IA, M, I }
,
276 new char[] { A, M, I }
,
277 new char[] { I, E, I_ }
,
278 new char[] { I, IA, M }
,
279 new char[] { I, E, M }
,
280 new char[] { I, IA, X }
,
281 new char[] { I, IA, M, I }
284 private static char[][] superlativeEndings
= {
285 new char[] { E, I_, SH }
,
286 new char[] { E, I_, SH, E }
289 private static char[][] derivationalEndings
= {
290 new char[] { O, S, T }
,
291 new char[] { O, S, T, SOFT }
295 /// RussianStemmer constructor comment.
297 public RussianStemmer()
302 /// RussianStemmer constructor comment.
304 /// <param name="charset"></param>
305 public RussianStemmer(char[] charset
)
307 this.charset
= charset
;
311 /// Adjectival ending is an adjective ending,
312 /// optionally preceded by participle ending.
313 /// Creation date: (17/03/2002 12:14:58 AM)
315 /// <param name="stemmingZone">StringBuilder</param>
316 /// <returns></returns>
317 private bool Adjectival(StringBuilder stemmingZone
)
319 // look for adjective ending in a stemming zone
320 if (!FindAndRemoveEnding(stemmingZone
, adjectiveEndings
))
322 // if adjective ending was found, try for participle ending
324 FindAndRemoveEnding(stemmingZone
, participleEndings1
, participle1Predessors
)
326 FindAndRemoveEnding(stemmingZone
, participleEndings2
);
331 /// Derivational endings
332 /// Creation date: (17/03/2002 12:14:58 AM)
334 /// <param name="stemmingZone">StringBuilder</param>
335 /// <returns></returns>
336 private bool Derivational(StringBuilder stemmingZone
)
338 int endingLength
= FindEnding(stemmingZone
, derivationalEndings
);
339 if (endingLength
== 0)
340 // no derivational ending found
344 // Ensure that the ending locates in R2
345 if (R2
- RV
<= stemmingZone
.Length
- endingLength
)
347 stemmingZone
.Length
= stemmingZone
.Length
- endingLength
;
358 /// Finds ending among given ending class and returns the length of ending found(0, if not found).
359 /// Creation date: (17/03/2002 8:18:34 PM)
361 /// <param name="stemmingZone"></param>
362 /// <param name="startIndex"></param>
363 /// <param name="theEndingClass"></param>
364 /// <returns></returns>
365 private int FindEnding(StringBuilder stemmingZone
, int startIndex
, char[][] theEndingClass
)
368 for (int i
= theEndingClass
.Length
- 1; i
>= 0; i
--)
370 char[] theEnding
= theEndingClass
[i
];
371 // check if the ending is bigger than stemming zone
372 if (startIndex
< theEnding
.Length
- 1)
378 int stemmingIndex
= startIndex
;
379 for (int j
= theEnding
.Length
- 1; j
>= 0; j
--)
381 if (stemmingZone
[stemmingIndex
--] != charset
[theEnding
[j
]])
387 // check if ending was found
390 return theEndingClass
[i
].Length
; // cut ending
396 private int FindEnding(StringBuilder stemmingZone
, char[][] theEndingClass
)
398 return FindEnding(stemmingZone
, stemmingZone
.Length
- 1, theEndingClass
);
402 /// Finds the ending among the given class of endings and removes it from stemming zone.
403 /// Creation date: (17/03/2002 8:18:34 PM)
405 /// <param name="stemmingZone"></param>
406 /// <param name="theEndingClass"></param>
407 /// <returns></returns>
408 private bool FindAndRemoveEnding(StringBuilder stemmingZone
, char[][] theEndingClass
)
410 int endingLength
= FindEnding(stemmingZone
, theEndingClass
);
411 if (endingLength
== 0)
416 stemmingZone
.Length
= stemmingZone
.Length
- endingLength
;
417 // cut the ending found
423 /// Finds the ending among the given class of endings, then checks if this ending was
424 /// preceded by any of given predessors, and if so, removes it from stemming zone.
425 /// Creation date: (17/03/2002 8:18:34 PM)
427 /// <param name="stemmingZone"></param>
428 /// <param name="theEndingClass"></param>
429 /// <param name="thePredessors"></param>
430 /// <returns></returns>
431 private bool FindAndRemoveEnding(StringBuilder stemmingZone
,
432 char[][] theEndingClass
, char[][] thePredessors
)
434 int endingLength
= FindEnding(stemmingZone
, theEndingClass
);
435 if (endingLength
== 0)
440 int predessorLength
=
441 FindEnding(stemmingZone
,
442 stemmingZone
.Length
- endingLength
- 1,
444 if (predessorLength
== 0)
448 stemmingZone
.Length
= stemmingZone
.Length
- endingLength
;
449 // cut the ending found
457 /// Marks positions of RV, R1 and R2 in a given word.
458 /// Creation date: (16/03/2002 3:40:11 PM)
460 /// <param name="word"></param>
461 private void MarkPositions(String word
)
468 while (word
.Length
> i
&& !IsVowel(word
[i
]))
472 if (word
.Length
- 1 < ++i
)
473 return; // RV zone is empty
476 while (word
.Length
> i
&& IsVowel(word
[i
]))
480 if (word
.Length
- 1 < ++i
)
481 return; // R1 zone is empty
484 while (word
.Length
> i
&& !IsVowel(word
[i
]))
488 if (word
.Length
- 1 < ++i
)
489 return; // R2 zone is empty
490 while (word
.Length
> i
&& IsVowel(word
[i
]))
494 if (word
.Length
- 1 < ++i
)
495 return; // R2 zone is empty
500 /// Checks if character is a vowel..
501 /// Creation date: (16/03/2002 10:47:03 PM)
503 /// <param name="letter"></param>
504 /// <returns></returns>
505 private bool IsVowel(char letter
)
507 for (int i
= 0; i
< vowels
.Length
; i
++)
509 if (letter
== charset
[vowels
[i
]])
517 /// Creation date: (17/03/2002 12:14:58 AM)
519 /// <param name="stemmingZone"></param>
520 /// <returns></returns>
521 private bool Noun(StringBuilder stemmingZone
)
523 return FindAndRemoveEnding(stemmingZone
, nounEndings
);
527 /// Perfective gerund endings.
528 /// Creation date: (17/03/2002 12:14:58 AM)
530 /// <param name="stemmingZone"></param>
531 /// <returns></returns>
532 private bool PerfectiveGerund(StringBuilder stemmingZone
)
534 return FindAndRemoveEnding(
536 perfectiveGerundEndings1
,
537 perfectiveGerund1Predessors
)
538 || FindAndRemoveEnding(stemmingZone
, perfectiveGerundEndings2
);
542 /// Reflexive endings.
543 /// Creation date: (17/03/2002 12:14:58 AM)
545 /// <param name="stemmingZone"></param>
546 /// <returns></returns>
547 private bool Reflexive(StringBuilder stemmingZone
)
549 return FindAndRemoveEnding(stemmingZone
, reflexiveEndings
);
553 /// Insert the method's description here.
554 /// Creation date: (17/03/2002 12:14:58 AM)
556 /// <param name="stemmingZone"></param>
557 /// <returns></returns>
558 private bool RemoveI(StringBuilder stemmingZone
)
560 if (stemmingZone
.Length
> 0
561 && stemmingZone
[stemmingZone
.Length
- 1] == charset
[I
])
563 stemmingZone
.Length
= stemmingZone
.Length
- 1;
573 /// Insert the method's description here.
574 /// Creation date: (17/03/2002 12:14:58 AM)
576 /// <param name="stemmingZone"></param>
577 /// <returns></returns>
578 private bool RemoveSoft(StringBuilder stemmingZone
)
580 if (stemmingZone
.Length
> 0
581 && stemmingZone
[stemmingZone
.Length
- 1] == charset
[SOFT
])
583 stemmingZone
.Length
= stemmingZone
.Length
- 1;
593 /// Insert the method's description here.
594 /// Creation date: (16/03/2002 10:58:42 PM)
596 /// <param name="newCharset"></param>
597 public void SetCharset(char[] newCharset
)
599 charset
= newCharset
;
603 /// Set ending definition as in Russian stemming algorithm.
604 /// Creation date: (16/03/2002 11:16:36 PM)
606 private void SetEndings()
608 vowels
= new char[] { A, E, I, O, U, Y, AE, IU, IA }
;
610 perfectiveGerundEndings1
= new char[][] {
611 new char[] { V }
, new char[] { V, SH, I }
, new char[] { V, SH, I, S, SOFT }
614 perfectiveGerund1Predessors
= new char[][] {
615 new char[] { A }
, new char[] { IA }
618 perfectiveGerundEndings2
= new char[][] {
621 new char[] { I, V, SH, I }
,
622 new char[] { Y, V, SH, I }
,
623 new char[] { I, V, SH, I, S, SOFT }
,
624 new char[] { Y, V, SH, I, S, SOFT }
627 adjectiveEndings
= new char[][] {
632 new char[] { E, I_ }
,
633 new char[] { I, I_ }
,
634 new char[] { Y, I_ }
,
635 new char[] { O, I_ }
,
642 new char[] { U, IU }
,
643 new char[] { IU, IU }
,
644 new char[] { A, IA }
,
645 new char[] { IA, IA }
,
646 new char[] { O, IU }
,
647 new char[] { E, IU }
,
648 new char[] { I, M, I }
,
649 new char[] { Y, M, I }
,
650 new char[] { E, G, O }
,
651 new char[] { O, G, O }
,
652 new char[] { E, M, U }
,
653 new char[] { O, M, U }
656 participleEndings1
= new char[][] {
660 new char[] { V, SH }
,
661 new char[] { IU, SHCH }
664 participleEndings2
= new char[][] {
665 new char[] { I, V, SH }
,
666 new char[] { Y, V, SH }
,
667 new char[] { U, IU, SHCH }
670 participle1Predessors
= new char[][] {
675 reflexiveEndings
= new char[][] {
676 new char[] { S, IA }
,
677 new char[] { S, SOFT }
680 verbEndings1
= new char[][] {
687 new char[] { IU, T }
,
693 new char[] { E, T, E }
,
694 new char[] { I_, T, E }
,
695 new char[] { T, SOFT }
,
696 new char[] { E, SH, SOFT }
,
697 new char[] { N, N, O }
700 verbEndings2
= new char[][] {
702 new char[] { U, IU }
,
704 new char[] { E, I_ }
,
705 new char[] { IA, T }
,
706 new char[] { U, I_ }
,
713 new char[] { I, L, A }
,
714 new char[] { Y, L, A }
,
715 new char[] { E, N, A }
,
716 new char[] { I, T, E }
,
717 new char[] { I, L, I }
,
718 new char[] { Y, L, I }
,
719 new char[] { I, L, O }
,
720 new char[] { Y, L, O }
,
721 new char[] { E, N, O }
,
722 new char[] { U, E, T }
,
723 new char[] { U, IU, T }
,
724 new char[] { E, N, Y }
,
725 new char[] { I, T, SOFT }
,
726 new char[] { Y, T, SOFT }
,
727 new char[] { I, SH, SOFT }
,
728 new char[] { E, I_, T, E }
,
729 new char[] { U, I_, T, E }
732 verb1Predessors
= new char[][] {
737 nounEndings
= new char[][] {
751 new char[] { SOFT, E }
,
752 new char[] { IA, X }
,
753 new char[] { I, IU }
,
756 new char[] { E, I_ }
,
757 new char[] { O, I_ }
,
762 new char[] { SOFT, IU }
,
763 new char[] { I, IA }
,
764 new char[] { SOFT, IA }
,
765 new char[] { I, I_ }
,
766 new char[] { IA, M }
,
767 new char[] { IA, M, I }
,
768 new char[] { A, M, I }
,
769 new char[] { I, E, I_ }
,
770 new char[] { I, IA, M }
,
771 new char[] { I, E, M }
,
772 new char[] { I, IA, X }
,
773 new char[] { I, IA, M, I }
776 superlativeEndings
= new char[][] {
777 new char[] { E, I_, SH }
,
778 new char[] { E, I_, SH, E }
781 derivationalEndings
= new char[][] {
782 new char[] { O, S, T }
,
783 new char[] { O, S, T, SOFT }
788 /// Finds the stem for given Russian word.
789 /// Creation date: (16/03/2002 3:36:48 PM)
791 /// <param name="input"></param>
792 /// <returns></returns>
793 public String
Stem(String input
)
795 MarkPositions(input
);
797 return input
; //RV wasn't detected, nothing to stem
798 StringBuilder stemmingZone
= new StringBuilder(input
.Substring(RV
));
799 // stemming goes on in RV
802 if (!PerfectiveGerund(stemmingZone
))
804 Reflexive(stemmingZone
);
806 Adjectival(stemmingZone
)
807 || Verb(stemmingZone
)
808 || Noun(stemmingZone
);
811 RemoveI(stemmingZone
);
813 Derivational(stemmingZone
);
815 Superlative(stemmingZone
);
816 UndoubleN(stemmingZone
);
817 RemoveSoft(stemmingZone
);
819 return input
.Substring(0, RV
) + stemmingZone
.ToString();
823 /// Superlative endings.
824 /// Creation date: (17/03/2002 12:14:58 AM)
826 /// <param name="stemmingZone"></param>
827 /// <returns></returns>
828 private bool Superlative(StringBuilder stemmingZone
)
830 return FindAndRemoveEnding(stemmingZone
, superlativeEndings
);
835 /// Creation date: (17/03/2002 12:14:58 AM)
837 /// <param name="stemmingZone"></param>
838 /// <returns></returns>
839 private bool UndoubleN(StringBuilder stemmingZone
)
844 if (FindEnding(stemmingZone
, doubleN
) != 0)
846 stemmingZone
.Length
= stemmingZone
.Length
- 1;
857 /// Creation date: (17/03/2002 12:14:58 AM)
859 /// <param name="stemmingZone"></param>
860 /// <returns></returns>
861 private bool Verb(StringBuilder stemmingZone
)
863 return FindAndRemoveEnding(
867 || FindAndRemoveEnding(stemmingZone
, verbEndings2
);
871 /// Static method for stemming with different charsets
873 /// <param name="theWord"></param>
874 /// <param name="charset"></param>
875 /// <returns></returns>
876 public static String
Stem(String theWord
, char[] charset
)
878 RussianStemmer stemmer
= new RussianStemmer();
879 stemmer
.SetCharset(charset
);
880 return stemmer
.Stem(theWord
);