4 using System
.Collections
;
5 using Lucene
.Net
.Analysis
;
7 namespace Lucene
.Net
.Analysis
.Ru
9 /* ====================================================================
10 * The Apache Software License, Version 1.1
12 * Copyright (c) 2001 The Apache Software Foundation. All rights
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in
24 * the documentation and/or other materials provided with the
27 * 3. The end-user documentation included with the redistribution,
28 * if any, must include the following acknowledgment:
29 * "This product includes software developed by the
30 * Apache Software Foundation (http://www.apache.org/)."
31 * Alternately, this acknowledgment may appear in the software itself,
32 * if and wherever such third-party acknowledgments normally appear.
34 * 4. The names "Apache" and "Apache Software Foundation" and
35 * "Apache Lucene" must not be used to endorse or promote products
36 * derived from this software without prior written permission. For
37 * written permission, please contact apache@apache.org.
39 * 5. Products derived from this software may not be called "Apache",
40 * "Apache Lucene", nor may "Apache" appear in their name, without
41 * prior written permission of the Apache Software Foundation.
43 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
45 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
47 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * ====================================================================
57 * This software consists of voluntary contributions made by many
58 * individuals on behalf of the Apache Software Foundation. For more
59 * information on the Apache Software Foundation, please see
60 * <http://www.apache.org/>.
64 /// Analyzer for Russian language. Supports an external list of stopwords (words that
65 /// will not be indexed at all).
66 /// A default set of stopwords is used unless an alternative list is specified.
68 /// <author>Boris Okner, b.okner@rogers.com</author>
69 /// <version>$Id: RussianAnalyzer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
71 public sealed class RussianAnalyzer
: Analyzer
74 private static char A
= (char)0;
75 private static char B
= (char)1;
76 private static char V
= (char)2;
77 private static char G
= (char)3;
78 private static char D
= (char)4;
79 private static char E
= (char)5;
80 private static char ZH
= (char)6;
81 private static char Z
= (char)7;
82 private static char I
= (char)8;
83 private static char I_
= (char)9;
84 private static char K
= (char)10;
85 private static char L
= (char)11;
86 private static char M
= (char)12;
87 private static char N
= (char)13;
88 private static char O
= (char)14;
89 private static char P
= (char)15;
90 private static char R
= (char)16;
91 private static char S
= (char)17;
92 private static char T
= (char)18;
93 private static char U
= (char)19;
94 private static char F
= (char)20;
95 private static char X
= (char)21;
96 private static char TS
= (char)22;
97 private static char CH
= (char)23;
98 private static char SH
= (char)24;
99 private static char SHCH
= (char)25;
100 private static char HARD
= (char)26;
101 private static char Y
= (char)27;
102 private static char SOFT
= (char)28;
103 private static char AE
= (char)29;
104 private static char IU
= (char)30;
105 private static char IA
= (char)31;
108 /// List of typical Russian stopwords.
110 private static char[][] RUSSIAN_STOP_WORDS
= {
112 new char[] {B, E, Z}
,
113 new char[] {B, O, L, E, E}
,
115 new char[] {B, Y, L}
,
116 new char[] {B, Y, L, A}
,
117 new char[] {B, Y, L, I}
,
118 new char[] {B, Y, L, O}
,
119 new char[] {B, Y, T, SOFT}
,
121 new char[] {V, A, M}
,
122 new char[] {V, A, S}
,
123 new char[] {V, E, S, SOFT}
,
125 new char[] {V, O, T}
,
126 new char[] {V, S, E}
,
127 new char[] {V, S, E, G, O}
,
128 new char[] {V, S, E, X}
,
130 new char[] {G, D, E}
,
132 new char[] {D, A, ZH, E}
,
133 new char[] {D, L, IA}
,
135 new char[] {E, G, O}
,
139 new char[] {E, S, L, I}
,
140 new char[] {E, S, T, SOFT}
,
141 new char[] {E, SHCH, E}
,
144 new char[] {Z, D, E, S, SOFT}
,
147 new char[] {I, L, I}
,
151 new char[] {K, A, K}
,
153 new char[] {K, O, G, D, A}
,
154 new char[] {K, T, O}
,
156 new char[] {L, I, B, O}
,
157 new char[] {M, N, E}
,
158 new char[] {M, O, ZH, E, T}
,
161 new char[] {N, A, D, O}
,
162 new char[] {N, A, SH}
,
164 new char[] {N, E, G, O}
,
165 new char[] {N, E, E}
,
166 new char[] {N, E, T}
,
168 new char[] {N, I, X}
,
173 new char[] {O, D, N, A, K, O}
,
175 new char[] {O, N, A}
,
176 new char[] {O, N, I}
,
177 new char[] {O, N, O}
,
179 new char[] {O, CH, E, N, SOFT}
,
181 new char[] {P, O, D}
,
182 new char[] {P, R, I}
,
185 new char[] {T, A, K}
,
186 new char[] {T, A, K, ZH, E}
,
187 new char[] {T, A, K, O, I_}
,
188 new char[] {T, A, M}
,
190 new char[] {T, E, M}
,
192 new char[] {T, O, G, O}
,
193 new char[] {T, O, ZH, E}
,
194 new char[] {T, O, I_}
,
195 new char[] {T, O, L, SOFT, K, O}
,
196 new char[] {T, O, M}
,
199 new char[] {U, ZH, E}
,
200 new char[] {X, O, T, IA}
,
201 new char[] {CH, E, G, O}
,
202 new char[] {CH, E, I_}
,
203 new char[] {CH, E, M}
,
204 new char[] {CH, T, O}
,
205 new char[] {CH, T, O, B, Y}
,
206 new char[] {CH, SOFT, E}
,
207 new char[] {CH, SOFT, IA}
,
208 new char[] {AE, T, A}
,
209 new char[] {AE, T, I}
,
210 new char[] {AE, T, O}
,
215 /// Contains the stopwords used with the StopFilter.
217 private Hashtable stoptable
= new Hashtable();
220 /// Charset for Russian letters.
221 /// Represents encoding for 32 lowercase Russian letters.
222 /// Predefined charsets can be taken from RussianCharSets class
224 private char[] charset
;
227 /// Builds an analyzer.
229 public RussianAnalyzer()
231 this.charset
= RussianCharsets
.UnicodeRussian
;
232 stoptable
= StopFilter
.MakeStopTable(MakeStopWords(RussianCharsets
.UnicodeRussian
));
236 /// Builds an analyzer.
238 /// <param name="charset"></param>
239 public RussianAnalyzer(char[] charset
)
241 this.charset
= charset
;
242 stoptable
= StopFilter
.MakeStopTable(MakeStopWords(charset
));
246 /// Builds an analyzer with the given stop words.
248 /// <param name="charset"></param>
249 /// <param name="stopwords"></param>
250 public RussianAnalyzer(char[] charset
, String
[] stopwords
)
252 this.charset
= charset
;
253 stoptable
= StopFilter
.MakeStopTable(stopwords
);
257 /// Takes russian stop words and translates them to a String array, using
258 /// the given charset
260 /// <param name="charset"></param>
261 /// <returns></returns>
262 private static String
[] MakeStopWords(char[] charset
)
264 String
[] res
= new String
[RUSSIAN_STOP_WORDS
.Length
];
265 for (int i
= 0; i
< res
.Length
; i
++)
267 char[] theStopWord
= RUSSIAN_STOP_WORDS
[i
];
268 // translate the word,using the charset
269 StringBuilder theWord
= new StringBuilder();
270 for (int j
= 0; j
< theStopWord
.Length
; j
++)
272 theWord
.Append(charset
[theStopWord
[j
]]);
274 res
[i
] = theWord
.ToString();
280 /// Builds an analyzer with the given stop words.
282 /// <param name="charset"></param>
283 /// <param name="stopwords"></param>
284 public RussianAnalyzer(char[] charset
, Hashtable stopwords
)
286 this.charset
= charset
;
287 stoptable
= stopwords
;
291 /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
293 /// <param name="fieldName"></param>
294 /// <param name="reader"></param>
296 /// A TokenStream build from a RussianLetterTokenizer filtered with
297 /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
299 public override TokenStream
TokenStream(String fieldName
, TextReader reader
)
301 TokenStream result
= new RussianLetterTokenizer(reader
, charset
);
302 result
= new RussianLowerCaseFilter(result
, charset
);
303 result
= new StopFilter(result
, stoptable
);
304 result
= new RussianStemFilter(result
, charset
);