First post!
[beagle.git] / Lucene.Net / Analysis / RU / RussianAnalyzer.cs
blobb32095593ab887778fc112cca5ac9cb34d91f1df
1 using System;
2 using System.Text;
3 using System.IO;
4 using System.Collections;
5 using Lucene.Net.Analysis;
7 namespace Lucene.Net.Analysis.Ru
9 /* ====================================================================
10 * The Apache Software License, Version 1.1
12 * Copyright (c) 2001 The Apache Software Foundation. All rights
13 * reserved.
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in
24 * the documentation and/or other materials provided with the
25 * distribution.
27 * 3. The end-user documentation included with the redistribution,
28 * if any, must include the following acknowledgment:
29 * "This product includes software developed by the
30 * Apache Software Foundation (http://www.apache.org/)."
31 * Alternately, this acknowledgment may appear in the software itself,
32 * if and wherever such third-party acknowledgments normally appear.
34 * 4. The names "Apache" and "Apache Software Foundation" and
35 * "Apache Lucene" must not be used to endorse or promote products
36 * derived from this software without prior written permission. For
37 * written permission, please contact apache@apache.org.
39 * 5. Products derived from this software may not be called "Apache",
40 * "Apache Lucene", nor may "Apache" appear in their name, without
41 * prior written permission of the Apache Software Foundation.
43 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
45 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
47 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 * ====================================================================
57 * This software consists of voluntary contributions made by many
58 * individuals on behalf of the Apache Software Foundation. For more
59 * information on the Apache Software Foundation, please see
60 * <http://www.apache.org/>.
63 /// <summary>
64 /// Analyzer for Russian language. Supports an external list of stopwords (words that
65 /// will not be indexed at all).
66 /// A default set of stopwords is used unless an alternative list is specified.
67 /// </summary>
68 /// <author>Boris Okner, b.okner@rogers.com</author>
69 /// <version>$Id: RussianAnalyzer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
70 ///
71 public sealed class RussianAnalyzer : Analyzer
73 // letters
74 private static char A = (char)0;
75 private static char B = (char)1;
76 private static char V = (char)2;
77 private static char G = (char)3;
78 private static char D = (char)4;
79 private static char E = (char)5;
80 private static char ZH = (char)6;
81 private static char Z = (char)7;
82 private static char I = (char)8;
83 private static char I_ = (char)9;
84 private static char K = (char)10;
85 private static char L = (char)11;
86 private static char M = (char)12;
87 private static char N = (char)13;
88 private static char O = (char)14;
89 private static char P = (char)15;
90 private static char R = (char)16;
91 private static char S = (char)17;
92 private static char T = (char)18;
93 private static char U = (char)19;
94 private static char F = (char)20;
95 private static char X = (char)21;
96 private static char TS = (char)22;
97 private static char CH = (char)23;
98 private static char SH = (char)24;
99 private static char SHCH = (char)25;
100 private static char HARD = (char)26;
101 private static char Y = (char)27;
102 private static char SOFT = (char)28;
103 private static char AE = (char)29;
104 private static char IU = (char)30;
105 private static char IA = (char)31;
107 /// <summary>
108 /// List of typical Russian stopwords.
109 /// </summary>
110 private static char[][] RUSSIAN_STOP_WORDS = {
111 new char[] {A},
112 new char[] {B, E, Z},
113 new char[] {B, O, L, E, E},
114 new char[] {B, Y},
115 new char[] {B, Y, L},
116 new char[] {B, Y, L, A},
117 new char[] {B, Y, L, I},
118 new char[] {B, Y, L, O},
119 new char[] {B, Y, T, SOFT},
120 new char[] {V},
121 new char[] {V, A, M},
122 new char[] {V, A, S},
123 new char[] {V, E, S, SOFT},
124 new char[] {V, O},
125 new char[] {V, O, T},
126 new char[] {V, S, E},
127 new char[] {V, S, E, G, O},
128 new char[] {V, S, E, X},
129 new char[] {V, Y},
130 new char[] {G, D, E},
131 new char[] {D, A},
132 new char[] {D, A, ZH, E},
133 new char[] {D, L, IA},
134 new char[] {D, O},
135 new char[] {E, G, O},
136 new char[] {E, E},
137 new char[] {E, I_,},
138 new char[] {E, IU},
139 new char[] {E, S, L, I},
140 new char[] {E, S, T, SOFT},
141 new char[] {E, SHCH, E},
142 new char[] {ZH, E},
143 new char[] {Z, A},
144 new char[] {Z, D, E, S, SOFT},
145 new char[] {I},
146 new char[] {I, Z},
147 new char[] {I, L, I},
148 new char[] {I, M},
149 new char[] {I, X},
150 new char[] {K},
151 new char[] {K, A, K},
152 new char[] {K, O},
153 new char[] {K, O, G, D, A},
154 new char[] {K, T, O},
155 new char[] {L, I},
156 new char[] {L, I, B, O},
157 new char[] {M, N, E},
158 new char[] {M, O, ZH, E, T},
159 new char[] {M, Y},
160 new char[] {N, A},
161 new char[] {N, A, D, O},
162 new char[] {N, A, SH},
163 new char[] {N, E},
164 new char[] {N, E, G, O},
165 new char[] {N, E, E},
166 new char[] {N, E, T},
167 new char[] {N, I},
168 new char[] {N, I, X},
169 new char[] {N, O},
170 new char[] {N, U},
171 new char[] {O},
172 new char[] {O, B},
173 new char[] {O, D, N, A, K, O},
174 new char[] {O, N},
175 new char[] {O, N, A},
176 new char[] {O, N, I},
177 new char[] {O, N, O},
178 new char[] {O, T},
179 new char[] {O, CH, E, N, SOFT},
180 new char[] {P, O},
181 new char[] {P, O, D},
182 new char[] {P, R, I},
183 new char[] {S},
184 new char[] {S, O},
185 new char[] {T, A, K},
186 new char[] {T, A, K, ZH, E},
187 new char[] {T, A, K, O, I_},
188 new char[] {T, A, M},
189 new char[] {T, E},
190 new char[] {T, E, M},
191 new char[] {T, O},
192 new char[] {T, O, G, O},
193 new char[] {T, O, ZH, E},
194 new char[] {T, O, I_},
195 new char[] {T, O, L, SOFT, K, O},
196 new char[] {T, O, M},
197 new char[] {T, Y},
198 new char[] {U},
199 new char[] {U, ZH, E},
200 new char[] {X, O, T, IA},
201 new char[] {CH, E, G, O},
202 new char[] {CH, E, I_},
203 new char[] {CH, E, M},
204 new char[] {CH, T, O},
205 new char[] {CH, T, O, B, Y},
206 new char[] {CH, SOFT, E},
207 new char[] {CH, SOFT, IA},
208 new char[] {AE, T, A},
209 new char[] {AE, T, I},
210 new char[] {AE, T, O},
211 new char[] {IA}
214 /// <summary>
215 /// Contains the stopwords used with the StopFilter.
216 /// </summary>
217 private Hashtable stoptable = new Hashtable();
219 /// <summary>
220 /// Charset for Russian letters.
221 /// Represents encoding for 32 lowercase Russian letters.
222 /// Predefined charsets can be taken from RussianCharSets class
223 /// </summary>
224 private char[] charset;
226 /// <summary>
227 /// Builds an analyzer.
228 /// </summary>
229 public RussianAnalyzer()
231 this.charset = RussianCharsets.UnicodeRussian;
232 stoptable = StopFilter.MakeStopTable(MakeStopWords(RussianCharsets.UnicodeRussian));
235 /// <summary>
236 /// Builds an analyzer.
237 /// </summary>
238 /// <param name="charset"></param>
239 public RussianAnalyzer(char[] charset)
241 this.charset = charset;
242 stoptable = StopFilter.MakeStopTable(MakeStopWords(charset));
245 /// <summary>
246 /// Builds an analyzer with the given stop words.
247 /// </summary>
248 /// <param name="charset"></param>
249 /// <param name="stopwords"></param>
250 public RussianAnalyzer(char[] charset, String[] stopwords)
252 this.charset = charset;
253 stoptable = StopFilter.MakeStopTable(stopwords);
256 /// <summary>
257 /// Takes russian stop words and translates them to a String array, using
258 /// the given charset
259 /// </summary>
260 /// <param name="charset"></param>
261 /// <returns></returns>
262 private static String[] MakeStopWords(char[] charset)
264 String[] res = new String[RUSSIAN_STOP_WORDS.Length];
265 for (int i = 0; i < res.Length; i++)
267 char[] theStopWord = RUSSIAN_STOP_WORDS[i];
268 // translate the word,using the charset
269 StringBuilder theWord = new StringBuilder();
270 for (int j = 0; j < theStopWord.Length; j++)
272 theWord.Append(charset[theStopWord[j]]);
274 res[i] = theWord.ToString();
276 return res;
279 /// <summary>
280 /// Builds an analyzer with the given stop words.
281 /// </summary>
282 /// <param name="charset"></param>
283 /// <param name="stopwords"></param>
284 public RussianAnalyzer(char[] charset, Hashtable stopwords)
286 this.charset = charset;
287 stoptable = stopwords;
290 /// <summary>
291 /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
292 /// </summary>
293 /// <param name="fieldName"></param>
294 /// <param name="reader"></param>
295 /// <returns>
296 /// A TokenStream build from a RussianLetterTokenizer filtered with
297 /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
298 /// </returns>
299 public override TokenStream TokenStream(String fieldName, TextReader reader)
301 TokenStream result = new RussianLetterTokenizer(reader, charset);
302 result = new RussianLowerCaseFilter(result, charset);
303 result = new StopFilter(result, stoptable);
304 result = new RussianStemFilter(result, charset);
305 return result;