Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Analysis / RU / RussianAnalyzer.cs
blobe0e95b159fe2c20e446be2d100e24ffc87c8cc4a
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Analyzer = Lucene.Net.Analysis.Analyzer;
18 using StopFilter = Lucene.Net.Analysis.StopFilter;
19 using TokenStream = Lucene.Net.Analysis.TokenStream;
20 namespace Lucene.Net.Analysis.RU
23 /// <summary> Analyzer for Russian language. Supports an external list of stopwords (words that
24 /// will not be indexed at all).
25 /// A default set of stopwords is used unless an alternative list is specified.
26 ///
27 /// </summary>
28 /// <author> Boris Okner, b.okner@rogers.com
29 /// </author>
30 /// <version> $Id: RussianAnalyzer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
31 /// </version>
32 public sealed class RussianAnalyzer : Analyzer
34 // letters
35 private static char A = (char) (0);
36 private static char B = (char) (1);
37 private static char V = (char) (2);
38 private static char G = (char) (3);
39 private static char D = (char) (4);
40 private static char E = (char) (5);
41 private static char ZH = (char) (6);
42 private static char Z = (char) (7);
43 private static char I = (char) (8);
44 private static char I_ = (char) (9);
45 private static char K = (char) (10);
46 private static char L = (char) (11);
47 private static char M = (char) (12);
48 private static char N = (char) (13);
49 private static char O = (char) (14);
50 private static char P = (char) (15);
51 private static char R = (char) (16);
52 private static char S = (char) (17);
53 private static char T = (char) (18);
54 private static char U = (char) (19);
55 private static char F = (char) (20);
56 private static char X = (char) (21);
57 private static char TS = (char) (22);
58 private static char CH = (char) (23);
59 private static char SH = (char) (24);
60 private static char SHCH = (char) (25);
61 private static char HARD = (char) (26);
62 private static char Y = (char) (27);
63 private static char SOFT = (char) (28);
64 private static char AE = (char) (29);
65 private static char IU = (char) (30);
66 private static char IA = (char) (31);
68 /// <summary> List of typical Russian stopwords.</summary>
69 private static char[][] RUSSIAN_STOP_WORDS = new char[][]{new char[]{A}, new char[]{B, E, Z}, new char[]{B, O, L, E, E}, new char[]{B, Y}, new char[]{B, Y, L}, new char[]{B, Y, L, A}, new char[]{B, Y, L, I}, new char[]{B, Y, L, O}, new char[]{B, Y, T, SOFT}, new char[]{V}, new char[]{V, A, M}, new char[]{V, A, S}, new char[]{V, E, S, SOFT}, new char[]{V, O}, new char[]{V, O, T}, new char[]{V, S, E}, new char[]{V, S, E, G, O}, new char[]{V, S, E, X}, new char[]{V, Y}, new char[]{G, D, E}, new char[]{D, A}, new char[]{D, A, ZH, E}, new char[]{D, L, IA}, new char[]{D, O}, new char[]{E, G, O}, new char[]{E, E}, new char[]{E, I_}, new char[]{E, IU}, new char[]{E, S, L, I}, new char[]{E, S, T, SOFT}, new char[]{E, SHCH, E}, new char[]{ZH, E}, new char[]{Z, A}, new char[]{Z, D, E, S, SOFT}, new char[]{I}, new char[]{I, Z}, new char[]{I, L, I}, new char[]{I, M}, new char[]{I, X}, new char[]{K}, new char[]{K, A, K}, new char[]{K, O}, new char[]{K, O, G, D, A}, new char[]{K, T, O}, new char[]{L, I}, new char[]{L, I, B, O}, new char[]{M, N, E}, new char[]{M, O, ZH, E, T}, new char[]{M, Y}, new char[]{N, A}, new char[]{N, A, D, O}, new char[]{N, A, SH}, new char[]{N, E}, new char[]{N, E, G, O}, new char[]{N, E, E}, new char[]{N, E, T}, new char[]{N, I}, new char[]{N, I, X}, new char[]{N, O}, new char[]{N, U}, new char[]{O}, new char[]{O, B}, new char[]{O, D, N, A, K, O}, new char[]{O, N}, new char[]{O, N, A}, new char[]{O, N, I}, new char[]{O, N, O}, new char[]{O, T}, new char[]{O, CH, E, N, SOFT}, new char[]{P, O}, new char[]{P, O, D}, new char[]{P, R, I}, new char[]{S}, new char[]{S, O}, new char[]{T, A, K}, new char[]{T, A, K, ZH, E}, new char[]{T, A, K, O, I_}, new char[]{T, A, M}, new char[]{T, E}, new char[]{T, E, M}, new char[]{T, O}, new char[]{T, O, G, O}, new char[]{T, O, ZH, E}, new char[]{T, O, I_}, new char[]{T, O, L, SOFT, K, O}, new char[]{T, O, M}, new char[]{T, Y}, new char[]{U}, new char[]{U, ZH, E}, new char[]{X, O, T, IA}, new char[]{CH, E, G, O}, new char[]{CH, E, I_}, new char[]{CH, E, M},
70 new char[]{CH, T, O}, new char[]{CH, T, O, B, Y}, new char[]{CH, SOFT, E}, new char[]{CH, SOFT, IA}, new char[]{AE, T, A}, new char[]{AE, T, I}, new char[]{AE, T, O}, new char[]{IA}};
72 /// <summary> Contains the stopwords used with the StopFilter.</summary>
73 private System.Collections.Hashtable stopSet = new System.Collections.Hashtable();
75 /// <summary> Charset for Russian letters.
76 /// Represents encoding for 32 lowercase Russian letters.
77 /// Predefined charsets can be taken from RussianCharSets class
78 /// </summary>
79 private char[] charset;
82 public RussianAnalyzer()
84 charset = RussianCharsets.UnicodeRussian;
85 stopSet = StopFilter.MakeStopSet(makeStopWords(RussianCharsets.UnicodeRussian));
88 /// <summary> Builds an analyzer.</summary>
89 public RussianAnalyzer(char[] charset)
91 this.charset = charset;
92 stopSet = StopFilter.MakeStopSet(makeStopWords(charset));
95 /// <summary> Builds an analyzer with the given stop words.</summary>
96 public RussianAnalyzer(char[] charset, System.String[] stopwords)
98 this.charset = charset;
99 stopSet = StopFilter.MakeStopSet(stopwords);
102 // Takes russian stop words and translates them to a String array, using
103 // the given charset
104 private static System.String[] makeStopWords(char[] charset)
106 System.String[] res = new System.String[RUSSIAN_STOP_WORDS.Length];
107 for (int i = 0; i < res.Length; i++)
109 char[] theStopWord = RUSSIAN_STOP_WORDS[i];
110 // translate the word,using the charset
111 System.Text.StringBuilder theWord = new System.Text.StringBuilder();
112 for (int j = 0; j < theStopWord.Length; j++)
114 theWord.Append(charset[theStopWord[j]]);
116 res[i] = theWord.ToString();
118 return res;
121 /// <summary> Builds an analyzer with the given stop words.</summary>
122 /// <todo> create a Set version of this ctor </todo>
123 public RussianAnalyzer(char[] charset, System.Collections.Hashtable stopwords)
125 this.charset = charset;
126 stopSet = new System.Collections.Hashtable(new System.Collections.Hashtable(stopwords));
129 /// <summary> Creates a TokenStream which tokenizes all the text in the provided Reader.
130 ///
131 /// </summary>
132 /// <returns> A TokenStream build from a RussianLetterTokenizer filtered with
133 /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
134 /// </returns>
135 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
137 TokenStream result = new RussianLetterTokenizer(reader, charset);
138 result = new RussianLowerCaseFilter(result, charset);
139 result = new StopFilter(result, stopSet);
140 result = new RussianStemFilter(result, charset);
141 return result;