2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Analyzer
= Lucene
.Net
.Analysis
.Analyzer
;
18 using StopFilter
= Lucene
.Net
.Analysis
.StopFilter
;
19 using TokenStream
= Lucene
.Net
.Analysis
.TokenStream
;
20 namespace Lucene
.Net
.Analysis
.RU
23 /// <summary> Analyzer for Russian language. Supports an external list of stopwords (words that
24 /// will not be indexed at all).
25 /// A default set of stopwords is used unless an alternative list is specified.
28 /// <author> Boris Okner, b.okner@rogers.com
30 /// <version> $Id: RussianAnalyzer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
32 public sealed class RussianAnalyzer
: Analyzer
35 private static char A
= (char) (0);
36 private static char B
= (char) (1);
37 private static char V
= (char) (2);
38 private static char G
= (char) (3);
39 private static char D
= (char) (4);
40 private static char E
= (char) (5);
41 private static char ZH
= (char) (6);
42 private static char Z
= (char) (7);
43 private static char I
= (char) (8);
44 private static char I_
= (char) (9);
45 private static char K
= (char) (10);
46 private static char L
= (char) (11);
47 private static char M
= (char) (12);
48 private static char N
= (char) (13);
49 private static char O
= (char) (14);
50 private static char P
= (char) (15);
51 private static char R
= (char) (16);
52 private static char S
= (char) (17);
53 private static char T
= (char) (18);
54 private static char U
= (char) (19);
55 private static char F
= (char) (20);
56 private static char X
= (char) (21);
57 private static char TS
= (char) (22);
58 private static char CH
= (char) (23);
59 private static char SH
= (char) (24);
60 private static char SHCH
= (char) (25);
61 private static char HARD
= (char) (26);
62 private static char Y
= (char) (27);
63 private static char SOFT
= (char) (28);
64 private static char AE
= (char) (29);
65 private static char IU
= (char) (30);
66 private static char IA
= (char) (31);
68 /// <summary> List of typical Russian stopwords.</summary>
69 private static char[][] RUSSIAN_STOP_WORDS
= new char[][]{new char[]{A}
, new char[]{B, E, Z}
, new char[]{B, O, L, E, E}
, new char[]{B, Y}
, new char[]{B, Y, L}
, new char[]{B, Y, L, A}
, new char[]{B, Y, L, I}
, new char[]{B, Y, L, O}
, new char[]{B, Y, T, SOFT}
, new char[]{V}
, new char[]{V, A, M}
, new char[]{V, A, S}
, new char[]{V, E, S, SOFT}
, new char[]{V, O}
, new char[]{V, O, T}
, new char[]{V, S, E}
, new char[]{V, S, E, G, O}
, new char[]{V, S, E, X}
, new char[]{V, Y}
, new char[]{G, D, E}
, new char[]{D, A}
, new char[]{D, A, ZH, E}
, new char[]{D, L, IA}
, new char[]{D, O}
, new char[]{E, G, O}
, new char[]{E, E}
, new char[]{E, I_}
, new char[]{E, IU}
, new char[]{E, S, L, I}
, new char[]{E, S, T, SOFT}
, new char[]{E, SHCH, E}
, new char[]{ZH, E}
, new char[]{Z, A}
, new char[]{Z, D, E, S, SOFT}
, new char[]{I}
, new char[]{I, Z}
, new char[]{I, L, I}
, new char[]{I, M}
, new char[]{I, X}
, new char[]{K}
, new char[]{K, A, K}
, new char[]{K, O}
, new char[]{K, O, G, D, A}
, new char[]{K, T, O}
, new char[]{L, I}
, new char[]{L, I, B, O}
, new char[]{M, N, E}
, new char[]{M, O, ZH, E, T}
, new char[]{M, Y}
, new char[]{N, A}
, new char[]{N, A, D, O}
, new char[]{N, A, SH}
, new char[]{N, E}
, new char[]{N, E, G, O}
, new char[]{N, E, E}
, new char[]{N, E, T}
, new char[]{N, I}
, new char[]{N, I, X}
, new char[]{N, O}
, new char[]{N, U}
, new char[]{O}
, new char[]{O, B}
, new char[]{O, D, N, A, K, O}
, new char[]{O, N}
, new char[]{O, N, A}
, new char[]{O, N, I}
, new char[]{O, N, O}
, new char[]{O, T}
, new char[]{O, CH, E, N, SOFT}
, new char[]{P, O}
, new char[]{P, O, D}
, new char[]{P, R, I}
, new char[]{S}
, new char[]{S, O}
, new char[]{T, A, K}
, new char[]{T, A, K, ZH, E}
, new char[]{T, A, K, O, I_}
, new char[]{T, A, M}
, new char[]{T, E}
, new char[]{T, E, M}
, new char[]{T, O}
, new char[]{T, O, G, O}
, new char[]{T, O, ZH, E}
, new char[]{T, O, I_}
, new char[]{T, O, L, SOFT, K, O}
, new char[]{T, O, M}
, new char[]{T, Y}
, new char[]{U}
, new char[]{U, ZH, E}
, new char[]{X, O, T, IA}
, new char[]{CH, E, G, O}
, new char[]{CH, E, I_}
, new char[]{CH, E, M}
,
70 new char[]{CH, T, O}
, new char[]{CH, T, O, B, Y}
, new char[]{CH, SOFT, E}
, new char[]{CH, SOFT, IA}
, new char[]{AE, T, A}
, new char[]{AE, T, I}
, new char[]{AE, T, O}
, new char[]{IA}}
;
72 /// <summary> Contains the stopwords used with the StopFilter.</summary>
73 private System
.Collections
.Hashtable stopSet
= new System
.Collections
.Hashtable();
75 /// <summary> Charset for Russian letters.
76 /// Represents encoding for 32 lowercase Russian letters.
77 /// Predefined charsets can be taken from RussianCharSets class
79 private char[] charset
;
82 public RussianAnalyzer()
84 charset
= RussianCharsets
.UnicodeRussian
;
85 stopSet
= StopFilter
.MakeStopSet(makeStopWords(RussianCharsets
.UnicodeRussian
));
88 /// <summary> Builds an analyzer.</summary>
89 public RussianAnalyzer(char[] charset
)
91 this.charset
= charset
;
92 stopSet
= StopFilter
.MakeStopSet(makeStopWords(charset
));
95 /// <summary> Builds an analyzer with the given stop words.</summary>
96 public RussianAnalyzer(char[] charset
, System
.String
[] stopwords
)
98 this.charset
= charset
;
99 stopSet
= StopFilter
.MakeStopSet(stopwords
);
102 // Takes russian stop words and translates them to a String array, using
104 private static System
.String
[] makeStopWords(char[] charset
)
106 System
.String
[] res
= new System
.String
[RUSSIAN_STOP_WORDS
.Length
];
107 for (int i
= 0; i
< res
.Length
; i
++)
109 char[] theStopWord
= RUSSIAN_STOP_WORDS
[i
];
110 // translate the word,using the charset
111 System
.Text
.StringBuilder theWord
= new System
.Text
.StringBuilder();
112 for (int j
= 0; j
< theStopWord
.Length
; j
++)
114 theWord
.Append(charset
[theStopWord
[j
]]);
116 res
[i
] = theWord
.ToString();
121 /// <summary> Builds an analyzer with the given stop words.</summary>
122 /// <todo> create a Set version of this ctor </todo>
123 public RussianAnalyzer(char[] charset
, System
.Collections
.Hashtable stopwords
)
125 this.charset
= charset
;
126 stopSet
= new System
.Collections
.Hashtable(new System
.Collections
.Hashtable(stopwords
));
129 /// <summary> Creates a TokenStream which tokenizes all the text in the provided Reader.
132 /// <returns> A TokenStream build from a RussianLetterTokenizer filtered with
133 /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
135 public override TokenStream
TokenStream(System
.String fieldName
, System
.IO
.TextReader reader
)
137 TokenStream result
= new RussianLetterTokenizer(reader
, charset
);
138 result
= new RussianLowerCaseFilter(result
, charset
);
139 result
= new StopFilter(result
, stopSet
);
140 result
= new RussianStemFilter(result
, charset
);