beagled/Lucene.Net/Analysis/RU/RussianAnalyzer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 using Analyzer = Lucene.Net.Analysis.Analyzer;
  18 using StopFilter = Lucene.Net.Analysis.StopFilter;
  19 using TokenStream = Lucene.Net.Analysis.TokenStream;
  20 namespace Lucene.Net.Analysis.RU
  21 {
  22
  23         /// <summary> Analyzer for Russian language. Supports an external list of stopwords (words that
  24         /// will not be indexed at all).
  25         /// A default set of stopwords is used unless an alternative list is specified.
  26         ///
  27         /// </summary>
  28         /// <author>   Boris Okner, b.okner@rogers.com
  29         /// </author>
  30         /// <version>  $Id: RussianAnalyzer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
  31         /// </version>
  32         public sealed class RussianAnalyzer : Analyzer
  33         {
  34                 // letters
  35                 private static char A = (char) (0);
  36                 private static char B = (char) (1);
  37                 private static char V = (char) (2);
  38                 private static char G = (char) (3);
  39                 private static char D = (char) (4);
  40                 private static char E = (char) (5);
  41                 private static char ZH = (char) (6);
  42                 private static char Z = (char) (7);
  43                 private static char I = (char) (8);
  44                 private static char I_ = (char) (9);
  45                 private static char K = (char) (10);
  46                 private static char L = (char) (11);
  47                 private static char M = (char) (12);
  48                 private static char N = (char) (13);
  49                 private static char O = (char) (14);
  50                 private static char P = (char) (15);
  51                 private static char R = (char) (16);
  52                 private static char S = (char) (17);
  53                 private static char T = (char) (18);
  54                 private static char U = (char) (19);
  55                 private static char F = (char) (20);
  56                 private static char X = (char) (21);
  57                 private static char TS = (char) (22);
  58                 private static char CH = (char) (23);
  59                 private static char SH = (char) (24);
  60                 private static char SHCH = (char) (25);
  61                 private static char HARD = (char) (26);
  62                 private static char Y = (char) (27);
  63                 private static char SOFT = (char) (28);
  64                 private static char AE = (char) (29);
  65                 private static char IU = (char) (30);
  66                 private static char IA = (char) (31);
  67
  68                 /// <summary> List of typical Russian stopwords.</summary>
  69                 private static char[][] RUSSIAN_STOP_WORDS = new char[][]{new char[]{A}, new char[]{B, E, Z}, new char[]{B, O, L, E, E}, new char[]{B, Y}, new char[]{B, Y, L}, new char[]{B, Y, L, A}, new char[]{B, Y, L, I}, new char[]{B, Y, L, O}, new char[]{B, Y, T, SOFT}, new char[]{V}, new char[]{V, A, M}, new char[]{V, A, S}, new char[]{V, E, S, SOFT}, new char[]{V, O}, new char[]{V, O, T}, new char[]{V, S, E}, new char[]{V, S, E, G, O}, new char[]{V, S, E, X}, new char[]{V, Y}, new char[]{G, D, E}, new char[]{D, A}, new char[]{D, A, ZH, E}, new char[]{D, L, IA}, new char[]{D, O}, new char[]{E, G, O}, new char[]{E, E}, new char[]{E, I_}, new char[]{E, IU}, new char[]{E, S, L, I}, new char[]{E, S, T, SOFT}, new char[]{E, SHCH, E}, new char[]{ZH, E}, new char[]{Z, A}, new char[]{Z, D, E, S, SOFT}, new char[]{I}, new char[]{I, Z}, new char[]{I, L, I}, new char[]{I, M}, new char[]{I, X}, new char[]{K}, new char[]{K, A, K}, new char[]{K, O}, new char[]{K, O, G, D, A}, new char[]{K, T, O}, new char[]{L, I}, new char[]{L, I, B, O}, new char[]{M, N, E}, new char[]{M, O, ZH, E, T}, new char[]{M, Y}, new char[]{N, A}, new char[]{N, A, D, O}, new char[]{N, A, SH}, new char[]{N, E}, new char[]{N, E, G, O}, new char[]{N, E, E}, new char[]{N, E, T}, new char[]{N, I}, new char[]{N, I, X}, new char[]{N, O}, new char[]{N, U}, new char[]{O}, new char[]{O, B}, new char[]{O, D, N, A, K, O}, new char[]{O, N}, new char[]{O, N, A}, new char[]{O, N, I}, new char[]{O, N, O}, new char[]{O, T}, new char[]{O, CH, E, N, SOFT}, new char[]{P, O}, new char[]{P, O, D}, new char[]{P, R, I}, new char[]{S}, new char[]{S, O}, new char[]{T, A, K}, new char[]{T, A, K, ZH, E}, new char[]{T, A, K, O, I_}, new char[]{T, A, M}, new char[]{T, E}, new char[]{T, E, M}, new char[]{T, O}, new char[]{T, O, G, O}, new char[]{T, O, ZH, E}, new char[]{T, O, I_}, new char[]{T, O, L, SOFT, K, O}, new char[]{T, O, M}, new char[]{T, Y}, new char[]{U}, new char[]{U, ZH, E}, new char[]{X, O, T, IA}, new char[]{CH, E, G, O}, new char[]{CH, E, I_}, new char[]{CH, E, M},
  70                         new char[]{CH, T, O}, new char[]{CH, T, O, B, Y}, new char[]{CH, SOFT, E}, new char[]{CH, SOFT, IA}, new char[]{AE, T, A}, new char[]{AE, T, I}, new char[]{AE, T, O}, new char[]{IA}};
  71
  72                 /// <summary> Contains the stopwords used with the StopFilter.</summary>
  73                 private System.Collections.Hashtable stopSet = new System.Collections.Hashtable();
  74
  75                 /// <summary> Charset for Russian letters.
  76                 /// Represents encoding for 32 lowercase Russian letters.
  77                 /// Predefined charsets can be taken from RussianCharSets class
  78                 /// </summary>
  79                 private char[] charset;
  80
  81
  82                 public RussianAnalyzer()
  83                 {
  84                         charset = RussianCharsets.UnicodeRussian;
  85                         stopSet = StopFilter.MakeStopSet(makeStopWords(RussianCharsets.UnicodeRussian));
  86                 }
  87
  88                 /// <summary> Builds an analyzer.</summary>
  89                 public RussianAnalyzer(char[] charset)
  90                 {
  91                         this.charset = charset;
  92                         stopSet = StopFilter.MakeStopSet(makeStopWords(charset));
  93                 }
  94
  95                 /// <summary> Builds an analyzer with the given stop words.</summary>
  96                 public RussianAnalyzer(char[] charset, System.String[] stopwords)
  97                 {
  98                         this.charset = charset;
  99                         stopSet = StopFilter.MakeStopSet(stopwords);
 100                 }
 101
 102                 // Takes russian stop words and translates them to a String array, using
 103                 // the given charset
 104                 private static System.String[] makeStopWords(char[] charset)
 105                 {
 106                         System.String[] res = new System.String[RUSSIAN_STOP_WORDS.Length];
 107                         for (int i = 0; i < res.Length; i++)
 108                         {
 109                                 char[] theStopWord = RUSSIAN_STOP_WORDS[i];
 110                                 // translate the word,using the charset
 111                                 System.Text.StringBuilder theWord = new System.Text.StringBuilder();
 112                                 for (int j = 0; j < theStopWord.Length; j++)
 113                                 {
 114                                         theWord.Append(charset[theStopWord[j]]);
 115                                 }
 116                                 res[i] = theWord.ToString();
 117                         }
 118                         return res;
 119                 }
 120
 121                 /// <summary> Builds an analyzer with the given stop words.</summary>
 122                 /// <todo>  create a Set version of this ctor </todo>
 123                 public RussianAnalyzer(char[] charset, System.Collections.Hashtable stopwords)
 124                 {
 125                         this.charset = charset;
 126                         stopSet = new System.Collections.Hashtable(new System.Collections.Hashtable(stopwords));
 127                 }
 128
 129                 /// <summary> Creates a TokenStream which tokenizes all the text in the provided Reader.
 130                 ///
 131                 /// </summary>
 132                 /// <returns>  A TokenStream build from a RussianLetterTokenizer filtered with
 133                 /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
 134                 /// </returns>
 135                 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 136                 {
 137                         TokenStream result = new RussianLetterTokenizer(reader, charset);
 138                         result = new RussianLowerCaseFilter(result, charset);
 139                         result = new StopFilter(result, stopSet);
 140                         result = new RussianStemFilter(result, charset);
 141                         return result;
 142                 }
 143         }
 144 }