Lucene.Net/Analysis/RU/RussianAnalyzer.cs

   1 using System;
   2 using System.Text;
   3 using System.IO;
   4 using System.Collections;
   5 using Lucene.Net.Analysis;
   6
   7 namespace Lucene.Net.Analysis.Ru
   8 {
   9         /* ====================================================================
  10          * The Apache Software License, Version 1.1
  11          *
  12          * Copyright (c) 2001 The Apache Software Foundation.  All rights
  13          * reserved.
  14          *
  15          * Redistribution and use in source and binary forms, with or without
  16          * modification, are permitted provided that the following conditions
  17          * are met:
  18          *
  19          * 1. Redistributions of source code must retain the above copyright
  20          *    notice, this list of conditions and the following disclaimer.
  21          *
  22          * 2. Redistributions in binary form must reproduce the above copyright
  23          *    notice, this list of conditions and the following disclaimer in
  24          *    the documentation and/or other materials provided with the
  25          *    distribution.
  26          *
  27          * 3. The end-user documentation included with the redistribution,
  28          *    if any, must include the following acknowledgment:
  29          *       "This product includes software developed by the
  30          *        Apache Software Foundation (http://www.apache.org/)."
  31          *    Alternately, this acknowledgment may appear in the software itself,
  32          *    if and wherever such third-party acknowledgments normally appear.
  33          *
  34          * 4. The names "Apache" and "Apache Software Foundation" and
  35          *    "Apache Lucene" must not be used to endorse or promote products
  36          *    derived from this software without prior written permission. For
  37          *    written permission, please contact apache@apache.org.
  38          *
  39          * 5. Products derived from this software may not be called "Apache",
  40          *    "Apache Lucene", nor may "Apache" appear in their name, without
  41          *    prior written permission of the Apache Software Foundation.
  42          *
  43          * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  44          * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  45          * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  46          * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  47          * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  48          * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  49          * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  50          * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  51          * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  52          * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  53          * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  54          * SUCH DAMAGE.
  55          * ====================================================================
  56          *
  57          * This software consists of voluntary contributions made by many
  58          * individuals on behalf of the Apache Software Foundation.  For more
  59          * information on the Apache Software Foundation, please see
  60          * <http://www.apache.org/>.
  61          */
  62
  63         /// <summary>
  64         /// Analyzer for Russian language. Supports an external list of stopwords (words that
  65         /// will not be indexed at all).
  66         /// A default set of stopwords is used unless an alternative list is specified.
  67         /// </summary>
  68         /// <author>Boris Okner, b.okner@rogers.com</author>
  69         /// <version>$Id: RussianAnalyzer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
  70         ///
  71         public sealed class RussianAnalyzer : Analyzer
  72         {
  73                 // letters
  74                 private static char A = (char)0;
  75                 private static char B = (char)1;
  76                 private static char V = (char)2;
  77                 private static char G = (char)3;
  78                 private static char D = (char)4;
  79                 private static char E = (char)5;
  80                 private static char ZH = (char)6;
  81                 private static char Z = (char)7;
  82                 private static char I = (char)8;
  83                 private static char I_ = (char)9;
  84                 private static char K = (char)10;
  85                 private static char L = (char)11;
  86                 private static char M = (char)12;
  87                 private static char N = (char)13;
  88                 private static char O = (char)14;
  89                 private static char P = (char)15;
  90                 private static char R = (char)16;
  91                 private static char S = (char)17;
  92                 private static char T = (char)18;
  93                 private static char U = (char)19;
  94                 private static char F = (char)20;
  95                 private static char X = (char)21;
  96                 private static char TS = (char)22;
  97                 private static char CH = (char)23;
  98                 private static char SH = (char)24;
  99                 private static char SHCH = (char)25;
 100                 private static char HARD = (char)26;
 101                 private static char Y = (char)27;
 102                 private static char SOFT = (char)28;
 103                 private static char AE = (char)29;
 104                 private static char IU = (char)30;
 105                 private static char IA = (char)31;
 106
 107                 /// <summary>
 108                 /// List of typical Russian stopwords.
 109                 /// </summary>
 110                 private static char[][] RUSSIAN_STOP_WORDS = {
 111                 new char[] {A},
 112                 new char[] {B, E, Z},
 113                 new char[] {B, O, L, E, E},
 114                 new char[] {B, Y},
 115                 new char[] {B, Y, L},
 116                 new char[] {B, Y, L, A},
 117                 new char[] {B, Y, L, I},
 118                 new char[] {B, Y, L, O},
 119                 new char[] {B, Y, T, SOFT},
 120                 new char[] {V},
 121                 new char[] {V, A, M},
 122                 new char[] {V, A, S},
 123                 new char[] {V, E, S, SOFT},
 124                 new char[] {V, O},
 125                 new char[] {V, O, T},
 126                 new char[] {V, S, E},
 127                 new char[] {V, S, E, G, O},
 128                 new char[] {V, S, E, X},
 129                 new char[] {V, Y},
 130                 new char[] {G, D, E},
 131                 new char[] {D, A},
 132                 new char[] {D, A, ZH, E},
 133                 new char[] {D, L, IA},
 134                 new char[] {D, O},
 135                 new char[] {E, G, O},
 136                 new char[] {E, E},
 137                 new char[] {E, I_,},
 138                 new char[] {E, IU},
 139                 new char[] {E, S, L, I},
 140                 new char[] {E, S, T, SOFT},
 141                 new char[] {E, SHCH, E},
 142                 new char[] {ZH, E},
 143                 new char[] {Z, A},
 144                 new char[] {Z, D, E, S, SOFT},
 145                 new char[] {I},
 146                 new char[] {I, Z},
 147                 new char[] {I, L, I},
 148                 new char[] {I, M},
 149                 new char[] {I, X},
 150                 new char[] {K},
 151                 new char[] {K, A, K},
 152                 new char[] {K, O},
 153                 new char[] {K, O, G, D, A},
 154                 new char[] {K, T, O},
 155                 new char[] {L, I},
 156                 new char[] {L, I, B, O},
 157                 new char[] {M, N, E},
 158                 new char[] {M, O, ZH, E, T},
 159                 new char[] {M, Y},
 160                 new char[] {N, A},
 161                 new char[] {N, A, D, O},
 162                 new char[] {N, A, SH},
 163                 new char[] {N, E},
 164                 new char[] {N, E, G, O},
 165                 new char[] {N, E, E},
 166                 new char[] {N, E, T},
 167                 new char[] {N, I},
 168                 new char[] {N, I, X},
 169                 new char[] {N, O},
 170                 new char[] {N, U},
 171                 new char[] {O},
 172                 new char[] {O, B},
 173                 new char[] {O, D, N, A, K, O},
 174                 new char[] {O, N},
 175                 new char[] {O, N, A},
 176                 new char[] {O, N, I},
 177                 new char[] {O, N, O},
 178                 new char[] {O, T},
 179                 new char[] {O, CH, E, N, SOFT},
 180                 new char[] {P, O},
 181                 new char[] {P, O, D},
 182                 new char[] {P, R, I},
 183                 new char[] {S},
 184                 new char[] {S, O},
 185                 new char[] {T, A, K},
 186                 new char[] {T, A, K, ZH, E},
 187                 new char[] {T, A, K, O, I_},
 188                 new char[] {T, A, M},
 189                 new char[] {T, E},
 190                 new char[] {T, E, M},
 191                 new char[] {T, O},
 192                 new char[] {T, O, G, O},
 193                 new char[] {T, O, ZH, E},
 194                 new char[] {T, O, I_},
 195                 new char[] {T, O, L, SOFT, K, O},
 196                 new char[] {T, O, M},
 197                 new char[] {T, Y},
 198                 new char[] {U},
 199                 new char[] {U, ZH, E},
 200                 new char[] {X, O, T, IA},
 201                 new char[] {CH, E, G, O},
 202                 new char[] {CH, E, I_},
 203                 new char[] {CH, E, M},
 204                 new char[] {CH, T, O},
 205                 new char[] {CH, T, O, B, Y},
 206                 new char[] {CH, SOFT, E},
 207                 new char[] {CH, SOFT, IA},
 208                 new char[] {AE, T, A},
 209                 new char[] {AE, T, I},
 210                 new char[] {AE, T, O},
 211                 new char[] {IA}
 212                                                                                                          };
 213
 214                 /// <summary>
 215                 /// Contains the stopwords used with the StopFilter.
 216                 /// </summary>
 217                 private Hashtable stoptable = new Hashtable();
 218
 219                 /// <summary>
 220                 /// Charset for Russian letters.
 221             /// Represents encoding for 32 lowercase Russian letters.
 222                 /// Predefined charsets can be taken from RussianCharSets class
 223                 /// </summary>
 224                 private char[] charset;
 225
 226                 /// <summary>
 227                 /// Builds an analyzer.
 228                 /// </summary>
 229                 public RussianAnalyzer()
 230                 {
 231                         this.charset = RussianCharsets.UnicodeRussian;
 232                         stoptable = StopFilter.MakeStopTable(MakeStopWords(RussianCharsets.UnicodeRussian));
 233                 }
 234
 235                 /// <summary>
 236                 /// Builds an analyzer.
 237                 /// </summary>
 238                 /// <param name="charset"></param>
 239                 public RussianAnalyzer(char[] charset)
 240                 {
 241                         this.charset = charset;
 242                         stoptable = StopFilter.MakeStopTable(MakeStopWords(charset));
 243                 }
 244
 245                 /// <summary>
 246                 /// Builds an analyzer with the given stop words.
 247                 /// </summary>
 248                 /// <param name="charset"></param>
 249                 /// <param name="stopwords"></param>
 250                 public RussianAnalyzer(char[] charset, String[] stopwords)
 251                 {
 252                         this.charset = charset;
 253                         stoptable = StopFilter.MakeStopTable(stopwords);
 254                 }
 255
 256                 /// <summary>
 257                 /// Takes russian stop words and translates them to a String array, using
 258                 /// the given charset
 259                 /// </summary>
 260                 /// <param name="charset"></param>
 261                 /// <returns></returns>
 262                 private static String[] MakeStopWords(char[] charset)
 263                 {
 264                         String[] res = new String[RUSSIAN_STOP_WORDS.Length];
 265                         for (int i = 0; i < res.Length; i++)
 266                         {
 267                                 char[] theStopWord = RUSSIAN_STOP_WORDS[i];
 268                                 // translate the word,using the charset
 269                                 StringBuilder theWord = new StringBuilder();
 270                                 for (int j = 0; j < theStopWord.Length; j++)
 271                                 {
 272                                         theWord.Append(charset[theStopWord[j]]);
 273                                 }
 274                                 res[i] = theWord.ToString();
 275                         }
 276                         return res;
 277                 }
 278
 279                 /// <summary>
 280                 /// Builds an analyzer with the given stop words.
 281                 /// </summary>
 282                 /// <param name="charset"></param>
 283                 /// <param name="stopwords"></param>
 284                 public RussianAnalyzer(char[] charset, Hashtable stopwords)
 285                 {
 286                         this.charset = charset;
 287                         stoptable = stopwords;
 288                 }
 289
 290                 /// <summary>
 291                 /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
 292                 /// </summary>
 293                 /// <param name="fieldName"></param>
 294                 /// <param name="reader"></param>
 295                 /// <returns>
 296                 ///             A TokenStream build from a RussianLetterTokenizer filtered with
 297                 ///     RussianLowerCaseFilter, StopFilter, and RussianStemFilter
 298                 ///  </returns>
 299                 public override TokenStream TokenStream(String fieldName, TextReader reader)
 300                 {
 301                         TokenStream result = new RussianLetterTokenizer(reader, charset);
 302                         result = new RussianLowerCaseFilter(result, charset);
 303                         result = new StopFilter(result, stoptable);
 304                         result = new RussianStemFilter(result, charset);
 305                         return result;
 306                 }
 307         }
 308 }