Lucene.Net/Analysis/RU/RussianCharsets.cs

   1 using System;
   2
   3 namespace Lucene.Net.Analysis.Ru
   4 {
   5         /* ====================================================================
   6          * The Apache Software License, Version 1.1
   7          *
   8          * Copyright (c) 2001 The Apache Software Foundation.  All rights
   9          * reserved.
  10          *
  11          * Redistribution and use in source and binary forms, with or without
  12          * modification, are permitted provided that the following conditions
  13          * are met:
  14          *
  15          * 1. Redistributions of source code must retain the above copyright
  16          *    notice, this list of conditions and the following disclaimer.
  17          *
  18          * 2. Redistributions in binary form must reproduce the above copyright
  19          *    notice, this list of conditions and the following disclaimer in
  20          *    the documentation and/or other materials provided with the
  21          *    distribution.
  22          *
  23          * 3. The end-user documentation included with the redistribution,
  24          *    if any, must include the following acknowledgment:
  25          *       "This product includes software developed by the
  26          *        Apache Software Foundation (http://www.apache.org/)."
  27          *    Alternately, this acknowledgment may appear in the software itself,
  28          *    if and wherever such third-party acknowledgments normally appear.
  29          *
  30          * 4. The names "Apache" and "Apache Software Foundation" and
  31          *    "Apache Lucene" must not be used to endorse or promote products
  32          *    derived from this software without prior written permission. For
  33          *    written permission, please contact apache@apache.org.
  34          *
  35          * 5. Products derived from this software may not be called "Apache",
  36          *    "Apache Lucene", nor may "Apache" appear in their name, without
  37          *    prior written permission of the Apache Software Foundation.
  38          *
  39          * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  40          * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  41          * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  42          * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  43          * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  44          * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  45          * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  46          * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  47          * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  48          * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  49          * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  50          * SUCH DAMAGE.
  51          * ====================================================================
  52          *
  53          * This software consists of voluntary contributions made by many
  54          * individuals on behalf of the Apache Software Foundation.  For more
  55          * information on the Apache Software Foundation, please see
  56          * <http://www.apache.org/>.
  57          */
  58
  59         /// <summary>
  60         /// RussianCharsets class contains encodings schemes (charsets) and ToLowerCase() method implementation
  61         /// for russian characters in Unicode, KOI8 and CP1252.
  62         /// Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
  63         /// One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
  64         /// and adding logic to ToLowerCase() method for that charset.
  65         /// </summary>
  66         /// <author>Boris Okner, b.okner@rogers.com</author>
  67         /// <version>$Id: RussianCharsets.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
  68         public class RussianCharsets
  69         {
  70                 /// <summary>
  71                 /// Unicode Russian charset (lowercase letters only)
  72                 /// </summary>
  73                 public static char[] UnicodeRussian = {
  74                                                                                                   '\u0430',
  75                                                                                                   '\u0431',
  76                                                                                                   '\u0432',
  77                                                                                                   '\u0433',
  78                                                                                                   '\u0434',
  79                                                                                                   '\u0435',
  80                                                                                                   '\u0436',
  81                                                                                                   '\u0437',
  82                                                                                                   '\u0438',
  83                                                                                                   '\u0439',
  84                                                                                                   '\u043A',
  85                                                                                                   '\u043B',
  86                                                                                                   '\u043C',
  87                                                                                                   '\u043D',
  88                                                                                                   '\u043E',
  89                                                                                                   '\u043F',
  90                                                                                                   '\u0440',
  91                                                                                                   '\u0441',
  92                                                                                                   '\u0442',
  93                                                                                                   '\u0443',
  94                                                                                                   '\u0444',
  95                                                                                                   '\u0445',
  96                                                                                                   '\u0446',
  97                                                                                                   '\u0447',
  98                                                                                                   '\u0448',
  99                                                                                                   '\u0449',
 100                                                                                                   '\u044A',
 101                                                                                                   '\u044B',
 102                                                                                                   '\u044C',
 103                                                                                                   '\u044D',
 104                                                                                                   '\u044E',
 105                                                                                                   '\u044F',
 106                                                                                                   // upper case
 107                                                                                                   '\u0410',
 108                                                                                                   '\u0411',
 109                                                                                                   '\u0412',
 110                                                                                                   '\u0413',
 111                                                                                                   '\u0414',
 112                                                                                                   '\u0415',
 113                                                                                                   '\u0416',
 114                                                                                                   '\u0417',
 115                                                                                                   '\u0418',
 116                                                                                                   '\u0419',
 117                                                                                                   '\u041A',
 118                                                                                                   '\u041B',
 119                                                                                                   '\u041C',
 120                                                                                                   '\u041D',
 121                                                                                                   '\u041E',
 122                                                                                                   '\u041F',
 123                                                                                                   '\u0420',
 124                                                                                                   '\u0421',
 125                                                                                                   '\u0422',
 126                                                                                                   '\u0423',
 127                                                                                                   '\u0424',
 128                                                                                                   '\u0425',
 129                                                                                                   '\u0426',
 130                                                                                                   '\u0427',
 131                                                                                                   '\u0428',
 132                                                                                                   '\u0429',
 133                                                                                                   '\u042A',
 134                                                                                                   '\u042B',
 135                                                                                                   '\u042C',
 136                                                                                                   '\u042D',
 137                                                                                                   '\u042E',
 138                                                                                                   '\u042F'
 139                                                                                           };
 140
 141                 /// <summary>
 142                 /// KOI8 charset
 143                 /// </summary>
 144                 public static char[] KOI8 = {
 145                                                                                 (char)0xc1,
 146                                                                                 (char)0xc2,
 147                                                                                 (char)0xd7,
 148                                                                                 (char)0xc7,
 149                                                                                 (char)0xc4,
 150                                                                                 (char)0xc5,
 151                                                                                 (char)0xd6,
 152                                                                                 (char)0xda,
 153                                                                                 (char)0xc9,
 154                                                                                 (char)0xca,
 155                                                                                 (char)0xcb,
 156                                                                                 (char)0xcc,
 157                                                                                 (char)0xcd,
 158                                                                                 (char)0xce,
 159                                                                                 (char)0xcf,
 160                                                                                 (char)0xd0,
 161                                                                                 (char)0xd2,
 162                                                                                 (char)0xd3,
 163                                                                                 (char)0xd4,
 164                                                                                 (char)0xd5,
 165                                                                                 (char)0xc6,
 166                                                                                 (char)0xc8,
 167                                                                                 (char)0xc3,
 168                                                                                 (char)0xde,
 169                                                                                 (char)0xdb,
 170                                                                                 (char)0xdd,
 171                                                                                 (char)0xdf,
 172                                                                                 (char)0xd9,
 173                                                                                 (char)0xd8,
 174                                                                                 (char)0xdc,
 175                                                                                 (char)0xc0,
 176                                                                                 (char)0xd1,
 177                                                                                 // upper case
 178                                                                                 (char)0xe1,
 179                                                                                 (char)0xe2,
 180                                                                                 (char)0xf7,
 181                                                                                 (char)0xe7,
 182                                                                                 (char)0xe4,
 183                                                                                 (char)0xe5,
 184                                                                                 (char)0xf6,
 185                                                                                 (char)0xfa,
 186                                                                                 (char)0xe9,
 187                                                                                 (char)0xea,
 188                                                                                 (char)0xeb,
 189                                                                                 (char)0xec,
 190                                                                                 (char)0xed,
 191                                                                                 (char)0xee,
 192                                                                                 (char)0xef,
 193                                                                                 (char)0xf0,
 194                                                                                 (char)0xf2,
 195                                                                                 (char)0xf3,
 196                                                                                 (char)0xf4,
 197                                                                                 (char)0xf5,
 198                                                                                 (char)0xe6,
 199                                                                                 (char)0xe8,
 200                                                                                 (char)0xe3,
 201                                                                                 (char)0xfe,
 202                                                                                 (char)0xfb,
 203                                                                                 (char)0xfd,
 204                                                                                 (char)0xff,
 205                                                                                 (char)0xf9,
 206                                                                                 (char)0xf8,
 207                                                                                 (char)0xfc,
 208                                                                                 (char)0xe0,
 209                                                                                 (char)0xf1
 210                                                                         };
 211
 212                 /// <summary>
 213                 /// CP1251 Charset
 214                 /// </summary>
 215                 public static char[] CP1251 = {
 216                                                                                   (char)0xE0,
 217                                                                                   (char)0xE1,
 218                                                                                   (char)0xE2,
 219                                                                                   (char)0xE3,
 220                                                                                   (char)0xE4,
 221                                                                                   (char)0xE5,
 222                                                                                   (char)0xE6,
 223                                                                                   (char)0xE7,
 224                                                                                   (char)0xE8,
 225                                                                                   (char)0xE9,
 226                                                                                   (char)0xEA,
 227                                                                                   (char)0xEB,
 228                                                                                   (char)0xEC,
 229                                                                                   (char)0xED,
 230                                                                                   (char)0xEE,
 231                                                                                   (char)0xEF,
 232                                                                                   (char)0xF0,
 233                                                                                   (char)0xF1,
 234                                                                                   (char)0xF2,
 235                                                                                   (char)0xF3,
 236                                                                                   (char)0xF4,
 237                                                                                   (char)0xF5,
 238                                                                                   (char)0xF6,
 239                                                                                   (char)0xF7,
 240                                                                                   (char)0xF8,
 241                                                                                   (char)0xF9,
 242                                                                                   (char)0xFA,
 243                                                                                   (char)0xFB,
 244                                                                                   (char)0xFC,
 245                                                                                   (char)0xFD,
 246                                                                                   (char)0xFE,
 247                                                                                   (char)0xFF,
 248                                                                                   // upper case
 249                                                                                   (char)0xC0,
 250                                                                                   (char)0xC1,
 251                                                                                   (char)0xC2,
 252                                                                                   (char)0xC3,
 253                                                                                   (char)0xC4,
 254                                                                                   (char)0xC5,
 255                                                                                   (char)0xC6,
 256                                                                                   (char)0xC7,
 257                                                                                   (char)0xC8,
 258                                                                                   (char)0xC9,
 259                                                                                   (char)0xCA,
 260                                                                                   (char)0xCB,
 261                                                                                   (char)0xCC,
 262                                                                                   (char)0xCD,
 263                                                                                   (char)0xCE,
 264                                                                                   (char)0xCF,
 265                                                                                   (char)0xD0,
 266                                                                                   (char)0xD1,
 267                                                                                   (char)0xD2,
 268                                                                                   (char)0xD3,
 269                                                                                   (char)0xD4,
 270                                                                                   (char)0xD5,
 271                                                                                   (char)0xD6,
 272                                                                                   (char)0xD7,
 273                                                                                   (char)0xD8,
 274                                                                                   (char)0xD9,
 275                                                                                   (char)0xDA,
 276                                                                                   (char)0xDB,
 277                                                                                   (char)0xDC,
 278                                                                                   (char)0xDD,
 279                                                                                   (char)0xDE,
 280                                                                                   (char)0xDF
 281                                                                           };
 282
 283                 public static char ToLowerCase(char letter, char[] charset)
 284                 {
 285                         if (charset == UnicodeRussian)
 286                         {
 287                                 if (letter >= '\u0430' && letter <= '\u044F')
 288                                 {
 289                                         return letter;
 290                                 }
 291                                 if (letter >= '\u0410' && letter <= '\u042F')
 292                                 {
 293                                         return (char) (letter + 32);
 294                                 }
 295                         }
 296
 297                         if (charset == KOI8)
 298                         {
 299                                 if (letter >= 0xe0 && letter <= 0xff)
 300                                 {
 301                                         return (char) (letter - 32);
 302                                 }
 303                                 if (letter >= 0xc0 && letter <= 0xdf)
 304                                 {
 305                                         return letter;
 306                                 }
 307
 308                         }
 309
 310                         if (charset == CP1251)
 311                         {
 312                                 if (letter >= 0xC0 && letter <= 0xDF)
 313                                 {
 314                                         return (char) (letter + 32);
 315                                 }
 316                                 if (letter >= 0xE0 && letter <= 0xFF)
 317                                 {
 318                                         return letter;
 319                                 }
 320
 321                         }
 322
 323                         return Char.ToLower(letter);
 324                 }
 325         }
 326 }