Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Analysis / RU / RussianCharsets.cs
blobf3ba2224ff90665449072ac3e41cc844c2260b94
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 namespace Lucene.Net.Analysis.RU
19 /// <summary> RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
20 /// for russian characters in Unicode, KOI8 and CP1252.
21 /// Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
22 /// One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
23 /// and adding logic to toLowerCase() method for that charset.
24 ///
25 /// </summary>
26 /// <author> Boris Okner, b.okner@rogers.com
27 /// </author>
28 /// <version> $Id: RussianCharsets.cs,v 1.2 2005/01/17 19:54:28 joeshaw Exp $
29 /// </version>
30 public class RussianCharsets
32 // Unicode Russian charset (lowercase letters only)
33 public static char[] UnicodeRussian = new char[]{'\u0430', '\u0431', '\u0432', '\u0433', '\u0434', '\u0435', '\u0436', '\u0437', '\u0438', '\u0439', '\u043A', '\u043B', '\u043C', '\u043D', '\u043E', '\u043F', '\u0440', '\u0441', '\u0442', '\u0443', '\u0444', '\u0445', '\u0446', '\u0447', '\u0448', '\u0449', '\u044A', '\u044B', '\u044C', '\u044D', '\u044E', '\u044F', '\u0410', '\u0411', '\u0412', '\u0413', '\u0414', '\u0415', '\u0416', '\u0417', '\u0418', '\u0419', '\u041A', '\u041B', '\u041C', '\u041D', '\u041E', '\u041F', '\u0420', '\u0421', '\u0422', '\u0423', '\u0424', '\u0425', '\u0426', '\u0427', '\u0428', '\u0429', '\u042A', '\u042B', '\u042C', '\u042D', '\u042E', '\u042F'};
35 // KOI8 charset
36 public static char[] KOI8 = new char[]{(char) (0xc1), (char) (0xc2), (char) (0xd7), (char) (0xc7), (char) (0xc4), (char) (0xc5), (char) (0xd6), (char) (0xda), (char) (0xc9), (char) (0xca), (char) (0xcb), (char) (0xcc), (char) (0xcd), (char) (0xce), (char) (0xcf), (char) (0xd0), (char) (0xd2), (char) (0xd3), (char) (0xd4), (char) (0xd5), (char) (0xc6), (char) (0xc8), (char) (0xc3), (char) (0xde), (char) (0xdb), (char) (0xdd), (char) (0xdf), (char) (0xd9), (char) (0xd8), (char) (0xdc), (char) (0xc0), (char) (0xd1), (char) (0xe1), (char) (0xe2), (char) (0xf7), (char) (0xe7), (char) (0xe4), (char) (0xe5), (char) (0xf6), (char) (0xfa), (char) (0xe9), (char) (0xea), (char) (0xeb), (char) (0xec), (char) (0xed), (char) (0xee), (char) (0xef), (char) (0xf0), (char) (0xf2), (char) (0xf3), (char) (0xf4), (char) (0xf5), (char) (0xe6), (char) (0xe8), (char) (0xe3), (char) (0xfe), (char) (0xfb), (char) (0xfd), (char) (0xff), (char) (0xf9), (char) (0xf8), (char) (0xfc), (char) (0xe0), (char) (0xf1)};
38 // CP1251 eharset
39 public static char[] CP1251 = new char[]{(char) (0xE0), (char) (0xE1), (char) (0xE2), (char) (0xE3), (char) (0xE4), (char) (0xE5), (char) (0xE6), (char) (0xE7), (char) (0xE8), (char) (0xE9), (char) (0xEA), (char) (0xEB), (char) (0xEC), (char) (0xED), (char) (0xEE), (char) (0xEF), (char) (0xF0), (char) (0xF1), (char) (0xF2), (char) (0xF3), (char) (0xF4), (char) (0xF5), (char) (0xF6), (char) (0xF7), (char) (0xF8), (char) (0xF9), (char) (0xFA), (char) (0xFB), (char) (0xFC), (char) (0xFD), (char) (0xFE), (char) (0xFF), (char) (0xC0), (char) (0xC1), (char) (0xC2), (char) (0xC3), (char) (0xC4), (char) (0xC5), (char) (0xC6), (char) (0xC7), (char) (0xC8), (char) (0xC9), (char) (0xCA), (char) (0xCB), (char) (0xCC), (char) (0xCD), (char) (0xCE), (char) (0xCF), (char) (0xD0), (char) (0xD1), (char) (0xD2), (char) (0xD3), (char) (0xD4), (char) (0xD5), (char) (0xD6), (char) (0xD7), (char) (0xD8), (char) (0xD9), (char) (0xDA), (char) (0xDB), (char) (0xDC), (char) (0xDD), (char) (0xDE), (char) (0xDF)};
41 public static char ToLowerCase(char letter, char[] charset)
43 if (charset == UnicodeRussian)
45 if (letter >= '\u0430' && letter <= '\u044F')
47 return letter;
49 if (letter >= '\u0410' && letter <= '\u042F')
51 return (char) (letter + 32);
55 if (charset == KOI8)
57 if (letter >= 0xe0 && letter <= 0xff)
59 return (char) (letter - 32);
61 if (letter >= 0xc0 && letter <= 0xdf)
63 return letter;
67 if (charset == CP1251)
69 if (letter >= 0xC0 && letter <= 0xDF)
71 return (char) (letter + 32);
73 if (letter >= 0xE0 && letter <= 0xFF)
75 return letter;
79 return System.Char.ToLower(letter);