First post!
[beagle.git] / Lucene.Net / Analysis / RU / RussianCharsets.cs
blobba7313409a4353a0cb345ed5a1f8d1e6b993f05f
1 using System;
3 namespace Lucene.Net.Analysis.Ru
5 /* ====================================================================
6 * The Apache Software License, Version 1.1
8 * Copyright (c) 2001 The Apache Software Foundation. All rights
9 * reserved.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in
20 * the documentation and/or other materials provided with the
21 * distribution.
23 * 3. The end-user documentation included with the redistribution,
24 * if any, must include the following acknowledgment:
25 * "This product includes software developed by the
26 * Apache Software Foundation (http://www.apache.org/)."
27 * Alternately, this acknowledgment may appear in the software itself,
28 * if and wherever such third-party acknowledgments normally appear.
30 * 4. The names "Apache" and "Apache Software Foundation" and
31 * "Apache Lucene" must not be used to endorse or promote products
32 * derived from this software without prior written permission. For
33 * written permission, please contact apache@apache.org.
35 * 5. Products derived from this software may not be called "Apache",
36 * "Apache Lucene", nor may "Apache" appear in their name, without
37 * prior written permission of the Apache Software Foundation.
39 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 * ====================================================================
53 * This software consists of voluntary contributions made by many
54 * individuals on behalf of the Apache Software Foundation. For more
55 * information on the Apache Software Foundation, please see
56 * <http://www.apache.org/>.
59 /// <summary>
60 /// RussianCharsets class contains encodings schemes (charsets) and ToLowerCase() method implementation
61 /// for russian characters in Unicode, KOI8 and CP1252.
62 /// Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
63 /// One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
64 /// and adding logic to ToLowerCase() method for that charset.
65 /// </summary>
66 /// <author>Boris Okner, b.okner@rogers.com</author>
67 /// <version>$Id: RussianCharsets.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
68 public class RussianCharsets
70 /// <summary>
71 /// Unicode Russian charset (lowercase letters only)
72 /// </summary>
73 public static char[] UnicodeRussian = {
74 '\u0430',
75 '\u0431',
76 '\u0432',
77 '\u0433',
78 '\u0434',
79 '\u0435',
80 '\u0436',
81 '\u0437',
82 '\u0438',
83 '\u0439',
84 '\u043A',
85 '\u043B',
86 '\u043C',
87 '\u043D',
88 '\u043E',
89 '\u043F',
90 '\u0440',
91 '\u0441',
92 '\u0442',
93 '\u0443',
94 '\u0444',
95 '\u0445',
96 '\u0446',
97 '\u0447',
98 '\u0448',
99 '\u0449',
100 '\u044A',
101 '\u044B',
102 '\u044C',
103 '\u044D',
104 '\u044E',
105 '\u044F',
106 // upper case
107 '\u0410',
108 '\u0411',
109 '\u0412',
110 '\u0413',
111 '\u0414',
112 '\u0415',
113 '\u0416',
114 '\u0417',
115 '\u0418',
116 '\u0419',
117 '\u041A',
118 '\u041B',
119 '\u041C',
120 '\u041D',
121 '\u041E',
122 '\u041F',
123 '\u0420',
124 '\u0421',
125 '\u0422',
126 '\u0423',
127 '\u0424',
128 '\u0425',
129 '\u0426',
130 '\u0427',
131 '\u0428',
132 '\u0429',
133 '\u042A',
134 '\u042B',
135 '\u042C',
136 '\u042D',
137 '\u042E',
138 '\u042F'
141 /// <summary>
142 /// KOI8 charset
143 /// </summary>
144 public static char[] KOI8 = {
145 (char)0xc1,
146 (char)0xc2,
147 (char)0xd7,
148 (char)0xc7,
149 (char)0xc4,
150 (char)0xc5,
151 (char)0xd6,
152 (char)0xda,
153 (char)0xc9,
154 (char)0xca,
155 (char)0xcb,
156 (char)0xcc,
157 (char)0xcd,
158 (char)0xce,
159 (char)0xcf,
160 (char)0xd0,
161 (char)0xd2,
162 (char)0xd3,
163 (char)0xd4,
164 (char)0xd5,
165 (char)0xc6,
166 (char)0xc8,
167 (char)0xc3,
168 (char)0xde,
169 (char)0xdb,
170 (char)0xdd,
171 (char)0xdf,
172 (char)0xd9,
173 (char)0xd8,
174 (char)0xdc,
175 (char)0xc0,
176 (char)0xd1,
177 // upper case
178 (char)0xe1,
179 (char)0xe2,
180 (char)0xf7,
181 (char)0xe7,
182 (char)0xe4,
183 (char)0xe5,
184 (char)0xf6,
185 (char)0xfa,
186 (char)0xe9,
187 (char)0xea,
188 (char)0xeb,
189 (char)0xec,
190 (char)0xed,
191 (char)0xee,
192 (char)0xef,
193 (char)0xf0,
194 (char)0xf2,
195 (char)0xf3,
196 (char)0xf4,
197 (char)0xf5,
198 (char)0xe6,
199 (char)0xe8,
200 (char)0xe3,
201 (char)0xfe,
202 (char)0xfb,
203 (char)0xfd,
204 (char)0xff,
205 (char)0xf9,
206 (char)0xf8,
207 (char)0xfc,
208 (char)0xe0,
209 (char)0xf1
212 /// <summary>
213 /// CP1251 Charset
214 /// </summary>
215 public static char[] CP1251 = {
216 (char)0xE0,
217 (char)0xE1,
218 (char)0xE2,
219 (char)0xE3,
220 (char)0xE4,
221 (char)0xE5,
222 (char)0xE6,
223 (char)0xE7,
224 (char)0xE8,
225 (char)0xE9,
226 (char)0xEA,
227 (char)0xEB,
228 (char)0xEC,
229 (char)0xED,
230 (char)0xEE,
231 (char)0xEF,
232 (char)0xF0,
233 (char)0xF1,
234 (char)0xF2,
235 (char)0xF3,
236 (char)0xF4,
237 (char)0xF5,
238 (char)0xF6,
239 (char)0xF7,
240 (char)0xF8,
241 (char)0xF9,
242 (char)0xFA,
243 (char)0xFB,
244 (char)0xFC,
245 (char)0xFD,
246 (char)0xFE,
247 (char)0xFF,
248 // upper case
249 (char)0xC0,
250 (char)0xC1,
251 (char)0xC2,
252 (char)0xC3,
253 (char)0xC4,
254 (char)0xC5,
255 (char)0xC6,
256 (char)0xC7,
257 (char)0xC8,
258 (char)0xC9,
259 (char)0xCA,
260 (char)0xCB,
261 (char)0xCC,
262 (char)0xCD,
263 (char)0xCE,
264 (char)0xCF,
265 (char)0xD0,
266 (char)0xD1,
267 (char)0xD2,
268 (char)0xD3,
269 (char)0xD4,
270 (char)0xD5,
271 (char)0xD6,
272 (char)0xD7,
273 (char)0xD8,
274 (char)0xD9,
275 (char)0xDA,
276 (char)0xDB,
277 (char)0xDC,
278 (char)0xDD,
279 (char)0xDE,
280 (char)0xDF
283 public static char ToLowerCase(char letter, char[] charset)
285 if (charset == UnicodeRussian)
287 if (letter >= '\u0430' && letter <= '\u044F')
289 return letter;
291 if (letter >= '\u0410' && letter <= '\u042F')
293 return (char) (letter + 32);
297 if (charset == KOI8)
299 if (letter >= 0xe0 && letter <= 0xff)
301 return (char) (letter - 32);
303 if (letter >= 0xc0 && letter <= 0xdf)
305 return letter;
310 if (charset == CP1251)
312 if (letter >= 0xC0 && letter <= 0xDF)
314 return (char) (letter + 32);
316 if (letter >= 0xE0 && letter <= 0xFF)
318 return letter;
323 return Char.ToLower(letter);