cvsimport
[beagle.git] / beagled / Lucene.Net / Analysis / ISOLatin1AccentFilter.cs
blob9540485dcb41d1f28b5c74c86b282462d9f861a4
1 /*
2 * Copyright 2004-2005 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
19 namespace Lucene.Net.Analysis
22 /// <summary> A filter that replaces accented characters in the ISO Latin 1 character set
23 /// (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
24 /// <p>
25 /// For instance, '&agrave;' will be replaced by 'a'.
26 /// <p>
27 /// </summary>
28 public class ISOLatin1AccentFilter : TokenFilter
30 public ISOLatin1AccentFilter(TokenStream input) : base(input)
34 public override Token Next()
36 Token t = input.Next();
37 if (t == null)
38 return null;
39 // Return a token with filtered characters.
40 return new Token(RemoveAccents(t.TermText()), t.StartOffset(), t.EndOffset(), t.Type());
43 /// <summary> To replace accented characters in a String by unaccented equivalents.</summary>
44 public static System.String RemoveAccents(System.String input)
46 System.Text.StringBuilder output = new System.Text.StringBuilder();
47 for (int i = 0; i < input.Length; i++)
49 long val = input[i];
51 switch (input[i])
54 case '\u00C0': // À
55 case '\u00C1': // Ã?
56 case '\u00C2': // Â
57 case '\u00C3': // Ã
58 case '\u00C4': // Ä
59 case '\u00C5': // Ã…
60 output.Append("A");
61 break;
63 case '\u00C6': // Æ
64 output.Append("AE");
65 break;
67 case '\u00C7': // Ç
68 output.Append("C");
69 break;
71 case '\u00C8': // È
72 case '\u00C9': // É
73 case '\u00CA': // Ê
74 case '\u00CB': // Ë
75 output.Append("E");
76 break;
78 case '\u00CC': // Ì
79 case '\u00CD': // Ã?
80 case '\u00CE': // ÃŽ
81 case '\u00CF': // Ã?
82 output.Append("I");
83 break;
85 case '\u00D0': // Ã?
86 output.Append("D");
87 break;
89 case '\u00D1': // Ñ
90 output.Append("N");
91 break;
93 case '\u00D2': // Ã’
94 case '\u00D3': // Ó
95 case '\u00D4': // Ô
96 case '\u00D5': // Õ
97 case '\u00D6': // Ö
98 case '\u00D8': // Ø
99 output.Append("O");
100 break;
102 case '\u0152': // Å’
103 output.Append("OE");
104 break;
106 case '\u00DE': // Þ
107 output.Append("TH");
108 break;
110 case '\u00D9': // Ù
111 case '\u00DA': // Ú
112 case '\u00DB': // Û
113 case '\u00DC': // Ü
114 output.Append("U");
115 break;
117 case '\u00DD': // Ã?
118 case '\u0178': // Ÿ
119 output.Append("Y");
120 break;
122 case '\u00E0': // à
123 case '\u00E1': // á
124 case '\u00E2': // â
125 case '\u00E3': // ã
126 case '\u00E4': // ä
127 case '\u00E5': // å
128 output.Append("a");
129 break;
131 case '\u00E6': // æ
132 output.Append("ae");
133 break;
135 case '\u00E7': // ç
136 output.Append("c");
137 break;
139 case '\u00E8': // è
140 case '\u00E9': // é
141 case '\u00EA': // ê
142 case '\u00EB': // ë
143 output.Append("e");
144 break;
146 case '\u00EC': // ì
147 case '\u00ED': // í
148 case '\u00EE': // î
149 case '\u00EF': // ï
150 output.Append("i");
151 break;
153 case '\u00F0': // ð
154 output.Append("d");
155 break;
157 case '\u00F1': // ñ
158 output.Append("n");
159 break;
161 case '\u00F2': // ò
162 case '\u00F3': // ó
163 case '\u00F4': // ô
164 case '\u00F5': // õ
165 case '\u00F6': // ö
166 case '\u00F8': // ø
167 output.Append("o");
168 break;
170 case '\u0153': // Å“
171 output.Append("oe");
172 break;
174 case '\u00DF': // ß
175 output.Append("ss");
176 break;
178 case '\u00FE': // þ
179 output.Append("th");
180 break;
182 case '\u00F9': // ù
183 case '\u00FA': // ú
184 case '\u00FB': // û
185 case '\u00FC': // ü
186 output.Append("u");
187 break;
189 case '\u00FD': // ý
190 case '\u00FF': // ÿ
191 output.Append("y");
192 break;
194 default:
195 output.Append(input[i]);
196 break;
200 return output.ToString();