Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Analysis / DE / GermanStemmer.cs
blob8846d73db8a6c22f7ced549d2c6208a463458e6d
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 namespace Lucene.Net.Analysis.DE
19 /// <summary> A stemmer for German words. The algorithm is based on the report
20 /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
21 /// Caumanns (joerg.caumanns@isst.fhg.de).
22 ///
23 /// </summary>
24 /// <author> Gerhard Schwarz
25 /// </author>
26 /// <version> $Id: GermanStemmer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
27 /// </version>
28 public class GermanStemmer
30 /// <summary> Buffer for the terms while stemming them.</summary>
31 private System.Text.StringBuilder sb = new System.Text.StringBuilder();
33 /// <summary> Amount of characters that are removed with <tt>substitute()</tt> while stemming.</summary>
34 private int substCount = 0;
36 /// <summary> Stemms the given term to an unique <tt>discriminator</tt>.
37 ///
38 /// </summary>
39 /// <param name="term"> The term that should be stemmed.
40 /// </param>
41 /// <returns> Discriminator for <tt>term</tt>
42 /// </returns>
43 protected internal virtual System.String Stem(System.String term)
45 // Use lowercase for medium stemming.
46 term = term.ToLower();
47 if (!IsStemmable(term))
48 return term;
49 // Reset the StringBuffer.
50 sb.Remove(0, sb.Length - 0);
51 sb.Insert(0, term);
52 // Stemming starts here...
53 Substitute(sb);
54 Strip(sb);
55 Optimize(sb);
56 Resubstitute(sb);
57 RemoveParticleDenotion(sb);
58 return sb.ToString();
61 /// <summary> Checks if a term could be stemmed.
62 ///
63 /// </summary>
64 /// <returns> true if, and only if, the given term consists in letters.
65 /// </returns>
66 private bool IsStemmable(System.String term)
68 for (int c = 0; c < term.Length; c++)
70 if (!System.Char.IsLetter(term[c]))
71 return false;
73 return true;
76 /// <summary> suffix stripping (stemming) on the current term. The stripping is reduced
77 /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
78 /// from which all regular suffixes are build of. The simplification causes
79 /// some overstemming, and way more irregular stems, but still provides unique.
80 /// discriminators in the most of those cases.
81 /// The algorithm is context free, except of the length restrictions.
82 /// </summary>
83 private void Strip(System.Text.StringBuilder buffer)
85 bool doMore = true;
86 while (doMore && buffer.Length > 3)
88 if ((buffer.Length + substCount > 5) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("nd"))
90 buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
92 else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("em"))
94 buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
96 else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("er"))
98 buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
100 else if (buffer[buffer.Length - 1] == 'e')
102 buffer.Remove(buffer.Length - 1, 1);
104 else if (buffer[buffer.Length - 1] == 's')
106 buffer.Remove(buffer.Length - 1, 1);
108 else if (buffer[buffer.Length - 1] == 'n')
110 buffer.Remove(buffer.Length - 1, 1);
112 // "t" occurs only as suffix of verbs.
113 else if (buffer[buffer.Length - 1] == 't')
115 buffer.Remove(buffer.Length - 1, 1);
117 else
119 doMore = false;
124 /// <summary> Does some optimizations on the term. This optimisations are
125 /// contextual.
126 /// </summary>
127 private void Optimize(System.Text.StringBuilder buffer)
129 // Additional step for female plurals of professions and inhabitants.
130 if (buffer.Length > 5 && buffer.ToString(buffer.Length - 5, buffer.Length).Equals("erin*"))
132 buffer.Remove(buffer.Length - 1, 1);
133 Strip(buffer);
135 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
136 if (buffer[buffer.Length - 1] == ('z'))
138 buffer[buffer.Length - 1] = 'x';
142 /// <summary> Removes a particle denotion ("ge") from a term.</summary>
143 private void RemoveParticleDenotion(System.Text.StringBuilder buffer)
145 if (buffer.Length > 4)
147 for (int c = 0; c < buffer.Length - 3; c++)
149 if (buffer.ToString(c, c + 4).Equals("gege"))
151 buffer.Remove(c, c + 2 - c);
152 return ;
158 /// <summary> Do some substitutions for the term to reduce overstemming:
159 ///
160 /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
161 /// "ß" is substituted by "ss"
162 /// - Substitute a second char of a pair of equal characters with
163 /// an asterisk: ?? -> ?*
164 /// - Substitute some common character combinations with a token:
165 /// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
166 /// </summary>
167 private void Substitute(System.Text.StringBuilder buffer)
169 substCount = 0;
170 for (int c = 0; c < buffer.Length; c++)
172 // Replace the second char of a pair of the equal characters with an asterisk
173 if (c > 0 && buffer[c] == buffer[c - 1])
175 buffer[c] = '*';
177 // Substitute Umlauts.
178 else if (buffer[c] == 'A') //// 'ä')
180 //'ä' ) {
181 buffer[c] = 'a';
183 else if (buffer[c] == 'A') //// 'ö')
185 //'ö' ) {
186 buffer[c] = 'o';
188 else if (buffer[c] == 'A') //// 'ü')
190 // 'ü' ) {
191 buffer[c] = 'u';
193 // Take care that at least one character is left left side from the current one
194 if (c < buffer.Length - 1)
196 if (buffer[c] == 'A') //// 'ß')
198 //'ß' ) {
199 buffer[c] = 's';
200 buffer.Insert(c + 1, 's');
201 substCount++;
203 // Masking several common character combinations with an token
204 else if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
206 buffer[c] = '$';
207 buffer.Remove(c + 1, c + 3 - (c + 1));
208 substCount = + 2;
210 else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
212 buffer[c] = 'A'; //// '§';
213 buffer.Remove(c + 1, 1);
214 substCount++;
216 else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
218 buffer[c] = '%';
219 buffer.Remove(c + 1, 1);
220 substCount++;
222 else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
224 buffer[c] = '&';
225 buffer.Remove(c + 1, 1);
226 substCount++;
228 else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
230 buffer[c] = '#';
231 buffer.Remove(c + 1, 1);
232 substCount++;
234 else if (buffer[c] == 's' && buffer[c + 1] == 't')
236 buffer[c] = '!';
237 buffer.Remove(c + 1, 1);
238 substCount++;
244 /// <summary> Undoes the changes made by substitute(). That are character pairs and
245 /// character combinations. Umlauts will remain as their corresponding vowel,
246 /// as "ß" remains as "ss".
247 /// </summary>
248 private void Resubstitute(System.Text.StringBuilder buffer)
250 for (int c = 0; c < buffer.Length; c++)
252 if (buffer[c] == '*')
254 char x = buffer[c - 1];
255 buffer[c] = x;
257 else if (buffer[c] == '$')
259 buffer[c] = 's';
260 buffer.Insert(c + 1, new char[]{'c', 'h'}, 0, 2);
262 else if (buffer[c] == 'A') //// '§')
264 // '§' ) {
265 buffer[c] = 'c';
266 buffer.Insert(c + 1, 'h');
268 else if (buffer[c] == '%')
270 buffer[c] = 'e';
271 buffer.Insert(c + 1, 'i');
273 else if (buffer[c] == '&')
275 buffer[c] = 'i';
276 buffer.Insert(c + 1, 'e');
278 else if (buffer[c] == '#')
280 buffer[c] = 'i';
281 buffer.Insert(c + 1, 'g');
283 else if (buffer[c] == '!')
285 buffer[c] = 's';
286 buffer.Insert(c + 1, 't');