First post!
[beagle.git] / Lucene.Net / Analysis / DE / GermanStemmer.cs
blob6c22cb44db60852723aac801808b9481bf95e53e
1 using System;
2 using System.IO;
3 using System.Text;
4 using System.Collections;
6 namespace Lucene.Net.Analysis.De
8 /* ====================================================================
9 * The Apache Software License, Version 1.1
11 * Copyright (c) 2001 The Apache Software Foundation. All rights
12 * reserved.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in
23 * the documentation and/or other materials provided with the
24 * distribution.
26 * 3. The end-user documentation included with the redistribution,
27 * if any, must include the following acknowledgment:
28 * "This product includes software developed by the
29 * Apache Software Foundation (http://www.apache.org/)."
30 * Alternately, this acknowledgment may appear in the software itself,
31 * if and wherever such third-party acknowledgments normally appear.
33 * 4. The names "Apache" and "Apache Software Foundation" and
34 * "Apache Lucene" must not be used to endorse or promote products
35 * derived from this software without prior written permission. For
36 * written permission, please contact apache@apache.org.
38 * 5. Products derived from this software may not be called "Apache",
39 * "Apache Lucene", nor may "Apache" appear in their name, without
40 * prior written permission of the Apache Software Foundation.
42 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
43 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
44 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
46 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
49 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
51 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
52 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 * ====================================================================
56 * This software consists of voluntary contributions made by many
57 * individuals on behalf of the Apache Software Foundation. For more
58 * information on the Apache Software Foundation, please see
59 * <http://www.apache.org/>.
62 /// <summary>
63 /// A stemmer for German words. The algorithm is based on the report
64 /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
65 /// Caumanns (joerg.caumanns@isst.fhg.de).
66 /// </summary>
67 /// <author>Gerhard Schwarz</author>
68 /// <version>$Id: GermanStemmer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
69 public class GermanStemmer
71 /// <summary>
72 /// Buffer for the terms while stemming them.
73 /// </summary>
74 private StringBuilder sb = new StringBuilder();
76 /// <summary>
77 /// Indicates if a term is handled as a noun.
78 /// </summary>
79 private bool uppercase = false;
81 /// <summary>
82 /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
83 /// </summary>
84 private int substCount = 0;
86 /// <summary>
87 /// Stemms the given term to an unique <tt>discriminator</tt>.
88 /// </summary>
89 /// <param name="term">The term that should be stemmed.</param>
90 /// <returns>Discriminator for <tt>term</tt></returns>
91 internal String Stem( String term )
93 // Mark a possible noun.
94 uppercase = Char.IsUpper( term[0] );
95 // Use lowercase for medium stemming.
96 term = term.ToLower();
97 if ( !IsStemmable( term ) )
98 return term;
99 // Reset the StringBuilder.
100 sb.Remove(0, sb.Length);
101 sb.Insert(0, term);
102 // Stemming starts here...
103 Substitute( sb );
104 Strip( sb );
105 Optimize( sb );
106 Resubstitute( sb );
107 RemoveParticleDenotion( sb );
108 return sb.ToString();
111 /// <summary>
112 /// Checks if a term could be stemmed.
113 /// </summary>
114 /// <param name="term"></param>
115 /// <returns>true if, and only if, the given term consists in letters.</returns>
116 private bool IsStemmable( String term )
118 for ( int c = 0; c < term.Length; c++ )
120 if ( !Char.IsLetter(term[c])) return false;
122 return true;
125 /// <summary>
126 /// Suffix stripping (stemming) on the current term. The stripping is reduced
127 /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
128 /// from which all regular suffixes are build of. The simplification causes
129 /// some overstemming, and way more irregular stems, but still provides unique.
130 /// discriminators in the most of those cases.
131 /// The algorithm is context free, except of the length restrictions.
132 /// </summary>
133 /// <param name="buffer"></param>
134 private void Strip( StringBuilder buffer )
136 bool doMore = true;
137 while ( doMore && buffer.Length > 3 )
139 if ( ( buffer.Length + substCount > 5 ) &&
140 buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) )
142 buffer.Remove( buffer.Length - 2, buffer.Length );
144 else if ( ( buffer.Length + substCount > 4 ) &&
145 buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) )
147 buffer.Remove( buffer.Length - 2, buffer.Length );
149 else if ( ( buffer.Length + substCount > 4 ) &&
150 buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) )
152 buffer.Remove( buffer.Length - 2, buffer.Length );
154 else if ( buffer[buffer.Length - 1] == 'e' )
156 buffer.Remove(buffer.Length - 1, 1);
158 else if ( buffer[buffer.Length - 1] == 's' )
160 buffer.Remove(buffer.Length - 1, 1);
162 else if ( buffer[buffer.Length - 1] == 'n' )
164 buffer.Remove(buffer.Length - 1, 1);
166 // "t" occurs only as suffix of verbs.
167 else if ( buffer[buffer.Length - 1] == 't' && !uppercase )
169 buffer.Remove(buffer.Length - 1, 1);
171 else
173 doMore = false;
178 /// <summary>
179 /// Does some optimizations on the term. This optimisations are contextual.
180 /// </summary>
181 /// <param name="buffer"></param>
182 private void Optimize( StringBuilder buffer )
184 // Additional step for female plurals of professions and inhabitants.
185 if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" ))
187 buffer.Remove(buffer.Length - 1, 1);
188 Strip(buffer);
190 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
191 if ( buffer[buffer.Length - 1] == ('z') )
193 buffer[buffer.Length - 1] = 'x';
197 /// <summary>
198 /// Removes a particle denotion ("ge") from a term.
199 /// </summary>
200 /// <param name="buffer"></param>
201 private void RemoveParticleDenotion( StringBuilder buffer )
203 if ( buffer.Length > 4 )
205 for ( int c = 0; c < buffer.Length - 3; c++ )
207 if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) )
209 buffer.Remove(c, 2);
210 return;
216 /// <summary>
217 /// Do some substitutions for the term to reduce overstemming:
219 /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
220 /// "ß" is substituted by "ss"
221 /// - Substitute a second char of a pair of equal characters with
222 /// an asterisk: ?? -> ?*
223 /// - Substitute some common character combinations with a token:
224 /// sch/ch/ei/ie/ig/st -> $/§/%/&amp;/#/!
225 /// </summary>
226 /// <param name="buffer"></param>
227 private void Substitute( StringBuilder buffer )
229 substCount = 0;
230 for ( int c = 0; c < buffer.Length; c++ )
232 // Replace the second char of a pair of the equal characters with an asterisk
233 if ( c > 0 && buffer[c] == buffer[c - 1])
235 buffer[c] = '*';
237 // Substitute Umlauts.
238 else if ( buffer[c] == 'ä' )
240 buffer[c] = 'a';
242 else if ( buffer[c] == 'ö' )
244 buffer[c] = 'o';
246 else if ( buffer[c] == 'ü' )
248 buffer[c] = 'u';
250 // Take care that at least one character is left left side from the current one
251 if ( c < buffer.Length - 1 )
253 if ( buffer[c] == 'ß' )
255 buffer[c] = 's';
256 buffer.Insert(c + 1, 's');
257 substCount++;
259 // Masking several common character combinations with an token
260 else if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' &&
261 buffer[c + 1] == 'c' && buffer[c + 2] == 'h' )
263 buffer[c] = '$';
264 buffer.Remove(c + 1, 2);
265 substCount =+ 2;
267 else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' )
269 buffer[c] = '§';
270 buffer.Remove(c + 1, 1);
271 substCount++;
273 else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' )
275 buffer[c] = '%';
276 buffer.Remove(c + 1, 1);
277 substCount++;
279 else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' )
281 buffer[c] = '&';
282 buffer.Remove(c + 1, 1);
283 substCount++;
285 else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' )
287 buffer[c] = '#';
288 buffer.Remove(c + 1, 1);
289 substCount++;
291 else if ( buffer[c] == 's' && buffer[c + 1] == 't' )
293 buffer[c] = '!';
294 buffer.Remove(c + 1, 1);
295 substCount++;
301 /// <summary>
302 /// Undoes the changes made by Substitute(). That are character pairs and
303 /// character combinations. Umlauts will remain as their corresponding vowel,
304 /// as "ß" remains as "ss".
305 /// </summary>
306 /// <param name="buffer"></param>
307 private void Resubstitute( StringBuilder buffer )
309 for ( int c = 0; c < buffer.Length; c++ )
311 if ( buffer[c] == '*' )
313 char x = buffer[c - 1];
314 buffer[c] = x;
316 else if ( buffer[c] == '$' )
318 buffer[c] = 's';
319 buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2);
321 else if ( buffer[c] == '§' )
323 buffer[c] = 'c';
324 buffer.Insert( c + 1, 'h' );
326 else if ( buffer[c] == '%' )
328 buffer[c] = 'e';
329 buffer.Insert( c + 1, 'i' );
331 else if ( buffer[c] == '&' )
333 buffer[c] = 'i';
334 buffer.Insert( c + 1, 'e' );
336 else if ( buffer[c] == '#' )
338 buffer[c] = 'i';
339 buffer.Insert( c + 1, 'g' );
341 else if ( buffer[c] == '!' )
343 buffer[c] = 's';
344 buffer.Insert( c + 1, 't' );