4 using System
.Collections
;
6 namespace Lucene
.Net
.Analysis
.De
8 /* ====================================================================
9 * The Apache Software License, Version 1.1
11 * Copyright (c) 2001 The Apache Software Foundation. All rights
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in
23 * the documentation and/or other materials provided with the
26 * 3. The end-user documentation included with the redistribution,
27 * if any, must include the following acknowledgment:
28 * "This product includes software developed by the
29 * Apache Software Foundation (http://www.apache.org/)."
30 * Alternately, this acknowledgment may appear in the software itself,
31 * if and wherever such third-party acknowledgments normally appear.
33 * 4. The names "Apache" and "Apache Software Foundation" and
34 * "Apache Lucene" must not be used to endorse or promote products
35 * derived from this software without prior written permission. For
36 * written permission, please contact apache@apache.org.
38 * 5. Products derived from this software may not be called "Apache",
39 * "Apache Lucene", nor may "Apache" appear in their name, without
40 * prior written permission of the Apache Software Foundation.
42 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
43 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
44 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
46 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
49 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
51 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
52 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * ====================================================================
56 * This software consists of voluntary contributions made by many
57 * individuals on behalf of the Apache Software Foundation. For more
58 * information on the Apache Software Foundation, please see
59 * <http://www.apache.org/>.
63 /// A stemmer for German words. The algorithm is based on the report
64 /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
65 /// Caumanns (joerg.caumanns@isst.fhg.de).
67 /// <author>Gerhard Schwarz</author>
68 /// <version>$Id: GermanStemmer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
69 public class GermanStemmer
72 /// Buffer for the terms while stemming them.
74 private StringBuilder sb
= new StringBuilder();
77 /// Indicates if a term is handled as a noun.
79 private bool uppercase
= false;
82 /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
84 private int substCount
= 0;
87 /// Stemms the given term to an unique <tt>discriminator</tt>.
89 /// <param name="term">The term that should be stemmed.</param>
90 /// <returns>Discriminator for <tt>term</tt></returns>
91 internal String
Stem( String term
)
93 // Mark a possible noun.
94 uppercase
= Char
.IsUpper( term
[0] );
95 // Use lowercase for medium stemming.
96 term
= term
.ToLower();
97 if ( !IsStemmable( term
) )
99 // Reset the StringBuilder.
100 sb
.Remove(0, sb
.Length
);
102 // Stemming starts here...
107 RemoveParticleDenotion( sb
);
108 return sb
.ToString();
112 /// Checks if a term could be stemmed.
114 /// <param name="term"></param>
115 /// <returns>true if, and only if, the given term consists in letters.</returns>
116 private bool IsStemmable( String term
)
118 for ( int c
= 0; c
< term
.Length
; c
++ )
120 if ( !Char
.IsLetter(term
[c
])) return false;
126 /// Suffix stripping (stemming) on the current term. The stripping is reduced
127 /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
128 /// from which all regular suffixes are build of. The simplification causes
129 /// some overstemming, and way more irregular stems, but still provides unique.
130 /// discriminators in the most of those cases.
131 /// The algorithm is context free, except of the length restrictions.
133 /// <param name="buffer"></param>
134 private void Strip( StringBuilder buffer
)
137 while ( doMore
&& buffer
.Length
> 3 )
139 if ( ( buffer
.Length
+ substCount
> 5 ) &&
140 buffer
.ToString().Substring(buffer
.Length
- 2, 2).Equals( "nd" ) )
142 buffer
.Remove( buffer
.Length
- 2, buffer
.Length
);
144 else if ( ( buffer
.Length
+ substCount
> 4 ) &&
145 buffer
.ToString().Substring( buffer
.Length
- 2, 2).Equals( "em" ) )
147 buffer
.Remove( buffer
.Length
- 2, buffer
.Length
);
149 else if ( ( buffer
.Length
+ substCount
> 4 ) &&
150 buffer
.ToString().Substring( buffer
.Length
- 2, 2).Equals( "er" ) )
152 buffer
.Remove( buffer
.Length
- 2, buffer
.Length
);
154 else if ( buffer
[buffer
.Length
- 1] == 'e' )
156 buffer
.Remove(buffer
.Length
- 1, 1);
158 else if ( buffer
[buffer
.Length
- 1] == 's' )
160 buffer
.Remove(buffer
.Length
- 1, 1);
162 else if ( buffer
[buffer
.Length
- 1] == 'n' )
164 buffer
.Remove(buffer
.Length
- 1, 1);
166 // "t" occurs only as suffix of verbs.
167 else if ( buffer
[buffer
.Length
- 1] == 't' && !uppercase
)
169 buffer
.Remove(buffer
.Length
- 1, 1);
179 /// Does some optimizations on the term. This optimisations are contextual.
181 /// <param name="buffer"></param>
182 private void Optimize( StringBuilder buffer
)
184 // Additional step for female plurals of professions and inhabitants.
185 if ( buffer
.Length
> 5 && buffer
.ToString().Substring(buffer
.Length
- 5, 5).Equals( "erin*" ))
187 buffer
.Remove(buffer
.Length
- 1, 1);
190 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
191 if ( buffer
[buffer
.Length
- 1] == ('z') )
193 buffer
[buffer
.Length
- 1] = 'x';
198 /// Removes a particle denotion ("ge") from a term.
200 /// <param name="buffer"></param>
201 private void RemoveParticleDenotion( StringBuilder buffer
)
203 if ( buffer
.Length
> 4 )
205 for ( int c
= 0; c
< buffer
.Length
- 3; c
++ )
207 if ( buffer
.ToString().Substring( c
, 4 ).Equals( "gege" ) )
217 /// Do some substitutions for the term to reduce overstemming:
219 /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
220 /// "ß" is substituted by "ss"
221 /// - Substitute a second char of a pair of equal characters with
222 /// an asterisk: ?? -> ?*
223 /// - Substitute some common character combinations with a token:
224 /// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
226 /// <param name="buffer"></param>
227 private void Substitute( StringBuilder buffer
)
230 for ( int c
= 0; c
< buffer
.Length
; c
++ )
232 // Replace the second char of a pair of the equal characters with an asterisk
233 if ( c
> 0 && buffer
[c
] == buffer
[c
- 1])
237 // Substitute Umlauts.
238 else if ( buffer
[c
] == 'ä' )
242 else if ( buffer
[c
] == 'ö' )
246 else if ( buffer
[c
] == 'ü' )
250 // Take care that at least one character is left left side from the current one
251 if ( c
< buffer
.Length
- 1 )
253 if ( buffer
[c
] == 'ß' )
256 buffer
.Insert(c
+ 1, 's');
259 // Masking several common character combinations with an token
260 else if ( ( c
< buffer
.Length
- 2 ) && buffer
[c
] == 's' &&
261 buffer
[c
+ 1] == 'c' && buffer
[c
+ 2] == 'h' )
264 buffer
.Remove(c
+ 1, 2);
267 else if ( buffer
[c
] == 'c' && buffer
[c
+ 1] == 'h' )
270 buffer
.Remove(c
+ 1, 1);
273 else if ( buffer
[c
] == 'e' && buffer
[c
+ 1] == 'i' )
276 buffer
.Remove(c
+ 1, 1);
279 else if ( buffer
[c
] == 'i' && buffer
[c
+ 1] == 'e' )
282 buffer
.Remove(c
+ 1, 1);
285 else if ( buffer
[c
] == 'i' && buffer
[c
+ 1] == 'g' )
288 buffer
.Remove(c
+ 1, 1);
291 else if ( buffer
[c
] == 's' && buffer
[c
+ 1] == 't' )
294 buffer
.Remove(c
+ 1, 1);
302 /// Undoes the changes made by Substitute(). That are character pairs and
303 /// character combinations. Umlauts will remain as their corresponding vowel,
304 /// as "ß" remains as "ss".
306 /// <param name="buffer"></param>
307 private void Resubstitute( StringBuilder buffer
)
309 for ( int c
= 0; c
< buffer
.Length
; c
++ )
311 if ( buffer
[c
] == '*' )
313 char x
= buffer
[c
- 1];
316 else if ( buffer
[c
] == '$' )
319 buffer
.Insert( c
+ 1, new char[]{'c', 'h'}
, 0, 2);
321 else if ( buffer
[c
] == '§' )
324 buffer
.Insert( c
+ 1, 'h' );
326 else if ( buffer
[c
] == '%' )
329 buffer
.Insert( c
+ 1, 'i' );
331 else if ( buffer
[c
] == '&' )
334 buffer
.Insert( c
+ 1, 'e' );
336 else if ( buffer
[c
] == '#' )
339 buffer
.Insert( c
+ 1, 'g' );
341 else if ( buffer
[c
] == '!' )
344 buffer
.Insert( c
+ 1, 't' );