2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 namespace Lucene
.Net
.Analysis
.DE
19 /// <summary> A stemmer for German words. The algorithm is based on the report
20 /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
21 /// Caumanns (joerg.caumanns@isst.fhg.de).
24 /// <author> Gerhard Schwarz
26 /// <version> $Id: GermanStemmer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
28 public class GermanStemmer
30 /// <summary> Buffer for the terms while stemming them.</summary>
31 private System
.Text
.StringBuilder sb
= new System
.Text
.StringBuilder();
33 /// <summary> Amount of characters that are removed with <tt>substitute()</tt> while stemming.</summary>
34 private int substCount
= 0;
36 /// <summary> Stemms the given term to an unique <tt>discriminator</tt>.
39 /// <param name="term"> The term that should be stemmed.
41 /// <returns> Discriminator for <tt>term</tt>
43 protected internal virtual System
.String
Stem(System
.String term
)
45 // Use lowercase for medium stemming.
46 term
= term
.ToLower();
47 if (!IsStemmable(term
))
49 // Reset the StringBuffer.
50 sb
.Remove(0, sb
.Length
- 0);
52 // Stemming starts here...
57 RemoveParticleDenotion(sb
);
61 /// <summary> Checks if a term could be stemmed.
64 /// <returns> true if, and only if, the given term consists in letters.
66 private bool IsStemmable(System
.String term
)
68 for (int c
= 0; c
< term
.Length
; c
++)
70 if (!System
.Char
.IsLetter(term
[c
]))
76 /// <summary> suffix stripping (stemming) on the current term. The stripping is reduced
77 /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
78 /// from which all regular suffixes are build of. The simplification causes
79 /// some overstemming, and way more irregular stems, but still provides unique.
80 /// discriminators in the most of those cases.
81 /// The algorithm is context free, except of the length restrictions.
83 private void Strip(System
.Text
.StringBuilder buffer
)
86 while (doMore
&& buffer
.Length
> 3)
88 if ((buffer
.Length
+ substCount
> 5) && buffer
.ToString(buffer
.Length
- 2, buffer
.Length
).Equals("nd"))
90 buffer
.Remove(buffer
.Length
- 2, buffer
.Length
- (buffer
.Length
- 2));
92 else if ((buffer
.Length
+ substCount
> 4) && buffer
.ToString(buffer
.Length
- 2, buffer
.Length
).Equals("em"))
94 buffer
.Remove(buffer
.Length
- 2, buffer
.Length
- (buffer
.Length
- 2));
96 else if ((buffer
.Length
+ substCount
> 4) && buffer
.ToString(buffer
.Length
- 2, buffer
.Length
).Equals("er"))
98 buffer
.Remove(buffer
.Length
- 2, buffer
.Length
- (buffer
.Length
- 2));
100 else if (buffer
[buffer
.Length
- 1] == 'e')
102 buffer
.Remove(buffer
.Length
- 1, 1);
104 else if (buffer
[buffer
.Length
- 1] == 's')
106 buffer
.Remove(buffer
.Length
- 1, 1);
108 else if (buffer
[buffer
.Length
- 1] == 'n')
110 buffer
.Remove(buffer
.Length
- 1, 1);
112 // "t" occurs only as suffix of verbs.
113 else if (buffer
[buffer
.Length
- 1] == 't')
115 buffer
.Remove(buffer
.Length
- 1, 1);
124 /// <summary> Does some optimizations on the term. This optimisations are
127 private void Optimize(System
.Text
.StringBuilder buffer
)
129 // Additional step for female plurals of professions and inhabitants.
130 if (buffer
.Length
> 5 && buffer
.ToString(buffer
.Length
- 5, buffer
.Length
).Equals("erin*"))
132 buffer
.Remove(buffer
.Length
- 1, 1);
135 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
136 if (buffer
[buffer
.Length
- 1] == ('z'))
138 buffer
[buffer
.Length
- 1] = 'x';
142 /// <summary> Removes a particle denotion ("ge") from a term.</summary>
143 private void RemoveParticleDenotion(System
.Text
.StringBuilder buffer
)
145 if (buffer
.Length
> 4)
147 for (int c
= 0; c
< buffer
.Length
- 3; c
++)
149 if (buffer
.ToString(c
, c
+ 4).Equals("gege"))
151 buffer
.Remove(c
, c
+ 2 - c
);
158 /// <summary> Do some substitutions for the term to reduce overstemming:
160 /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
161 /// "ß" is substituted by "ss"
162 /// - Substitute a second char of a pair of equal characters with
163 /// an asterisk: ?? -> ?*
164 /// - Substitute some common character combinations with a token:
165 /// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
167 private void Substitute(System
.Text
.StringBuilder buffer
)
170 for (int c
= 0; c
< buffer
.Length
; c
++)
172 // Replace the second char of a pair of the equal characters with an asterisk
173 if (c
> 0 && buffer
[c
] == buffer
[c
- 1])
177 // Substitute Umlauts.
178 else if (buffer
[c
] == 'A') //// 'ä')
183 else if (buffer
[c
] == 'A') //// 'ö')
188 else if (buffer
[c
] == 'A') //// 'ü')
193 // Take care that at least one character is left left side from the current one
194 if (c
< buffer
.Length
- 1)
196 if (buffer
[c
] == 'A') //// 'ß')
200 buffer
.Insert(c
+ 1, 's');
203 // Masking several common character combinations with an token
204 else if ((c
< buffer
.Length
- 2) && buffer
[c
] == 's' && buffer
[c
+ 1] == 'c' && buffer
[c
+ 2] == 'h')
207 buffer
.Remove(c
+ 1, c
+ 3 - (c
+ 1));
210 else if (buffer
[c
] == 'c' && buffer
[c
+ 1] == 'h')
212 buffer
[c
] = 'A'; //// '§';
213 buffer
.Remove(c
+ 1, 1);
216 else if (buffer
[c
] == 'e' && buffer
[c
+ 1] == 'i')
219 buffer
.Remove(c
+ 1, 1);
222 else if (buffer
[c
] == 'i' && buffer
[c
+ 1] == 'e')
225 buffer
.Remove(c
+ 1, 1);
228 else if (buffer
[c
] == 'i' && buffer
[c
+ 1] == 'g')
231 buffer
.Remove(c
+ 1, 1);
234 else if (buffer
[c
] == 's' && buffer
[c
+ 1] == 't')
237 buffer
.Remove(c
+ 1, 1);
244 /// <summary> Undoes the changes made by substitute(). That are character pairs and
245 /// character combinations. Umlauts will remain as their corresponding vowel,
246 /// as "ß" remains as "ss".
248 private void Resubstitute(System
.Text
.StringBuilder buffer
)
250 for (int c
= 0; c
< buffer
.Length
; c
++)
252 if (buffer
[c
] == '*')
254 char x
= buffer
[c
- 1];
257 else if (buffer
[c
] == '$')
260 buffer
.Insert(c
+ 1, new char[]{'c', 'h'}
, 0, 2);
262 else if (buffer
[c
] == 'A') //// '§')
266 buffer
.Insert(c
+ 1, 'h');
268 else if (buffer
[c
] == '%')
271 buffer
.Insert(c
+ 1, 'i');
273 else if (buffer
[c
] == '&')
276 buffer
.Insert(c
+ 1, 'e');
278 else if (buffer
[c
] == '#')
281 buffer
.Insert(c
+ 1, 'g');
283 else if (buffer
[c
] == '!')
286 buffer
.Insert(c
+ 1, 't');