beagled/Lucene.Net/Analysis/DE/GermanStemmer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 namespace Lucene.Net.Analysis.DE
  18 {
  19         /// <summary> A stemmer for German words. The algorithm is based on the report
  20         /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
  21         /// Caumanns (joerg.caumanns@isst.fhg.de).
  22         ///
  23         /// </summary>
  24         /// <author>     Gerhard Schwarz
  25         /// </author>
  26         /// <version>    $Id: GermanStemmer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
  27         /// </version>
  28         public class GermanStemmer
  29         {
  30                 /// <summary> Buffer for the terms while stemming them.</summary>
  31                 private System.Text.StringBuilder sb = new System.Text.StringBuilder();
  32
  33                 /// <summary> Amount of characters that are removed with <tt>substitute()</tt> while stemming.</summary>
  34                 private int substCount = 0;
  35
  36                 /// <summary> Stemms the given term to an unique <tt>discriminator</tt>.
  37                 ///
  38                 /// </summary>
  39                 /// <param name="term"> The term that should be stemmed.
  40                 /// </param>
  41                 /// <returns>      Discriminator for <tt>term</tt>
  42                 /// </returns>
  43                 protected internal virtual System.String Stem(System.String term)
  44                 {
  45                         // Use lowercase for medium stemming.
  46                         term = term.ToLower();
  47                         if (!IsStemmable(term))
  48                                 return term;
  49                         // Reset the StringBuffer.
  50                         sb.Remove(0, sb.Length - 0);
  51                         sb.Insert(0, term);
  52                         // Stemming starts here...
  53                         Substitute(sb);
  54                         Strip(sb);
  55                         Optimize(sb);
  56                         Resubstitute(sb);
  57                         RemoveParticleDenotion(sb);
  58                         return sb.ToString();
  59                 }
  60
  61                 /// <summary> Checks if a term could be stemmed.
  62                 ///
  63                 /// </summary>
  64                 /// <returns>  true if, and only if, the given term consists in letters.
  65                 /// </returns>
  66                 private bool IsStemmable(System.String term)
  67                 {
  68                         for (int c = 0; c < term.Length; c++)
  69                         {
  70                                 if (!System.Char.IsLetter(term[c]))
  71                                         return false;
  72                         }
  73                         return true;
  74                 }
  75
  76                 /// <summary> suffix stripping (stemming) on the current term. The stripping is reduced
  77                 /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
  78                 /// from which all regular suffixes are build of. The simplification causes
  79                 /// some overstemming, and way more irregular stems, but still provides unique.
  80                 /// discriminators in the most of those cases.
  81                 /// The algorithm is context free, except of the length restrictions.
  82                 /// </summary>
  83                 private void  Strip(System.Text.StringBuilder buffer)
  84                 {
  85                         bool doMore = true;
  86                         while (doMore && buffer.Length > 3)
  87                         {
  88                                 if ((buffer.Length + substCount > 5) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("nd"))
  89                                 {
  90                                         buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
  91                                 }
  92                                 else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("em"))
  93                                 {
  94                                         buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
  95                                 }
  96                                 else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("er"))
  97                                 {
  98                                         buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
  99                                 }
 100                                 else if (buffer[buffer.Length - 1] == 'e')
 101                                 {
 102                                         buffer.Remove(buffer.Length - 1, 1);
 103                                 }
 104                                 else if (buffer[buffer.Length - 1] == 's')
 105                                 {
 106                                         buffer.Remove(buffer.Length - 1, 1);
 107                                 }
 108                                 else if (buffer[buffer.Length - 1] == 'n')
 109                                 {
 110                                         buffer.Remove(buffer.Length - 1, 1);
 111                                 }
 112                                 // "t" occurs only as suffix of verbs.
 113                                 else if (buffer[buffer.Length - 1] == 't')
 114                                 {
 115                                         buffer.Remove(buffer.Length - 1, 1);
 116                                 }
 117                                 else
 118                                 {
 119                                         doMore = false;
 120                                 }
 121                         }
 122                 }
 123
 124                 /// <summary> Does some optimizations on the term. This optimisations are
 125                 /// contextual.
 126                 /// </summary>
 127                 private void  Optimize(System.Text.StringBuilder buffer)
 128                 {
 129                         // Additional step for female plurals of professions and inhabitants.
 130                         if (buffer.Length > 5 && buffer.ToString(buffer.Length - 5, buffer.Length).Equals("erin*"))
 131                         {
 132                                 buffer.Remove(buffer.Length - 1, 1);
 133                                 Strip(buffer);
 134                         }
 135                         // Additional step for irregular plural nouns like "Matrizen -> Matrix".
 136                         if (buffer[buffer.Length - 1] == ('z'))
 137                         {
 138                                 buffer[buffer.Length - 1] = 'x';
 139                         }
 140                 }
 141
 142                 /// <summary> Removes a particle denotion ("ge") from a term.</summary>
 143                 private void  RemoveParticleDenotion(System.Text.StringBuilder buffer)
 144                 {
 145                         if (buffer.Length > 4)
 146                         {
 147                                 for (int c = 0; c < buffer.Length - 3; c++)
 148                                 {
 149                                         if (buffer.ToString(c, c + 4).Equals("gege"))
 150                                         {
 151                                                 buffer.Remove(c, c + 2 - c);
 152                                                 return ;
 153                                         }
 154                                 }
 155                         }
 156                 }
 157
 158                 /// <summary> Do some substitutions for the term to reduce overstemming:
 159                 ///
 160                 /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
 161                 /// "ß" is substituted by "ss"
 162                 /// - Substitute a second char of a pair of equal characters with
 163                 /// an asterisk: ?? -> ?*
 164                 /// - Substitute some common character combinations with a token:
 165                 /// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
 166                 /// </summary>
 167                 private void  Substitute(System.Text.StringBuilder buffer)
 168                 {
 169                         substCount = 0;
 170                         for (int c = 0; c < buffer.Length; c++)
 171                         {
 172                                 // Replace the second char of a pair of the equal characters with an asterisk
 173                                 if (c > 0 && buffer[c] == buffer[c - 1])
 174                                 {
 175                                         buffer[c] = '*';
 176                                 }
 177                                 // Substitute Umlauts.
 178                                 else if (buffer[c] == 'A') //// 'Ã¤')
 179                                 {
 180                                         //'ä' ) {
 181                                         buffer[c] = 'a';
 182                                 }
 183                                 else if (buffer[c] == 'A') //// 'Ã¶')
 184                                 {
 185                                         //'ö' ) {
 186                                         buffer[c] = 'o';
 187                                 }
 188                                 else if (buffer[c] == 'A') //// 'Ã¼')
 189                                 {
 190                                         // 'ü' ) {
 191                                         buffer[c] = 'u';
 192                                 }
 193                                 // Take care that at least one character is left left side from the current one
 194                                 if (c < buffer.Length - 1)
 195                                 {
 196                                         if (buffer[c] == 'A') //// 'ÃŸ')
 197                                         {
 198                                                 //'ß' ) {
 199                                                 buffer[c] = 's';
 200                                                 buffer.Insert(c + 1, 's');
 201                                                 substCount++;
 202                                         }
 203                                         // Masking several common character combinations with an token
 204                                         else if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
 205                                         {
 206                                                 buffer[c] = '$';
 207                                                 buffer.Remove(c + 1, c + 3 - (c + 1));
 208                                                 substCount = + 2;
 209                                         }
 210                                         else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
 211                                         {
 212                                                 buffer[c] = 'A'; //// 'Â§';
 213                                                 buffer.Remove(c + 1, 1);
 214                                                 substCount++;
 215                                         }
 216                                         else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
 217                                         {
 218                                                 buffer[c] = '%';
 219                                                 buffer.Remove(c + 1, 1);
 220                                                 substCount++;
 221                                         }
 222                                         else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
 223                                         {
 224                                                 buffer[c] = '&';
 225                                                 buffer.Remove(c + 1, 1);
 226                                                 substCount++;
 227                                         }
 228                                         else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
 229                                         {
 230                                                 buffer[c] = '#';
 231                                                 buffer.Remove(c + 1, 1);
 232                                                 substCount++;
 233                                         }
 234                                         else if (buffer[c] == 's' && buffer[c + 1] == 't')
 235                                         {
 236                                                 buffer[c] = '!';
 237                                                 buffer.Remove(c + 1, 1);
 238                                                 substCount++;
 239                                         }
 240                                 }
 241                         }
 242                 }
 243
 244                 /// <summary> Undoes the changes made by substitute(). That are character pairs and
 245                 /// character combinations. Umlauts will remain as their corresponding vowel,
 246                 /// as "ß" remains as "ss".
 247                 /// </summary>
 248                 private void  Resubstitute(System.Text.StringBuilder buffer)
 249                 {
 250                         for (int c = 0; c < buffer.Length; c++)
 251                         {
 252                                 if (buffer[c] == '*')
 253                                 {
 254                                         char x = buffer[c - 1];
 255                                         buffer[c] = x;
 256                                 }
 257                                 else if (buffer[c] == '$')
 258                                 {
 259                                         buffer[c] = 's';
 260                                         buffer.Insert(c + 1, new char[]{'c', 'h'}, 0, 2);
 261                                 }
 262                                 else if (buffer[c] == 'A') //// 'Â§')
 263                                 {
 264                                         // '§' ) {
 265                                         buffer[c] = 'c';
 266                                         buffer.Insert(c + 1, 'h');
 267                                 }
 268                                 else if (buffer[c] == '%')
 269                                 {
 270                                         buffer[c] = 'e';
 271                                         buffer.Insert(c + 1, 'i');
 272                                 }
 273                                 else if (buffer[c] == '&')
 274                                 {
 275                                         buffer[c] = 'i';
 276                                         buffer.Insert(c + 1, 'e');
 277                                 }
 278                                 else if (buffer[c] == '#')
 279                                 {
 280                                         buffer[c] = 'i';
 281                                         buffer.Insert(c + 1, 'g');
 282                                 }
 283                                 else if (buffer[c] == '!')
 284                                 {
 285                                         buffer[c] = 's';
 286                                         buffer.Insert(c + 1, 't');
 287                                 }
 288                         }
 289                 }
 290         }
 291 }