Lucene.Net/Analysis/DE/GermanStemmer.cs

   1 using System;
   2 using System.IO;
   3 using System.Text;
   4 using System.Collections;
   5
   6 namespace Lucene.Net.Analysis.De
   7 {
   8         /* ====================================================================
   9          * The Apache Software License, Version 1.1
  10          *
  11          * Copyright (c) 2001 The Apache Software Foundation.  All rights
  12          * reserved.
  13          *
  14          * Redistribution and use in source and binary forms, with or without
  15          * modification, are permitted provided that the following conditions
  16          * are met:
  17          *
  18          * 1. Redistributions of source code must retain the above copyright
  19          *    notice, this list of conditions and the following disclaimer.
  20          *
  21          * 2. Redistributions in binary form must reproduce the above copyright
  22          *    notice, this list of conditions and the following disclaimer in
  23          *    the documentation and/or other materials provided with the
  24          *    distribution.
  25          *
  26          * 3. The end-user documentation included with the redistribution,
  27          *    if any, must include the following acknowledgment:
  28          *       "This product includes software developed by the
  29          *        Apache Software Foundation (http://www.apache.org/)."
  30          *    Alternately, this acknowledgment may appear in the software itself,
  31          *    if and wherever such third-party acknowledgments normally appear.
  32          *
  33          * 4. The names "Apache" and "Apache Software Foundation" and
  34          *    "Apache Lucene" must not be used to endorse or promote products
  35          *    derived from this software without prior written permission. For
  36          *    written permission, please contact apache@apache.org.
  37          *
  38          * 5. Products derived from this software may not be called "Apache",
  39          *    "Apache Lucene", nor may "Apache" appear in their name, without
  40          *    prior written permission of the Apache Software Foundation.
  41          *
  42          * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  43          * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  44          * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  45          * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  46          * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47          * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48          * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  49          * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  50          * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  51          * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  52          * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53          * SUCH DAMAGE.
  54          * ====================================================================
  55          *
  56          * This software consists of voluntary contributions made by many
  57          * individuals on behalf of the Apache Software Foundation.  For more
  58          * information on the Apache Software Foundation, please see
  59          * <http://www.apache.org/>.
  60          */
  61
  62         /// <summary>
  63         /// A stemmer for German words. The algorithm is based on the report
  64         /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
  65         /// Caumanns (joerg.caumanns@isst.fhg.de).
  66         /// </summary>
  67         /// <author>Gerhard Schwarz</author>
  68         /// <version>$Id: GermanStemmer.cs,v 1.1.1.1 2004/04/29 22:53:51 trow Exp $</version>
  69         public class GermanStemmer
  70         {
  71                 /// <summary>
  72                 /// Buffer for the terms while stemming them.
  73                 /// </summary>
  74                 private StringBuilder sb = new StringBuilder();
  75
  76                 /// <summary>
  77                 /// Indicates if a term is handled as a noun.
  78                 /// </summary>
  79                 private bool uppercase = false;
  80
  81                 /// <summary>
  82                 /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
  83                 /// </summary>
  84                 private int substCount = 0;
  85
  86                 /// <summary>
  87                 /// Stemms the given term to an unique <tt>discriminator</tt>.
  88                 /// </summary>
  89                 /// <param name="term">The term that should be stemmed.</param>
  90                 /// <returns>Discriminator for <tt>term</tt></returns>
  91                 internal String Stem( String term )
  92                 {
  93                         // Mark a possible noun.
  94                         uppercase = Char.IsUpper( term[0] );
  95                         // Use lowercase for medium stemming.
  96                         term = term.ToLower();
  97                         if ( !IsStemmable( term ) )
  98                                 return term;
  99                         // Reset the StringBuilder.
 100                         sb.Remove(0, sb.Length);
 101                         sb.Insert(0, term);
 102                         // Stemming starts here...
 103                         Substitute( sb );
 104                         Strip( sb );
 105                         Optimize( sb );
 106                         Resubstitute( sb );
 107                         RemoveParticleDenotion( sb );
 108                         return sb.ToString();
 109                 }
 110
 111                 /// <summary>
 112                 /// Checks if a term could be stemmed.
 113                 /// </summary>
 114                 /// <param name="term"></param>
 115                 /// <returns>true if, and only if, the given term consists in letters.</returns>
 116                 private bool IsStemmable( String term )
 117                 {
 118                         for ( int c = 0; c < term.Length; c++ )
 119                         {
 120                                 if ( !Char.IsLetter(term[c])) return false;
 121                         }
 122                         return true;
 123                 }
 124
 125                 /// <summary>
 126                 /// Suffix stripping (stemming) on the current term. The stripping is reduced
 127                 /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
 128                 /// from which all regular suffixes are build of. The simplification causes
 129                 /// some overstemming, and way more irregular stems, but still provides unique.
 130                 /// discriminators in the most of those cases.
 131                 /// The algorithm is context free, except of the length restrictions.
 132                 /// </summary>
 133                 /// <param name="buffer"></param>
 134                 private void Strip( StringBuilder buffer )
 135                 {
 136                         bool doMore = true;
 137                         while ( doMore && buffer.Length > 3 )
 138                         {
 139                                 if ( ( buffer.Length + substCount > 5 ) &&
 140                                         buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) )
 141                                 {
 142                                         buffer.Remove( buffer.Length - 2, buffer.Length );
 143                                 }
 144                                 else if ( ( buffer.Length + substCount > 4 ) &&
 145                                         buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) )
 146                                 {
 147                                         buffer.Remove( buffer.Length - 2, buffer.Length );
 148                                 }
 149                                 else if ( ( buffer.Length + substCount > 4 ) &&
 150                                         buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) )
 151                                 {
 152                                         buffer.Remove( buffer.Length - 2, buffer.Length );
 153                                 }
 154                                 else if ( buffer[buffer.Length - 1] == 'e' )
 155                                 {
 156                                         buffer.Remove(buffer.Length - 1, 1);
 157                                 }
 158                                 else if ( buffer[buffer.Length - 1] == 's' )
 159                                 {
 160                                         buffer.Remove(buffer.Length - 1, 1);
 161                                 }
 162                                 else if ( buffer[buffer.Length - 1] == 'n' )
 163                                 {
 164                                         buffer.Remove(buffer.Length - 1, 1);
 165                                 }
 166                                         // "t" occurs only as suffix of verbs.
 167                                 else if ( buffer[buffer.Length - 1] == 't' && !uppercase )
 168                                 {
 169                                         buffer.Remove(buffer.Length - 1, 1);
 170                                 }
 171                                 else
 172                                 {
 173                                         doMore = false;
 174                                 }
 175                         }
 176                 }
 177
 178                 /// <summary>
 179                 /// Does some optimizations on the term. This optimisations are contextual.
 180                 /// </summary>
 181                 /// <param name="buffer"></param>
 182                 private void Optimize( StringBuilder buffer )
 183                 {
 184                         // Additional step for female plurals of professions and inhabitants.
 185                         if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" ))
 186                         {
 187                                 buffer.Remove(buffer.Length - 1, 1);
 188                                 Strip(buffer);
 189                         }
 190                         // Additional step for irregular plural nouns like "Matrizen -> Matrix".
 191                         if ( buffer[buffer.Length - 1] == ('z') )
 192                         {
 193                                 buffer[buffer.Length - 1] = 'x';
 194                         }
 195                 }
 196
 197                 /// <summary>
 198                 /// Removes a particle denotion ("ge") from a term.
 199                 /// </summary>
 200                 /// <param name="buffer"></param>
 201                 private void RemoveParticleDenotion( StringBuilder buffer )
 202                 {
 203                         if ( buffer.Length > 4 )
 204                         {
 205                                 for ( int c = 0; c < buffer.Length - 3; c++ )
 206                                 {
 207                                         if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) )
 208                                         {
 209                                                 buffer.Remove(c, 2);
 210                                                 return;
 211                                         }
 212                                 }
 213                         }
 214                 }
 215
 216                 /// <summary>
 217                 /// Do some substitutions for the term to reduce overstemming:
 218                 ///
 219                 /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
 220                 ///   "ß" is substituted by "ss"
 221                 /// - Substitute a second char of a pair of equal characters with
 222                 /// an asterisk: ?? -> ?*
 223                 /// - Substitute some common character combinations with a token:
 224                 ///   sch/ch/ei/ie/ig/st -> $/§/%/&amp;/#/!
 225                 /// </summary>
 226                 /// <param name="buffer"></param>
 227                 private void Substitute( StringBuilder buffer )
 228                 {
 229                         substCount = 0;
 230                         for ( int c = 0; c < buffer.Length; c++ )
 231                         {
 232                                 // Replace the second char of a pair of the equal characters with an asterisk
 233                                 if ( c > 0 && buffer[c] == buffer[c - 1])
 234                                 {
 235                                         buffer[c] = '*';
 236                                 }
 237                                         // Substitute Umlauts.
 238                                 else if ( buffer[c] == 'ä' )
 239                                 {
 240                                         buffer[c] = 'a';
 241                                 }
 242                                 else if ( buffer[c] == 'ö' )
 243                                 {
 244                                         buffer[c] = 'o';
 245                                 }
 246                                 else if ( buffer[c] == 'ü' )
 247                                 {
 248                                         buffer[c] = 'u';
 249                                 }
 250                                 // Take care that at least one character is left left side from the current one
 251                                 if ( c < buffer.Length - 1 )
 252                                 {
 253                                         if ( buffer[c] == 'ß' )
 254                                         {
 255                                                 buffer[c] = 's';
 256                                                 buffer.Insert(c + 1, 's');
 257                                                 substCount++;
 258                                         }
 259                                                 // Masking several common character combinations with an token
 260                                         else if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' &&
 261                                                 buffer[c + 1] == 'c' && buffer[c + 2] == 'h' )
 262                                         {
 263                                                 buffer[c] = '$';
 264                                                 buffer.Remove(c + 1, 2);
 265                                                 substCount =+ 2;
 266                                         }
 267                                         else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' )
 268                                         {
 269                                                 buffer[c] = '§';
 270                                                 buffer.Remove(c + 1, 1);
 271                                                 substCount++;
 272                                         }
 273                                         else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' )
 274                                         {
 275                                                 buffer[c] = '%';
 276                                                 buffer.Remove(c + 1, 1);
 277                                                 substCount++;
 278                                         }
 279                                         else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' )
 280                                         {
 281                                                 buffer[c] = '&';
 282                                                 buffer.Remove(c + 1, 1);
 283                                                 substCount++;
 284                                         }
 285                                         else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' )
 286                                         {
 287                                                 buffer[c] = '#';
 288                                                 buffer.Remove(c + 1, 1);
 289                                                 substCount++;
 290                                         }
 291                                         else if ( buffer[c] == 's' && buffer[c + 1] == 't' )
 292                                         {
 293                                                 buffer[c] = '!';
 294                                                 buffer.Remove(c + 1, 1);
 295                                                 substCount++;
 296                                         }
 297                                 }
 298                         }
 299                 }
 300
 301                 /// <summary>
 302                 /// Undoes the changes made by Substitute(). That are character pairs and
 303                 /// character combinations. Umlauts will remain as their corresponding vowel,
 304                 /// as "ß" remains as "ss".
 305                 /// </summary>
 306                 /// <param name="buffer"></param>
 307                 private void Resubstitute( StringBuilder buffer )
 308                 {
 309                         for ( int c = 0; c < buffer.Length; c++ )
 310                         {
 311                                 if ( buffer[c] == '*' )
 312                                 {
 313                                         char x = buffer[c - 1];
 314                                         buffer[c] = x;
 315                                 }
 316                                 else if ( buffer[c] == '$' )
 317                                 {
 318                                         buffer[c] = 's';
 319                                         buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2);
 320                                 }
 321                                 else if ( buffer[c] == '§' )
 322                                 {
 323                                         buffer[c] = 'c';
 324                                         buffer.Insert( c + 1, 'h' );
 325                                 }
 326                                 else if ( buffer[c] == '%' )
 327                                 {
 328                                         buffer[c] = 'e';
 329                                         buffer.Insert( c + 1, 'i' );
 330                                 }
 331                                 else if ( buffer[c] == '&' )
 332                                 {
 333                                         buffer[c] = 'i';
 334                                         buffer.Insert( c + 1, 'e' );
 335                                 }
 336                                 else if ( buffer[c] == '#' )
 337                                 {
 338                                         buffer[c] = 'i';
 339                                         buffer.Insert( c + 1, 'g' );
 340                                 }
 341                                 else if ( buffer[c] == '!' )
 342                                 {
 343                                         buffer[c] = 's';
 344                                         buffer.Insert( c + 1, 't' );
 345                                 }
 346                         }
 347                 }
 348         }
 349 }