Lucene.Net/Analysis/PorterStemmer.cs

   1 using System;
   2 using System.IO;
   3
   4 namespace Lucene.Net.Analysis
   5 {
   6         /* ====================================================================
   7          * The Apache Software License, Version 1.1
   8          *
   9          * Copyright (c) 2001 The Apache Software Foundation.  All rights
  10          * reserved.
  11          *
  12          * Redistribution and use _in source and binary forms, with or without
  13          * modification, are permitted provided that the following conditions
  14          * are met:
  15          *
  16          * 1. Redistributions of source code must retain the above copyright
  17          *    notice, this list of conditions and the following disclaimer.
  18          *
  19          * 2. Redistributions _in binary form must reproduce the above copyright
  20          *    notice, this list of conditions and the following disclaimer _in
  21          *    the documentation and/or other materials provided with the
  22          *    distribution.
  23          *
  24          * 3. The end-user documentation included with the redistribution,
  25          *    if any, must include the following acknowledgment:
  26          *       "This product includes software developed by the
  27          *        Apache Software Foundation (http://www.apache.org/)."
  28          *    Alternately, this acknowledgment may appear _in the software itself,
  29          *    if and wherever such third-party acknowledgments normally appear.
  30          *
  31          * 4. The names "Apache" and "Apache Software Foundation" and
  32          *    "Apache Lucene" must not be used to endorse or promote products
  33          *    derived from this software without prior written permission. For
  34          *    written permission, please contact apache@apache.org.
  35          *
  36          * 5. Products derived from this software may not be called "Apache",
  37          *    "Apache Lucene", nor may "Apache" appear _in their name, without
  38          *    prior written permission of the Apache Software Foundation.
  39          *
  40          * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  41          * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  42          * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  43          * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  44          * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  45          * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  46          * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  47          * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  48          * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  49          * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  50          * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51          * SUCH DAMAGE.
  52          * ====================================================================
  53          *
  54          * This software consists of voluntary contributions made by many
  55          * individuals on behalf of the Apache Software Foundation.  For more
  56          * information on the Apache Software Foundation, please see
  57          * <http://www.apache.org/>.
  58          */
  59
  60         /*
  61            Porter stemmer in Java. The original paper is in
  62
  63                    Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  64                    no. 3, pp 130-137,
  65
  66            See also http://www.muscat.com/~martin/stem.html
  67
  68            Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
  69            Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
  70            is then out outside the bounds of b.
  71
  72            Similarly,
  73
  74            Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
  75            'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
  76            b[j] is then outside the bounds of b.
  77
  78            Release 3.
  79
  80            [ This version is derived from Release 3, modified by Brian Goetz to
  81                  optimize for fewer object creations.  ]
  82
  83         */
  84
  85         /// <summary>
  86         /// Stemmer, implementing the Porter Stemming Algorithm
  87         ///
  88         /// The Stemmer class transforms a word into its root form.  The input
  89         /// word can be provided a character at time (by calling Add()), or at once
  90         /// by calling one of the various Stem(something) methods.
  91         /// </summary>
  92         class PorterStemmer
  93         {
  94                 private char[] b;
  95                 private int i,    /* offset into b */
  96                         j, k, k0;
  97                 private bool dirty = false;
  98                 private const int INC = 50; /* unit of size whereby b is increased */
  99                 private const int EXTRA = 1;
 100
 101                 public PorterStemmer()
 102                 {
 103                         b = new char[INC];
 104                         i = 0;
 105                 }
 106
 107                 /// <summary>
 108                 /// Resets the stemmer so it can stem another word.  If you invoke
 109                 /// the stemmer by calling Add(char) and then Stem(), you must call Reset()
 110                 /// before starting another word.
 111                 /// </summary>
 112                 public void Reset() { i = 0; dirty = false; }
 113
 114                 /// <summary>
 115                 /// Add a character to the word being stemmed.  When you are finished
 116                 /// adding characters, you can call Stem(void) to process the word.
 117                 /// </summary>
 118                 /// <param name="ch"></param>
 119                 public void Add(char ch)
 120                 {
 121                         if (b.Length <= i + EXTRA)
 122                         {
 123                                 char[] new_b = new char[b.Length+INC];
 124                                 for (int c = 0; c < b.Length; c++)
 125                                         new_b[c] = b[c];
 126                                 b = new_b;
 127                         }
 128                         b[i++] = ch;
 129                 }
 130
 131                 /// <summary>
 132                 /// After a word has been stemmed, it can be retrieved by ToString(),
 133                 /// or a reference to the internal buffer can be retrieved by GetResultBuffer
 134                 /// and GetResultLength (which is generally more efficient.)
 135                 /// </summary>
 136                 /// <returns></returns>
 137                 public override String ToString() { return new String(b,0,i); }
 138
 139                 /// <summary>
 140                 /// Returns the length of the word resulting from the stemming process.
 141                 /// </summary>
 142                 /// <returns></returns>
 143                 virtual public int GetResultLength() { return i; }
 144
 145                 /// <summary>
 146                 /// Returns a reference to a character buffer containing the results of
 147                 /// the stemming process.  You also need to consult GetResultLength()
 148                 /// to determine the length of the result.
 149                 /// </summary>
 150                 /// <returns></returns>
 151                 public char[] GetResultBuffer() { return b; }
 152
 153                 /// <summary>
 154                 /// Ñons(i) is true &lt;=&gt; b[i] is a consonant.
 155                 /// </summary>
 156                 /// <param name="i"></param>
 157                 /// <returns></returns>
 158                 private bool Cons(int i)
 159                 {
 160                         switch (b[i])
 161                         {
 162                                 case 'a':
 163                                         return false;
 164                                 case 'e':
 165                                         return false;
 166                                 case 'i':
 167                                         return false;
 168                                 case 'o':
 169                                     return false;
 170                                 case 'u':
 171                                         return false;
 172                                 case 'y':
 173                                         return (i==k0) ? true : !Cons(i-1);
 174                                 default:
 175                                         return true;
 176                         }
 177                 }
 178
 179                 /// <summary>
 180                 /// M() measures the number of consonant sequences between k0 and j. if c is
 181                 /// a consonant sequence and v a vowel sequence, and &lt;..&gt; indicates arbitrary
 182                 /// presence,
 183                 ///             &lt;c&gt;&lt;v&gt;       gives 0
 184                 ///             &lt;c&gt;vc&lt;v&gt;     gives 1
 185                 ///             &lt;c&gt;vcvc&lt;v&gt;   gives 2
 186                 ///             &lt;c&gt;vcvcvc&lt;v&gt; gives 3
 187                 ///             ....
 188                 /// </summary>
 189                 /// <returns></returns>
 190                 private int M()
 191                 {
 192                         int n = 0;
 193                         int i = k0;
 194                         while(true)
 195                         {
 196                                 if (i > j)
 197                                         return n;
 198                                 if (! Cons(i))
 199                                         break;
 200                                 i++;
 201                         }
 202                         i++;
 203                         while(true)
 204                         {
 205                                 while(true)
 206                                 {
 207                                         if (i > j)
 208                                                 return n;
 209                                         if (Cons(i))
 210                                                 break;
 211                                         i++;
 212                                 }
 213                                 i++;
 214                                 n++;
 215                                 while(true)
 216                                 {
 217                                         if (i > j)
 218                                                 return n;
 219                                         if (! Cons(i))
 220                                                 break;
 221                                         i++;
 222                                 }
 223                                 i++;
 224                         }
 225                 }
 226
 227                 /// <summary>
 228                 /// Vowelinstem() is true &lt;=&gt; k0,...j contains a vowel
 229                 /// </summary>
 230                 /// <returns></returns>
 231                 private bool Vowelinstem()
 232                 {
 233                         int i;
 234                         for (i = k0; i <= j; i++)
 235                                 if (! Cons(i))
 236                                         return true;
 237                         return false;
 238                 }
 239
 240                 /// <summary>
 241                 /// Doublec(j) is true &lt;=&gt; j,(j-1) contain a double consonant.
 242                 /// </summary>
 243                 /// <param name="j"></param>
 244                 /// <returns></returns>
 245                 private bool Doublec(int j)
 246                 {
 247                         if (j < k0+1)
 248                                 return false;
 249                         if (b[j] != b[j-1])
 250                                 return false;
 251                         return Cons(j);
 252                 }
 253
 254                 /// <summary>
 255                 /// Cvc(i) is true &lt;=&gt; i-2,i-1,i has the form consonant - vowel - consonant
 256                 /// and also if the second c is not w,x or y. this is used when trying to
 257                 /// restore an e at the end of a short word. e.g.
 258                 ///
 259                 ///                             Cav(e), Lov(e), Hop(e), Crim(e), but
 260                 ///                             snow, box, tray.
 261                 /// </summary>
 262                 /// <param name="i"></param>
 263                 /// <returns></returns>
 264                 private bool Cvc(int i)
 265                 {
 266                         if (i < k0+2 || !Cons(i) || Cons(i-1) || !Cons(i-2))
 267                                 return false;
 268                         else
 269                         {
 270                                 int ch = b[i];
 271                                 if (ch == 'w' || ch == 'x' || ch == 'y') return false;
 272                         }
 273                         return true;
 274                 }
 275
 276                 private bool Ends(String s)
 277                 {
 278                         int l = s.Length;
 279                         int o = k-l+1;
 280                         if (o < k0)
 281                                 return false;
 282                         for (int i = 0; i < l; i++)
 283                                 if (b[o+i] != s[i])
 284                                         return false;
 285                         j = k-l;
 286                         return true;
 287                 }
 288
 289                 /// <summary>
 290                 /// Setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
 291                 /// </summary>
 292                 /// <param name="s"></param>
 293                 internal virtual void Setto(String s)
 294                 {
 295                         int l = s.Length;
 296                         int o = j+1;
 297                         for (int i = 0; i < l; i++)
 298                                 b[o+i] = s[i];
 299                         k = j+l;
 300                         dirty = true;
 301                 }
 302
 303                 /// <summary>
 304                 /// R(s) is used further down.
 305                 /// </summary>
 306                 /// <param name="s"></param>
 307                 internal virtual void R(String s) { if (M() > 0) Setto(s); }
 308
 309                 /// <summary>
 310                 /// Step1() gets rid of plurals and -ed or -ing. e.g.
 311                 ///              caresses  ->  caress
 312                 ///              ponies    ->  poni
 313                 ///              ties      ->  ti
 314                 ///              caress    ->  caress
 315                 ///              cats      ->  cat
 316                 ///
 317                 ///              feed      ->  feed
 318                 ///              agreed    ->  agree
 319                 ///              disabled  ->  disable
 320                 ///
 321                 ///              matting   ->  mat
 322                 ///              mating    ->  mate
 323                 ///              meeting   ->  meet
 324                 ///              milling   ->  mill
 325                 ///              messing   ->  mess
 326                 ///
 327                 ///              meetings  ->  meet
 328                 /// </summary>
 329                 private void Step1()
 330                 {
 331                         if (b[k] == 's')
 332                         {
 333                                 if (Ends("sses")) k -= 2;
 334                                 else if (Ends("ies")) Setto("i");
 335                                 else if (b[k-1] != 's') k--;
 336                         }
 337                         if (Ends("eed"))
 338                         {
 339                                 if (M() > 0)
 340                                         k--;
 341                         }
 342                         else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
 343                         {
 344                                 k = j;
 345                                 if (Ends("at")) Setto("ate");
 346                                 else if (Ends("bl")) Setto("ble");
 347                                 else if (Ends("iz")) Setto("ize");
 348                                 else if (Doublec(k))
 349                                 {
 350                                         int ch = b[k--];
 351                                         if (ch == 'l' || ch == 's' || ch == 'z')
 352                                                 k++;
 353                                 }
 354                                 else if (M() == 1 && Cvc(k))
 355                                         Setto("e");
 356                         }
 357                 }
 358
 359                 /// <summary>
 360                 /// Step2() turns terminal y to i when there is another vowel in the stem.
 361                 /// </summary>
 362                 private void Step2()
 363                 {
 364                         if (Ends("y") && Vowelinstem())
 365                         {
 366                                 b[k] = 'i';
 367                                 dirty = true;
 368                         }
 369                 }
 370
 371
 372                 /// <summary>
 373                 /// Step3() maps double suffices to single ones. so -ization ( = -ize plus
 374                 /// -ation) maps to -ize etc. note that the string before the suffix must give
 375                 /// M() > 0.
 376                 /// </summary>
 377                 private void Step3()
 378                 {
 379                         if (k == k0) return; /* For Bug 1 */
 380                         switch (b[k-1])
 381                         {
 382                                 case 'a':
 383                                         if (Ends("ational")) { R("ate"); break; }
 384                                         if (Ends("tional")) { R("tion"); break; }
 385                                         break;
 386                                 case 'c':
 387                                         if (Ends("enci")) { R("ence"); break; }
 388                                         if (Ends("anci")) { R("ance"); break; }
 389                                         break;
 390                                 case 'e':
 391                                         if (Ends("izer")) { R("ize"); break; }
 392                                         break;
 393                                 case 'l':
 394                                         if (Ends("bli")) { R("ble"); break; }
 395                                         if (Ends("alli")) { R("al"); break; }
 396                                         if (Ends("entli")) { R("ent"); break; }
 397                                         if (Ends("eli")) { R("e"); break; }
 398                                         if (Ends("ousli")) { R("ous"); break; }
 399                                         break;
 400                                 case 'o':
 401                                         if (Ends("ization")) { R("ize"); break; }
 402                                         if (Ends("ation")) { R("ate"); break; }
 403                                         if (Ends("ator")) { R("ate"); break; }
 404                                         break;
 405                                 case 's':
 406                                         if (Ends("alism")) { R("al"); break; }
 407                                         if (Ends("iveness")) { R("ive"); break; }
 408                                         if (Ends("fulness")) { R("ful"); break; }
 409                                         if (Ends("ousness")) { R("ous"); break; }
 410                                         break;
 411                                 case 't':
 412                                         if (Ends("aliti")) { R("al"); break; }
 413                                         if (Ends("iviti")) { R("ive"); break; }
 414                                         if (Ends("biliti")) { R("ble"); break; }
 415                                         break;
 416                                 case 'g':
 417                                         if (Ends("logi")) { R("log"); break; }
 418                                 break;
 419                         }
 420                 }
 421
 422                 /// <summary>
 423                 /// Step4() deals with -ic-, -full, -ness etc. similar strategy to Step3.
 424                 /// </summary>
 425                 private void Step4()
 426                 {
 427                         switch (b[k])
 428                         {
 429                                 case 'e':
 430                                         if (Ends("icate")) { R("ic"); break; }
 431                                         if (Ends("ative")) { R(""); break; }
 432                                         if (Ends("alize")) { R("al"); break; }
 433                                         break;
 434                                 case 'i':
 435                                         if (Ends("iciti")) { R("ic"); break; }
 436                                         break;
 437                                 case 'l':
 438                                         if (Ends("ical")) { R("ic"); break; }
 439                                         if (Ends("ful")) { R(""); break; }
 440                                         break;
 441                                 case 's':
 442                                         if (Ends("ness")) { R(""); break; }
 443                                         break;
 444                         }
 445                 }
 446
 447                 /// <summary>
 448                 /// Step5() takes off -ant, -ence etc., in context &lt;c&gt;vcvc&lt;v&gt;.
 449                 /// </summary>
 450                 private void Step5()
 451                 {
 452                         if (k == k0) return; /* for Bug 1 */
 453                         switch (b[k-1])
 454                         {
 455                                 case 'a':
 456                                         if (Ends("al")) break;
 457                                         return;
 458                                 case 'c':
 459                                         if (Ends("ance")) break;
 460                                         if (Ends("ence")) break;
 461                                         return;
 462                                 case 'e':
 463                                         if (Ends("er")) break; return;
 464                                 case 'i':
 465                                         if (Ends("ic")) break; return;
 466                                 case 'l':
 467                                         if (Ends("able")) break;
 468                                         if (Ends("ible")) break; return;
 469                                 case 'n':
 470                                         if (Ends("ant")) break;
 471                                         if (Ends("ement")) break;
 472                                         if (Ends("ment")) break;
 473                                         /* element etc. not stripped before the m */
 474                                         if (Ends("ent")) break;
 475                                         return;
 476                                 case 'o':
 477                                         if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
 478                                         /* j >= 0 fixes Bug 2 */
 479                                         if (Ends("ou")) break;
 480                                         return;
 481                                         /* takes care of -ous */
 482                                 case 's':
 483                                         if (Ends("ism")) break;
 484                                         return;
 485                                 case 't':
 486                                         if (Ends("ate")) break;
 487                                         if (Ends("iti")) break;
 488                                         return;
 489                                 case 'u':
 490                                         if (Ends("ous")) break;
 491                                         return;
 492                                 case 'v':
 493                                         if (Ends("ive")) break;
 494                                         return;
 495                                 case 'z':
 496                                         if (Ends("ize")) break;
 497                                         return;
 498                                 default:
 499                                         return;
 500                         }
 501                         if (M() > 1)
 502                                 k = j;
 503                 }
 504
 505                 /// <summary>
 506                 /// Step6() removes a final -e if M() > 1.
 507                 /// </summary>
 508                 private void Step6()
 509                 {
 510                         j = k;
 511                         if (b[k] == 'e')
 512                         {
 513                                 int a = M();
 514                                 if (a > 1 || a == 1 && !Cvc(k-1))
 515                                         k--;
 516                         }
 517                         if (b[k] == 'l' && Doublec(k) && M() > 1)
 518                                 k--;
 519                 }
 520
 521                 /// <summary>
 522                 /// Stem a word provided as a String.  Returns the result as a String.
 523                 /// </summary>
 524                 /// <param name="s"></param>
 525                 /// <returns></returns>
 526                 public String Stem(String s)
 527                 {
 528                         if (Stem(s.ToCharArray(), s.Length))
 529                                 return ToString();
 530                         else
 531                                 return s;
 532                 }
 533
 534                 /// <summary>
 535                 /// Stem a word contained in a char[].  Returns true if the stemming process
 536                 /// resulted in a word different from the input.  You can retrieve the
 537                 /// result with GetResultLength()/GetResultBuffer() or ToString().
 538                 /// </summary>
 539                 /// <param name="word"></param>
 540                 /// <returns></returns>
 541                 public virtual bool Stem(char[] word)
 542                 {
 543                         return Stem(word, word.Length);
 544                 }
 545
 546                 /// <summary>
 547                 /// Stem a word contained in a portion of a char[] array.  Returns
 548                 /// true if the stemming process resulted in a word different from
 549                 /// the input.  You can retrieve the result with
 550                 /// GetResultLength()/GetResultBuffer() or ToString().
 551                 /// </summary>
 552                 /// <param name="wordBuffer"></param>
 553                 /// <param name="offset"></param>
 554                 /// <param name="wordLen"></param>
 555                 /// <returns></returns>
 556                 public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
 557                 {
 558                         Reset();
 559                         if (b.Length < wordLen)
 560                         {
 561                                 char[] new_b = new char[wordLen + EXTRA];
 562                                 b = new_b;
 563                         }
 564                         for (int j=0; j<wordLen; j++)
 565                                 b[j] = wordBuffer[offset+j];
 566                         i = wordLen;
 567                         return Stem(0);
 568                 }
 569
 570                 /// <summary>
 571                 /// Stem a word contained in a leading portion of a char[] array.
 572                 /// Returns true if the stemming process resulted in a word different
 573                 /// from the input. You can retrieve the result with
 574                 /// GetResultLength()/GetResultBuffer() or ToString().
 575                 /// </summary>
 576                 /// <param name="word"></param>
 577                 /// <param name="wordLen"></param>
 578                 /// <returns></returns>
 579                 public virtual bool Stem(char[] word, int wordLen)
 580                 {
 581                         return Stem(word, 0, wordLen);
 582                 }
 583
 584                 /// Stem the word placed into the Stemmer buffer through calls to Add().
 585                 /// Returns true if the stemming process resulted in a word different
 586                 /// from the input.  You can retrieve the result with
 587                 /// GetResultLength()/GetResultBuffer() or ToString().
 588                 public virtual bool Stem()
 589                 {
 590                         return Stem(0);
 591                 }
 592
 593                 public virtual bool Stem(int i0)
 594                 {
 595                         k = i - 1;
 596                         k0 = i0;
 597                         if (k > k0+1)
 598                         {
 599                                 Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
 600                         }
 601                         // Also, a word is considered dirty if we lopped off letters
 602                         // Thanks to Ifigenia Vairelles for pointing this out.
 603                         if (i != k+1)
 604                                 dirty = true;
 605                         i = k+1;
 606                         return dirty;
 607                 }
 608
 609                 /// <summary>
 610                 /// Test program for demonstrating the Stemmer.  It reads a file and
 611                 /// stems each word, writing the result to standard out.
 612                 /// Usage: Stemmer file-name
 613                 /// </summary>
 614                 /// <param name="args"></param>
 615                 public static void Main(String[] args)
 616                 {
 617                         PorterStemmer s = new PorterStemmer();
 618
 619                         for (int i = 0; i < args.Length; i++)
 620                         {
 621                                 try
 622                                 {
 623                                         // fix: 1.3.2.2 - FileAccess.Read
 624                                         FileStream _in = new FileStream(args[i], FileMode.Open, FileAccess.Read);
 625                                         byte[] buffer = new byte[1024];
 626                                         int bufferLen, offset, ch;
 627
 628                                         bufferLen = _in.Read(buffer, 0, buffer.Length);
 629                                         offset = 0;
 630                                         s.Reset();
 631
 632                                         while(true)
 633                                         {
 634                                                 if (offset < bufferLen)
 635                                                         ch = buffer[offset++];
 636                                                 else
 637                                                 {
 638                                                         bufferLen = _in.Read(buffer, 0, buffer.Length);
 639                                                         offset = 0;
 640                                                         if (bufferLen < 0)
 641                                                                 ch = -1;
 642                                                         else
 643                                                                 ch = buffer[offset++];
 644                                                 }
 645
 646                                                 if (Char.IsLetter((char) ch))
 647                                                 {
 648                                                         s.Add(Char.ToLower((char) ch));
 649                                                 }
 650                                                 else
 651                                                 {
 652                                                         s.Stem();
 653                                                         Console.Write(s.ToString());
 654                                                         s.Reset();
 655                                                         if (ch < 0)
 656                                                                 break;
 657                                                         else
 658                                                         {
 659                                                                 Console.Write((char) ch);
 660                                                         }
 661                                                 }
 662                                         }
 663
 664                                         _in.Close();
 665                                 }
 666                                 catch (IOException)
 667                                 {
 668                                         Console.WriteLine("error reading " + args[i]);
 669                                 }
 670                         }
 671                 }
 672         }
 673 }