beagled/Lucene.Net/Analysis/PorterStemmer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18
  19 Porter stemmer in Java. The original paper is in
  20
  21 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  22 no. 3, pp 130-137,
  23
  24 See also http://www.tartarus.org/~martin/PorterStemmer/index.html
  25
  26 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
  27 Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
  28 is then out outside the bounds of b.
  29
  30 Similarly,
  31
  32 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
  33 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
  34 b[j] is then outside the bounds of b.
  35
  36 Release 3.
  37
  38 [ This version is derived from Release 3, modified by Brian Goetz to
  39 optimize for fewer object creations.  ]
  40 */
  41
  42 using System;
  43
  44 namespace Lucene.Net.Analysis
  45 {
  46
  47         /// <summary>
  48         /// Stemmer, implementing the Porter Stemming Algorithm
  49         ///
  50         /// The Stemmer class transforms a word into its root form.  The input
  51         /// word can be provided a character at time (by calling add()), or at once
  52         /// by calling one of the various stem(something) methods.
  53         /// </summary>
  54
  55         class PorterStemmer
  56         {
  57                 private char[] b;
  58                 private int i, j, k, k0;
  59                 private bool dirty = false;
  60                 private const int INC = 50; /* unit of size whereby b is increased */
  61                 private const int EXTRA = 1;
  62
  63                 public PorterStemmer()
  64                 {
  65                         b = new char[INC];
  66                         i = 0;
  67                 }
  68
  69                 /// <summary> reset() resets the stemmer so it can stem another word.  If you invoke
  70                 /// the stemmer by calling add(char) and then Stem(), you must call reset()
  71                 /// before starting another word.
  72                 /// </summary>
  73                 public virtual void  Reset()
  74                 {
  75                         i = 0; dirty = false;
  76                 }
  77
  78                 /// <summary> Add a character to the word being stemmed.  When you are finished
  79                 /// adding characters, you can call Stem(void) to process the word.
  80                 /// </summary>
  81                 public virtual void  Add(char ch)
  82                 {
  83                         if (b.Length <= i + EXTRA)
  84                         {
  85                                 char[] new_b = new char[b.Length + INC];
  86                                 for (int c = 0; c < b.Length; c++)
  87                                         new_b[c] = b[c];
  88                                 b = new_b;
  89                         }
  90                         b[i++] = ch;
  91                 }
  92
  93                 /// <summary> After a word has been stemmed, it can be retrieved by toString(),
  94                 /// or a reference to the internal buffer can be retrieved by getResultBuffer
  95                 /// and getResultLength (which is generally more efficient.)
  96                 /// </summary>
  97                 public override System.String ToString()
  98                 {
  99                         return new System.String(b, 0, i);
 100                 }
 101
 102                 /// <summary> Returns the length of the word resulting from the stemming process.</summary>
 103                 public virtual int GetResultLength()
 104                 {
 105                         return i;
 106                 }
 107
 108                 /// <summary> Returns a reference to a character buffer containing the results of
 109                 /// the stemming process.  You also need to consult getResultLength()
 110                 /// to determine the length of the result.
 111                 /// </summary>
 112                 public virtual char[] GetResultBuffer()
 113                 {
 114                         return b;
 115                 }
 116
 117                 /* cons(i) is true <=> b[i] is a consonant. */
 118
 119                 private bool Cons(int i)
 120                 {
 121                         switch (b[i])
 122                         {
 123
 124                                 case 'a':
 125                                 case 'e':
 126                                 case 'i':
 127                                 case 'o':
 128                                 case 'u':
 129                                         return false;
 130
 131                                 case 'y':
 132                                         return (i == k0)?true:!Cons(i - 1);
 133
 134                                 default:
 135                                         return true;
 136
 137                         }
 138                 }
 139
 140                 /* m() measures the number of consonant sequences between k0 and j. if c is
 141                 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 142                 presence,
 143
 144                 <c><v>       gives 0
 145                 <c>vc<v>     gives 1
 146                 <c>vcvc<v>   gives 2
 147                 <c>vcvcvc<v> gives 3
 148                 ....
 149                 */
 150
 151                 private int M()
 152                 {
 153                         int n = 0;
 154                         int i = k0;
 155                         while (true)
 156                         {
 157                                 if (i > j)
 158                                         return n;
 159                                 if (!Cons(i))
 160                                         break;
 161                                 i++;
 162                         }
 163                         i++;
 164                         while (true)
 165                         {
 166                                 while (true)
 167                                 {
 168                                         if (i > j)
 169                                                 return n;
 170                                         if (Cons(i))
 171                                                 break;
 172                                         i++;
 173                                 }
 174                                 i++;
 175                                 n++;
 176                                 while (true)
 177                                 {
 178                                         if (i > j)
 179                                                 return n;
 180                                         if (!Cons(i))
 181                                                 break;
 182                                         i++;
 183                                 }
 184                                 i++;
 185                         }
 186                 }
 187
 188                 /* vowelinstem() is true <=> k0,...j contains a vowel */
 189
 190                 private bool Vowelinstem()
 191                 {
 192                         int i;
 193                         for (i = k0; i <= j; i++)
 194                                 if (!Cons(i))
 195                                         return true;
 196                         return false;
 197                 }
 198
 199                 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
 200
 201                 private bool Doublec(int j)
 202                 {
 203                         if (j < k0 + 1)
 204                                 return false;
 205                         if (b[j] != b[j - 1])
 206                                 return false;
 207                         return Cons(j);
 208                 }
 209
 210                 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
 211                 and also if the second c is not w,x or y. this is used when trying to
 212                 restore an e at the end of a short word. e.g.
 213
 214                 cav(e), lov(e), hop(e), crim(e), but
 215                 snow, box, tray.
 216
 217                 */
 218
 219                 private bool Cvc(int i)
 220                 {
 221                         if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
 222                                 return false;
 223                         else
 224                         {
 225                                 int ch = b[i];
 226                                 if (ch == 'w' || ch == 'x' || ch == 'y')
 227                                         return false;
 228                         }
 229                         return true;
 230                 }
 231
 232                 private bool Ends(System.String s)
 233                 {
 234                         int l = s.Length;
 235                         int o = k - l + 1;
 236                         if (o < k0)
 237                                 return false;
 238                         for (int i = 0; i < l; i++)
 239                                 if (b[o + i] != s[i])
 240                                         return false;
 241                         j = k - l;
 242                         return true;
 243                 }
 244
 245                 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
 246                 k. */
 247
 248                 internal virtual void  Setto(System.String s)
 249                 {
 250                         int l = s.Length;
 251                         int o = j + 1;
 252                         for (int i = 0; i < l; i++)
 253                                 b[o + i] = s[i];
 254                         k = j + l;
 255                         dirty = true;
 256                 }
 257
 258                 /* r(s) is used further down. */
 259
 260                 internal virtual void  R(System.String s)
 261                 {
 262                         if (M() > 0)
 263                                 Setto(s);
 264                 }
 265
 266                 /* step1() gets rid of plurals and -ed or -ing. e.g.
 267
 268                 caresses  ->  caress
 269                 ponies    ->  poni
 270                 ties      ->  ti
 271                 caress    ->  caress
 272                 cats      ->  cat
 273
 274                 feed      ->  feed
 275                 agreed    ->  agree
 276                 disabled  ->  disable
 277
 278                 matting   ->  mat
 279                 mating    ->  mate
 280                 meeting   ->  meet
 281                 milling   ->  mill
 282                 messing   ->  mess
 283
 284                 meetings  ->  meet
 285
 286                 */
 287
 288                 private void  Step1()
 289                 {
 290                         if (b[k] == 's')
 291                         {
 292                                 if (Ends("sses"))
 293                                         k -= 2;
 294                                 else if (Ends("ies"))
 295                                         Setto("i");
 296                                 else if (b[k - 1] != 's')
 297                                         k--;
 298                         }
 299                         if (Ends("eed"))
 300                         {
 301                                 if (M() > 0)
 302                                         k--;
 303                         }
 304                         else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
 305                         {
 306                                 k = j;
 307                                 if (Ends("at"))
 308                                         Setto("ate");
 309                                 else if (Ends("bl"))
 310                                         Setto("ble");
 311                                 else if (Ends("iz"))
 312                                         Setto("ize");
 313                                 else if (Doublec(k))
 314                                 {
 315                                         int ch = b[k--];
 316                                         if (ch == 'l' || ch == 's' || ch == 'z')
 317                                                 k++;
 318                                 }
 319                                 else if (M() == 1 && Cvc(k))
 320                                         Setto("e");
 321                         }
 322                 }
 323
 324                 /* step2() turns terminal y to i when there is another vowel in the stem. */
 325
 326                 private void  Step2()
 327                 {
 328                         if (Ends("y") && Vowelinstem())
 329                         {
 330                                 b[k] = 'i';
 331                                 dirty = true;
 332                         }
 333                 }
 334
 335                 /* step3() maps double suffices to single ones. so -ization ( = -ize plus
 336                 -ation) maps to -ize etc. note that the string before the suffix must give
 337                 m() > 0. */
 338
 339                 private void  Step3()
 340                 {
 341                         if (k == k0)
 342                                 return ; /* For Bug 1 */
 343                         switch (b[k - 1])
 344                         {
 345
 346                                 case 'a':
 347                                         if (Ends("ational"))
 348                                         {
 349                                                 R("ate"); break;
 350                                         }
 351                                         if (Ends("tional"))
 352                                         {
 353                                                 R("tion"); break;
 354                                         }
 355                                         break;
 356
 357                                 case 'c':
 358                                         if (Ends("enci"))
 359                                         {
 360                                                 R("ence"); break;
 361                                         }
 362                                         if (Ends("anci"))
 363                                         {
 364                                                 R("ance"); break;
 365                                         }
 366                                         break;
 367
 368                                 case 'e':
 369                                         if (Ends("izer"))
 370                                         {
 371                                                 R("ize"); break;
 372                                         }
 373                                         break;
 374
 375                                 case 'l':
 376                                         if (Ends("bli"))
 377                                         {
 378                                                 R("ble"); break;
 379                                         }
 380                                         if (Ends("alli"))
 381                                         {
 382                                                 R("al"); break;
 383                                         }
 384                                         if (Ends("entli"))
 385                                         {
 386                                                 R("ent"); break;
 387                                         }
 388                                         if (Ends("eli"))
 389                                         {
 390                                                 R("e"); break;
 391                                         }
 392                                         if (Ends("ousli"))
 393                                         {
 394                                                 R("ous"); break;
 395                                         }
 396                                         break;
 397
 398                                 case 'o':
 399                                         if (Ends("ization"))
 400                                         {
 401                                                 R("ize"); break;
 402                                         }
 403                                         if (Ends("ation"))
 404                                         {
 405                                                 R("ate"); break;
 406                                         }
 407                                         if (Ends("ator"))
 408                                         {
 409                                                 R("ate"); break;
 410                                         }
 411                                         break;
 412
 413                                 case 's':
 414                                         if (Ends("alism"))
 415                                         {
 416                                                 R("al"); break;
 417                                         }
 418                                         if (Ends("iveness"))
 419                                         {
 420                                                 R("ive"); break;
 421                                         }
 422                                         if (Ends("fulness"))
 423                                         {
 424                                                 R("ful"); break;
 425                                         }
 426                                         if (Ends("ousness"))
 427                                         {
 428                                                 R("ous"); break;
 429                                         }
 430                                         break;
 431
 432                                 case 't':
 433                                         if (Ends("aliti"))
 434                                         {
 435                                                 R("al"); break;
 436                                         }
 437                                         if (Ends("iviti"))
 438                                         {
 439                                                 R("ive"); break;
 440                                         }
 441                                         if (Ends("biliti"))
 442                                         {
 443                                                 R("ble"); break;
 444                                         }
 445                                         break;
 446
 447                                 case 'g':
 448                                         if (Ends("logi"))
 449                                         {
 450                                                 R("log"); break;
 451                                         }
 452                                         break;
 453                                 }
 454                 }
 455
 456                 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
 457
 458                 private void  Step4()
 459                 {
 460                         switch (b[k])
 461                         {
 462
 463                                 case 'e':
 464                                         if (Ends("icate"))
 465                                         {
 466                                                 R("ic"); break;
 467                                         }
 468                                         if (Ends("ative"))
 469                                         {
 470                                                 R(""); break;
 471                                         }
 472                                         if (Ends("alize"))
 473                                         {
 474                                                 R("al"); break;
 475                                         }
 476                                         break;
 477
 478                                 case 'i':
 479                                         if (Ends("iciti"))
 480                                         {
 481                                                 R("ic"); break;
 482                                         }
 483                                         break;
 484
 485                                 case 'l':
 486                                         if (Ends("ical"))
 487                                         {
 488                                                 R("ic"); break;
 489                                         }
 490                                         if (Ends("ful"))
 491                                         {
 492                                                 R(""); break;
 493                                         }
 494                                         break;
 495
 496                                 case 's':
 497                                         if (Ends("ness"))
 498                                         {
 499                                                 R(""); break;
 500                                         }
 501                                         break;
 502                                 }
 503                 }
 504
 505                 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
 506
 507                 private void  Step5()
 508                 {
 509                         if (k == k0)
 510                                 return ; /* for Bug 1 */
 511                         switch (b[k - 1])
 512                         {
 513
 514                                 case 'a':
 515                                         if (Ends("al"))
 516                                                 break;
 517                                         return ;
 518
 519                                 case 'c':
 520                                         if (Ends("ance"))
 521                                                 break;
 522                                         if (Ends("ence"))
 523                                                 break;
 524                                         return ;
 525
 526                                 case 'e':
 527                                         if (Ends("er"))
 528                                                 break; return ;
 529
 530                                 case 'i':
 531                                         if (Ends("ic"))
 532                                                 break; return ;
 533
 534                                 case 'l':
 535                                         if (Ends("able"))
 536                                                 break;
 537                                         if (Ends("ible"))
 538                                                 break; return ;
 539
 540                                 case 'n':
 541                                         if (Ends("ant"))
 542                                                 break;
 543                                         if (Ends("ement"))
 544                                                 break;
 545                                         if (Ends("ment"))
 546                                                 break;
 547                                         /* element etc. not stripped before the m */
 548                                         if (Ends("ent"))
 549                                                 break;
 550                                         return ;
 551
 552                                 case 'o':
 553                                         if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
 554                                                 break;
 555                                         /* j >= 0 fixes Bug 2 */
 556                                         if (Ends("ou"))
 557                                                 break;
 558                                         return ;
 559                                         /* takes care of -ous */
 560
 561                                 case 's':
 562                                         if (Ends("ism"))
 563                                                 break;
 564                                         return ;
 565
 566                                 case 't':
 567                                         if (Ends("ate"))
 568                                                 break;
 569                                         if (Ends("iti"))
 570                                                 break;
 571                                         return ;
 572
 573                                 case 'u':
 574                                         if (Ends("ous"))
 575                                                 break;
 576                                         return ;
 577
 578                                 case 'v':
 579                                         if (Ends("ive"))
 580                                                 break;
 581                                         return ;
 582
 583                                 case 'z':
 584                                         if (Ends("ize"))
 585                                                 break;
 586                                         return ;
 587
 588                                 default:
 589                                         return ;
 590
 591                         }
 592                         if (M() > 1)
 593                                 k = j;
 594                 }
 595
 596                 /* step6() removes a final -e if m() > 1. */
 597
 598                 private void  Step6()
 599                 {
 600                         j = k;
 601                         if (b[k] == 'e')
 602                         {
 603                                 int a = M();
 604                                 if (a > 1 || a == 1 && !Cvc(k - 1))
 605                                         k--;
 606                         }
 607                         if (b[k] == 'l' && Doublec(k) && M() > 1)
 608                                 k--;
 609                 }
 610
 611
 612                 /// <summary> Stem a word provided as a String.  Returns the result as a String.</summary>
 613                 public virtual System.String Stem(System.String s)
 614                 {
 615                         if (Stem(s.ToCharArray(), s.Length))
 616                         {
 617                                 return ToString();
 618                         }
 619                         else
 620                                 return s;
 621                 }
 622
 623                 /// <summary>Stem a word contained in a char[].  Returns true if the stemming process
 624                 /// resulted in a word different from the input.  You can retrieve the
 625                 /// result with getResultLength()/getResultBuffer() or toString().
 626                 /// </summary>
 627                 public virtual bool Stem(char[] word)
 628                 {
 629                         return Stem(word, word.Length);
 630                 }
 631
 632                 /// <summary>Stem a word contained in a portion of a char[] array.  Returns
 633                 /// true if the stemming process resulted in a word different from
 634                 /// the input.  You can retrieve the result with
 635                 /// getResultLength()/getResultBuffer() or toString().
 636                 /// </summary>
 637                 public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
 638                 {
 639                         Reset();
 640                         if (b.Length < wordLen)
 641                         {
 642                                 char[] new_b = new char[wordLen + EXTRA];
 643                                 b = new_b;
 644                         }
 645                         for (int j = 0; j < wordLen; j++)
 646                                 b[j] = wordBuffer[offset + j];
 647                         i = wordLen;
 648                         return Stem(0);
 649                 }
 650
 651                 /// <summary>Stem a word contained in a leading portion of a char[] array.
 652                 /// Returns true if the stemming process resulted in a word different
 653                 /// from the input.  You can retrieve the result with
 654                 /// getResultLength()/getResultBuffer() or toString().
 655                 /// </summary>
 656                 public virtual bool Stem(char[] word, int wordLen)
 657                 {
 658                         return Stem(word, 0, wordLen);
 659                 }
 660
 661                 /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
 662                 /// Returns true if the stemming process resulted in a word different
 663                 /// from the input.  You can retrieve the result with
 664                 /// getResultLength()/getResultBuffer() or toString().
 665                 /// </summary>
 666                 public virtual bool Stem()
 667                 {
 668                         return Stem(0);
 669                 }
 670
 671                 public virtual bool Stem(int i0)
 672                 {
 673                         k = i - 1;
 674                         k0 = i0;
 675                         if (k > k0 + 1)
 676                         {
 677                                 Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
 678                         }
 679                         // Also, a word is considered dirty if we lopped off letters
 680                         // Thanks to Ifigenia Vairelles for pointing this out.
 681                         if (i != k + 1)
 682                                 dirty = true;
 683                         i = k + 1;
 684                         return dirty;
 685                 }
 686
 687                 /// <summary>Test program for demonstrating the Stemmer.  It reads a file and
 688                 /// stems each word, writing the result to standard out.
 689                 /// Usage: Stemmer file-name
 690                 /// </summary>
 691                 [STAThread]
 692                 public static void  Main(System.String[] args)
 693                 {
 694                         PorterStemmer s = new PorterStemmer();
 695
 696                         for (int i = 0; i < args.Length; i++)
 697                         {
 698                                 try
 699                                 {
 700                     System.IO.BinaryReader in_Renamed = new System.IO.BinaryReader(System.IO.File.Open(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read));
 701                                         byte[] buffer = new byte[1024];
 702                                         int bufferLen, offset, ch;
 703
 704                                         bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
 705                                         offset = 0;
 706                                         s.Reset();
 707
 708                                         while (true)
 709                                         {
 710                                                 if (offset < bufferLen)
 711                                                         ch = buffer[offset++];
 712                                                 else
 713                                                 {
 714                                                         bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
 715                                                         offset = 0;
 716                                                         if (bufferLen <= 0)
 717                                                                 ch = - 1;
 718                                                         else
 719                                                                 ch = buffer[offset++];
 720                                                 }
 721
 722                                                 if (System.Char.IsLetter((char) ch))
 723                                                 {
 724                                                         s.Add(System.Char.ToLower((char) ch));
 725                                                 }
 726                                                 else
 727                                                 {
 728                                                         s.Stem();
 729                                                         System.Console.Out.Write(s.ToString());
 730                                                         s.Reset();
 731                                                         if (ch < 0)
 732                                                                 break;
 733                                                         else
 734                                                         {
 735                                                                 System.Console.Out.Write((char) ch);
 736                                                         }
 737                                                 }
 738                                         }
 739
 740                                         in_Renamed.Close();
 741                                 }
 742                                 catch (System.IO.IOException)
 743                                 {
 744                                         System.Console.Out.WriteLine("error reading " + args[i]);
 745                                 }
 746                         }
 747                 }
 748         }
 749 }