Change the GC.GetTotalMemory() threshold to 10%; otherwise there are just too many...
[beagle.git] / beagled / Lucene.Net / Analysis / PorterStemmer.cs
blob3085ce8a1a0bf90372c082be7c788f0e650e3d0d
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 Porter stemmer in Java. The original paper is in
21 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
22 no. 3, pp 130-137,
24 See also http://www.tartarus.org/~martin/PorterStemmer/index.html
26 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
27 Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
28 is then out outside the bounds of b.
30 Similarly,
32 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
33 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
34 b[j] is then outside the bounds of b.
36 Release 3.
38 [ This version is derived from Release 3, modified by Brian Goetz to
39 optimize for fewer object creations. ]
42 using System;
44 namespace Lucene.Net.Analysis
47 /// <summary>
48 /// Stemmer, implementing the Porter Stemming Algorithm
49 ///
50 /// The Stemmer class transforms a word into its root form. The input
51 /// word can be provided a character at time (by calling add()), or at once
52 /// by calling one of the various stem(something) methods.
53 /// </summary>
55 class PorterStemmer
57 private char[] b;
58 private int i, j, k, k0;
59 private bool dirty = false;
60 private const int INC = 50; /* unit of size whereby b is increased */
61 private const int EXTRA = 1;
63 public PorterStemmer()
65 b = new char[INC];
66 i = 0;
69 /// <summary> reset() resets the stemmer so it can stem another word. If you invoke
70 /// the stemmer by calling add(char) and then Stem(), you must call reset()
71 /// before starting another word.
72 /// </summary>
73 public virtual void Reset()
75 i = 0; dirty = false;
78 /// <summary> Add a character to the word being stemmed. When you are finished
79 /// adding characters, you can call Stem(void) to process the word.
80 /// </summary>
81 public virtual void Add(char ch)
83 if (b.Length <= i + EXTRA)
85 char[] new_b = new char[b.Length + INC];
86 for (int c = 0; c < b.Length; c++)
87 new_b[c] = b[c];
88 b = new_b;
90 b[i++] = ch;
93 /// <summary> After a word has been stemmed, it can be retrieved by toString(),
94 /// or a reference to the internal buffer can be retrieved by getResultBuffer
95 /// and getResultLength (which is generally more efficient.)
96 /// </summary>
97 public override System.String ToString()
99 return new System.String(b, 0, i);
102 /// <summary> Returns the length of the word resulting from the stemming process.</summary>
103 public virtual int GetResultLength()
105 return i;
108 /// <summary> Returns a reference to a character buffer containing the results of
109 /// the stemming process. You also need to consult getResultLength()
110 /// to determine the length of the result.
111 /// </summary>
112 public virtual char[] GetResultBuffer()
114 return b;
117 /* cons(i) is true <=> b[i] is a consonant. */
119 private bool Cons(int i)
121 switch (b[i])
124 case 'a':
125 case 'e':
126 case 'i':
127 case 'o':
128 case 'u':
129 return false;
131 case 'y':
132 return (i == k0)?true:!Cons(i - 1);
134 default:
135 return true;
140 /* m() measures the number of consonant sequences between k0 and j. if c is
141 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
142 presence,
144 <c><v> gives 0
145 <c>vc<v> gives 1
146 <c>vcvc<v> gives 2
147 <c>vcvcvc<v> gives 3
148 ....
151 private int M()
153 int n = 0;
154 int i = k0;
155 while (true)
157 if (i > j)
158 return n;
159 if (!Cons(i))
160 break;
161 i++;
163 i++;
164 while (true)
166 while (true)
168 if (i > j)
169 return n;
170 if (Cons(i))
171 break;
172 i++;
174 i++;
175 n++;
176 while (true)
178 if (i > j)
179 return n;
180 if (!Cons(i))
181 break;
182 i++;
184 i++;
188 /* vowelinstem() is true <=> k0,...j contains a vowel */
190 private bool Vowelinstem()
192 int i;
193 for (i = k0; i <= j; i++)
194 if (!Cons(i))
195 return true;
196 return false;
199 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
201 private bool Doublec(int j)
203 if (j < k0 + 1)
204 return false;
205 if (b[j] != b[j - 1])
206 return false;
207 return Cons(j);
210 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
211 and also if the second c is not w,x or y. this is used when trying to
212 restore an e at the end of a short word. e.g.
214 cav(e), lov(e), hop(e), crim(e), but
215 snow, box, tray.
219 private bool Cvc(int i)
221 if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
222 return false;
223 else
225 int ch = b[i];
226 if (ch == 'w' || ch == 'x' || ch == 'y')
227 return false;
229 return true;
232 private bool Ends(System.String s)
234 int l = s.Length;
235 int o = k - l + 1;
236 if (o < k0)
237 return false;
238 for (int i = 0; i < l; i++)
239 if (b[o + i] != s[i])
240 return false;
241 j = k - l;
242 return true;
245 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
246 k. */
248 internal virtual void Setto(System.String s)
250 int l = s.Length;
251 int o = j + 1;
252 for (int i = 0; i < l; i++)
253 b[o + i] = s[i];
254 k = j + l;
255 dirty = true;
258 /* r(s) is used further down. */
260 internal virtual void R(System.String s)
262 if (M() > 0)
263 Setto(s);
266 /* step1() gets rid of plurals and -ed or -ing. e.g.
268 caresses -> caress
269 ponies -> poni
270 ties -> ti
271 caress -> caress
272 cats -> cat
274 feed -> feed
275 agreed -> agree
276 disabled -> disable
278 matting -> mat
279 mating -> mate
280 meeting -> meet
281 milling -> mill
282 messing -> mess
284 meetings -> meet
288 private void Step1()
290 if (b[k] == 's')
292 if (Ends("sses"))
293 k -= 2;
294 else if (Ends("ies"))
295 Setto("i");
296 else if (b[k - 1] != 's')
297 k--;
299 if (Ends("eed"))
301 if (M() > 0)
302 k--;
304 else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
306 k = j;
307 if (Ends("at"))
308 Setto("ate");
309 else if (Ends("bl"))
310 Setto("ble");
311 else if (Ends("iz"))
312 Setto("ize");
313 else if (Doublec(k))
315 int ch = b[k--];
316 if (ch == 'l' || ch == 's' || ch == 'z')
317 k++;
319 else if (M() == 1 && Cvc(k))
320 Setto("e");
324 /* step2() turns terminal y to i when there is another vowel in the stem. */
326 private void Step2()
328 if (Ends("y") && Vowelinstem())
330 b[k] = 'i';
331 dirty = true;
335 /* step3() maps double suffices to single ones. so -ization ( = -ize plus
336 -ation) maps to -ize etc. note that the string before the suffix must give
337 m() > 0. */
339 private void Step3()
341 if (k == k0)
342 return ; /* For Bug 1 */
343 switch (b[k - 1])
346 case 'a':
347 if (Ends("ational"))
349 R("ate"); break;
351 if (Ends("tional"))
353 R("tion"); break;
355 break;
357 case 'c':
358 if (Ends("enci"))
360 R("ence"); break;
362 if (Ends("anci"))
364 R("ance"); break;
366 break;
368 case 'e':
369 if (Ends("izer"))
371 R("ize"); break;
373 break;
375 case 'l':
376 if (Ends("bli"))
378 R("ble"); break;
380 if (Ends("alli"))
382 R("al"); break;
384 if (Ends("entli"))
386 R("ent"); break;
388 if (Ends("eli"))
390 R("e"); break;
392 if (Ends("ousli"))
394 R("ous"); break;
396 break;
398 case 'o':
399 if (Ends("ization"))
401 R("ize"); break;
403 if (Ends("ation"))
405 R("ate"); break;
407 if (Ends("ator"))
409 R("ate"); break;
411 break;
413 case 's':
414 if (Ends("alism"))
416 R("al"); break;
418 if (Ends("iveness"))
420 R("ive"); break;
422 if (Ends("fulness"))
424 R("ful"); break;
426 if (Ends("ousness"))
428 R("ous"); break;
430 break;
432 case 't':
433 if (Ends("aliti"))
435 R("al"); break;
437 if (Ends("iviti"))
439 R("ive"); break;
441 if (Ends("biliti"))
443 R("ble"); break;
445 break;
447 case 'g':
448 if (Ends("logi"))
450 R("log"); break;
452 break;
456 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
458 private void Step4()
460 switch (b[k])
463 case 'e':
464 if (Ends("icate"))
466 R("ic"); break;
468 if (Ends("ative"))
470 R(""); break;
472 if (Ends("alize"))
474 R("al"); break;
476 break;
478 case 'i':
479 if (Ends("iciti"))
481 R("ic"); break;
483 break;
485 case 'l':
486 if (Ends("ical"))
488 R("ic"); break;
490 if (Ends("ful"))
492 R(""); break;
494 break;
496 case 's':
497 if (Ends("ness"))
499 R(""); break;
501 break;
505 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
507 private void Step5()
509 if (k == k0)
510 return ; /* for Bug 1 */
511 switch (b[k - 1])
514 case 'a':
515 if (Ends("al"))
516 break;
517 return ;
519 case 'c':
520 if (Ends("ance"))
521 break;
522 if (Ends("ence"))
523 break;
524 return ;
526 case 'e':
527 if (Ends("er"))
528 break; return ;
530 case 'i':
531 if (Ends("ic"))
532 break; return ;
534 case 'l':
535 if (Ends("able"))
536 break;
537 if (Ends("ible"))
538 break; return ;
540 case 'n':
541 if (Ends("ant"))
542 break;
543 if (Ends("ement"))
544 break;
545 if (Ends("ment"))
546 break;
547 /* element etc. not stripped before the m */
548 if (Ends("ent"))
549 break;
550 return ;
552 case 'o':
553 if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
554 break;
555 /* j >= 0 fixes Bug 2 */
556 if (Ends("ou"))
557 break;
558 return ;
559 /* takes care of -ous */
561 case 's':
562 if (Ends("ism"))
563 break;
564 return ;
566 case 't':
567 if (Ends("ate"))
568 break;
569 if (Ends("iti"))
570 break;
571 return ;
573 case 'u':
574 if (Ends("ous"))
575 break;
576 return ;
578 case 'v':
579 if (Ends("ive"))
580 break;
581 return ;
583 case 'z':
584 if (Ends("ize"))
585 break;
586 return ;
588 default:
589 return ;
592 if (M() > 1)
593 k = j;
596 /* step6() removes a final -e if m() > 1. */
598 private void Step6()
600 j = k;
601 if (b[k] == 'e')
603 int a = M();
604 if (a > 1 || a == 1 && !Cvc(k - 1))
605 k--;
607 if (b[k] == 'l' && Doublec(k) && M() > 1)
608 k--;
612 /// <summary> Stem a word provided as a String. Returns the result as a String.</summary>
613 public virtual System.String Stem(System.String s)
615 if (Stem(s.ToCharArray(), s.Length))
617 return ToString();
619 else
620 return s;
623 /// <summary>Stem a word contained in a char[]. Returns true if the stemming process
624 /// resulted in a word different from the input. You can retrieve the
625 /// result with getResultLength()/getResultBuffer() or toString().
626 /// </summary>
627 public virtual bool Stem(char[] word)
629 return Stem(word, word.Length);
632 /// <summary>Stem a word contained in a portion of a char[] array. Returns
633 /// true if the stemming process resulted in a word different from
634 /// the input. You can retrieve the result with
635 /// getResultLength()/getResultBuffer() or toString().
636 /// </summary>
637 public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
639 Reset();
640 if (b.Length < wordLen)
642 char[] new_b = new char[wordLen + EXTRA];
643 b = new_b;
645 for (int j = 0; j < wordLen; j++)
646 b[j] = wordBuffer[offset + j];
647 i = wordLen;
648 return Stem(0);
651 /// <summary>Stem a word contained in a leading portion of a char[] array.
652 /// Returns true if the stemming process resulted in a word different
653 /// from the input. You can retrieve the result with
654 /// getResultLength()/getResultBuffer() or toString().
655 /// </summary>
656 public virtual bool Stem(char[] word, int wordLen)
658 return Stem(word, 0, wordLen);
661 /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
662 /// Returns true if the stemming process resulted in a word different
663 /// from the input. You can retrieve the result with
664 /// getResultLength()/getResultBuffer() or toString().
665 /// </summary>
666 public virtual bool Stem()
668 return Stem(0);
671 public virtual bool Stem(int i0)
673 k = i - 1;
674 k0 = i0;
675 if (k > k0 + 1)
677 Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
679 // Also, a word is considered dirty if we lopped off letters
680 // Thanks to Ifigenia Vairelles for pointing this out.
681 if (i != k + 1)
682 dirty = true;
683 i = k + 1;
684 return dirty;
687 /// <summary>Test program for demonstrating the Stemmer. It reads a file and
688 /// stems each word, writing the result to standard out.
689 /// Usage: Stemmer file-name
690 /// </summary>
691 [STAThread]
692 public static void Main(System.String[] args)
694 PorterStemmer s = new PorterStemmer();
696 for (int i = 0; i < args.Length; i++)
700 System.IO.BinaryReader in_Renamed = new System.IO.BinaryReader(System.IO.File.Open(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read));
701 byte[] buffer = new byte[1024];
702 int bufferLen, offset, ch;
704 bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
705 offset = 0;
706 s.Reset();
708 while (true)
710 if (offset < bufferLen)
711 ch = buffer[offset++];
712 else
714 bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
715 offset = 0;
716 if (bufferLen <= 0)
717 ch = - 1;
718 else
719 ch = buffer[offset++];
722 if (System.Char.IsLetter((char) ch))
724 s.Add(System.Char.ToLower((char) ch));
726 else
728 s.Stem();
729 System.Console.Out.Write(s.ToString());
730 s.Reset();
731 if (ch < 0)
732 break;
733 else
735 System.Console.Out.Write((char) ch);
740 in_Renamed.Close();
742 catch (System.IO.IOException)
744 System.Console.Out.WriteLine("error reading " + args[i]);