First post!
[beagle.git] / Lucene.Net / Analysis / PorterStemmer.cs
blob7638811124c5dcc4ee87d698ebaa43dec54c24a5
1 using System;
2 using System.IO;
4 namespace Lucene.Net.Analysis
6 /* ====================================================================
7 * The Apache Software License, Version 1.1
9 * Copyright (c) 2001 The Apache Software Foundation. All rights
10 * reserved.
12 * Redistribution and use _in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions _in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer _in
21 * the documentation and/or other materials provided with the
22 * distribution.
24 * 3. The end-user documentation included with the redistribution,
25 * if any, must include the following acknowledgment:
26 * "This product includes software developed by the
27 * Apache Software Foundation (http://www.apache.org/)."
28 * Alternately, this acknowledgment may appear _in the software itself,
29 * if and wherever such third-party acknowledgments normally appear.
31 * 4. The names "Apache" and "Apache Software Foundation" and
32 * "Apache Lucene" must not be used to endorse or promote products
33 * derived from this software without prior written permission. For
34 * written permission, please contact apache@apache.org.
36 * 5. Products derived from this software may not be called "Apache",
37 * "Apache Lucene", nor may "Apache" appear _in their name, without
38 * prior written permission of the Apache Software Foundation.
40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
44 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 * ====================================================================
54 * This software consists of voluntary contributions made by many
55 * individuals on behalf of the Apache Software Foundation. For more
56 * information on the Apache Software Foundation, please see
57 * <http://www.apache.org/>.
61 Porter stemmer in Java. The original paper is in
63 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
64 no. 3, pp 130-137,
66 See also http://www.muscat.com/~martin/stem.html
68 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
69 Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
70 is then out outside the bounds of b.
72 Similarly,
74 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
75 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
76 b[j] is then outside the bounds of b.
78 Release 3.
80 [ This version is derived from Release 3, modified by Brian Goetz to
81 optimize for fewer object creations. ]
85 /// <summary>
86 /// Stemmer, implementing the Porter Stemming Algorithm
87 ///
88 /// The Stemmer class transforms a word into its root form. The input
89 /// word can be provided a character at time (by calling Add()), or at once
90 /// by calling one of the various Stem(something) methods.
91 /// </summary>
92 class PorterStemmer
94 private char[] b;
95 private int i, /* offset into b */
96 j, k, k0;
97 private bool dirty = false;
98 private const int INC = 50; /* unit of size whereby b is increased */
99 private const int EXTRA = 1;
101 public PorterStemmer()
103 b = new char[INC];
104 i = 0;
107 /// <summary>
108 /// Resets the stemmer so it can stem another word. If you invoke
109 /// the stemmer by calling Add(char) and then Stem(), you must call Reset()
110 /// before starting another word.
111 /// </summary>
112 public void Reset() { i = 0; dirty = false; }
114 /// <summary>
115 /// Add a character to the word being stemmed. When you are finished
116 /// adding characters, you can call Stem(void) to process the word.
117 /// </summary>
118 /// <param name="ch"></param>
119 public void Add(char ch)
121 if (b.Length <= i + EXTRA)
123 char[] new_b = new char[b.Length+INC];
124 for (int c = 0; c < b.Length; c++)
125 new_b[c] = b[c];
126 b = new_b;
128 b[i++] = ch;
131 /// <summary>
132 /// After a word has been stemmed, it can be retrieved by ToString(),
133 /// or a reference to the internal buffer can be retrieved by GetResultBuffer
134 /// and GetResultLength (which is generally more efficient.)
135 /// </summary>
136 /// <returns></returns>
137 public override String ToString() { return new String(b,0,i); }
139 /// <summary>
140 /// Returns the length of the word resulting from the stemming process.
141 /// </summary>
142 /// <returns></returns>
143 virtual public int GetResultLength() { return i; }
145 /// <summary>
146 /// Returns a reference to a character buffer containing the results of
147 /// the stemming process. You also need to consult GetResultLength()
148 /// to determine the length of the result.
149 /// </summary>
150 /// <returns></returns>
151 public char[] GetResultBuffer() { return b; }
153 /// <summary>
154 /// Ñons(i) is true &lt;=&gt; b[i] is a consonant.
155 /// </summary>
156 /// <param name="i"></param>
157 /// <returns></returns>
158 private bool Cons(int i)
160 switch (b[i])
162 case 'a':
163 return false;
164 case 'e':
165 return false;
166 case 'i':
167 return false;
168 case 'o':
169 return false;
170 case 'u':
171 return false;
172 case 'y':
173 return (i==k0) ? true : !Cons(i-1);
174 default:
175 return true;
179 /// <summary>
180 /// M() measures the number of consonant sequences between k0 and j. if c is
181 /// a consonant sequence and v a vowel sequence, and &lt;..&gt; indicates arbitrary
182 /// presence,
183 /// &lt;c&gt;&lt;v&gt; gives 0
184 /// &lt;c&gt;vc&lt;v&gt; gives 1
185 /// &lt;c&gt;vcvc&lt;v&gt; gives 2
186 /// &lt;c&gt;vcvcvc&lt;v&gt; gives 3
187 /// ....
188 /// </summary>
189 /// <returns></returns>
190 private int M()
192 int n = 0;
193 int i = k0;
194 while(true)
196 if (i > j)
197 return n;
198 if (! Cons(i))
199 break;
200 i++;
202 i++;
203 while(true)
205 while(true)
207 if (i > j)
208 return n;
209 if (Cons(i))
210 break;
211 i++;
213 i++;
214 n++;
215 while(true)
217 if (i > j)
218 return n;
219 if (! Cons(i))
220 break;
221 i++;
223 i++;
227 /// <summary>
228 /// Vowelinstem() is true &lt;=&gt; k0,...j contains a vowel
229 /// </summary>
230 /// <returns></returns>
231 private bool Vowelinstem()
233 int i;
234 for (i = k0; i <= j; i++)
235 if (! Cons(i))
236 return true;
237 return false;
240 /// <summary>
241 /// Doublec(j) is true &lt;=&gt; j,(j-1) contain a double consonant.
242 /// </summary>
243 /// <param name="j"></param>
244 /// <returns></returns>
245 private bool Doublec(int j)
247 if (j < k0+1)
248 return false;
249 if (b[j] != b[j-1])
250 return false;
251 return Cons(j);
254 /// <summary>
255 /// Cvc(i) is true &lt;=&gt; i-2,i-1,i has the form consonant - vowel - consonant
256 /// and also if the second c is not w,x or y. this is used when trying to
257 /// restore an e at the end of a short word. e.g.
259 /// Cav(e), Lov(e), Hop(e), Crim(e), but
260 /// snow, box, tray.
261 /// </summary>
262 /// <param name="i"></param>
263 /// <returns></returns>
264 private bool Cvc(int i)
266 if (i < k0+2 || !Cons(i) || Cons(i-1) || !Cons(i-2))
267 return false;
268 else
270 int ch = b[i];
271 if (ch == 'w' || ch == 'x' || ch == 'y') return false;
273 return true;
276 private bool Ends(String s)
278 int l = s.Length;
279 int o = k-l+1;
280 if (o < k0)
281 return false;
282 for (int i = 0; i < l; i++)
283 if (b[o+i] != s[i])
284 return false;
285 j = k-l;
286 return true;
289 /// <summary>
290 /// Setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
291 /// </summary>
292 /// <param name="s"></param>
293 internal virtual void Setto(String s)
295 int l = s.Length;
296 int o = j+1;
297 for (int i = 0; i < l; i++)
298 b[o+i] = s[i];
299 k = j+l;
300 dirty = true;
303 /// <summary>
304 /// R(s) is used further down.
305 /// </summary>
306 /// <param name="s"></param>
307 internal virtual void R(String s) { if (M() > 0) Setto(s); }
309 /// <summary>
310 /// Step1() gets rid of plurals and -ed or -ing. e.g.
311 /// caresses -> caress
312 /// ponies -> poni
313 /// ties -> ti
314 /// caress -> caress
315 /// cats -> cat
317 /// feed -> feed
318 /// agreed -> agree
319 /// disabled -> disable
321 /// matting -> mat
322 /// mating -> mate
323 /// meeting -> meet
324 /// milling -> mill
325 /// messing -> mess
327 /// meetings -> meet
328 /// </summary>
329 private void Step1()
331 if (b[k] == 's')
333 if (Ends("sses")) k -= 2;
334 else if (Ends("ies")) Setto("i");
335 else if (b[k-1] != 's') k--;
337 if (Ends("eed"))
339 if (M() > 0)
340 k--;
342 else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
344 k = j;
345 if (Ends("at")) Setto("ate");
346 else if (Ends("bl")) Setto("ble");
347 else if (Ends("iz")) Setto("ize");
348 else if (Doublec(k))
350 int ch = b[k--];
351 if (ch == 'l' || ch == 's' || ch == 'z')
352 k++;
354 else if (M() == 1 && Cvc(k))
355 Setto("e");
359 /// <summary>
360 /// Step2() turns terminal y to i when there is another vowel in the stem.
361 /// </summary>
362 private void Step2()
364 if (Ends("y") && Vowelinstem())
366 b[k] = 'i';
367 dirty = true;
372 /// <summary>
373 /// Step3() maps double suffices to single ones. so -ization ( = -ize plus
374 /// -ation) maps to -ize etc. note that the string before the suffix must give
375 /// M() > 0.
376 /// </summary>
377 private void Step3()
379 if (k == k0) return; /* For Bug 1 */
380 switch (b[k-1])
382 case 'a':
383 if (Ends("ational")) { R("ate"); break; }
384 if (Ends("tional")) { R("tion"); break; }
385 break;
386 case 'c':
387 if (Ends("enci")) { R("ence"); break; }
388 if (Ends("anci")) { R("ance"); break; }
389 break;
390 case 'e':
391 if (Ends("izer")) { R("ize"); break; }
392 break;
393 case 'l':
394 if (Ends("bli")) { R("ble"); break; }
395 if (Ends("alli")) { R("al"); break; }
396 if (Ends("entli")) { R("ent"); break; }
397 if (Ends("eli")) { R("e"); break; }
398 if (Ends("ousli")) { R("ous"); break; }
399 break;
400 case 'o':
401 if (Ends("ization")) { R("ize"); break; }
402 if (Ends("ation")) { R("ate"); break; }
403 if (Ends("ator")) { R("ate"); break; }
404 break;
405 case 's':
406 if (Ends("alism")) { R("al"); break; }
407 if (Ends("iveness")) { R("ive"); break; }
408 if (Ends("fulness")) { R("ful"); break; }
409 if (Ends("ousness")) { R("ous"); break; }
410 break;
411 case 't':
412 if (Ends("aliti")) { R("al"); break; }
413 if (Ends("iviti")) { R("ive"); break; }
414 if (Ends("biliti")) { R("ble"); break; }
415 break;
416 case 'g':
417 if (Ends("logi")) { R("log"); break; }
418 break;
422 /// <summary>
423 /// Step4() deals with -ic-, -full, -ness etc. similar strategy to Step3.
424 /// </summary>
425 private void Step4()
427 switch (b[k])
429 case 'e':
430 if (Ends("icate")) { R("ic"); break; }
431 if (Ends("ative")) { R(""); break; }
432 if (Ends("alize")) { R("al"); break; }
433 break;
434 case 'i':
435 if (Ends("iciti")) { R("ic"); break; }
436 break;
437 case 'l':
438 if (Ends("ical")) { R("ic"); break; }
439 if (Ends("ful")) { R(""); break; }
440 break;
441 case 's':
442 if (Ends("ness")) { R(""); break; }
443 break;
447 /// <summary>
448 /// Step5() takes off -ant, -ence etc., in context &lt;c&gt;vcvc&lt;v&gt;.
449 /// </summary>
450 private void Step5()
452 if (k == k0) return; /* for Bug 1 */
453 switch (b[k-1])
455 case 'a':
456 if (Ends("al")) break;
457 return;
458 case 'c':
459 if (Ends("ance")) break;
460 if (Ends("ence")) break;
461 return;
462 case 'e':
463 if (Ends("er")) break; return;
464 case 'i':
465 if (Ends("ic")) break; return;
466 case 'l':
467 if (Ends("able")) break;
468 if (Ends("ible")) break; return;
469 case 'n':
470 if (Ends("ant")) break;
471 if (Ends("ement")) break;
472 if (Ends("ment")) break;
473 /* element etc. not stripped before the m */
474 if (Ends("ent")) break;
475 return;
476 case 'o':
477 if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
478 /* j >= 0 fixes Bug 2 */
479 if (Ends("ou")) break;
480 return;
481 /* takes care of -ous */
482 case 's':
483 if (Ends("ism")) break;
484 return;
485 case 't':
486 if (Ends("ate")) break;
487 if (Ends("iti")) break;
488 return;
489 case 'u':
490 if (Ends("ous")) break;
491 return;
492 case 'v':
493 if (Ends("ive")) break;
494 return;
495 case 'z':
496 if (Ends("ize")) break;
497 return;
498 default:
499 return;
501 if (M() > 1)
502 k = j;
505 /// <summary>
506 /// Step6() removes a final -e if M() > 1.
507 /// </summary>
508 private void Step6()
510 j = k;
511 if (b[k] == 'e')
513 int a = M();
514 if (a > 1 || a == 1 && !Cvc(k-1))
515 k--;
517 if (b[k] == 'l' && Doublec(k) && M() > 1)
518 k--;
521 /// <summary>
522 /// Stem a word provided as a String. Returns the result as a String.
523 /// </summary>
524 /// <param name="s"></param>
525 /// <returns></returns>
526 public String Stem(String s)
528 if (Stem(s.ToCharArray(), s.Length))
529 return ToString();
530 else
531 return s;
534 /// <summary>
535 /// Stem a word contained in a char[]. Returns true if the stemming process
536 /// resulted in a word different from the input. You can retrieve the
537 /// result with GetResultLength()/GetResultBuffer() or ToString().
538 /// </summary>
539 /// <param name="word"></param>
540 /// <returns></returns>
541 public virtual bool Stem(char[] word)
543 return Stem(word, word.Length);
546 /// <summary>
547 /// Stem a word contained in a portion of a char[] array. Returns
548 /// true if the stemming process resulted in a word different from
549 /// the input. You can retrieve the result with
550 /// GetResultLength()/GetResultBuffer() or ToString().
551 /// </summary>
552 /// <param name="wordBuffer"></param>
553 /// <param name="offset"></param>
554 /// <param name="wordLen"></param>
555 /// <returns></returns>
556 public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
558 Reset();
559 if (b.Length < wordLen)
561 char[] new_b = new char[wordLen + EXTRA];
562 b = new_b;
564 for (int j=0; j<wordLen; j++)
565 b[j] = wordBuffer[offset+j];
566 i = wordLen;
567 return Stem(0);
570 /// <summary>
571 /// Stem a word contained in a leading portion of a char[] array.
572 /// Returns true if the stemming process resulted in a word different
573 /// from the input. You can retrieve the result with
574 /// GetResultLength()/GetResultBuffer() or ToString().
575 /// </summary>
576 /// <param name="word"></param>
577 /// <param name="wordLen"></param>
578 /// <returns></returns>
579 public virtual bool Stem(char[] word, int wordLen)
581 return Stem(word, 0, wordLen);
584 /// Stem the word placed into the Stemmer buffer through calls to Add().
585 /// Returns true if the stemming process resulted in a word different
586 /// from the input. You can retrieve the result with
587 /// GetResultLength()/GetResultBuffer() or ToString().
588 public virtual bool Stem()
590 return Stem(0);
593 public virtual bool Stem(int i0)
595 k = i - 1;
596 k0 = i0;
597 if (k > k0+1)
599 Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
601 // Also, a word is considered dirty if we lopped off letters
602 // Thanks to Ifigenia Vairelles for pointing this out.
603 if (i != k+1)
604 dirty = true;
605 i = k+1;
606 return dirty;
609 /// <summary>
610 /// Test program for demonstrating the Stemmer. It reads a file and
611 /// stems each word, writing the result to standard out.
612 /// Usage: Stemmer file-name
613 /// </summary>
614 /// <param name="args"></param>
615 public static void Main(String[] args)
617 PorterStemmer s = new PorterStemmer();
619 for (int i = 0; i < args.Length; i++)
621 try
623 // fix: 1.3.2.2 - FileAccess.Read
624 FileStream _in = new FileStream(args[i], FileMode.Open, FileAccess.Read);
625 byte[] buffer = new byte[1024];
626 int bufferLen, offset, ch;
628 bufferLen = _in.Read(buffer, 0, buffer.Length);
629 offset = 0;
630 s.Reset();
632 while(true)
634 if (offset < bufferLen)
635 ch = buffer[offset++];
636 else
638 bufferLen = _in.Read(buffer, 0, buffer.Length);
639 offset = 0;
640 if (bufferLen < 0)
641 ch = -1;
642 else
643 ch = buffer[offset++];
646 if (Char.IsLetter((char) ch))
648 s.Add(Char.ToLower((char) ch));
650 else
652 s.Stem();
653 Console.Write(s.ToString());
654 s.Reset();
655 if (ch < 0)
656 break;
657 else
659 Console.Write((char) ch);
664 _in.Close();
666 catch (IOException)
668 Console.WriteLine("error reading " + args[i]);