4 namespace Lucene
.Net
.Analysis
6 /* ====================================================================
7 * The Apache Software License, Version 1.1
9 * Copyright (c) 2001 The Apache Software Foundation. All rights
12 * Redistribution and use _in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions _in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer _in
21 * the documentation and/or other materials provided with the
24 * 3. The end-user documentation included with the redistribution,
25 * if any, must include the following acknowledgment:
26 * "This product includes software developed by the
27 * Apache Software Foundation (http://www.apache.org/)."
28 * Alternately, this acknowledgment may appear _in the software itself,
29 * if and wherever such third-party acknowledgments normally appear.
31 * 4. The names "Apache" and "Apache Software Foundation" and
32 * "Apache Lucene" must not be used to endorse or promote products
33 * derived from this software without prior written permission. For
34 * written permission, please contact apache@apache.org.
36 * 5. Products derived from this software may not be called "Apache",
37 * "Apache Lucene", nor may "Apache" appear _in their name, without
38 * prior written permission of the Apache Software Foundation.
40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
44 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * ====================================================================
54 * This software consists of voluntary contributions made by many
55 * individuals on behalf of the Apache Software Foundation. For more
56 * information on the Apache Software Foundation, please see
57 * <http://www.apache.org/>.
61 Porter stemmer in Java. The original paper is in
63 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
66 See also http://www.muscat.com/~martin/stem.html
68 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
69 Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
70 is then out outside the bounds of b.
74 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
75 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
76 b[j] is then outside the bounds of b.
80 [ This version is derived from Release 3, modified by Brian Goetz to
81 optimize for fewer object creations. ]
86 /// Stemmer, implementing the Porter Stemming Algorithm
88 /// The Stemmer class transforms a word into its root form. The input
89 /// word can be provided a character at time (by calling Add()), or at once
90 /// by calling one of the various Stem(something) methods.
95 private int i
, /* offset into b */
97 private bool dirty
= false;
98 private const int INC
= 50; /* unit of size whereby b is increased */
99 private const int EXTRA
= 1;
101 public PorterStemmer()
108 /// Resets the stemmer so it can stem another word. If you invoke
109 /// the stemmer by calling Add(char) and then Stem(), you must call Reset()
110 /// before starting another word.
112 public void Reset() { i = 0; dirty = false; }
115 /// Add a character to the word being stemmed. When you are finished
116 /// adding characters, you can call Stem(void) to process the word.
118 /// <param name="ch"></param>
119 public void Add(char ch
)
121 if (b
.Length
<= i
+ EXTRA
)
123 char[] new_b
= new char[b
.Length
+INC
];
124 for (int c
= 0; c
< b
.Length
; c
++)
132 /// After a word has been stemmed, it can be retrieved by ToString(),
133 /// or a reference to the internal buffer can be retrieved by GetResultBuffer
134 /// and GetResultLength (which is generally more efficient.)
136 /// <returns></returns>
137 public override String
ToString() { return new String(b,0,i); }
140 /// Returns the length of the word resulting from the stemming process.
142 /// <returns></returns>
143 virtual public int GetResultLength() { return i; }
146 /// Returns a reference to a character buffer containing the results of
147 /// the stemming process. You also need to consult GetResultLength()
148 /// to determine the length of the result.
150 /// <returns></returns>
151 public char[] GetResultBuffer() { return b; }
154 /// Ñons(i) is true <=> b[i] is a consonant.
156 /// <param name="i"></param>
157 /// <returns></returns>
158 private bool Cons(int i
)
173 return (i
==k0
) ? true : !Cons(i
-1);
180 /// M() measures the number of consonant sequences between k0 and j. if c is
181 /// a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
183 /// <c><v> gives 0
184 /// <c>vc<v> gives 1
185 /// <c>vcvc<v> gives 2
186 /// <c>vcvcvc<v> gives 3
189 /// <returns></returns>
228 /// Vowelinstem() is true <=> k0,...j contains a vowel
230 /// <returns></returns>
231 private bool Vowelinstem()
234 for (i
= k0
; i
<= j
; i
++)
241 /// Doublec(j) is true <=> j,(j-1) contain a double consonant.
243 /// <param name="j"></param>
244 /// <returns></returns>
245 private bool Doublec(int j
)
255 /// Cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
256 /// and also if the second c is not w,x or y. this is used when trying to
257 /// restore an e at the end of a short word. e.g.
259 /// Cav(e), Lov(e), Hop(e), Crim(e), but
262 /// <param name="i"></param>
263 /// <returns></returns>
264 private bool Cvc(int i
)
266 if (i
< k0
+2 || !Cons(i
) || Cons(i
-1) || !Cons(i
-2))
271 if (ch
== 'w' || ch
== 'x' || ch
== 'y') return false;
276 private bool Ends(String s
)
282 for (int i
= 0; i
< l
; i
++)
290 /// Setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
292 /// <param name="s"></param>
293 internal virtual void Setto(String s
)
297 for (int i
= 0; i
< l
; i
++)
304 /// R(s) is used further down.
306 /// <param name="s"></param>
307 internal virtual void R(String s
) { if (M() > 0) Setto(s); }
310 /// Step1() gets rid of plurals and -ed or -ing. e.g.
311 /// caresses -> caress
319 /// disabled -> disable
333 if (Ends("sses")) k
-= 2;
334 else if (Ends("ies")) Setto("i");
335 else if (b
[k
-1] != 's') k
--;
342 else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
345 if (Ends("at")) Setto("ate");
346 else if (Ends("bl")) Setto("ble");
347 else if (Ends("iz")) Setto("ize");
351 if (ch
== 'l' || ch
== 's' || ch
== 'z')
354 else if (M() == 1 && Cvc(k
))
360 /// Step2() turns terminal y to i when there is another vowel in the stem.
364 if (Ends("y") && Vowelinstem())
373 /// Step3() maps double suffices to single ones. so -ization ( = -ize plus
374 /// -ation) maps to -ize etc. note that the string before the suffix must give
379 if (k
== k0
) return; /* For Bug 1 */
383 if (Ends("ational")) { R("ate"); break; }
384 if (Ends("tional")) { R("tion"); break; }
387 if (Ends("enci")) { R("ence"); break; }
388 if (Ends("anci")) { R("ance"); break; }
391 if (Ends("izer")) { R("ize"); break; }
394 if (Ends("bli")) { R("ble"); break; }
395 if (Ends("alli")) { R("al"); break; }
396 if (Ends("entli")) { R("ent"); break; }
397 if (Ends("eli")) { R("e"); break; }
398 if (Ends("ousli")) { R("ous"); break; }
401 if (Ends("ization")) { R("ize"); break; }
402 if (Ends("ation")) { R("ate"); break; }
403 if (Ends("ator")) { R("ate"); break; }
406 if (Ends("alism")) { R("al"); break; }
407 if (Ends("iveness")) { R("ive"); break; }
408 if (Ends("fulness")) { R("ful"); break; }
409 if (Ends("ousness")) { R("ous"); break; }
412 if (Ends("aliti")) { R("al"); break; }
413 if (Ends("iviti")) { R("ive"); break; }
414 if (Ends("biliti")) { R("ble"); break; }
417 if (Ends("logi")) { R("log"); break; }
423 /// Step4() deals with -ic-, -full, -ness etc. similar strategy to Step3.
430 if (Ends("icate")) { R("ic"); break; }
431 if (Ends("ative")) { R(""); break; }
432 if (Ends("alize")) { R("al"); break; }
435 if (Ends("iciti")) { R("ic"); break; }
438 if (Ends("ical")) { R("ic"); break; }
439 if (Ends("ful")) { R(""); break; }
442 if (Ends("ness")) { R(""); break; }
448 /// Step5() takes off -ant, -ence etc., in context <c>vcvc<v>.
452 if (k
== k0
) return; /* for Bug 1 */
456 if (Ends("al")) break;
459 if (Ends("ance")) break;
460 if (Ends("ence")) break;
463 if (Ends("er")) break; return;
465 if (Ends("ic")) break; return;
467 if (Ends("able")) break;
468 if (Ends("ible")) break; return;
470 if (Ends("ant")) break;
471 if (Ends("ement")) break;
472 if (Ends("ment")) break;
473 /* element etc. not stripped before the m */
474 if (Ends("ent")) break;
477 if (Ends("ion") && j
>= 0 && (b
[j
] == 's' || b
[j
] == 't')) break;
478 /* j >= 0 fixes Bug 2 */
479 if (Ends("ou")) break;
481 /* takes care of -ous */
483 if (Ends("ism")) break;
486 if (Ends("ate")) break;
487 if (Ends("iti")) break;
490 if (Ends("ous")) break;
493 if (Ends("ive")) break;
496 if (Ends("ize")) break;
506 /// Step6() removes a final -e if M() > 1.
514 if (a
> 1 || a
== 1 && !Cvc(k
-1))
517 if (b
[k
] == 'l' && Doublec(k
) && M() > 1)
522 /// Stem a word provided as a String. Returns the result as a String.
524 /// <param name="s"></param>
525 /// <returns></returns>
526 public String
Stem(String s
)
528 if (Stem(s
.ToCharArray(), s
.Length
))
535 /// Stem a word contained in a char[]. Returns true if the stemming process
536 /// resulted in a word different from the input. You can retrieve the
537 /// result with GetResultLength()/GetResultBuffer() or ToString().
539 /// <param name="word"></param>
540 /// <returns></returns>
541 public virtual bool Stem(char[] word
)
543 return Stem(word
, word
.Length
);
547 /// Stem a word contained in a portion of a char[] array. Returns
548 /// true if the stemming process resulted in a word different from
549 /// the input. You can retrieve the result with
550 /// GetResultLength()/GetResultBuffer() or ToString().
552 /// <param name="wordBuffer"></param>
553 /// <param name="offset"></param>
554 /// <param name="wordLen"></param>
555 /// <returns></returns>
556 public virtual bool Stem(char[] wordBuffer
, int offset
, int wordLen
)
559 if (b
.Length
< wordLen
)
561 char[] new_b
= new char[wordLen
+ EXTRA
];
564 for (int j
=0; j
<wordLen
; j
++)
565 b
[j
] = wordBuffer
[offset
+j
];
571 /// Stem a word contained in a leading portion of a char[] array.
572 /// Returns true if the stemming process resulted in a word different
573 /// from the input. You can retrieve the result with
574 /// GetResultLength()/GetResultBuffer() or ToString().
576 /// <param name="word"></param>
577 /// <param name="wordLen"></param>
578 /// <returns></returns>
579 public virtual bool Stem(char[] word
, int wordLen
)
581 return Stem(word
, 0, wordLen
);
584 /// Stem the word placed into the Stemmer buffer through calls to Add().
585 /// Returns true if the stemming process resulted in a word different
586 /// from the input. You can retrieve the result with
587 /// GetResultLength()/GetResultBuffer() or ToString().
588 public virtual bool Stem()
593 public virtual bool Stem(int i0
)
599 Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
601 // Also, a word is considered dirty if we lopped off letters
602 // Thanks to Ifigenia Vairelles for pointing this out.
610 /// Test program for demonstrating the Stemmer. It reads a file and
611 /// stems each word, writing the result to standard out.
612 /// Usage: Stemmer file-name
614 /// <param name="args"></param>
615 public static void Main(String
[] args
)
617 PorterStemmer s
= new PorterStemmer();
619 for (int i
= 0; i
< args
.Length
; i
++)
623 // fix: 1.3.2.2 - FileAccess.Read
624 FileStream _in
= new FileStream(args
[i
], FileMode
.Open
, FileAccess
.Read
);
625 byte[] buffer
= new byte[1024];
626 int bufferLen
, offset
, ch
;
628 bufferLen
= _in
.Read(buffer
, 0, buffer
.Length
);
634 if (offset
< bufferLen
)
635 ch
= buffer
[offset
++];
638 bufferLen
= _in
.Read(buffer
, 0, buffer
.Length
);
643 ch
= buffer
[offset
++];
646 if (Char
.IsLetter((char) ch
))
648 s
.Add(Char
.ToLower((char) ch
));
653 Console
.Write(s
.ToString());
659 Console
.Write((char) ch
);
668 Console
.WriteLine("error reading " + args
[i
]);