2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 Porter stemmer in Java. The original paper is in
21 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
24 See also http://www.tartarus.org/~martin/PorterStemmer/index.html
26 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
27 Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
28 is then out outside the bounds of b.
32 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
33 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
34 b[j] is then outside the bounds of b.
38 [ This version is derived from Release 3, modified by Brian Goetz to
39 optimize for fewer object creations. ]
44 namespace Lucene
.Net
.Analysis
48 /// Stemmer, implementing the Porter Stemming Algorithm
50 /// The Stemmer class transforms a word into its root form. The input
51 /// word can be provided a character at time (by calling add()), or at once
52 /// by calling one of the various stem(something) methods.
58 private int i
, j
, k
, k0
;
59 private bool dirty
= false;
60 private const int INC
= 50; /* unit of size whereby b is increased */
61 private const int EXTRA
= 1;
63 public PorterStemmer()
69 /// <summary> reset() resets the stemmer so it can stem another word. If you invoke
70 /// the stemmer by calling add(char) and then Stem(), you must call reset()
71 /// before starting another word.
73 public virtual void Reset()
78 /// <summary> Add a character to the word being stemmed. When you are finished
79 /// adding characters, you can call Stem(void) to process the word.
81 public virtual void Add(char ch
)
83 if (b
.Length
<= i
+ EXTRA
)
85 char[] new_b
= new char[b
.Length
+ INC
];
86 for (int c
= 0; c
< b
.Length
; c
++)
93 /// <summary> After a word has been stemmed, it can be retrieved by toString(),
94 /// or a reference to the internal buffer can be retrieved by getResultBuffer
95 /// and getResultLength (which is generally more efficient.)
97 public override System
.String
ToString()
99 return new System
.String(b
, 0, i
);
102 /// <summary> Returns the length of the word resulting from the stemming process.</summary>
103 public virtual int GetResultLength()
108 /// <summary> Returns a reference to a character buffer containing the results of
109 /// the stemming process. You also need to consult getResultLength()
110 /// to determine the length of the result.
112 public virtual char[] GetResultBuffer()
117 /* cons(i) is true <=> b[i] is a consonant. */
119 private bool Cons(int i
)
132 return (i
== k0
)?true:!Cons(i
- 1);
140 /* m() measures the number of consonant sequences between k0 and j. if c is
141 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
188 /* vowelinstem() is true <=> k0,...j contains a vowel */
190 private bool Vowelinstem()
193 for (i
= k0
; i
<= j
; i
++)
199 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
201 private bool Doublec(int j
)
205 if (b
[j
] != b
[j
- 1])
210 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
211 and also if the second c is not w,x or y. this is used when trying to
212 restore an e at the end of a short word. e.g.
214 cav(e), lov(e), hop(e), crim(e), but
219 private bool Cvc(int i
)
221 if (i
< k0
+ 2 || !Cons(i
) || Cons(i
- 1) || !Cons(i
- 2))
226 if (ch
== 'w' || ch
== 'x' || ch
== 'y')
232 private bool Ends(System
.String s
)
238 for (int i
= 0; i
< l
; i
++)
239 if (b
[o
+ i
] != s
[i
])
245 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
248 internal virtual void Setto(System
.String s
)
252 for (int i
= 0; i
< l
; i
++)
258 /* r(s) is used further down. */
260 internal virtual void R(System
.String s
)
266 /* step1() gets rid of plurals and -ed or -ing. e.g.
294 else if (Ends("ies"))
296 else if (b
[k
- 1] != 's')
304 else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
316 if (ch
== 'l' || ch
== 's' || ch
== 'z')
319 else if (M() == 1 && Cvc(k
))
324 /* step2() turns terminal y to i when there is another vowel in the stem. */
328 if (Ends("y") && Vowelinstem())
335 /* step3() maps double suffices to single ones. so -ization ( = -ize plus
336 -ation) maps to -ize etc. note that the string before the suffix must give
342 return ; /* For Bug 1 */
456 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
505 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
510 return ; /* for Bug 1 */
547 /* element etc. not stripped before the m */
553 if (Ends("ion") && j
>= 0 && (b
[j
] == 's' || b
[j
] == 't'))
555 /* j >= 0 fixes Bug 2 */
559 /* takes care of -ous */
596 /* step6() removes a final -e if m() > 1. */
604 if (a
> 1 || a
== 1 && !Cvc(k
- 1))
607 if (b
[k
] == 'l' && Doublec(k
) && M() > 1)
612 /// <summary> Stem a word provided as a String. Returns the result as a String.</summary>
613 public virtual System
.String
Stem(System
.String s
)
615 if (Stem(s
.ToCharArray(), s
.Length
))
623 /// <summary>Stem a word contained in a char[]. Returns true if the stemming process
624 /// resulted in a word different from the input. You can retrieve the
625 /// result with getResultLength()/getResultBuffer() or toString().
627 public virtual bool Stem(char[] word
)
629 return Stem(word
, word
.Length
);
632 /// <summary>Stem a word contained in a portion of a char[] array. Returns
633 /// true if the stemming process resulted in a word different from
634 /// the input. You can retrieve the result with
635 /// getResultLength()/getResultBuffer() or toString().
637 public virtual bool Stem(char[] wordBuffer
, int offset
, int wordLen
)
640 if (b
.Length
< wordLen
)
642 char[] new_b
= new char[wordLen
+ EXTRA
];
645 for (int j
= 0; j
< wordLen
; j
++)
646 b
[j
] = wordBuffer
[offset
+ j
];
651 /// <summary>Stem a word contained in a leading portion of a char[] array.
652 /// Returns true if the stemming process resulted in a word different
653 /// from the input. You can retrieve the result with
654 /// getResultLength()/getResultBuffer() or toString().
656 public virtual bool Stem(char[] word
, int wordLen
)
658 return Stem(word
, 0, wordLen
);
661 /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
662 /// Returns true if the stemming process resulted in a word different
663 /// from the input. You can retrieve the result with
664 /// getResultLength()/getResultBuffer() or toString().
666 public virtual bool Stem()
671 public virtual bool Stem(int i0
)
677 Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
679 // Also, a word is considered dirty if we lopped off letters
680 // Thanks to Ifigenia Vairelles for pointing this out.
687 /// <summary>Test program for demonstrating the Stemmer. It reads a file and
688 /// stems each word, writing the result to standard out.
689 /// Usage: Stemmer file-name
692 public static void Main(System
.String
[] args
)
694 PorterStemmer s
= new PorterStemmer();
696 for (int i
= 0; i
< args
.Length
; i
++)
700 System
.IO
.BinaryReader in_Renamed
= new System
.IO
.BinaryReader(System
.IO
.File
.Open(args
[i
], System
.IO
.FileMode
.Open
, System
.IO
.FileAccess
.Read
));
701 byte[] buffer
= new byte[1024];
702 int bufferLen
, offset
, ch
;
704 bufferLen
= in_Renamed
.Read(buffer
, 0, buffer
.Length
);
710 if (offset
< bufferLen
)
711 ch
= buffer
[offset
++];
714 bufferLen
= in_Renamed
.Read(buffer
, 0, buffer
.Length
);
719 ch
= buffer
[offset
++];
722 if (System
.Char
.IsLetter((char) ch
))
724 s
.Add(System
.Char
.ToLower((char) ch
));
729 System
.Console
.Out
.Write(s
.ToString());
735 System
.Console
.Out
.Write((char) ch
);
742 catch (System
.IO
.IOException
)
744 System
.Console
.Out
.WriteLine("error reading " + args
[i
]);