2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 //BUILD_PARSER = false;
22 USER_CHAR_STREAM = true;
23 OPTIMIZE_TOKEN_MANAGER = true;
24 //DEBUG_TOKEN_MANAGER = true;
26 PARSER_BEGIN(StandardTokenizer)
28 package org.apache.lucene.analysis.standard;
32 /** A grammar-based tokenizer constructed with JavaCC.
34 * <p> This should be a good tokenizer for most European-language documents:
37 * <li>Splits words at punctuation characters, removing punctuation. However, a
38 * dot that's not followed by whitespace is considered part of a token.
39 * <li>Splits words at hyphens, unless there's a number in the token, in which case
40 * the whole token is interpreted as a product number and is not split.
41 * <li>Recognizes email addresses and internet hostnames as one token.
44 * <p>Many applications have specific tokenizer needs. If this tokenizer does
45 * not suit your application, please consider copying this source code
46 * directory to your project and maintaining your own grammar-based tokenizer.
48 public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
50 /** Constructs a tokenizer for this Reader. */
51 public StandardTokenizer(Reader reader) {
52 this(new FastCharStream(reader));
57 PARSER_END(StandardTokenizer)
59 TOKEN : { // token patterns
61 // basic word: a sequence of digits & letters
62 <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
64 // internal apostrophes: O'Reilly, you're, O'Reilly's
65 // use a post-filter to remove possesives
66 | <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
68 // acronyms: U.S.A., I.B.M., etc.
69 // use a post-filter to remove dots
70 | <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
72 // company names like AT&T and Excite@Home.
73 | <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
76 | <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
79 | <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
81 // floating point, serial, model numbers, ip addresses, etc.
82 // every other segment must have at least one digit
83 | <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
84 | <HAS_DIGIT> <P> <ALPHANUM>
85 | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
86 | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
87 | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
88 | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
91 | <#P: ("_"|"-"|"/"|"."|",") >
92 | <#HAS_DIGIT: // at least one digit
98 | < #ALPHA: (<LETTER>)+>
99 | < #LETTER: // unicode letters
109 | < CJ: // Chinese, Japanese
118 | < KOREAN: // Korean
124 | < #DIGIT: // unicode digits
145 SKIP : { // skip unrecognized chars
149 /** Returns the next token in the stream, or null at EOS.
150 * <p>The returned token's type is set to an element of {@link
151 * StandardTokenizerConstants#tokenImage}.
153 org.apache.lucene.analysis.Token next() throws IOException :
158 ( token = <ALPHANUM> |
159 token = <APOSTROPHE> |
169 if (token.kind == EOF) {
173 new org.apache.lucene.analysis.Token(token.image,
174 token.beginColumn,token.endColumn,
175 tokenImage[token.kind]);