1 /* ====================================================================
2 * The Apache Software License, Version 1.1
4 * Copyright (c) 2001 The Apache Software Foundation. All rights
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
19 * 3. The end-user documentation included with the redistribution,
20 * if any, must include the following acknowledgment:
21 * "This product includes software developed by the
22 * Apache Software Foundation (http://www.apache.org/)."
23 * Alternately, this acknowledgment may appear in the software itself,
24 * if and wherever such third-party acknowledgments normally appear.
26 * 4. The names "Apache" and "Apache Software Foundation" and
27 * "Apache Lucene" must not be used to endorse or promote products
28 * derived from this software without prior written permission. For
29 * written permission, please contact apache@apache.org.
31 * 5. Products derived from this software may not be called "Apache",
32 * "Apache Lucene", nor may "Apache" appear in their name, without
33 * prior written permission of the Apache Software Foundation.
35 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
36 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
37 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
38 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
42 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
43 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
44 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
45 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * ====================================================================
49 * This software consists of voluntary contributions made by many
50 * individuals on behalf of the Apache Software Foundation. For more
51 * information on the Apache Software Foundation, please see
52 * <http://www.apache.org/>.
58 //BUILD_PARSER = false;
60 USER_CHAR_STREAM = true;
61 OPTIMIZE_TOKEN_MANAGER = true;
62 //DEBUG_TOKEN_MANAGER = true;
64 PARSER_BEGIN(StandardTokenizer)
66 package org.apache.lucene.analysis.standard;
70 /** A grammar-based tokenizer constructed with JavaCC.
72 * <p> This should be a good tokenizer for most European-language documents.
74 * <p>Many applications have specific tokenizer needs. If this tokenizer does
75 * not suit your application, please consider copying this source code
76 * directory to your project and maintaining your own grammar-based tokenizer.
78 public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
80 /** Constructs a tokenizer for this Reader. */
81 public StandardTokenizer(Reader reader) {
82 this(new FastCharStream(reader));
87 PARSER_END(StandardTokenizer)
89 TOKEN : { // token patterns
91 // basic word: a sequence of digits & letters
92 <ALPHANUM: (<LETTER>|<DIGIT>)+ >
94 // internal apostrophes: O'Reilly, you're, O'Reilly's
95 // use a post-filter to remove possesives
96 | <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
98 // acronyms: U.S.A., I.B.M., etc.
99 // use a post-filter to remove dots
100 | <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
102 // company names like AT&T and Excite@Home.
103 | <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
106 | <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
109 | <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
111 // floating point, serial, model numbers, ip addresses, etc.
112 // every other segment must have at least one digit
113 | <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
114 | <HAS_DIGIT> <P> <ALPHANUM>
115 | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
116 | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
117 | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
118 | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
121 | <#P: ("_"|"-"|"/"|"."|",") >
122 | <#HAS_DIGIT: // at least one digit
128 | < #ALPHA: (<LETTER>)+>
129 | < #LETTER: // unicode letters
139 | < CJK: // non-alphabets
148 | < #DIGIT: // unicode digits
169 SKIP : { // skip unrecognized chars
173 /** Returns the next token in the stream, or null at EOS.
174 * <p>The returned token's type is set to an element of {@link
175 * StandardTokenizerConstants#tokenImage}.
177 org.apache.lucene.analysis.Token next() throws IOException :
182 ( token = <ALPHANUM> |
183 token = <APOSTROPHE> |
193 if (token.kind == EOF) {
197 new org.apache.lucene.analysis.Token(token.image,
198 token.beginColumn,token.endColumn,
199 tokenImage[token.kind]);