Lucene.Net/Analysis/Standard/StandardTokenizer.jj

   1 /* ====================================================================
   2  * The Apache Software License, Version 1.1
   3  *
   4  * Copyright (c) 2001 The Apache Software Foundation.  All rights
   5  * reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  *
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in
  16  *    the documentation and/or other materials provided with the
  17  *    distribution.
  18  *
  19  * 3. The end-user documentation included with the redistribution,
  20  *    if any, must include the following acknowledgment:
  21  *       "This product includes software developed by the
  22  *        Apache Software Foundation (http://www.apache.org/)."
  23  *    Alternately, this acknowledgment may appear in the software itself,
  24  *    if and wherever such third-party acknowledgments normally appear.
  25  *
  26  * 4. The names "Apache" and "Apache Software Foundation" and
  27  *    "Apache Lucene" must not be used to endorse or promote products
  28  *    derived from this software without prior written permission. For
  29  *    written permission, please contact apache@apache.org.
  30  *
  31  * 5. Products derived from this software may not be called "Apache",
  32  *    "Apache Lucene", nor may "Apache" appear in their name, without
  33  *    prior written permission of the Apache Software Foundation.
  34  *
  35  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  36  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  37  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  38  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  41  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  42  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  43  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  44  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  45  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  46  * SUCH DAMAGE.
  47  * ====================================================================
  48  *
  49  * This software consists of voluntary contributions made by many
  50  * individuals on behalf of the Apache Software Foundation.  For more
  51  * information on the Apache Software Foundation, please see
  52  * <http://www.apache.org/>.
  53  */
  54
  55 options {
  56   STATIC = false;
  57 //IGNORE_CASE = true;
  58 //BUILD_PARSER = false;
  59   UNICODE_INPUT = true;
  60   USER_CHAR_STREAM = true;
  61   OPTIMIZE_TOKEN_MANAGER = true;
  62 //DEBUG_TOKEN_MANAGER = true;
  63 }
  64 PARSER_BEGIN(StandardTokenizer)
  65
  66 package org.apache.lucene.analysis.standard;
  67
  68 import java.io.*;
  69
  70 /** A grammar-based tokenizer constructed with JavaCC.
  71  *
  72  * <p> This should be a good tokenizer for most European-language documents.
  73  *
  74  * <p>Many applications have specific tokenizer needs.  If this tokenizer does
  75  * not suit your application, please consider copying this source code
  76  * directory to your project and maintaining your own grammar-based tokenizer.
  77  */
  78 public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
  79
  80   /** Constructs a tokenizer for this Reader. */
  81   public StandardTokenizer(Reader reader) {
  82     this(new FastCharStream(reader));
  83     this.input = reader;
  84   }
  85 }
  86
  87 PARSER_END(StandardTokenizer)
  88
  89 TOKEN : {                                         // token patterns
  90
  91   // basic word: a sequence of digits & letters
  92   <ALPHANUM: (<LETTER>|<DIGIT>)+ >
  93
  94   // internal apostrophes: O'Reilly, you're, O'Reilly's
  95   // use a post-filter to remove possesives
  96 | <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
  97
  98   // acronyms: U.S.A., I.B.M., etc.
  99   // use a post-filter to remove dots
 100 | <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
 101
 102   // company names like AT&T and Excite@Home.
 103 | <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
 104
 105   // email addresses
 106 | <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
 107
 108   // hostname
 109 | <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
 110
 111   // floating point, serial, model numbers, ip addresses, etc.
 112   // every other segment must have at least one digit
 113 | <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
 114        | <HAS_DIGIT> <P> <ALPHANUM>
 115        | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
 116        | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
 117        | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
 118        | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
 119         )
 120   >
 121 | <#P: ("_"|"-"|"/"|"."|",") >
 122 | <#HAS_DIGIT:                                    // at least one digit
 123     (<LETTER>|<DIGIT>)*
 124     <DIGIT>
 125     (<LETTER>|<DIGIT>)*
 126   >
 127
 128 | < SIGRAM: (<CJK>)+ >
 129 | < #ALPHA: (<LETTER>)+>
 130 | < #LETTER:                                      // unicode letters
 131       [
 132        "\u0041"-"\u005a",
 133        "\u0061"-"\u007a",
 134        "\u00c0"-"\u00d6",
 135        "\u00d8"-"\u00f6",
 136        "\u00f8"-"\u00ff",
 137        "\u0100"-"\u1fff"
 138       ]
 139   >
 140 | < #CJK:             // non-alphabets
 141       [
 142        "\u3040"-"\u318f",
 143        "\u3300"-"\u337f",
 144        "\u3400"-"\u3d2d",
 145        "\u4e00"-"\u9fff",
 146        "\uf900"-"\ufaff"
 147       ]
 148   >
 149 | < #DIGIT:                                       // unicode digits
 150       [
 151        "\u0030"-"\u0039",
 152        "\u0660"-"\u0669",
 153        "\u06f0"-"\u06f9",
 154        "\u0966"-"\u096f",
 155        "\u09e6"-"\u09ef",
 156        "\u0a66"-"\u0a6f",
 157        "\u0ae6"-"\u0aef",
 158        "\u0b66"-"\u0b6f",
 159        "\u0be7"-"\u0bef",
 160        "\u0c66"-"\u0c6f",
 161        "\u0ce6"-"\u0cef",
 162        "\u0d66"-"\u0d6f",
 163        "\u0e50"-"\u0e59",
 164        "\u0ed0"-"\u0ed9",
 165        "\u1040"-"\u1049"
 166       ]
 167   >
 168 }
 169
 170 SKIP : {                                          // skip unrecognized chars
 171  <NOISE: ~[] >
 172 }
 173
 174 /** Returns the next token in the stream, or null at EOS.
 175  * <p>The returned token's type is set to an element of {@link
 176  * StandardTokenizerConstants#tokenImage}.
 177  */
 178 org.apache.lucene.analysis.Token next() throws IOException :
 179 {
 180   Token token = null;
 181 }
 182 {
 183   ( token = <ALPHANUM> |
 184     token = <APOSTROPHE> |
 185     token = <ACRONYM> |
 186     token = <COMPANY> |
 187     token = <EMAIL> |
 188     token = <HOST> |
 189     token = <NUM> |
 190     token = <SIGRAM> |
 191     token = <EOF>
 192    )
 193     {
 194       if (token.kind == EOF) {
 195         return null;
 196       } else {
 197         return
 198           new org.apache.lucene.analysis.Token(token.image,
 199                                         token.beginColumn,token.endColumn,
 200                                         tokenImage[token.kind]);
 201       }
 202     }
 203 }