beagled/Lucene.Net/Analysis/Standard/StandardTokenizer.jj

   1 /**f
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 options {
  18   STATIC = false;
  19 //IGNORE_CASE = true;
  20 //BUILD_PARSER = false;
  21   UNICODE_INPUT = true;
  22   USER_CHAR_STREAM = true;
  23   OPTIMIZE_TOKEN_MANAGER = true;
  24 //DEBUG_TOKEN_MANAGER = true;
  25 }
  26 PARSER_BEGIN(StandardTokenizer)
  27
  28 package org.apache.lucene.analysis.standard;
  29
  30 import java.io.*;
  31
  32 /** A grammar-based tokenizer constructed with JavaCC.
  33  *
  34  * <p> This should be a good tokenizer for most European-language documents:
  35  *
  36  * <ul>
  37  *   <li>Splits words at punctuation characters, removing punctuation. However, a
  38  *     dot that's not followed by whitespace is considered part of a token.
  39  *   <li>Splits words at hyphens, unless there's a number in the token, in which case
  40  *     the whole token is interpreted as a product number and is not split.
  41  *   <li>Recognizes email addresses and internet hostnames as one token.
  42  * </ul>
  43  *
  44  * <p>Many applications have specific tokenizer needs.  If this tokenizer does
  45  * not suit your application, please consider copying this source code
  46  * directory to your project and maintaining your own grammar-based tokenizer.
  47  */
  48 public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
  49
  50   /** Constructs a tokenizer for this Reader. */
  51   public StandardTokenizer(Reader reader) {
  52     this(new FastCharStream(reader));
  53     this.input = reader;
  54   }
  55 }
  56
  57 PARSER_END(StandardTokenizer)
  58
  59 TOKEN : {                                         // token patterns
  60
  61   // basic word: a sequence of digits & letters
  62   <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
  63
  64   // internal apostrophes: O'Reilly, you're, O'Reilly's
  65   // use a post-filter to remove possesives
  66 | <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
  67
  68   // acronyms: U.S.A., I.B.M., etc.
  69   // use a post-filter to remove dots
  70 | <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
  71
  72   // company names like AT&T and Excite@Home.
  73 | <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
  74
  75   // email addresses
  76 | <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
  77
  78   // hostname
  79 | <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
  80
  81   // floating point, serial, model numbers, ip addresses, etc.
  82   // every other segment must have at least one digit
  83 | <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
  84        | <HAS_DIGIT> <P> <ALPHANUM>
  85        | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
  86        | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
  87        | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
  88        | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
  89         )
  90   >
  91 | <#P: ("_"|"-"|"/"|"."|",") >
  92 | <#HAS_DIGIT:                                    // at least one digit
  93     (<LETTER>|<DIGIT>)*
  94     <DIGIT>
  95     (<LETTER>|<DIGIT>)*
  96   >
  97
  98 | < #ALPHA: (<LETTER>)+>
  99 | < #LETTER:                                      // unicode letters
 100       [
 101        "\u0041"-"\u005a",
 102        "\u0061"-"\u007a",
 103        "\u00c0"-"\u00d6",
 104        "\u00d8"-"\u00f6",
 105        "\u00f8"-"\u00ff",
 106        "\u0100"-"\u1fff"
 107       ]
 108   >
 109 | < CJ:                                          // Chinese, Japanese
 110       [
 111        "\u3040"-"\u318f",
 112        "\u3300"-"\u337f",
 113        "\u3400"-"\u3d2d",
 114        "\u4e00"-"\u9fff",
 115        "\uf900"-"\ufaff"
 116       ]
 117   >
 118 | < KOREAN:                                          // Korean
 119       [
 120        "\u1100"-"\u11f9",
 121        "\uac00"-"\ud7af"
 122       ]
 123   >
 124 | < #DIGIT:                                       // unicode digits
 125       [
 126        "\u0030"-"\u0039",
 127        "\u0660"-"\u0669",
 128        "\u06f0"-"\u06f9",
 129        "\u0966"-"\u096f",
 130        "\u09e6"-"\u09ef",
 131        "\u0a66"-"\u0a6f",
 132        "\u0ae6"-"\u0aef",
 133        "\u0b66"-"\u0b6f",
 134        "\u0be7"-"\u0bef",
 135        "\u0c66"-"\u0c6f",
 136        "\u0ce6"-"\u0cef",
 137        "\u0d66"-"\u0d6f",
 138        "\u0e50"-"\u0e59",
 139        "\u0ed0"-"\u0ed9",
 140        "\u1040"-"\u1049"
 141       ]
 142   >
 143 }
 144
 145 SKIP : {                                          // skip unrecognized chars
 146  <NOISE: ~[] >
 147 }
 148
 149 /** Returns the next token in the stream, or null at EOS.
 150  * <p>The returned token's type is set to an element of {@link
 151  * StandardTokenizerConstants#tokenImage}.
 152  */
 153 org.apache.lucene.analysis.Token next() throws IOException :
 154 {
 155   Token token = null;
 156 }
 157 {
 158   ( token = <ALPHANUM> |
 159     token = <APOSTROPHE> |
 160     token = <ACRONYM> |
 161     token = <COMPANY> |
 162     token = <EMAIL> |
 163     token = <HOST> |
 164     token = <NUM> |
 165     token = <CJ> |
 166     token = <EOF>
 167    )
 168     {
 169       if (token.kind == EOF) {
 170         return null;
 171       } else {
 172         return
 173           new org.apache.lucene.analysis.Token(token.image,
 174                                         token.beginColumn,token.endColumn,
 175                                         tokenImage[token.kind]);
 176       }
 177     }
 178 }