beagled/Lucene.Net/Analysis/CharTokenizer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 namespace Lucene.Net.Analysis
  18 {
  19
  20         /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
  21         public abstract class CharTokenizer : Tokenizer
  22         {
  23                 public CharTokenizer(System.IO.TextReader input) : base(input)
  24                 {
  25                 }
  26
  27                 private int offset = 0, bufferIndex = 0, dataLen = 0;
  28                 private const int MAX_WORD_LEN = 255;
  29                 private const int IO_BUFFER_SIZE = 1024;
  30                 private char[] buffer = new char[MAX_WORD_LEN];
  31                 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
  32
  33                 /// <summary>Returns true iff a character should be included in a token.  This
  34                 /// tokenizer generates as tokens adjacent sequences of characters which
  35                 /// satisfy this predicate.  Characters for which this is false are used to
  36                 /// define token boundaries and are not included in tokens.
  37                 /// </summary>
  38                 protected internal abstract bool IsTokenChar(char c);
  39
  40                 /// <summary>Called on each token character to normalize it before it is added to the
  41                 /// token.  The default implementation does nothing.  Subclasses may use this
  42                 /// to, e.g., lowercase tokens.
  43                 /// </summary>
  44                 protected internal virtual char Normalize(char c)
  45                 {
  46                         return c;
  47                 }
  48
  49                 /// <summary>Returns the next token in the stream, or null at EOS. </summary>
  50                 public override Token Next()
  51                 {
  52                         int length = 0;
  53                         int start = offset;
  54                         while (true)
  55                         {
  56                                 char c;
  57
  58                                 offset++;
  59                                 if (bufferIndex >= dataLen)
  60                                 {
  61                                         dataLen = input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
  62                                         bufferIndex = 0;
  63                                 }
  64                                 ;
  65                                 if (dataLen <= 0)
  66                                 {
  67                                         if (length > 0)
  68                                                 break;
  69                                         else
  70                                                 return null;
  71                                 }
  72                                 else
  73                                         c = ioBuffer[bufferIndex++];
  74
  75                                 if (IsTokenChar(c))
  76                                 {
  77                                         // if it's a token char
  78
  79                                         if (length == 0)
  80                                         // start of token
  81                                                 start = offset - 1;
  82
  83                                         buffer[length++] = Normalize(c); // buffer it, normalized
  84
  85                                         if (length == MAX_WORD_LEN)
  86                                         // buffer overflow!
  87                                                 break;
  88                                 }
  89                                 else if (length > 0)
  90                                 // at non-Letter w/ chars
  91                                         break; // return 'em
  92                         }
  93
  94                         return new Token(new System.String(buffer, 0, length), start, start + length);
  95                 }
  96         }
  97 }