beagled/Lucene.Net/Analysis/CharTokenizer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 using System;
  18
  19 namespace Lucene.Net.Analysis
  20 {
  21
  22         /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
  23         public abstract class CharTokenizer : Tokenizer
  24         {
  25                 public CharTokenizer(System.IO.TextReader input) : base(input)
  26                 {
  27                 }
  28
  29                 private int offset = 0, bufferIndex = 0, dataLen = 0;
  30                 private const int MAX_WORD_LEN = 255;
  31                 private const int IO_BUFFER_SIZE = 1024;
  32                 private char[] buffer = new char[MAX_WORD_LEN];
  33                 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
  34
  35                 /// <summary>Returns true iff a character should be included in a token.  This
  36                 /// tokenizer generates as tokens adjacent sequences of characters which
  37                 /// satisfy this predicate.  Characters for which this is false are used to
  38                 /// define token boundaries and are not included in tokens.
  39                 /// </summary>
  40                 protected internal abstract bool IsTokenChar(char c);
  41
  42                 /// <summary>Called on each token character to normalize it before it is added to the
  43                 /// token.  The default implementation does nothing.  Subclasses may use this
  44                 /// to, e.g., lowercase tokens.
  45                 /// </summary>
  46                 protected internal virtual char Normalize(char c)
  47                 {
  48                         return c;
  49                 }
  50
  51                 /// <summary>Returns the next token in the stream, or null at EOS. </summary>
  52                 public override Token Next()
  53                 {
  54                         int length = 0;
  55                         int start = offset;
  56                         while (true)
  57                         {
  58                                 char c;
  59
  60                                 offset++;
  61                                 if (bufferIndex >= dataLen)
  62                                 {
  63                                         dataLen = input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
  64                                         bufferIndex = 0;
  65                                 }
  66                                 ;
  67                                 if (dataLen <= 0)
  68                                 {
  69                                         if (length > 0)
  70                                                 break;
  71                                         else
  72                                                 return null;
  73                                 }
  74                                 else
  75                                         c = ioBuffer[bufferIndex++];
  76
  77                                 if (IsTokenChar(c))
  78                                 {
  79                                         // if it's a token char
  80
  81                                         if (length == 0)
  82                                         // start of token
  83                                                 start = offset - 1;
  84
  85                                         buffer[length++] = Normalize(c); // buffer it, normalized
  86
  87                                         if (length == MAX_WORD_LEN)
  88                                         // buffer overflow!
  89                                                 break;
  90                                 }
  91                                 else if (length > 0)
  92                                 // at non-Letter w/ chars
  93                                         break; // return 'em
  94                         }
  95
  96                         return new Token(new System.String(buffer, 0, length), start, start + length);
  97                 }
  98         }
  99 }