beagled/Lucene.Net/Analysis/LetterTokenizer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 namespace Lucene.Net.Analysis
  18 {
  19
  20         /// <summary>A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
  21         /// to say, it defines tokens as maximal strings of adjacent letters, as defined
  22         /// by java.lang.Character.isLetter() predicate.
  23         /// Note: this does a decent job for most European languages, but does a terrible
  24         /// job for some Asian languages, where words are not separated by spaces.
  25         /// </summary>
  26
  27         public class LetterTokenizer : CharTokenizer
  28         {
  29                 /// <summary>Construct a new LetterTokenizer. </summary>
  30                 public LetterTokenizer(System.IO.TextReader in_Renamed) : base(in_Renamed)
  31                 {
  32                 }
  33
  34                 /// <summary>Collects only characters which satisfy
  35                 /// {@link Character#isLetter(char)}.
  36                 /// </summary>
  37                 protected internal override bool IsTokenChar(char c)
  38                 {
  39                         return System.Char.IsLetter(c);
  40                 }
  41         }
  42 }