beagled/Lucene.Net/Analysis/LetterTokenizer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 using System;
  18
  19 namespace Lucene.Net.Analysis
  20 {
  21
  22         /// <summary>A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
  23         /// to say, it defines tokens as maximal strings of adjacent letters, as defined
  24         /// by java.lang.Character.isLetter() predicate.
  25         /// Note: this does a decent job for most European languages, but does a terrible
  26         /// job for some Asian languages, where words are not separated by spaces.
  27         /// </summary>
  28
  29         public class LetterTokenizer : CharTokenizer
  30         {
  31                 /// <summary>Construct a new LetterTokenizer. </summary>
  32                 public LetterTokenizer(System.IO.TextReader in_Renamed) : base(in_Renamed)
  33                 {
  34                 }
  35
  36                 /// <summary>Collects only characters which satisfy
  37                 /// {@link Character#isLetter(char)}.
  38                 /// </summary>
  39                 protected internal override bool IsTokenChar(char c)
  40                 {
  41                         return System.Char.IsLetter(c);
  42                 }
  43         }
  44 }