beagled/Lucene.Net/Analysis/Token.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 using System;
  18
  19 namespace Lucene.Net.Analysis
  20 {
  21
  22         /// <summary>A Token is an occurence of a term from the text of a field.  It consists of
  23         /// a term's text, the start and end offset of the term in the text of the field,
  24         /// and a type string.
  25         /// The start and end offsets permit applications to re-associate a token with
  26         /// its source text, e.g., to display highlighted query terms in a document
  27         /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
  28         /// display, etc.
  29         /// The type is an interned string, assigned by a lexical analyzer
  30         /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
  31         /// belongs to.  For example an end of sentence marker token might be implemented
  32         /// with type "eos".  The default token type is "word".
  33         /// </summary>
  34
  35         public sealed class Token
  36         {
  37                 internal System.String termText; // the text of the term
  38                 internal int startOffset; // start in source text
  39                 internal int endOffset; // end in source text
  40                 internal System.String type = "word"; // lexical type
  41
  42                 private int positionIncrement = 1;
  43
  44                 /// <summary>Constructs a Token with the given term text, and start & end offsets.
  45                 /// The type defaults to "word."
  46                 /// </summary>
  47                 public Token(System.String text, int start, int end)
  48                 {
  49                         termText = text;
  50                         startOffset = start;
  51                         endOffset = end;
  52                 }
  53
  54                 /// <summary>Constructs a Token with the given text, start and end offsets, & type. </summary>
  55                 public Token(System.String text, int start, int end, System.String typ)
  56                 {
  57                         termText = text;
  58                         startOffset = start;
  59                         endOffset = end;
  60                         type = typ;
  61                 }
  62
  63                 /// <summary>Set the position increment.  This determines the position of this token
  64                 /// relative to the previous Token in a {@link TokenStream}, used in phrase
  65                 /// searching.
  66                 ///
  67                 /// <p>The default value is one.
  68                 ///
  69                 /// <p>Some common uses for this are:<ul>
  70                 ///
  71                 /// <li>Set it to zero to put multiple terms in the same position.  This is
  72                 /// useful if, e.g., a word has multiple stems.  Searches for phrases
  73                 /// including either stem will match.  In this case, all but the first stem's
  74                 /// increment should be set to zero: the increment of the first instance
  75                 /// should be one.  Repeating a token with an increment of zero can also be
  76                 /// used to boost the scores of matches on that token.
  77                 ///
  78                 /// <li>Set it to values greater than one to inhibit exact phrase matches.
  79                 /// If, for example, one does not want phrases to match across removed stop
  80                 /// words, then one could build a stop word filter that removes stop words and
  81                 /// also sets the increment to the number of stop words removed before each
  82                 /// non-stop word.  Then exact phrase queries will only match when the terms
  83                 /// occur with no intervening stop words.
  84                 ///
  85                 /// </ul>
  86                 /// </summary>
  87                 /// <seealso cref="Lucene.Net.index.TermPositions">
  88                 /// </seealso>
  89                 public void  SetPositionIncrement(int positionIncrement)
  90                 {
  91                         if (positionIncrement < 0)
  92                                 throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
  93                         this.positionIncrement = positionIncrement;
  94                 }
  95
  96                 /// <summary>Returns the position increment of this Token.</summary>
  97                 /// <seealso cref="setPositionIncrement">
  98                 /// </seealso>
  99                 public int GetPositionIncrement()
 100                 {
 101                         return positionIncrement;
 102                 }
 103
 104                 /// <summary>Returns the Token's term text. </summary>
 105                 public System.String TermText()
 106                 {
 107                         return termText;
 108                 }
 109
 110                 /// <summary>Returns this Token's starting offset, the position of the first character
 111                 /// corresponding to this token in the source text.
 112                 /// Note that the difference between endOffset() and startOffset() may not be
 113                 /// equal to termText.length(), as the term text may have been altered by a
 114                 /// stemmer or some other filter.
 115                 /// </summary>
 116                 public int StartOffset()
 117                 {
 118                         return startOffset;
 119                 }
 120
 121                 /// <summary>Returns this Token's ending offset, one greater than the position of the
 122                 /// last character corresponding to this token in the source text.
 123                 /// </summary>
 124                 public int EndOffset()
 125                 {
 126                         return endOffset;
 127                 }
 128
 129                 /// <summary>Returns this Token's lexical type.  Defaults to "word". </summary>
 130                 public System.String Type()
 131                 {
 132                         return type;
 133                 }
 134
 135                 public override System.String ToString()
 136                 {
 137                         System.Text.StringBuilder sb = new System.Text.StringBuilder();
 138                         sb.Append("(" + termText + "," + startOffset + "," + endOffset);
 139                         if (!type.Equals("word"))
 140                                 sb.Append(",type=" + type);
 141                         if (positionIncrement != 1)
 142                                 sb.Append(",posIncr=" + positionIncrement);
 143                         sb.Append(")");
 144                         return sb.ToString();
 145                 }
 146         }
 147 }