2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 namespace Lucene
.Net
.Analysis
19 /// <summary>A Token is an occurence of a term from the text of a Field. It consists of
20 /// a term's text, the start and end offset of the term in the text of the Field,
21 /// and a type string.
22 /// The start and end offsets permit applications to re-associate a token with
23 /// its source text, e.g., to display highlighted query terms in a document
24 /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
26 /// The type is an interned string, assigned by a lexical analyzer
27 /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
28 /// belongs to. For example an end of sentence marker token might be implemented
29 /// with type "eos". The default token type is "word".
32 public sealed class Token
34 internal System
.String termText
; // the text of the term
35 internal int startOffset
; // start in source text
36 internal int endOffset
; // end in source text
37 internal System
.String type_Renamed_Field
= "word"; // lexical type
39 private int positionIncrement
= 1;
41 /// <summary>Constructs a Token with the given term text, and start & end offsets.
42 /// The type defaults to "word."
44 public Token(System
.String text
, int start
, int end
)
51 /// <summary>Constructs a Token with the given text, start and end offsets, & type. </summary>
52 public Token(System
.String text
, int start
, int end
, System
.String typ
)
57 type_Renamed_Field
= typ
;
60 /// <summary>Set the position increment. This determines the position of this token
61 /// relative to the previous Token in a {@link TokenStream}, used in phrase
64 /// <p>The default value is one.
66 /// <p>Some common uses for this are:<ul>
68 /// <li>Set it to zero to put multiple terms in the same position. This is
69 /// useful if, e.g., a word has multiple stems. Searches for phrases
70 /// including either stem will match. In this case, all but the first stem's
71 /// increment should be set to zero: the increment of the first instance
72 /// should be one. Repeating a token with an increment of zero can also be
73 /// used to boost the scores of matches on that token.
75 /// <li>Set it to values greater than one to inhibit exact phrase matches.
76 /// If, for example, one does not want phrases to match across removed stop
77 /// words, then one could build a stop word filter that removes stop words and
78 /// also sets the increment to the number of stop words removed before each
79 /// non-stop word. Then exact phrase queries will only match when the terms
80 /// occur with no intervening stop words.
84 /// <seealso cref="Lucene.Net.Index.TermPositions">
86 public void SetPositionIncrement(int positionIncrement
)
88 if (positionIncrement
< 0)
89 throw new System
.ArgumentException("Increment must be zero or greater: " + positionIncrement
);
90 this.positionIncrement
= positionIncrement
;
93 /// <summary>Returns the position increment of this Token.</summary>
94 /// <seealso cref="#setPositionIncrement">
96 public int GetPositionIncrement()
98 return positionIncrement
;
101 /// <summary>Returns the Token's term text. </summary>
102 public System
.String
TermText()
107 /// <summary>Returns this Token's starting offset, the position of the first character
108 /// corresponding to this token in the source text.
109 /// Note that the difference between endOffset() and startOffset() may not be
110 /// equal to termText.length(), as the term text may have been altered by a
111 /// stemmer or some other filter.
113 public int StartOffset()
118 /// <summary>Returns this Token's ending offset, one greater than the position of the
119 /// last character corresponding to this token in the source text.
121 public int EndOffset()
126 /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
127 public System
.String
Type()
129 return type_Renamed_Field
;