Change the GC.GetTotalMemory() threshold to 10%; otherwise there are just too many...
[beagle.git] / beagled / Lucene.Net / Analysis / Token.cs
blob4713ead1f7de6489802b034aa8d8a67b9e6a1287
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
19 namespace Lucene.Net.Analysis
22 /// <summary>A Token is an occurence of a term from the text of a field. It consists of
23 /// a term's text, the start and end offset of the term in the text of the field,
24 /// and a type string.
25 /// The start and end offsets permit applications to re-associate a token with
26 /// its source text, e.g., to display highlighted query terms in a document
27 /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
28 /// display, etc.
29 /// The type is an interned string, assigned by a lexical analyzer
30 /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
31 /// belongs to. For example an end of sentence marker token might be implemented
32 /// with type "eos". The default token type is "word".
33 /// </summary>
35 public sealed class Token
37 internal System.String termText; // the text of the term
38 internal int startOffset; // start in source text
39 internal int endOffset; // end in source text
40 internal System.String type = "word"; // lexical type
42 private int positionIncrement = 1;
44 /// <summary>Constructs a Token with the given term text, and start & end offsets.
45 /// The type defaults to "word."
46 /// </summary>
47 public Token(System.String text, int start, int end)
49 termText = text;
50 startOffset = start;
51 endOffset = end;
54 /// <summary>Constructs a Token with the given text, start and end offsets, & type. </summary>
55 public Token(System.String text, int start, int end, System.String typ)
57 termText = text;
58 startOffset = start;
59 endOffset = end;
60 type = typ;
63 /// <summary>Set the position increment. This determines the position of this token
64 /// relative to the previous Token in a {@link TokenStream}, used in phrase
65 /// searching.
66 ///
67 /// <p>The default value is one.
68 ///
69 /// <p>Some common uses for this are:<ul>
70 ///
71 /// <li>Set it to zero to put multiple terms in the same position. This is
72 /// useful if, e.g., a word has multiple stems. Searches for phrases
73 /// including either stem will match. In this case, all but the first stem's
74 /// increment should be set to zero: the increment of the first instance
75 /// should be one. Repeating a token with an increment of zero can also be
76 /// used to boost the scores of matches on that token.
77 ///
78 /// <li>Set it to values greater than one to inhibit exact phrase matches.
79 /// If, for example, one does not want phrases to match across removed stop
80 /// words, then one could build a stop word filter that removes stop words and
81 /// also sets the increment to the number of stop words removed before each
82 /// non-stop word. Then exact phrase queries will only match when the terms
83 /// occur with no intervening stop words.
84 ///
85 /// </ul>
86 /// </summary>
87 /// <seealso cref="Lucene.Net.index.TermPositions">
88 /// </seealso>
89 public void SetPositionIncrement(int positionIncrement)
91 if (positionIncrement < 0)
92 throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
93 this.positionIncrement = positionIncrement;
96 /// <summary>Returns the position increment of this Token.</summary>
97 /// <seealso cref="setPositionIncrement">
98 /// </seealso>
99 public int GetPositionIncrement()
101 return positionIncrement;
104 /// <summary>Returns the Token's term text. </summary>
105 public System.String TermText()
107 return termText;
110 /// <summary>Returns this Token's starting offset, the position of the first character
111 /// corresponding to this token in the source text.
112 /// Note that the difference between endOffset() and startOffset() may not be
113 /// equal to termText.length(), as the term text may have been altered by a
114 /// stemmer or some other filter.
115 /// </summary>
116 public int StartOffset()
118 return startOffset;
121 /// <summary>Returns this Token's ending offset, one greater than the position of the
122 /// last character corresponding to this token in the source text.
123 /// </summary>
124 public int EndOffset()
126 return endOffset;
129 /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
130 public System.String Type()
132 return type;
135 public override System.String ToString()
137 System.Text.StringBuilder sb = new System.Text.StringBuilder();
138 sb.Append("(" + termText + "," + startOffset + "," + endOffset);
139 if (!type.Equals("word"))
140 sb.Append(",type=" + type);
141 if (positionIncrement != 1)
142 sb.Append(",posIncr=" + positionIncrement);
143 sb.Append(")");
144 return sb.ToString();