Add --enable-deletion option to buildindex. If used, buildindex will remove deleted...
[beagle.git] / beagled / Lucene.Net / Document / Field.cs
blob53351e10fce26f1f568521048caef8d4442cd11e
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using IndexReader = Lucene.Net.Index.IndexReader;
18 using Hits = Lucene.Net.Search.Hits;
19 using Similarity = Lucene.Net.Search.Similarity;
20 using Parameter = Lucene.Net.Util.Parameter;
21 namespace Lucene.Net.Documents
24 /// <summary>A field is a section of a Document. Each field has two parts, a name and a
25 /// value. Values may be free text, provided as a String or as a Reader, or they
26 /// may be atomic keywords, which are not further processed. Such keywords may
27 /// be used to represent dates, urls, etc. Fields are optionally stored in the
28 /// index, so that they may be returned with hits on the document.
29 /// </summary>
31 [Serializable]
32 public sealed class Field
34 private System.String name = "body";
36 // the one and only data object for all different kind of field values
37 private System.Object fieldsData = null;
39 private bool storeTermVector = false;
40 private bool storeOffsetWithTermVector = false;
41 private bool storePositionWithTermVector = false;
42 private bool isStored = false;
43 private bool isIndexed = true;
44 private bool isTokenized = true;
45 private bool isBinary = false;
46 private bool isCompressed = false;
48 private float boost = 1.0f;
50 [Serializable]
51 public sealed class Store : Parameter
54 internal Store(System.String name) : base(name)
58 /// <summary>Store the original field value in the index in a compressed form. This is
59 /// useful for long documents and for binary valued fields.
60 /// </summary>
61 public static readonly Store COMPRESS = new Store("COMPRESS");
63 /// <summary>Store the original field value in the index. This is useful for short texts
64 /// like a document's title which should be displayed with the results. The
65 /// value is stored in its original form, i.e. no analyzer is used before it is
66 /// stored.
67 /// </summary>
68 public static readonly Store YES = new Store("YES");
70 /// <summary>Do not store the field value in the index. </summary>
71 public static readonly Store NO = new Store("NO");
74 [Serializable]
75 public sealed class Index : Parameter
78 internal Index(System.String name) : base(name)
82 /// <summary>Do not index the field value. This field can thus not be searched,
83 /// but one can still access its contents provided it is
84 /// {@link Field.Store stored}.
85 /// </summary>
86 public static readonly Index NO = new Index("NO");
88 /// <summary>Index the field's value so it can be searched. An Analyzer will be used
89 /// to tokenize and possibly further normalize the text before its
90 /// terms will be stored in the index. This is useful for common text.
91 /// </summary>
92 public static readonly Index TOKENIZED = new Index("TOKENIZED");
94 /// <summary>Index the field's value without using an Analyzer, so it can be searched.
95 /// As no analyzer is used the value will be stored as a single term. This is
96 /// useful for unique Ids like product numbers.
97 /// </summary>
98 public static readonly Index UN_TOKENIZED = new Index("UN_TOKENIZED");
101 [Serializable]
102 public sealed class TermVector : Parameter
105 internal TermVector(System.String name) : base(name)
109 /// <summary>Do not store term vectors. </summary>
110 public static readonly TermVector NO = new TermVector("NO");
112 /// <summary>Store the term vectors of each document. A term vector is a list
113 /// of the document's terms and their number of occurences in that document.
114 /// </summary>
115 public static readonly TermVector YES = new TermVector("YES");
117 /// <summary> Store the term vector + token position information
118 ///
119 /// </summary>
120 /// <seealso cref="#YES">
121 /// </seealso>
122 public static readonly TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS");
124 /// <summary> Store the term vector + Token offset information
125 ///
126 /// </summary>
127 /// <seealso cref="#YES">
128 /// </seealso>
129 public static readonly TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS");
131 /// <summary> Store the term vector + Token position and offset information
132 ///
133 /// </summary>
134 /// <seealso cref="#YES">
135 /// </seealso>
136 /// <seealso cref="#WITH_POSITIONS">
137 /// </seealso>
138 /// <seealso cref="#WITH_OFFSETS">
139 /// </seealso>
140 public static readonly TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS");
143 /// <summary>Sets the boost factor hits on this field. This value will be
144 /// multiplied into the score of all hits on this this field of this
145 /// document.
146 ///
147 /// <p>The boost is multiplied by {@link Document#GetBoost()} of the document
148 /// containing this field. If a document has multiple fields with the same
149 /// name, all such values are multiplied together. This product is then
150 /// multipled by the value {@link Similarity#LengthNorm(String,int)}, and
151 /// rounded by {@link Similarity#EncodeNorm(float)} before it is stored in the
152 /// index. One should attempt to ensure that this product does not overflow
153 /// the range of that encoding.
154 ///
155 /// </summary>
156 /// <seealso cref="Document#SetBoost(float)">
157 /// </seealso>
158 /// <seealso cref="int)">
159 /// </seealso>
160 /// <seealso cref="Similarity#EncodeNorm(float)">
161 /// </seealso>
162 public void SetBoost(float boost)
164 this.boost = boost;
167 /// <summary>Returns the boost factor for hits for this field.
168 ///
169 /// <p>The default value is 1.0.
170 ///
171 /// <p>Note: this value is not stored directly with the document in the index.
172 /// Documents returned from {@link IndexReader#Document(int)} and
173 /// {@link Hits#Doc(int)} may thus not have the same value present as when
174 /// this field was indexed.
175 ///
176 /// </summary>
177 /// <seealso cref="#SetBoost(float)">
178 /// </seealso>
179 public float GetBoost()
181 return boost;
184 /// <summary>Constructs a String-valued Field that is not tokenized, but is indexed
185 /// and stored. Useful for non-text fields, e.g. date or url.
186 /// </summary>
187 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index)
188 /// Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED)} instead
189 /// </deprecated>
190 public static Field Keyword(System.String name, System.String value_Renamed)
192 return new Field(name, value_Renamed, true, true, false);
195 /// <summary>Constructs a String-valued Field that is not tokenized nor indexed,
196 /// but is stored in the index, for return with hits.
197 /// </summary>
198 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index)
199 /// Field(name, value, Field.Store.YES, Field.Index.NO)} instead
200 /// </deprecated>
201 public static Field UnIndexed(System.String name, System.String value_Renamed)
203 return new Field(name, value_Renamed, true, false, false);
206 /// <summary>Constructs a String-valued Field that is tokenized and indexed,
207 /// and is stored in the index, for return with hits. Useful for short text
208 /// fields, like "title" or "subject". Term vector will not be stored for this field.
209 /// </summary>
210 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index)
211 /// Field(name, value, Field.Store.YES, Field.Index.TOKENIZED)} instead
212 /// </deprecated>
213 public static Field Text(System.String name, System.String value_Renamed)
215 return Text(name, value_Renamed, false);
218 /// <summary>Constructs a Date-valued Field that is not tokenized and is indexed,
219 /// and stored in the index, for return with hits.
220 /// </summary>
221 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index)
222 /// Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED)} instead
223 /// </deprecated>
224 public static Field Keyword(System.String name, System.DateTime value_Renamed)
226 return new Field(name, DateField.DateToString(value_Renamed), true, true, false);
229 /// <summary>Constructs a String-valued Field that is tokenized and indexed,
230 /// and is stored in the index, for return with hits. Useful for short text
231 /// fields, like "title" or "subject".
232 /// </summary>
233 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)
234 /// Field(name, value, Field.Store.YES, Field.Index.TOKENIZED, storeTermVector)} instead
235 /// </deprecated>
236 public static Field Text(System.String name, System.String value_Renamed, bool storeTermVector)
238 return new Field(name, value_Renamed, true, true, true, storeTermVector);
241 /// <summary>Constructs a String-valued Field that is tokenized and indexed,
242 /// but that is not stored in the index. Term vector will not be stored for this field.
243 /// </summary>
244 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index)
245 /// Field(name, value, Field.Store.NO, Field.Index.TOKENIZED)} instead
246 /// </deprecated>
247 public static Field UnStored(System.String name, System.String value_Renamed)
249 return UnStored(name, value_Renamed, false);
252 /// <summary>Constructs a String-valued Field that is tokenized and indexed,
253 /// but that is not stored in the index.
254 /// </summary>
255 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)
256 /// Field(name, value, Field.Store.NO, Field.Index.TOKENIZED, storeTermVector)} instead
257 /// </deprecated>
258 public static Field UnStored(System.String name, System.String value_Renamed, bool storeTermVector)
260 return new Field(name, value_Renamed, false, true, true, storeTermVector);
263 /// <summary>Constructs a Reader-valued Field that is tokenized and indexed, but is
264 /// not stored in the index verbatim. Useful for longer text fields, like
265 /// "body". Term vector will not be stored for this field.
266 /// </summary>
267 /// <deprecated> use {@link #Field(String, Reader) Field(name, value)} instead
268 /// </deprecated>
269 public static Field Text(System.String name, System.IO.TextReader value_Renamed)
271 return Text(name, value_Renamed, false);
274 /// <summary>Constructs a Reader-valued Field that is tokenized and indexed, but is
275 /// not stored in the index verbatim. Useful for longer text fields, like
276 /// "body".
277 /// </summary>
278 /// <deprecated> use {@link #Field(String, Reader, Field.TermVector)
279 /// Field(name, value, storeTermVector)} instead
280 /// </deprecated>
281 public static Field Text(System.String name, System.IO.TextReader value_Renamed, bool storeTermVector)
283 Field f = new Field(name, value_Renamed);
284 f.storeTermVector = storeTermVector;
285 return f;
288 /// <summary>Returns the name of the field as an interned string.
289 /// For example "date", "title", "body", ...
290 /// </summary>
291 public System.String Name()
293 return name;
296 /// <summary>The value of the field as a String, or null. If null, the Reader value
297 /// or binary value is used. Exactly one of stringValue(), readerValue(), and
298 /// binaryValue() must be set.
299 /// </summary>
300 public System.String StringValue()
302 return fieldsData as System.String;
305 /// <summary>The value of the field as a Reader, or null. If null, the String value
306 /// or binary value is used. Exactly one of stringValue(), readerValue(),
307 /// and binaryValue() must be set.
308 /// </summary>
309 public System.IO.TextReader ReaderValue()
311 return fieldsData as System.IO.TextReader;
314 /// <summary>The value of the field in Binary, or null. If null, the Reader or
315 /// String value is used. Exactly one of stringValue(), readerValue() and
316 /// binaryValue() must be set.
317 /// </summary>
318 public byte[] BinaryValue()
320 return fieldsData as byte[];
323 /// <summary> Create a field by specifying its name, value and how it will
324 /// be saved in the index. Term vectors will not be stored in the index.
325 ///
326 /// </summary>
327 /// <param name="name">The name of the field
328 /// </param>
329 /// <param name="value">The string to process
330 /// </param>
331 /// <param name="store">Whether <code>value</code> should be stored in the index
332 /// </param>
333 /// <param name="index">Whether the field should be indexed, and if so, if it should
334 /// be tokenized before indexing
335 /// </param>
336 /// <throws> NullPointerException if name or value is <code>null</code> </throws>
337 /// <throws> IllegalArgumentException if the field is neither stored nor indexed </throws>
338 public Field(System.String name, System.String value_Renamed, Store store, Index index) : this(name, value_Renamed, store, index, TermVector.NO)
342 /// <summary> Create a field by specifying its name, value and how it will
343 /// be saved in the index.
344 ///
345 /// </summary>
346 /// <param name="name">The name of the field
347 /// </param>
348 /// <param name="value">The string to process
349 /// </param>
350 /// <param name="store">Whether <code>value</code> should be stored in the index
351 /// </param>
352 /// <param name="index">Whether the field should be indexed, and if so, if it should
353 /// be tokenized before indexing
354 /// </param>
355 /// <param name="termVector">Whether term vector should be stored
356 /// </param>
357 /// <throws> NullPointerException if name or value is <code>null</code> </throws>
358 /// <throws> IllegalArgumentException in any of the following situations: </throws>
359 /// <summary> <ul>
360 /// <li>the field is neither stored nor indexed</li>
361 /// <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
362 /// </ul>
363 /// </summary>
364 public Field(System.String name, System.String value_Renamed, Store store, Index index, TermVector termVector)
366 if (name == null)
367 throw new System.NullReferenceException("name cannot be null");
368 if (value_Renamed == null)
369 throw new System.NullReferenceException("value cannot be null");
370 if (index == Index.NO && store == Store.NO)
371 throw new System.ArgumentException("it doesn't make sense to have a field that " + "is neither indexed nor stored");
372 if (index == Index.NO && termVector != TermVector.NO)
373 throw new System.ArgumentException("cannot store term vector information " + "for a field that is not indexed");
375 this.name = String.Intern(name); // field names are interned
376 this.fieldsData = value_Renamed;
378 if (store == Store.YES)
380 this.isStored = true;
381 this.isCompressed = false;
383 else if (store == Store.COMPRESS)
385 this.isStored = true;
386 this.isCompressed = true;
388 else if (store == Store.NO)
390 this.isStored = false;
391 this.isCompressed = false;
393 else
395 throw new System.ArgumentException("unknown store parameter " + store);
398 if (index == Index.NO)
400 this.isIndexed = false;
401 this.isTokenized = false;
403 else if (index == Index.TOKENIZED)
405 this.isIndexed = true;
406 this.isTokenized = true;
408 else if (index == Index.UN_TOKENIZED)
410 this.isIndexed = true;
411 this.isTokenized = false;
413 else
415 throw new System.ArgumentException("unknown index parameter " + index);
418 this.isBinary = false;
420 SetStoreTermVector(termVector);
423 /// <summary> Create a tokenized and indexed field that is not stored. Term vectors will
424 /// not be stored.
425 ///
426 /// </summary>
427 /// <param name="name">The name of the field
428 /// </param>
429 /// <param name="reader">The reader with the content
430 /// </param>
431 /// <throws> NullPointerException if name or reader is <code>null</code> </throws>
432 public Field(System.String name, System.IO.TextReader reader) : this(name, reader, TermVector.NO)
436 /// <summary> Create a tokenized and indexed field that is not stored, optionally with
437 /// storing term vectors.
438 ///
439 /// </summary>
440 /// <param name="name">The name of the field
441 /// </param>
442 /// <param name="reader">The reader with the content
443 /// </param>
444 /// <param name="termVector">Whether term vector should be stored
445 /// </param>
446 /// <throws> NullPointerException if name or reader is <code>null</code> </throws>
447 public Field(System.String name, System.IO.TextReader reader, TermVector termVector)
449 if (name == null)
450 throw new System.NullReferenceException("name cannot be null");
451 if (reader == null)
452 throw new System.NullReferenceException("reader cannot be null");
454 this.name = String.Intern(name); // field names are interned
455 this.fieldsData = reader;
457 this.isStored = false;
458 this.isCompressed = false;
460 this.isIndexed = true;
461 this.isTokenized = true;
463 this.isBinary = false;
465 SetStoreTermVector(termVector);
468 /// <summary>Create a field by specifying all parameters except for <code>storeTermVector</code>,
469 /// which is set to <code>false</code>.
470 ///
471 /// </summary>
472 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index)} instead
473 /// </deprecated>
474 public Field(System.String name, System.String string_Renamed, bool store, bool index, bool token) : this(name, string_Renamed, store, index, token, false)
479 /// <summary> Create a stored field with binary value. Optionally the value may be compressed.
480 ///
481 /// </summary>
482 /// <param name="name">The name of the field
483 /// </param>
484 /// <param name="value">The binary value
485 /// </param>
486 /// <param name="store">How <code>value</code> should be stored (compressed or not.)
487 /// </param>
488 public Field(System.String name, byte[] value_Renamed, Store store)
490 if (name == null)
491 throw new System.ArgumentException("name cannot be null");
492 if (value_Renamed == null)
493 throw new System.ArgumentException("value cannot be null");
495 this.name = String.Intern(name);
496 this.fieldsData = value_Renamed;
498 if (store == Store.YES)
500 this.isStored = true;
501 this.isCompressed = false;
503 else if (store == Store.COMPRESS)
505 this.isStored = true;
506 this.isCompressed = true;
508 else if (store == Store.NO)
509 throw new System.ArgumentException("binary values can't be unstored");
510 else
512 throw new System.ArgumentException("unknown store parameter " + store);
515 this.isIndexed = false;
516 this.isTokenized = false;
518 this.isBinary = true;
520 SetStoreTermVector(TermVector.NO);
523 /// <summary> </summary>
524 /// <param name="name">The name of the field
525 /// </param>
526 /// <param name="string">The string to process
527 /// </param>
528 /// <param name="store">true if the field should store the string
529 /// </param>
530 /// <param name="index">true if the field should be indexed
531 /// </param>
532 /// <param name="token">true if the field should be tokenized
533 /// </param>
534 /// <param name="storeTermVector">true if we should store the Term Vector info
535 ///
536 /// </param>
537 /// <deprecated> use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead
538 /// </deprecated>
539 public Field(System.String name, System.String string_Renamed, bool store, bool index, bool token, bool storeTermVector)
541 if (name == null)
542 throw new System.NullReferenceException("name cannot be null");
543 if (string_Renamed == null)
544 throw new System.NullReferenceException("value cannot be null");
545 if (!index && storeTermVector)
546 throw new System.ArgumentException("cannot store a term vector for fields that are not indexed");
548 this.name = String.Intern(name); // field names are interned
549 this.fieldsData = string_Renamed;
550 this.isStored = store;
551 this.isIndexed = index;
552 this.isTokenized = token;
553 this.storeTermVector = storeTermVector;
556 private void SetStoreTermVector(TermVector termVector)
558 if (termVector == TermVector.NO)
560 this.storeTermVector = false;
561 this.storePositionWithTermVector = false;
562 this.storeOffsetWithTermVector = false;
564 else if (termVector == TermVector.YES)
566 this.storeTermVector = true;
567 this.storePositionWithTermVector = false;
568 this.storeOffsetWithTermVector = false;
570 else if (termVector == TermVector.WITH_POSITIONS)
572 this.storeTermVector = true;
573 this.storePositionWithTermVector = true;
574 this.storeOffsetWithTermVector = false;
576 else if (termVector == TermVector.WITH_OFFSETS)
578 this.storeTermVector = true;
579 this.storePositionWithTermVector = false;
580 this.storeOffsetWithTermVector = true;
582 else if (termVector == TermVector.WITH_POSITIONS_OFFSETS)
584 this.storeTermVector = true;
585 this.storePositionWithTermVector = true;
586 this.storeOffsetWithTermVector = true;
588 else
590 throw new System.ArgumentException("unknown termVector parameter " + termVector);
594 /// <summary>True iff the value of the field is to be stored in the index for return
595 /// with search hits. It is an error for this to be true if a field is
596 /// Reader-valued.
597 /// </summary>
598 public bool IsStored()
600 return isStored;
603 /// <summary>True iff the value of the field is to be indexed, so that it may be
604 /// searched on.
605 /// </summary>
606 public bool IsIndexed()
608 return isIndexed;
611 /// <summary>True iff the value of the field should be tokenized as text prior to
612 /// indexing. Un-tokenized fields are indexed as a single word and may not be
613 /// Reader-valued.
614 /// </summary>
615 public bool IsTokenized()
617 return isTokenized;
620 /// <summary>True if the value of the field is stored and compressed within the index </summary>
621 public bool IsCompressed()
623 return isCompressed;
626 /// <summary>True iff the term or terms used to index this field are stored as a term
627 /// vector, available from {@link IndexReader#GetTermFreqVector(int,String)}.
628 /// These methods do not provide access to the original content of the field,
629 /// only to terms used to index it. If the original content must be
630 /// preserved, use the <code>stored</code> attribute instead.
631 ///
632 /// </summary>
633 /// <seealso cref="String)">
634 /// </seealso>
635 public bool IsTermVectorStored()
637 return storeTermVector;
640 /// <summary> True iff terms are stored as term vector together with their offsets
641 /// (start and end positon in source text).
642 /// </summary>
643 public bool IsStoreOffsetWithTermVector()
645 return storeOffsetWithTermVector;
648 /// <summary> True iff terms are stored as term vector together with their token positions.</summary>
649 public bool IsStorePositionWithTermVector()
651 return storePositionWithTermVector;
654 /// <summary>True iff the value of the filed is stored as binary </summary>
655 public bool IsBinary()
657 return isBinary;
660 /// <summary>Prints a Field for human consumption. </summary>
661 public override System.String ToString()
663 System.Text.StringBuilder result = new System.Text.StringBuilder();
664 if (isStored)
666 result.Append("stored");
667 if (isCompressed)
668 result.Append("/compressed");
669 else
670 result.Append("/uncompressed");
672 if (isIndexed)
674 if (result.Length > 0)
675 result.Append(",");
676 result.Append("indexed");
678 if (isTokenized)
680 if (result.Length > 0)
681 result.Append(",");
682 result.Append("tokenized");
684 if (storeTermVector)
686 if (result.Length > 0)
687 result.Append(",");
688 result.Append("termVector");
690 if (storeOffsetWithTermVector)
692 if (result.Length > 0)
693 result.Append(",");
694 result.Append("termVectorOffsets");
696 if (storePositionWithTermVector)
698 if (result.Length > 0)
699 result.Append(",");
700 result.Append("termVectorPosition");
702 if (isBinary)
704 if (result.Length > 0)
705 result.Append(",");
706 result.Append("binary");
709 result.Append('<');
710 result.Append(name);
711 result.Append(':');
713 if (fieldsData != null)
715 result.Append(fieldsData);
718 result.Append('>');
719 return result.ToString();