beagled/Lucene.Net/Index/DocumentWriter.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 using Analyzer = Lucene.Net.Analysis.Analyzer;
  18 using Token = Lucene.Net.Analysis.Token;
  19 using TokenStream = Lucene.Net.Analysis.TokenStream;
  20 using Document = Lucene.Net.Documents.Document;
  21 using Field = Lucene.Net.Documents.Field;
  22 using Similarity = Lucene.Net.Search.Similarity;
  23 using Directory = Lucene.Net.Store.Directory;
  24 using OutputStream = Lucene.Net.Store.OutputStream;
  25 namespace Lucene.Net.Index
  26 {
  27
  28         sealed public class DocumentWriter
  29         {
  30                 private Analyzer analyzer;
  31                 private Directory directory;
  32                 private Similarity similarity;
  33                 private FieldInfos fieldInfos;
  34                 private int maxFieldLength;
  35
  36                 /// <summary> </summary>
  37                 /// <param name="directory">The directory to write the document information to
  38                 /// </param>
  39                 /// <param name="analyzer">The analyzer to use for the document
  40                 /// </param>
  41                 /// <param name="similarity">The Similarity function
  42                 /// </param>
  43                 /// <param name="maxFieldLength">The maximum number of tokens a Field may have
  44                 /// </param>
  45                 public /*internal*/ DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
  46                 {
  47                         this.directory = directory;
  48                         this.analyzer = analyzer;
  49                         this.similarity = similarity;
  50                         this.maxFieldLength = maxFieldLength;
  51                 }
  52
  53                 /*internal*/ public void  AddDocument(System.String segment, Document doc)
  54                 {
  55                         // write Field names
  56                         fieldInfos = new FieldInfos();
  57                         fieldInfos.Add(doc);
  58                         fieldInfos.Write(directory, segment + ".fnm");
  59
  60                         // write Field values
  61                         FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
  62                         try
  63                         {
  64                                 fieldsWriter.AddDocument(doc);
  65                         }
  66                         finally
  67                         {
  68                                 fieldsWriter.Close();
  69                         }
  70
  71                         // invert doc into postingTable
  72                         postingTable.Clear(); // clear postingTable
  73                         fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
  74                         fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
  75
  76                         fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
  77             float boost = doc.GetBoost();
  78             for (int i = 0; i < fieldBoosts.Length; i++)
  79             {
  80                 fieldBoosts[i] = boost;
  81             }
  82
  83                         InvertDocument(doc);
  84
  85                         // sort postingTable into an array
  86                         Posting[] postings = SortPostingTable();
  87
  88                         /*
  89                         for (int i = 0; i < postings.length; i++) {
  90                         Posting posting = postings[i];
  91                         System.out.print(posting.term);
  92                         System.out.print(" freq=" + posting.freq);
  93                         System.out.print(" pos=");
  94                         System.out.print(posting.positions[0]);
  95                         for (int j = 1; j < posting.freq; j++)
  96                         System.out.print("," + posting.positions[j]);
  97                         System.out.println("");
  98                         }
  99                         */
 100
 101                         // write postings
 102                         WritePostings(postings, segment);
 103
 104                         // write norms of indexed fields
 105                         WriteNorms(doc, segment);
 106                 }
 107
 108                 // Keys are Terms, values are Postings.
 109                 // Used to buffer a document before it is written to the index.
 110                 private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
 111                 private int[] fieldLengths;
 112                 private int[] fieldPositions;
 113                 private float[] fieldBoosts;
 114
 115                 // Tokenizes the fields of a document into Postings.
 116                 private void  InvertDocument(Document doc)
 117                 {
 118             foreach(Field field in doc.Fields())
 119             {
 120                                 System.String fieldName = field.Name();
 121                                 int fieldNumber = fieldInfos.FieldNumber(fieldName);
 122
 123                                 int length = fieldLengths[fieldNumber]; // length of Field
 124                                 int position = fieldPositions[fieldNumber]; // position in Field
 125
 126                                 if (field.IsIndexed())
 127                                 {
 128                                         if (!field.IsTokenized())
 129                                         {
 130                                                 // un-tokenized Field
 131                                                 AddPosition(fieldName, field.StringValue(), position++);
 132                                                 length++;
 133                                         }
 134                                         else
 135                                         {
 136                                                 System.IO.TextReader reader; // find or make Reader
 137                                                 if (field.ReaderValue() != null)
 138                                                         reader = field.ReaderValue();
 139                                                 else if (field.StringValue() != null)
 140                                                         reader = new System.IO.StringReader(field.StringValue());
 141                                                 else
 142                                                         throw new System.ArgumentException("Field must have either String or Reader value");
 143
 144                                                 // Tokenize Field and add to postingTable
 145                                                 TokenStream stream = analyzer.TokenStream(fieldName, reader);
 146                                                 try
 147                                                 {
 148                                                         for (Token t = stream.Next(); t != null; t = stream.Next())
 149                                                         {
 150                                                                 position += (t.GetPositionIncrement() - 1);
 151                                                                 AddPosition(fieldName, t.TermText(), position++);
 152                                                                 if (++length > maxFieldLength)
 153                                                                         break;
 154                                                         }
 155                                                 }
 156                                                 finally
 157                                                 {
 158                                                         stream.Close();
 159                                                 }
 160                                         }
 161
 162                                         fieldLengths[fieldNumber] = length; // save Field length
 163                                         fieldPositions[fieldNumber] = position; // save Field position
 164                                         fieldBoosts[fieldNumber] *= field.GetBoost();
 165                                 }
 166                         }
 167                 }
 168
 169                 private Term termBuffer = new Term("", ""); // avoid consing
 170
 171                 private void  AddPosition(System.String field, System.String text, int position)
 172                 {
 173                         termBuffer.Set(field, text);
 174                         Posting ti = (Posting) postingTable[termBuffer];
 175                         if (ti != null)
 176                         {
 177                                 // word seen before
 178                                 int freq = ti.freq;
 179                                 if (ti.positions.Length == freq)
 180                                 {
 181                                         // positions array is full
 182                                         int[] newPositions = new int[freq * 2]; // double size
 183                                         int[] positions = ti.positions;
 184                                         for (int i = 0; i < freq; i++)
 185                                         // copy old positions to new
 186                                                 newPositions[i] = positions[i];
 187                                         ti.positions = newPositions;
 188                                 }
 189                                 ti.positions[freq] = position; // add new position
 190                                 ti.freq = freq + 1; // update frequency
 191                         }
 192                         else
 193                         {
 194                                 // word not seen before
 195                                 Term term = new Term(field, text, false);
 196                                 postingTable[term] = new Posting(term, position);
 197                         }
 198                 }
 199
 200                 private Posting[] SortPostingTable()
 201                 {
 202                         // copy postingTable into an array
 203                         Posting[] array = new Posting[postingTable.Count];
 204                         System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
 205                         for (int i = 0; postings.MoveNext(); i++)
 206                         {
 207                                 array[i] = (Posting) postings.Current;
 208                         }
 209
 210                         // sort the array
 211                         QuickSort(array, 0, array.Length - 1);
 212
 213                         return array;
 214                 }
 215
 216                 private static void  QuickSort(Posting[] postings, int lo, int hi)
 217                 {
 218                         if (lo >= hi)
 219                                 return ;
 220
 221                         int mid = (lo + hi) / 2;
 222
 223                         if (postings[lo].term.CompareTo(postings[mid].term) > 0)
 224                         {
 225                                 Posting tmp = postings[lo];
 226                                 postings[lo] = postings[mid];
 227                                 postings[mid] = tmp;
 228                         }
 229
 230                         if (postings[mid].term.CompareTo(postings[hi].term) > 0)
 231                         {
 232                                 Posting tmp = postings[mid];
 233                                 postings[mid] = postings[hi];
 234                                 postings[hi] = tmp;
 235
 236                                 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
 237                                 {
 238                                         Posting tmp2 = postings[lo];
 239                                         postings[lo] = postings[mid];
 240                                         postings[mid] = tmp2;
 241                                 }
 242                         }
 243
 244                         int left = lo + 1;
 245                         int right = hi - 1;
 246
 247                         if (left >= right)
 248                                 return ;
 249
 250                         Term partition = postings[mid].term;
 251
 252                         for (; ; )
 253                         {
 254                                 while (postings[right].term.CompareTo(partition) > 0)
 255                                         --right;
 256
 257                                 while (left < right && postings[left].term.CompareTo(partition) <= 0)
 258                                         ++left;
 259
 260                                 if (left < right)
 261                                 {
 262                                         Posting tmp = postings[left];
 263                                         postings[left] = postings[right];
 264                                         postings[right] = tmp;
 265                                         --right;
 266                                 }
 267                                 else
 268                                 {
 269                                         break;
 270                                 }
 271                         }
 272
 273                         QuickSort(postings, lo, left);
 274                         QuickSort(postings, left + 1, hi);
 275                 }
 276
 277                 private void  WritePostings(Posting[] postings, System.String segment)
 278                 {
 279                         OutputStream freq = null, prox = null;
 280                         TermInfosWriter tis = null;
 281                         TermVectorsWriter termVectorWriter = null;
 282                         try
 283                         {
 284                                 //open files for inverse index storage
 285                                 freq = directory.CreateFile(segment + ".frq");
 286                                 prox = directory.CreateFile(segment + ".prx");
 287                                 tis = new TermInfosWriter(directory, segment, fieldInfos);
 288                                 TermInfo ti = new TermInfo();
 289                                 System.String currentField = null;
 290
 291                                 for (int i = 0; i < postings.Length; i++)
 292                                 {
 293                                         Posting posting = postings[i];
 294
 295                                         // add an entry to the dictionary with pointers to prox and freq files
 296                                         ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
 297                                         tis.Add(posting.term, ti);
 298
 299                                         // add an entry to the freq file
 300                                         int postingFreq = posting.freq;
 301                                         if (postingFreq == 1)
 302                                         // optimize freq=1
 303                                                 freq.WriteVInt(1);
 304                                         // set low bit of doc num.
 305                                         else
 306                                         {
 307                                                 freq.WriteVInt(0); // the document number
 308                                                 freq.WriteVInt(postingFreq); // frequency in doc
 309                                         }
 310
 311                                         int lastPosition = 0; // write positions
 312                                         int[] positions = posting.positions;
 313                                         for (int j = 0; j < postingFreq; j++)
 314                                         {
 315                                                 // use delta-encoding
 316                                                 int position = positions[j];
 317                                                 prox.WriteVInt(position - lastPosition);
 318                                                 lastPosition = position;
 319                                         }
 320                                         // check to see if we switched to a new Field
 321                                         System.String termField = posting.term.Field();
 322                                         if ((System.Object) currentField != (System.Object) termField)
 323                                         {
 324                                                 // changing Field - see if there is something to save
 325                                                 currentField = termField;
 326                                                 FieldInfo fi = fieldInfos.FieldInfo(currentField);
 327                                                 if (fi.storeTermVector)
 328                                                 {
 329                                                         if (termVectorWriter == null)
 330                                                         {
 331                                                                 termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
 332                                                                 termVectorWriter.OpenDocument();
 333                                                         }
 334                                                         termVectorWriter.OpenField(currentField);
 335                                                 }
 336                                                 else if (termVectorWriter != null)
 337                                                 {
 338                                                         termVectorWriter.CloseField();
 339                                                 }
 340                                         }
 341                                         if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
 342                                         {
 343                                                 termVectorWriter.AddTerm(posting.term.Text(), postingFreq);
 344                                         }
 345                                 }
 346                                 if (termVectorWriter != null)
 347                                         termVectorWriter.CloseDocument();
 348                         }
 349                         finally
 350                         {
 351                                 // make an effort to close all streams we can but remember and re-throw
 352                                 // the first exception encountered in this process
 353                                 System.IO.IOException keep = null;
 354                                 if (freq != null)
 355                                         try
 356                                         {
 357                                                 freq.Close();
 358                                         }
 359                                         catch (System.IO.IOException e)
 360                                         {
 361                                                 if (keep == null)
 362                                                         keep = e;
 363                                         }
 364                                 if (prox != null)
 365                                         try
 366                                         {
 367                                                 prox.Close();
 368                                         }
 369                                         catch (System.IO.IOException e)
 370                                         {
 371                                                 if (keep == null)
 372                                                         keep = e;
 373                                         }
 374                                 if (tis != null)
 375                                         try
 376                                         {
 377                                                 tis.Close();
 378                                         }
 379                                         catch (System.IO.IOException e)
 380                                         {
 381                                                 if (keep == null)
 382                                                         keep = e;
 383                                         }
 384                                 if (termVectorWriter != null)
 385                                         try
 386                                         {
 387                                                 termVectorWriter.Close();
 388                                         }
 389                                         catch (System.IO.IOException e)
 390                                         {
 391                                                 if (keep == null)
 392                                                         keep = e;
 393                                         }
 394                                 if (keep != null)
 395                                 {
 396                     throw new System.IO.IOException(keep.StackTrace);
 397                                 }
 398                         }
 399                 }
 400
 401                 private void  WriteNorms(Document doc, System.String segment)
 402                 {
 403                         for (int n = 0; n < fieldInfos.Size(); n++)
 404                         {
 405                                 FieldInfo fi = fieldInfos.FieldInfo(n);
 406                                 if (fi.isIndexed)
 407                                 {
 408                                         float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
 409                                         OutputStream norms = directory.CreateFile(segment + ".f" + n);
 410                                         try
 411                                         {
 412                                                 norms.WriteByte(Lucene.Net.Search.Similarity.EncodeNorm(norm));
 413                                         }
 414                                         finally
 415                                         {
 416                                                 norms.Close();
 417                                         }
 418                                 }
 419                         }
 420                 }
 421         }
 422
 423         sealed class Posting
 424         {
 425                 // info about a Term in a doc
 426                 internal Term term; // the Term
 427                 internal int freq; // its frequency in doc
 428                 internal int[] positions; // positions it occurs at
 429
 430                 internal Posting(Term t, int position)
 431                 {
 432                         term = t;
 433                         freq = 1;
 434                         positions = new int[1];
 435                         positions[0] = position;
 436                 }
 437         }
 438 }