Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / NameIndex.cs
blobacd0c08196945756f908f766b10a104ed45aed6f
1 //
2 // NameIndex.cs
3 //
4 // Copyright (C) 2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // Remember when I said that only LuceneDriver.cs should be the only source
29 // code that knew about Lucene internals? I lied.
32 using System;
33 using System.Collections;
34 using System.IO;
36 using Lucene.Net.Analysis;
37 using Lucene.Net.Analysis.Standard;
38 using Lucene.Net.Documents;
39 using Lucene.Net.Index;
40 using Lucene.Net.QueryParsers;
41 using LNS = Lucene.Net.Search;
43 using Beagle.Util;
45 namespace Beagle.Daemon {
47 public class NameIndex {
49 static public bool Debug = false;
51 // This is just a standard analyzer combined with the Porter stemmer.
52 // FIXME: This assumes everything being indexed is in English!
53 private class NameAnalyzer : StandardAnalyzer {
54 public override TokenStream TokenStream (String fieldName, TextReader reader)
56 TokenStream outstream = base.TokenStream (fieldName, reader);
57 outstream = new PorterStemFilter (outstream);
58 return outstream;
62 static Analyzer analyzer;
64 // If Name is null, this is a removal, otherwise it is an add.
65 private class PendingOperation {
66 public Guid UniqueId;
67 public string Name;
70 const int VERSION = 1;
72 Lucene.Net.Store.FSDirectory store;
74 Hashtable pending = new Hashtable ();
76 int adds_since_last_optimize = 0;
77 const int optimize_threshold = 5000;
79 static NameIndex ()
81 analyzer = new NameAnalyzer ();
84 public NameIndex (string directory, string fingerprint)
86 string top_dir = Path.Combine (directory, "NameIndex");
87 string index_dir = Path.Combine (top_dir, "Index");
88 string lock_dir = Path.Combine (top_dir, "Locks");
89 string version_file = Path.Combine (top_dir, "version");
90 string fingerprint_file = Path.Combine (top_dir, "fingerprint");
91 string index_test_file = Path.Combine (index_dir, "segments");
93 bool version_exists = File.Exists (version_file);
94 bool fingerprint_exists = File.Exists (fingerprint_file);
95 bool index_exists = File.Exists (index_test_file);
97 // Check the index's version number. If it is wrong,
98 // declare the index non-existent.
99 if (version_exists && index_exists) {
100 StreamReader sr = new StreamReader (version_file);
101 string version_str = sr.ReadLine ();
102 sr.Close ();
104 if (version_str != Convert.ToString (VERSION))
105 index_exists = false;
108 // Check the fingerprint. If it is wrong, declare the
109 // index non-existent.
110 if (index_exists && fingerprint_exists) {
111 StreamReader sr = new StreamReader (fingerprint_file);
112 string fingerprint_from_file = sr.ReadLine ();
113 sr.Close ();
114 if (fingerprint == null) {
115 fingerprint = fingerprint_from_file;
116 index_exists = true;
117 } else if (fingerprint_from_file != fingerprint)
118 index_exists = false;
119 } else {
120 index_exists = false;
123 // If our index doesn't exist, purge and rebuild the directory
124 // structure.
125 if (! index_exists) {
127 if (Directory.Exists (top_dir)) {
128 Logger.Log.Debug ("Purging {0}", top_dir);
129 Directory.Delete (top_dir, true);
132 Directory.CreateDirectory (top_dir);
133 Directory.CreateDirectory (index_dir);
134 Directory.CreateDirectory (lock_dir);
136 StreamWriter sw = new StreamWriter (fingerprint_file, false);
137 sw.WriteLine (fingerprint);
138 sw.Close ();
140 sw = new StreamWriter (version_file, false);
141 sw.WriteLine (VERSION);
142 sw.Close ();
146 store = Lucene.Net.Store.FSDirectory.GetDirectory (index_dir, lock_dir, false);
148 if (! index_exists) {
149 // This creates the index if it doesn't exist
150 IndexWriter writer = new IndexWriter (store, null, true);
151 writer.Close ();
155 private Document ToLuceneDocument (PendingOperation p)
157 Document doc;
158 Field f;
160 doc = new Document ();
162 f = Field.Keyword ("Uid", GuidFu.ToShortString (p.UniqueId));
163 doc.Add (f);
165 f = Field.Text ("Name", p.Name);
166 doc.Add (f);
168 string name_noext = Path.GetFileNameWithoutExtension (p.Name);
169 if (name_noext != p.Name) {
170 f = Field.UnStored ("Name", name_noext);
171 doc.Add (f);
174 string name_split = String.Join (" ", StringFu.FuzzySplit (name_noext));
175 if (name_split != name_noext && name_split != p.Name) {
176 f= Field.UnStored ("Name", name_split);
177 doc.Add (f);
180 return doc;
184 public void Add (Guid unique_id, string name)
186 if (unique_id == Guid.Empty) {
187 string msg = String.Format ("Attempt to add '{0}' to the NameIndex with unique_id=Guid.Empty", name);
188 //throw new Exception (msg);
189 Logger.Log.Debug (msg);
190 return;
193 if (Debug && name != null)
194 Logger.Log.Debug ("NameIndex.Add: {0} '{1}'",
195 GuidFu.ToShortString (unique_id), name);
197 PendingOperation p = new PendingOperation ();
198 p.UniqueId = unique_id;
199 p.Name = name;
200 pending [p.UniqueId] = p;
204 public void Remove (Guid unique_id)
206 if (unique_id == Guid.Empty) {
207 string msg = "Attempt to remove unique_id=Guid.Empty from the NameIndex";
208 //throw new Exception ("Attempt to remove unique_id=Guid.Empty from the NameIndex");
209 Logger.Log.Debug (msg);
210 return;
213 if (Debug)
214 Logger.Log.Debug ("NameIndex.Remove: {0}",
215 GuidFu.ToShortString (unique_id));
217 Add (unique_id, null);
220 public void Flush ()
222 if (pending.Count == 0) {
223 if (Debug)
224 Logger.Log.Debug ("NameIndex.Flush: nothing to do");
225 return;
228 if (Debug)
229 Logger.Log.Debug ("NameIndex.Flush: starting");
231 Stopwatch sw = new Stopwatch ();
232 sw.Start ();
234 // This code:
235 // (1) Makes sure there is only one record per uid for things we are adding
236 // (2) Deletes rid of things we are removing
237 IndexReader reader = IndexReader.Open (store);
238 foreach (PendingOperation p in pending.Values) {
239 Term term = new Term ("Uid", GuidFu.ToShortString (p.UniqueId));
240 reader.Delete (term);
242 reader.Close ();
245 bool did_optimize = false;
246 IndexWriter writer = new IndexWriter (store, analyzer, false);
248 foreach (PendingOperation p in pending.Values) {
250 if (p.Name == null)
251 continue;
253 Document doc = ToLuceneDocument (p);
254 writer.AddDocument (doc);
256 ++adds_since_last_optimize;
259 // FIXME: What should be the correct policy for optimizing this index?
260 if (adds_since_last_optimize > optimize_threshold) {
261 writer.Optimize ();
262 adds_since_last_optimize = 0;
263 did_optimize = true;
266 writer.Close ();
268 sw.Stop ();
270 if (Debug)
271 Logger.Log.Debug ("NameIndex.Flush: Add{0} of {1} took {2}",
272 did_optimize ? "+Optimize" : "",
273 pending.Count,
274 sw);
276 pending.Clear ();
279 ///////////////////////////////////////////////////////////////////////////////////////////
281 static private LNS.Query NewTokenizedQuery (string field, string text)
283 ArrayList tokens = new ArrayList ();
285 // Use the analyzer to extract the query's tokens.
286 // This code is taken from Lucene's query parser.
287 // We use the standard Analyzer.
288 TokenStream source = analyzer.TokenStream (field, new StringReader (text));
289 while (true) {
290 Lucene.Net.Analysis.Token t;
291 try {
292 t = source.Next ();
293 } catch (IOException) {
294 t = null;
296 if (t == null)
297 break;
298 tokens.Add (t.TermText ());
300 try {
301 source.Close ();
302 } catch (IOException) {
303 // ignore
306 LNS.Query q = null;
307 if (tokens.Count == 1) {
308 Term t = new Term (field, (string) tokens [0]);
309 q = new LNS.TermQuery (t);
310 } else if (tokens.Count > 1) {
311 q = new LNS.PhraseQuery ();
312 foreach (string tokenStr in tokens) {
313 Term t = new Term (field, tokenStr);
314 ((LNS.PhraseQuery) q).Add (t);
318 return q;
321 static public LNS.Query ToUidQuery (ICollection list_of_uris)
323 if (list_of_uris == null || list_of_uris.Count == 0)
324 return null;
326 LNS.BooleanQuery query = new LNS.BooleanQuery ();
327 int max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
328 int clause_count = 0;
330 foreach (Uri uri in list_of_uris) {
332 // The localpath of a uid: Uri is the short-string version
333 // of the Guid.
334 Term term = new Term ("Uid", uri.LocalPath);
336 LNS.Query term_query = new LNS.TermQuery (term);
337 query.Add (term_query, false, false);
338 ++clause_count;
339 // If we have to many clases, nest the queries
340 if (clause_count == max_clauses) {
341 LNS.BooleanQuery new_query = new LNS.BooleanQuery ();
342 new_query.Add (query, false, false);
343 query = new_query;
344 clause_count = 1;
348 return query;
352 static private LNS.Query ToLuceneQuery (Query query, ICollection uris_to_search)
354 if (query.Text.Count == 0)
355 return null;
357 LNS.BooleanQuery lucene_query = new LNS.BooleanQuery ();
358 bool used_any_part = false;
360 foreach (QueryPart abstract_part in query.Parts) {
361 if (abstract_part is QueryPart_Text) {
362 QueryPart_Text part = (QueryPart_Text) abstract_part;
363 LNS.Query part_query;
364 part_query = NewTokenizedQuery ("Name", part.Text);
365 if (part_query != null) {
366 lucene_query.Add (part_query,
367 // FIXME: This is wrong.
368 part.Logic == QueryPartLogic.Required,
369 part.Logic == QueryPartLogic.Prohibited);
370 used_any_part = true;
375 if (! used_any_part)
376 return null;
378 // If a list of Uris is specified, we must match one of them.
379 LNS.Query uid_query = ToUidQuery (uris_to_search);
380 if (uid_query != null) {
381 LNS.BooleanQuery combined_query = new LNS.BooleanQuery ();
382 combined_query.Add (lucene_query, true, false);
383 combined_query.Add (uid_query, true, false);
384 lucene_query = combined_query;
387 return lucene_query;
390 // Return a collection of uid: Uris.
391 public ICollection Search (Query query, ICollection uris_to_search)
393 LNS.Query lucene_query = ToLuceneQuery (query, uris_to_search);
394 if (lucene_query == null)
395 return new string [0];
397 LNS.Searcher searcher = new LNS.IndexSearcher (store);
398 LNS.Hits hits = searcher.Search (lucene_query);
400 int n_hits = hits.Length ();
401 Uri [] uids = new Uri [n_hits];
403 for (int i = 0; i < n_hits; ++i) {
404 Document doc = hits.Doc (i);
405 uids [i] = GuidFu.FromShortStringToUri (doc.Get ("Uid"));
408 searcher.Close ();
410 return uids;
413 //////////////////////////////////////////////////////////////////////////////////
415 // Pull data out of the NameIndex in bulk -- useful for sanity checks
416 // and debugging
418 public struct Record {
419 public Guid UniqueId;
420 public string Name;
423 public Record [] GetManyByUniqueId (Guid [] unique_ids)
425 LNS.BooleanQuery query = new LNS.BooleanQuery ();
426 int max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
427 int clause_count = 0;
429 foreach (Guid uid in unique_ids) {
430 Term term = new Term ("Uid", GuidFu.ToShortString (uid));
431 LNS.Query term_query = new LNS.TermQuery (term);
432 query.Add (term_query, false, false);
433 ++clause_count;
434 // If we have to many clases, nest the queries
435 if (clause_count == max_clauses) {
436 LNS.BooleanQuery new_query = new LNS.BooleanQuery ();
437 new_query.Add (query, false, false);
438 query = new_query;
439 clause_count = 1;
443 LNS.Searcher searcher = new LNS.IndexSearcher (store);
444 LNS.Hits hits = searcher.Search (query);
445 int n_hits = hits.Length ();
447 Record [] records = new Record [n_hits];
448 for (int i = 0; i < n_hits; ++i) {
449 Document doc = hits.Doc (i);
450 records [i].UniqueId = GuidFu.FromShortString (doc.Get ("Uid"));
451 records [i].Name = doc.Get ("Name");
454 searcher.Close ();
456 return records;
459 //////////////////////////////////////////////////////////////////////////////////
461 public void SpewIndex ()
463 IndexReader reader = IndexReader.Open (store);
464 int N = reader.MaxDoc ();
466 for (int i = 0; i < N; ++i) {
467 if (! reader.IsDeleted (i)) {
468 Document doc = reader.Document (i);
469 Console.WriteLine (doc.Get ("Uid"));
473 reader.Close ();