4 // Copyright (C) 2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // Remember when I said that only LuceneDriver.cs should be the only source
29 // code that knew about Lucene internals? I lied.
33 using System
.Collections
;
36 using Lucene
.Net
.Analysis
;
37 using Lucene
.Net
.Analysis
.Standard
;
38 using Lucene
.Net
.Documents
;
39 using Lucene
.Net
.Index
;
40 using Lucene
.Net
.QueryParsers
;
41 using LNS
= Lucene
.Net
.Search
;
45 namespace Beagle
.Daemon
{
47 public class NameIndex
{
49 static public bool Debug
= false;
51 // This is just a standard analyzer combined with the Porter stemmer.
52 // FIXME: This assumes everything being indexed is in English!
53 private class NameAnalyzer
: StandardAnalyzer
{
54 public override TokenStream
TokenStream (String fieldName
, TextReader reader
)
56 TokenStream outstream
= base.TokenStream (fieldName
, reader
);
57 outstream
= new PorterStemFilter (outstream
);
62 static Analyzer analyzer
;
64 // If Name is null, this is a removal, otherwise it is an add.
65 private class PendingOperation
{
70 const int VERSION
= 1;
72 Lucene
.Net
.Store
.FSDirectory store
;
74 Hashtable pending
= new Hashtable ();
76 int adds_since_last_optimize
= 0;
77 const int optimize_threshold
= 5000;
81 analyzer
= new NameAnalyzer ();
84 public NameIndex (string directory
, string fingerprint
)
86 string top_dir
= Path
.Combine (directory
, "NameIndex");
87 string index_dir
= Path
.Combine (top_dir
, "Index");
88 string lock_dir
= Path
.Combine (top_dir
, "Locks");
89 string version_file
= Path
.Combine (top_dir
, "version");
90 string fingerprint_file
= Path
.Combine (top_dir
, "fingerprint");
91 string index_test_file
= Path
.Combine (index_dir
, "segments");
93 bool version_exists
= File
.Exists (version_file
);
94 bool fingerprint_exists
= File
.Exists (fingerprint_file
);
95 bool index_exists
= File
.Exists (index_test_file
);
97 // Check the index's version number. If it is wrong,
98 // declare the index non-existent.
99 if (version_exists
&& index_exists
) {
100 StreamReader sr
= new StreamReader (version_file
);
101 string version_str
= sr
.ReadLine ();
104 if (version_str
!= Convert
.ToString (VERSION
))
105 index_exists
= false;
108 // Check the fingerprint. If it is wrong, declare the
109 // index non-existent.
110 if (index_exists
&& fingerprint_exists
) {
111 StreamReader sr
= new StreamReader (fingerprint_file
);
112 string fingerprint_from_file
= sr
.ReadLine ();
114 if (fingerprint
== null) {
115 fingerprint
= fingerprint_from_file
;
117 } else if (fingerprint_from_file
!= fingerprint
)
118 index_exists
= false;
120 index_exists
= false;
123 // If our index doesn't exist, purge and rebuild the directory
125 if (! index_exists
) {
127 if (Directory
.Exists (top_dir
)) {
128 Logger
.Log
.Debug ("Purging {0}", top_dir
);
129 Directory
.Delete (top_dir
, true);
132 Directory
.CreateDirectory (top_dir
);
133 Directory
.CreateDirectory (index_dir
);
134 Directory
.CreateDirectory (lock_dir
);
136 StreamWriter sw
= new StreamWriter (fingerprint_file
, false);
137 sw
.WriteLine (fingerprint
);
140 sw
= new StreamWriter (version_file
, false);
141 sw
.WriteLine (VERSION
);
146 store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (index_dir
, lock_dir
, false);
148 if (! index_exists
) {
149 // This creates the index if it doesn't exist
150 IndexWriter writer
= new IndexWriter (store
, null, true);
155 private Document
ToLuceneDocument (PendingOperation p
)
160 doc
= new Document ();
162 f
= Field
.Keyword ("Uid", GuidFu
.ToShortString (p
.UniqueId
));
165 f
= Field
.Text ("Name", p
.Name
);
168 string name_noext
= Path
.GetFileNameWithoutExtension (p
.Name
);
169 if (name_noext
!= p
.Name
) {
170 f
= Field
.UnStored ("Name", name_noext
);
174 string name_split
= String
.Join (" ", StringFu
.FuzzySplit (name_noext
));
175 if (name_split
!= name_noext
&& name_split
!= p
.Name
) {
176 f
= Field
.UnStored ("Name", name_split
);
184 public void Add (Guid unique_id
, string name
)
186 if (unique_id
== Guid
.Empty
) {
187 string msg
= String
.Format ("Attempt to add '{0}' to the NameIndex with unique_id=Guid.Empty", name
);
188 //throw new Exception (msg);
189 Logger
.Log
.Debug (msg
);
193 if (Debug
&& name
!= null)
194 Logger
.Log
.Debug ("NameIndex.Add: {0} '{1}'",
195 GuidFu
.ToShortString (unique_id
), name
);
197 PendingOperation p
= new PendingOperation ();
198 p
.UniqueId
= unique_id
;
200 pending
[p
.UniqueId
] = p
;
204 public void Remove (Guid unique_id
)
206 if (unique_id
== Guid
.Empty
) {
207 string msg
= "Attempt to remove unique_id=Guid.Empty from the NameIndex";
208 //throw new Exception ("Attempt to remove unique_id=Guid.Empty from the NameIndex");
209 Logger
.Log
.Debug (msg
);
214 Logger
.Log
.Debug ("NameIndex.Remove: {0}",
215 GuidFu
.ToShortString (unique_id
));
217 Add (unique_id
, null);
222 if (pending
.Count
== 0) {
224 Logger
.Log
.Debug ("NameIndex.Flush: nothing to do");
229 Logger
.Log
.Debug ("NameIndex.Flush: starting");
231 Stopwatch sw
= new Stopwatch ();
235 // (1) Makes sure there is only one record per uid for things we are adding
236 // (2) Deletes rid of things we are removing
237 IndexReader reader
= IndexReader
.Open (store
);
238 foreach (PendingOperation p
in pending
.Values
) {
239 Term term
= new Term ("Uid", GuidFu
.ToShortString (p
.UniqueId
));
240 reader
.Delete (term
);
245 bool did_optimize
= false;
246 IndexWriter writer
= new IndexWriter (store
, analyzer
, false);
248 foreach (PendingOperation p
in pending
.Values
) {
253 Document doc
= ToLuceneDocument (p
);
254 writer
.AddDocument (doc
);
256 ++adds_since_last_optimize
;
259 // FIXME: What should be the correct policy for optimizing this index?
260 if (adds_since_last_optimize
> optimize_threshold
) {
262 adds_since_last_optimize
= 0;
271 Logger
.Log
.Debug ("NameIndex.Flush: Add{0} of {1} took {2}",
272 did_optimize
? "+Optimize" : "",
279 ///////////////////////////////////////////////////////////////////////////////////////////
281 static private LNS
.Query
NewTokenizedQuery (string field
, string text
)
283 ArrayList tokens
= new ArrayList ();
285 // Use the analyzer to extract the query's tokens.
286 // This code is taken from Lucene's query parser.
287 // We use the standard Analyzer.
288 TokenStream source
= analyzer
.TokenStream (field
, new StringReader (text
));
290 Lucene
.Net
.Analysis
.Token t
;
293 } catch (IOException
) {
298 tokens
.Add (t
.TermText ());
302 } catch (IOException
) {
307 if (tokens
.Count
== 1) {
308 Term t
= new Term (field
, (string) tokens
[0]);
309 q
= new LNS
.TermQuery (t
);
310 } else if (tokens
.Count
> 1) {
311 q
= new LNS
.PhraseQuery ();
312 foreach (string tokenStr
in tokens
) {
313 Term t
= new Term (field
, tokenStr
);
314 ((LNS
.PhraseQuery
) q
).Add (t
);
321 static public LNS
.Query
ToUidQuery (ICollection list_of_uris
)
323 if (list_of_uris
== null || list_of_uris
.Count
== 0)
326 LNS
.BooleanQuery query
= new LNS
.BooleanQuery ();
327 int max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
328 int clause_count
= 0;
330 foreach (Uri uri
in list_of_uris
) {
332 // The localpath of a uid: Uri is the short-string version
334 Term term
= new Term ("Uid", uri
.LocalPath
);
336 LNS
.Query term_query
= new LNS
.TermQuery (term
);
337 query
.Add (term_query
, false, false);
339 // If we have to many clases, nest the queries
340 if (clause_count
== max_clauses
) {
341 LNS
.BooleanQuery new_query
= new LNS
.BooleanQuery ();
342 new_query
.Add (query
, false, false);
352 static private LNS
.Query
ToLuceneQuery (Query query
, ICollection uris_to_search
)
354 if (query
.Text
.Count
== 0)
357 LNS
.BooleanQuery lucene_query
= new LNS
.BooleanQuery ();
358 bool used_any_part
= false;
360 foreach (QueryPart abstract_part
in query
.Parts
) {
361 if (abstract_part
is QueryPart_Text
) {
362 QueryPart_Text part
= (QueryPart_Text
) abstract_part
;
363 LNS
.Query part_query
;
364 part_query
= NewTokenizedQuery ("Name", part
.Text
);
365 if (part_query
!= null) {
366 lucene_query
.Add (part_query
,
367 // FIXME: This is wrong.
368 part
.Logic
== QueryPartLogic
.Required
,
369 part
.Logic
== QueryPartLogic
.Prohibited
);
370 used_any_part
= true;
378 // If a list of Uris is specified, we must match one of them.
379 LNS
.Query uid_query
= ToUidQuery (uris_to_search
);
380 if (uid_query
!= null) {
381 LNS
.BooleanQuery combined_query
= new LNS
.BooleanQuery ();
382 combined_query
.Add (lucene_query
, true, false);
383 combined_query
.Add (uid_query
, true, false);
384 lucene_query
= combined_query
;
390 // Return a collection of uid: Uris.
391 public ICollection
Search (Query query
, ICollection uris_to_search
)
393 LNS
.Query lucene_query
= ToLuceneQuery (query
, uris_to_search
);
394 if (lucene_query
== null)
395 return new string [0];
397 LNS
.Searcher searcher
= new LNS
.IndexSearcher (store
);
398 LNS
.Hits hits
= searcher
.Search (lucene_query
);
400 int n_hits
= hits
.Length ();
401 Uri
[] uids
= new Uri
[n_hits
];
403 for (int i
= 0; i
< n_hits
; ++i
) {
404 Document doc
= hits
.Doc (i
);
405 uids
[i
] = GuidFu
.FromShortStringToUri (doc
.Get ("Uid"));
413 //////////////////////////////////////////////////////////////////////////////////
415 // Pull data out of the NameIndex in bulk -- useful for sanity checks
418 public struct Record
{
419 public Guid UniqueId
;
423 public Record
[] GetManyByUniqueId (Guid
[] unique_ids
)
425 LNS
.BooleanQuery query
= new LNS
.BooleanQuery ();
426 int max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
427 int clause_count
= 0;
429 foreach (Guid uid
in unique_ids
) {
430 Term term
= new Term ("Uid", GuidFu
.ToShortString (uid
));
431 LNS
.Query term_query
= new LNS
.TermQuery (term
);
432 query
.Add (term_query
, false, false);
434 // If we have to many clases, nest the queries
435 if (clause_count
== max_clauses
) {
436 LNS
.BooleanQuery new_query
= new LNS
.BooleanQuery ();
437 new_query
.Add (query
, false, false);
443 LNS
.Searcher searcher
= new LNS
.IndexSearcher (store
);
444 LNS
.Hits hits
= searcher
.Search (query
);
445 int n_hits
= hits
.Length ();
447 Record
[] records
= new Record
[n_hits
];
448 for (int i
= 0; i
< n_hits
; ++i
) {
449 Document doc
= hits
.Doc (i
);
450 records
[i
].UniqueId
= GuidFu
.FromShortString (doc
.Get ("Uid"));
451 records
[i
].Name
= doc
.Get ("Name");
459 //////////////////////////////////////////////////////////////////////////////////
461 public void SpewIndex ()
463 IndexReader reader
= IndexReader
.Open (store
);
464 int N
= reader
.MaxDoc ();
466 for (int i
= 0; i
< N
; ++i
) {
467 if (! reader
.IsDeleted (i
)) {
468 Document doc
= reader
.Document (i
);
469 Console
.WriteLine (doc
.Get ("Uid"));