Tokenize 001234 as 1234. Include a testing function in NoiseFilter to figure out...
[beagle.git] / beagled / DumpIndex.cs
blob6ab447e59541388c29f0cfd94640344121327220
1 //
2 // DumpIndex.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in all
16 // copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 // SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Net;
33 using Beagle;
34 using Beagle.Util;
35 using Beagle.Daemon;
36 using Hit = Beagle.Hit;
38 using Lucene.Net.Index;
39 using Lucene.Net.Search;
40 using Lucene.Net.Documents;
42 class DumpIndexTool {
44 public class HitByUriComparer : IComparer {
46 public int Compare (object a, object b)
48 // All of this mapping to and from strings is dreadful.
49 return String.Compare (((Hit) a).Uri.ToString (), ((Hit) b).Uri.ToString ());
53 static string RemapUriToPath (Hashtable all_hits_by_uri, Hit hit)
55 string exact_name = hit.GetFirstProperty ("beagle:ExactFilename");
56 string parent_uri_str = hit.GetFirstProperty ("_private:ParentDirUri");
58 if (parent_uri_str == null)
59 return exact_name;
60 else
61 return Path.Combine (RemapUriToPath (all_hits_by_uri, (Hit) all_hits_by_uri [parent_uri_str]),
62 exact_name);
65 static int DumpOneIndex_Metadata (string index_name, bool only_dump_the_urls)
67 Console.WriteLine (); // a visual cue that something has changed
68 LuceneQueryingDriver driver;
69 driver = new LuceneQueryingDriver (index_name, -1, true);
71 Hashtable all_hits_by_uri;
72 all_hits_by_uri = driver.GetAllHitsByUri ();
74 ArrayList all_hits;
75 all_hits = new ArrayList (all_hits_by_uri.Values);
77 if (index_name == "FileSystemIndex") // A hard-wired hack
78 foreach (Hit hit in all_hits)
79 hit.Uri = UriFu.PathToFileUri (RemapUriToPath (all_hits_by_uri, hit));
81 all_hits.Sort (new HitByUriComparer ());
83 foreach (Hit hit in all_hits) {
85 if (only_dump_the_urls) {
86 Console.WriteLine ("{0}: {1}", index_name, hit.Uri);
87 continue;
90 Console.WriteLine (" Index: {0}", index_name);
91 Console.WriteLine (" Uri: {0}", hit.Uri);
92 if (hit.ParentUri != null)
93 Console.WriteLine ("Parent: {0}", hit.ParentUri);
94 Console.WriteLine (" MimeT: {0}", hit.MimeType);
95 Console.WriteLine (" Type: {0}", hit.Type);
97 ArrayList props;
98 props = new ArrayList (hit.Properties);
99 props.Sort ();
100 foreach (Property prop in props)
101 if (! prop.Key.StartsWith ("_private:"))
102 Console.WriteLine (" Prop: {0} = '{1}'", prop.Key, prop.Value);
104 Console.WriteLine ();
107 return all_hits.Count;
110 static Term initial_enum_term;
111 // Dump the term frequencies: we do this via direct Lucene access.
112 static void DumpOneIndex_TermFrequencies (string index_name)
114 LuceneQueryingDriver driver;
115 driver = new LuceneQueryingDriver (index_name, -1, true);
117 IndexReader reader;
118 reader = IndexReader.Open (driver.PrimaryStore);
120 TermEnum term_enum;
121 term_enum = reader.Terms (initial_enum_term);
123 int distinct_term_count = 0;
124 int term_count = 0;
126 // from LuceneFAQ
127 // Terms are sorted first by field, then by text
128 // so all terms with a given field are adjacent in enumerations.
129 if (term_enum.Term () != null) {
130 while (term_enum.Term().Field() == "Text") {
131 int freq;
132 freq = term_enum.DocFreq ();
134 Console.WriteLine ("{0} {1} {2}", index_name, term_enum.Term ().Text (), freq);
136 // FIXME: spew these as a count
137 ++distinct_term_count;
138 term_count += freq;
140 if (!term_enum.Next ())
141 break;
145 term_enum.Close ();
146 reader.Close ();
148 Console.WriteLine ();
151 public class IndexInfo : IComparable {
152 public string Name;
153 public int Count;
155 public IndexInfo (string name)
157 Name = name;
160 public int CompareTo (object obj)
162 IndexInfo other = (IndexInfo) obj;
163 return String.Compare (this.Name, other.Name);
167 static void DumpIndexInformation (Mode mode, bool show_counts)
169 ArrayList index_info_list;
170 index_info_list = new ArrayList ();
172 DirectoryInfo dir;
173 dir = new DirectoryInfo (PathFinder.IndexDir);
174 foreach (DirectoryInfo subdir in dir.GetDirectories ())
175 index_info_list.Add (new IndexInfo (subdir.Name));
177 index_info_list.Sort ();
179 bool set_counts = false;
181 if (mode == Mode.TermFrequencies)
182 initial_enum_term = new Term ("Text", "");
184 foreach (IndexInfo info in index_info_list) {
185 if (mode == Mode.Uris || mode == Mode.Properties) {
186 info.Count = DumpOneIndex_Metadata (info.Name, mode == Mode.Uris);
187 set_counts = true;
188 } else {
189 DumpOneIndex_TermFrequencies (info.Name);
193 if (show_counts && set_counts) {
194 Console.WriteLine ();
195 Console.WriteLine ("FINAL COUNTS");
197 foreach (IndexInfo info in index_info_list)
198 Console.WriteLine ("{0} {1}", info.Count.ToString ().PadLeft (7), info.Name);
202 class DummyQueryResult : IQueryResult {
203 public void Add (ICollection hits)
207 public void Add (ICollection hits, int total_results)
211 public void Subtract (ICollection hits)
216 static void DumpFileIndexInformation (string path, string indexdir)
218 //Uri uri = UriFu.PathToFileUri (path);
219 //Console.WriteLine ("Dumping information about:" + uri.AbsolutePath);
220 //path = uri.AbsolutePath;
221 if ((! File.Exists (path)) && (! Directory.Exists (path))) {
222 Console.WriteLine ("No such file or directory:" + path);
223 return;
226 if (indexdir == null)
227 // default is ~/.beagle/Indexes/FileSystemIndex
228 indexdir = Path.Combine (PathFinder.IndexDir, "FileSystemIndex");
229 if (! Directory.Exists (indexdir)) {
230 Console.WriteLine ("Index:{0} doesnt exist.", indexdir);
231 return;
234 // get fingerprint
235 TextReader reader;
236 reader = new StreamReader (Path.Combine (indexdir, "fingerprint"));
237 string fingerprint = reader.ReadLine ();
238 reader.Close ();
239 //Console.WriteLine ("Read fingerprint:" + fingerprint);
241 // find out uid
242 FileAttributesStore fa_store = new FileAttributesStore (new FileAttributesStore_Mixed (indexdir, fingerprint));
243 Beagle.Daemon.FileAttributes attr = fa_store.Read (path);
244 if (attr == null) {
245 Console.WriteLine ("No information about this file in index. Ignoring.");
246 return;
248 string uri_string = "uid:" + GuidFu.ToShortString (attr.UniqueId);
249 Console.WriteLine ("Uri = " + uri_string);
250 //Console.WriteLine ("FilterName:" + attr.FilterName);
251 Console.WriteLine ("LastAttrTime:" + attr.LastAttrTime);
252 Console.WriteLine ("LastWriteTime:" + attr.LastWriteTime);
254 LuceneQueryingDriver driver;
255 driver = new LuceneQueryingDriver (indexdir, -1, true);
258 // first try for the Uri:"uid:xxxxxxxxxxxxxxx"
259 Lucene.Net.Search.Query query = new TermQuery(new Term("Uri", uri_string));
260 if (DoQuery (driver, query))
261 return;
263 // else query by path - this is for static indexes
264 path = UriFu.PathToFileUriString (path);
265 Console.WriteLine ("Querying by:[" + path + "]");
266 query = new TermQuery(new Term("Uri", path));
267 DoQuery (driver, query);
271 static bool DoQuery (LuceneQueryingDriver driver, Lucene.Net.Search.Query query)
273 IndexSearcher primary_searcher = LuceneCommon.GetSearcher (driver.PrimaryStore);
274 IndexSearcher secondary_searcher = LuceneCommon.GetSearcher (driver.SecondaryStore);
276 Hits primary_hits = primary_searcher.Search(query);
277 Hits secondary_hits = secondary_searcher.Search (query);
278 Console.WriteLine ("{0} hits from primary store; {1} hits from secondary store", primary_hits.Length (), secondary_hits.Length ());
280 Document primary_doc, secondary_doc;
281 // there should be exactly one primary hit and 0/1 secondary hit
282 if (primary_hits.Length () == 1) {
283 primary_doc = primary_hits.Doc (0);
284 Console.WriteLine (
285 "-----------------------------------------[ Immutable data ]--------------------------------------");
286 foreach (Field f in primary_doc.Fields ()) {
288 String name = f.Name ();
289 String val = f.StringValue ();
290 bool stored = f.IsStored ();
291 bool searchable = (val [0] == 's');
292 bool tokenized = f.IsTokenized();
293 if (name.Length >= 7 && name.StartsWith ("prop:"))
294 tokenized = (name [5] != 't');
295 float boost = f.GetBoost();
297 Console.WriteLine ("{0,-30} = [{1}] (stored? {2}, searchable? {3}, tokenized? {4}, boost={5})",
298 name, val, stored, searchable, tokenized, boost);
302 if (secondary_hits.Length () == 1) {
303 secondary_doc = secondary_hits.Doc (0);
304 Console.WriteLine (
305 "------------------------------------------[ Mutable data ]---------------------------------------");
306 foreach (Field f in secondary_doc.Fields ()) {
308 String name = f.Name ();
309 String val = f.StringValue ();
310 bool stored = f.IsStored ();
311 bool searchable = (val [0] == 's');
312 bool tokenized = f.IsTokenized();
313 if (name.Length >= 7 && name.StartsWith ("prop:"))
314 tokenized = (name [5] != 't');
315 float boost = f.GetBoost();
317 Console.WriteLine ("{0,-30} = [{1}] (stored? {2}, searchable? {3}, tokenized? {4}, boost={5})",
318 name, val, stored, searchable, tokenized, boost);
322 LuceneCommon.ReleaseSearcher (primary_searcher);
323 LuceneCommon.ReleaseSearcher (secondary_searcher);
325 if (primary_hits.Length () != 0 || secondary_hits.Length () != 0)
326 return true;
327 else
328 return false;
331 enum Mode {
332 Uris,
333 Properties,
334 TermFrequencies
338 static void Main (string [] args)
340 Mode mode = Mode.Uris;
341 bool show_counts = true;
342 string file = null;
343 string indexdir = null;
345 foreach (string arg in args) {
347 switch (arg) {
349 case "--help":
350 Console.WriteLine (@"
351 beagle-dump-index [options] [ [--indexdir=dir] file]
353 --uris Dump all Uris (default)
354 --properties Dump all properties
355 --term-frequencies Dump term frequencies
357 --show-counts Show index count totals (default)
358 --hide-counts Hide index count totals
360 --indexdir=<index directory>
361 Absolute path of the directory storing the index
362 e.g. /home/user/.beagle/Indexes/FileSystemIndex
363 file Get information in index about this file or directory
365 --help What you just did");
366 Environment.Exit (0);
367 break;
369 case "--uris":
370 mode = Mode.Uris;
371 break;
373 case "--properties":
374 mode = Mode.Properties;
375 break;
377 case "--term-frequencies":
378 mode = Mode.TermFrequencies;
379 break;
381 case "--hide-counts":
382 show_counts = false;
383 break;
385 case "--show-counts":
386 show_counts = false;
387 break;
389 default:
390 if (arg.StartsWith ("--indexdir="))
391 indexdir = arg.Remove (0, 11);
392 else
393 file = arg;
394 break;
398 if (file == null)
399 DumpIndexInformation (mode, show_counts);
400 else
401 DumpFileIndexInformation (file, indexdir);