Add --enable-deletion option to buildindex. If used, buildindex will remove deleted...
[beagle.git] / beagled / KonqHistoryQueryable / KonqQueryable.cs
blob76db5b4f0a016341fdf2f93b37f44c5dcd9c8a19
1 //
2 // KonqQueryable.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera
5 //
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a
8 // copy of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the
12 // Software is furnished to do so, subject to the following conditions:
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 // DEALINGS IN THE SOFTWARE.
26 using System;
27 using System.IO;
28 using System.Collections;
29 using System.Threading;
30 using System.Text;
32 using Beagle.Daemon;
33 using Beagle.Util;
35 namespace Beagle.Daemon.KonqQueryable {
37 [QueryableFlavor (Name="KonquerorHistory", Domain=QueryDomain.Local, RequireInotify=false)]
38 public class KonqQueryable : LuceneFileQueryable, IIndexableGenerator {
40 private static Logger log = Logger.Get ("KonqQueryable");
42 string konq_cache_dir;
43 private IEnumerator directory_enumerator = null;
44 private int polling_interval_in_seconds = 300; // 5 min
46 // ISO-Latin1 is 28591
47 private Encoding latin_encoding = Encoding.GetEncoding (28591);
49 public KonqQueryable () : base ("KonqHistoryIndex")
51 /* How to determine kio-http cache location ?
52 * From KDE web-page it looks like /var/tmp/kdecache-$USERNAME/http
54 konq_cache_dir = "/var/tmp/kdecache-" + System.Environment.UserName + "/http";
57 /////////////////////////////////////////////////
59 public override void Start ()
61 base.Start ();
62 ExceptionHandlingThread.Start (new ThreadStart (StartWorker));
65 private void StartWorker ()
67 if (!Directory.Exists (konq_cache_dir)) {
68 // if the directory is not present, user is not running KDE
69 // no need to periodically check
70 //GLib.Timeout.Add (60000, new GLib.TimeoutHandler (CheckForExistence));
71 return;
74 if (Inotify.Enabled) {
75 // watch konq_cache_dir for new directory creations
76 Inotify.EventType mask = Inotify.EventType.Create;
77 Inotify.Subscribe (konq_cache_dir, OnInotifyEvent, mask);
78 } else {
79 Scheduler.Task crawl_task = Scheduler.TaskFromHook (new Scheduler.TaskHook (CrawlHook));
80 crawl_task.Tag = "Crawling konqueror webcache";
81 crawl_task.Source = this;
82 ThisScheduler.Add (crawl_task);
85 log.Info ("Starting Konq history backend ...");
86 Crawl ();
89 private void Crawl ()
91 State = QueryableState.Crawling;
92 directory_enumerator = DirectoryWalker.GetDirectoryInfos (konq_cache_dir).GetEnumerator ();
93 Scheduler.Task crawl_task = NewAddTask (this);
94 crawl_task.Tag = crawler_tag;
95 ThisScheduler.Add (crawl_task);
96 State = QueryableState.Idle;
99 private string crawler_tag = "Konqueror History Crawler";
100 private void CrawlHook (Scheduler.Task task)
102 if (!ThisScheduler.ContainsByTag (crawler_tag)) {
103 Crawl ();
106 task.Reschedule = true;
107 task.TriggerTime = DateTime.Now.AddSeconds (polling_interval_in_seconds);
110 private bool CheckForExistence ()
112 if (!Directory.Exists (konq_cache_dir))
113 return true;
115 this.Start ();
117 return false;
120 /////////////////////////////////////////////////
122 // Modified/Created event using Inotify
124 private void OnInotifyEvent (Inotify.Watch watch,
125 string path,
126 string subitem,
127 string srcpath,
128 Inotify.EventType type)
130 if (subitem == "")
131 return;
133 // Watch konq_cache_dir for new directory creation
134 // Watch its subdirectories for new file creation
135 // If any file in created in konq_cache_dir, ignore it
136 // Its a Konq error otherwise
137 if ((type & Inotify.EventType.IsDirectory) == 0)
138 IndexSingleFile (Path.Combine (path, subitem));
139 else if ((type & Inotify.EventType.IsDirectory) != 0)
140 Inotify.Subscribe (konq_cache_dir, OnInotifyEvent, Inotify.EventType.CloseWrite);
143 void IndexSingleFile (string path)
145 if (path.EndsWith (".new"))
146 return;
147 Indexable indexable = FileToIndexable (path);
148 if (indexable == null)
149 return;
150 Scheduler.Task task = NewAddTask (indexable);
151 task.Priority = Scheduler.Priority.Immediate;
152 task.Tag = path;
153 task.SubPriority = 0;
154 ThisScheduler.Add (task);
157 /////////////////////////////////////////////////
159 private Indexable FileToIndexable (string path) {
160 //Logger.Log.Debug ("KonqQ: Trying to index " + path);
162 FileStream stream;
163 try {
164 stream = new FileStream (path, FileMode.Open, FileAccess.Read, FileShare.Read);
165 } catch (FileNotFoundException) {
166 // that was fast - lost the file
167 return null;
170 StreamReader reader = new StreamReader (stream, latin_encoding);
171 string url = null;
172 string creation_date = null;
173 string mimetype = null;
174 string charset = null;
175 bool is_ok = KonqHistoryUtil.ShouldIndex (reader,
176 out url,
177 out creation_date,
178 out mimetype,
179 out charset);
181 if (!is_ok || url == String.Empty) {
182 //Logger.Log.Debug ("KonqQ: Skipping non-html file " + path + " of type=" + mimetype);
183 // finding out if a cache file should be indexed is expensive
184 // so, soon after we run the test, write lastwritetime attribute
185 FileAttributesStore.AttachLastWriteTime (path, DateTime.UtcNow);
186 return null; // we wont index bad files and non-html files
189 Logger.Log.Debug ("KonqQ: Indexing " + path + " with url=" + url);
190 Uri uri = new Uri (url, true);
191 if (uri.Scheme == Uri.UriSchemeHttps) {
192 Logger.Log.Error ("Indexing secure https:// URIs is not secure!");
193 return null;
196 Indexable indexable = new Indexable (uri);
197 indexable.HitType = "WebHistory";
198 indexable.MimeType = KonqHistoryUtil.KonqCacheMimeType;
199 // store www.beaglewiki.org as www beagle org, till inpath: query is implemented
200 indexable.AddProperty (Property.NewUnstored ("fixme:urltoken", StringFu.UrlFuzzyDivide (url)));
201 // hint for the filter about the charset
202 indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "charset", charset));
204 DateTime date = new DateTime (1970, 1, 1);
205 date = date.AddSeconds (Int64.Parse (creation_date));
206 indexable.Timestamp = date;
208 indexable.ContentUri = UriFu.PathToFileUri (path);
209 return indexable;
212 // FIXME: Implement removefile - removing files from history doesnt really make sense ? Do they ?
214 // ---------------- IIndexableGenerator --------------------------
215 private FileInfo current_file;
216 private IEnumerator file_enumerator = null;
218 public Indexable GetNextIndexable ()
220 if (current_file == null)
221 return null;
222 return FileToIndexable (current_file.FullName);
225 public bool HasNextIndexable ()
227 do {
228 while (file_enumerator == null || ! file_enumerator.MoveNext ()) {
229 if (! directory_enumerator.MoveNext ()) {
230 Logger.Log.Debug ("KonqQ: Crawling done");
231 file_enumerator = null;
232 current_file = null;
233 return false;
235 DirectoryInfo current_dir = (DirectoryInfo)directory_enumerator.Current;
236 //Logger.Log.Debug ("Trying dir:" + current_dir.Name);
237 // start watching for new files and get the list of current files
238 // kind of race here - might get duplicate files
239 if (Inotify.Enabled)
240 Inotify.Subscribe (current_dir.FullName, OnInotifyEvent,
241 Inotify.EventType.Create | Inotify.EventType.MovedTo);
242 file_enumerator = DirectoryWalker.GetFileInfos (current_dir).GetEnumerator ();
244 current_file = (FileInfo) file_enumerator.Current;
245 //if (!IsUpToDate (current_file.FullName))
246 // Logger.Log.Debug (current_file.FullName + " is not upto date");
247 } while (IsUpToDate (current_file.FullName));
249 return true;
252 public string StatusName {
253 get { return String.Format ("KonquerorQueryable: Indexing {0}", (current_file == null ? "Done" : current_file.FullName)); }
256 public void PostFlushHook ()