QueryResponses.cs, DumpIndex.cs, IQueryResult.cs, QueryExecutor.cs, QueryResult.cs...
[beagle.git] / beagled / AkregatorQueryable / AkregatorQueryable.cs
blob057fe229783450cd0b3260b48b66b9fd90441fde
1 //
2 // AkregatorQueryable.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera
5 //
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a
8 // copy of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the
12 // Software is furnished to do so, subject to the following conditions:
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 // DEALINGS IN THE SOFTWARE.
26 using System;
27 using System.IO;
28 using System.Collections;
29 using System.Threading;
30 using System.Text;
31 using System.Xml;
32 using System.Xml.Serialization;
33 using System.Globalization;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 namespace Beagle.Daemon.AkregatorQueryable {
40 [QueryableFlavor (Name="Akregator", Domain=QueryDomain.Local, RequireInotify=false)]
41 public class AkregatorQueryable : LuceneFileQueryable {
43 private static Logger log = Logger.Get ("AkregatorQueryable");
45 string akregator_dir;
47 // construct a serializer and keep it handy for indexablegenerator to use
48 private XmlSerializer serializer = null;
49 public XmlSerializer Serializer {
50 get {
51 if (serializer == null)
52 serializer = new XmlSerializer (typeof (Item));
53 return serializer;
57 // store the file size indexed by the filenames
58 // akregator unnecessarily saves files
59 private Hashtable file_sizes;
60 public long GetFileSize (string name)
62 if (! file_sizes.Contains (name))
63 return -1;
64 return (long)file_sizes [name];
66 public void SetFileSize (string name, long size)
68 file_sizes [name] = size;
71 // add versioning of index
72 // v1: change property names to DC names,
73 // store feed_file as ParentUri
74 // v2: remove dc:date, use Timestamp property.
75 private const int INDEX_VERSION = 2;
77 public AkregatorQueryable () : base ("AkregatorIndex", INDEX_VERSION)
79 akregator_dir = Path.Combine (PathFinder.HomeDir, ".kde");
80 akregator_dir = Path.Combine (akregator_dir, "share");
81 akregator_dir = Path.Combine (akregator_dir, "apps");
82 akregator_dir = Path.Combine (akregator_dir, "akregator");
83 akregator_dir = Path.Combine (akregator_dir, "Archive");
85 file_sizes = new Hashtable ();
88 /////////////////////////////////////////////////
90 public override void Start ()
92 base.Start ();
94 ExceptionHandlingThread.Start (new ThreadStart (StartWorker));
97 private void StartWorker ()
99 if (!Directory.Exists (akregator_dir)) {
100 GLib.Timeout.Add (60000, new GLib.TimeoutHandler (CheckForExistence));
101 return;
104 if (Inotify.Enabled) {
105 Inotify.EventType mask = Inotify.EventType.CloseWrite
106 | Inotify.EventType.Delete;
108 Inotify.Subscribe (akregator_dir, OnInotifyEvent, mask);
109 } else {
110 FileSystemWatcher fsw = new FileSystemWatcher ();
111 fsw.Path = akregator_dir;
113 fsw.Changed += new FileSystemEventHandler (OnChanged);
114 fsw.Created += new FileSystemEventHandler (OnChanged);
116 fsw.EnableRaisingEvents = true;
119 log.Info ("Scanning Akregator feeds...");
121 State = QueryableState.Crawling;
122 Stopwatch stopwatch = new Stopwatch ();
123 stopwatch.Start ();
125 DirectoryInfo dir = new DirectoryInfo (akregator_dir);
126 int count = 0;
127 foreach (FileInfo file in DirectoryWalker.GetFileInfos (dir)) {
128 if (file.Extension == ".xml") {
129 IndexSingleFeed (file.FullName, true);
130 count ++;
134 State = QueryableState.Idle;
135 stopwatch.Stop ();
136 log.Info ("{0} files will be parsed (scanned in {1})", count, stopwatch);
139 private bool CheckForExistence ()
141 if (!Directory.Exists (akregator_dir))
142 return true;
144 this.Start ();
146 return false;
149 /////////////////////////////////////////////////
151 // Modified/Created event using Inotify
153 private void OnInotifyEvent (Inotify.Watch watch,
154 string path,
155 string subitem,
156 string srcpath,
157 Inotify.EventType type)
159 if (subitem == "" || !subitem.EndsWith (".xml"))
160 return;
162 if ((type & Inotify.EventType.CloseWrite) != 0)
163 IndexSingleFeed (Path.Combine (path, subitem), false);
164 else if ((type & Inotify.EventType.Delete) != 0)
165 RemoveFeedFile (Path.Combine (path, subitem));
168 // Modified/Created event using FSW
170 private void OnChanged (object o, FileSystemEventArgs args)
172 IndexSingleFeed (args.FullPath, false);
175 /////////////////////////////////////////////////
177 // Parse and index a single feed
179 private void IndexSingleFeed (string filename, bool initial_scan) {
180 if (! filename.EndsWith (".xml"))
181 return;
182 if (ThisScheduler.ContainsByTag (filename)) {
183 Logger.Log.Debug ("Not adding task for already running task: {0}", filename);
184 return;
187 FeedIndexableGenerator generator = new FeedIndexableGenerator (this, filename, initial_scan);
188 Scheduler.Task task;
189 task = NewAddTask (generator);
190 task.Tag = filename;
191 ThisScheduler.Add (task);
194 private void RemoveFeedFile (string file) {
195 Logger.Log.Debug ("Removing Akregator feedfile:" + file);
196 Uri uri = UriFu.PathToFileUri (file);
197 Scheduler.Task task = NewRemoveTask (uri);
198 task.Priority = Scheduler.Priority.Immediate;
199 task.SubPriority = 0;
200 ThisScheduler.Add (task);
206 * Indexable generator for Akregator Feeds
208 public class FeedIndexableGenerator : IIndexableGenerator {
209 private string feed_file;
210 private AkregatorQueryable queryable;
212 private XmlTextReader reader;
213 private bool is_valid_file = true;
214 private bool initial_scan = false;
216 private string channel_title;
217 private string channel_link;
218 private string channel_description;
220 private Item current_item;
221 private XmlSerializer serializer;
223 public FeedIndexableGenerator (AkregatorQueryable queryable, string feed_file, bool initial_scan)
225 this.queryable = queryable;
226 this.feed_file = feed_file;
227 this.serializer = queryable.Serializer;
228 this.initial_scan = initial_scan;
229 ReadFeedHeader ();
232 public void PostFlushHook ()
236 public string StatusName {
237 get { return feed_file; }
240 private bool IsUpToDate (string path)
242 // first check the file date
243 if (queryable.FileAttributesStore.IsUpToDate (path))
244 return true;
245 // if not up to date and initial scan, then we should index
246 if (initial_scan)
247 return false;
248 // next check the size - its really unlucky if the file is changed
249 // and yet the size is same
250 // FIXME: Maybe store the md5-hash of the file - that is less expensive
251 // than indexing all the feeds in the file!
252 FileInfo file = new FileInfo (path);
253 if (queryable.GetFileSize (path) != file.Length)
254 return false;
255 return true;
258 private void ReadFeedHeader () {
260 if (IsUpToDate (feed_file)) {
261 is_valid_file = false;
262 return;
264 try {
265 Logger.Log.Debug ("Opening feed file: {0}", feed_file);
266 reader = new XmlTextReader (feed_file);
267 reader.WhitespaceHandling = WhitespaceHandling.None;
269 is_valid_file = true;
271 // move to beginning of document
272 reader.MoveToContent();
273 // move to <rss ...> node
274 reader.ReadStartElement ("rss");
275 // move to <channel> node
276 reader.ReadStartElement ("channel");
278 // read <title>
280 do {
281 string elementName = reader.Name;
282 if (elementName == "item")
283 break;
284 switch (elementName) {
285 case "title":
286 reader.ReadStartElement ("title");
287 channel_title = reader.ReadString ();
288 reader.ReadEndElement ();
289 break;
291 case "link":
292 reader.ReadStartElement ("link");
293 channel_link = reader.ReadString ();
294 reader.ReadEndElement ();
295 break;
297 case "description":
298 reader.ReadStartElement ("description");
299 channel_description = reader.ReadString ();
300 reader.ReadEndElement ();
301 break;
303 // ignore other elements
304 default:
305 reader.ReadOuterXml ();
306 break;
308 } while (!reader.EOF && reader.NodeType == XmlNodeType.Element);
309 } catch (XmlException ex) {
310 Logger.Log.Warn (ex, "Caught exception parsing feed file:");
311 is_valid_file = false;
312 reader.Close ();
316 public bool HasNextIndexable ()
318 current_item = null;
319 if (!is_valid_file || reader == null)
320 return false;
321 string itemString = "";
322 try {
323 // check if the reader is at the startnode
324 if (reader.NodeType == XmlNodeType.Element) {
325 itemString = reader.ReadOuterXml ();
326 // form node object from the <node>...</node> string
327 // FIXME Deserialize is expensive - remove it altogether
328 current_item = (Item) serializer.Deserialize (new StringReader (itemString));
330 } catch (XmlException ex) {
331 // probably no more <item>
334 if (current_item == null) {
335 //Logger.Log.Debug ("AkregatorQ: Probably no more feeds left in " + feed_file);
336 //Logger.Log.Debug ("Causing string = " + itemString);
337 current_item = null;
338 is_valid_file = false;
339 reader.Close ();
341 if (! is_valid_file)
342 StoreFileSize ();
343 return is_valid_file;
346 private void StoreFileSize ()
348 // cache the file size
349 FileInfo file = new FileInfo (feed_file);
350 queryable.SetFileSize (feed_file, file.Length);
353 public Indexable GetNextIndexable ()
355 if (current_item != null || !current_item.IsDeleted)
356 return current_itemToIndexable ();
357 else
358 return null;
361 private Indexable current_itemToIndexable ()
363 // sanity check
364 if (current_item == null)
365 return null;
367 //Logger.Log.Debug ("Indexing " + channel_link + ":" + current_item.Link);
368 Indexable indexable = new Indexable (new Uri (String.Format ("feed:{0};item={1}", channel_link, current_item.Link)));
369 indexable.ParentUri = UriFu.PathToFileUri (feed_file);
370 indexable.MimeType = "text/html";
371 indexable.HitType = "FeedItem";
373 string RFC822 = "ddd, dd MMM yyyy HH:mm:ss zzz";
374 DateTime date = DateTime.ParseExact(current_item.PubDate, RFC822, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal);
375 indexable.Timestamp = date;
377 // replace property names with Dublin Core names
378 indexable.AddProperty (Property.New ("dc:title", current_item.Title));
379 indexable.AddProperty (Property.NewKeyword ("dc:identifier", current_item.Link));
380 indexable.AddProperty (Property.NewKeyword ("dc:source", channel_link));
381 indexable.AddProperty (Property.New ("dc:publisher", channel_title));
383 StringReader reader = new StringReader (current_item.Description);
384 indexable.SetTextReader (reader);
386 return indexable;
391 public class MetaInfo {
392 [XmlText]
393 public string value = "";
394 [XmlAttribute ("type")] public string Type = "";
397 // we will deserialize XML fragments, so there wont be any <? xml ... ?>
398 [System.Xml.Serialization.XmlRoot("item", Namespace="", IsNullable=false)]
399 [System.Xml.Serialization.XmlType("item", Namespace="")]
400 public class Item {
401 [XmlElement ("pubDate")] public string PubDate;
402 [XmlElement ("title")] public string Title = "";
403 [XmlElement ("description")] public string Description ="";
404 [XmlElement ("link")] public string Link="";
405 [XmlElement ("meta", typeof (MetaInfo), Namespace="http://foobar")]
406 public ArrayList MetaList {
407 get { return metaList; }
408 set { metaList = value; }
410 private ArrayList metaList = new ArrayList ();
412 public bool IsDeleted {
413 get {
414 for (int i=0; i<metaList.Count; ++i) {
415 MetaInfo meta = (MetaInfo)metaList[i];
416 if (meta.Type == "deleted" && meta.value == "true") {
417 return true;
420 return false;