cvsimport
[beagle.git] / beagled / AkregatorQueryable / AkregatorQueryable.cs
blobab58c1dddd19ca2fb26c4a4029baa7aa564031f9
1 //
2 // AkregatorQueryable.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera
5 //
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a
8 // copy of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the
12 // Software is furnished to do so, subject to the following conditions:
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 // DEALINGS IN THE SOFTWARE.
26 using System;
27 using System.IO;
28 using System.Collections;
29 using System.Threading;
30 using System.Text;
31 using System.Xml;
32 using System.Xml.Serialization;
33 using System.Globalization;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 namespace Beagle.Daemon.AkregatorQueryable {
40 [QueryableFlavor (Name="Akregator", Domain=QueryDomain.Local, RequireInotify=false)]
41 public class AkregatorQueryable : LuceneFileQueryable {
43 string akregator_dir;
45 // construct a serializer and keep it handy for indexablegenerator to use
46 private XmlSerializer serializer = null;
47 public XmlSerializer Serializer {
48 get {
49 if (serializer == null)
50 serializer = new XmlSerializer (typeof (Item));
51 return serializer;
55 // store the file size indexed by the filenames
56 // akregator unnecessarily saves files
57 private Hashtable file_sizes;
58 public long GetFileSize (string name)
60 if (! file_sizes.Contains (name))
61 return -1;
62 return (long)file_sizes [name];
64 public void SetFileSize (string name, long size)
66 file_sizes [name] = size;
69 // add versioning of index
70 // v1: change property names to DC names,
71 // store feed_file as ParentUri
72 // v2: remove dc:date, use Timestamp property.
73 private const int INDEX_VERSION = 2;
75 public AkregatorQueryable () : base ("AkregatorIndex", INDEX_VERSION)
77 akregator_dir = Path.Combine (PathFinder.HomeDir, ".kde");
78 akregator_dir = Path.Combine (akregator_dir, "share");
79 akregator_dir = Path.Combine (akregator_dir, "apps");
80 akregator_dir = Path.Combine (akregator_dir, "akregator");
81 akregator_dir = Path.Combine (akregator_dir, "Archive");
83 file_sizes = new Hashtable ();
86 /////////////////////////////////////////////////
88 public override void Start ()
90 base.Start ();
92 ExceptionHandlingThread.Start (new ThreadStart (StartWorker));
95 private void StartWorker ()
97 if (!Directory.Exists (akregator_dir)) {
98 GLib.Timeout.Add (60000, new GLib.TimeoutHandler (CheckForExistence));
99 return;
102 if (Inotify.Enabled) {
103 Inotify.EventType mask = Inotify.EventType.CloseWrite
104 | Inotify.EventType.Delete;
106 Inotify.Subscribe (akregator_dir, OnInotifyEvent, mask);
107 } else {
108 FileSystemWatcher fsw = new FileSystemWatcher ();
109 fsw.Path = akregator_dir;
111 fsw.Changed += new FileSystemEventHandler (OnChanged);
112 fsw.Created += new FileSystemEventHandler (OnChanged);
114 fsw.EnableRaisingEvents = true;
117 Log.Info ("Scanning Akregator feeds...");
119 Stopwatch stopwatch = new Stopwatch ();
120 stopwatch.Start ();
122 DirectoryInfo dir = new DirectoryInfo (akregator_dir);
123 int count = 0;
124 foreach (FileInfo file in DirectoryWalker.GetFileInfos (dir)) {
125 if (file.Extension == ".xml") {
126 IndexSingleFeed (file.FullName, true);
127 count ++;
131 stopwatch.Stop ();
132 Log.Info ("{0} files will be parsed (scanned in {1})", count, stopwatch);
135 private bool CheckForExistence ()
137 if (!Directory.Exists (akregator_dir))
138 return true;
140 this.Start ();
142 return false;
145 /////////////////////////////////////////////////
147 // Modified/Created event using Inotify
149 private void OnInotifyEvent (Inotify.Watch watch,
150 string path,
151 string subitem,
152 string srcpath,
153 Inotify.EventType type)
155 if (subitem == "" || !subitem.EndsWith (".xml"))
156 return;
158 if ((type & Inotify.EventType.CloseWrite) != 0)
159 IndexSingleFeed (Path.Combine (path, subitem), false);
160 else if ((type & Inotify.EventType.Delete) != 0)
161 RemoveFeedFile (Path.Combine (path, subitem));
164 // Modified/Created event using FSW
166 private void OnChanged (object o, FileSystemEventArgs args)
168 IndexSingleFeed (args.FullPath, false);
171 /////////////////////////////////////////////////
173 // Parse and index a single feed
175 private void IndexSingleFeed (string filename, bool initial_scan) {
176 if (! filename.EndsWith (".xml"))
177 return;
178 if (ThisScheduler.ContainsByTag (filename)) {
179 Log.Debug ("Not adding task for already running task: {0}", filename);
180 return;
183 FeedIndexableGenerator generator = new FeedIndexableGenerator (this, filename, initial_scan);
184 Scheduler.Task task;
185 task = NewAddTask (generator);
186 task.Tag = filename;
187 ThisScheduler.Add (task);
190 private void RemoveFeedFile (string file) {
191 Log.Debug ("Removing Akregator feedfile:" + file);
192 Uri uri = UriFu.PathToFileUri (file);
193 Scheduler.Task task = NewRemoveTask (uri);
194 task.Priority = Scheduler.Priority.Immediate;
195 task.SubPriority = 0;
196 ThisScheduler.Add (task);
202 * Indexable generator for Akregator Feeds
204 public class FeedIndexableGenerator : IIndexableGenerator {
205 private string feed_file;
206 private AkregatorQueryable queryable;
208 private XmlTextReader reader;
209 private bool is_valid_file = true;
210 private bool initial_scan = false;
212 private string channel_title;
213 private string channel_link;
214 private string channel_description;
216 private Item current_item;
217 private XmlSerializer serializer;
219 public FeedIndexableGenerator (AkregatorQueryable queryable, string feed_file, bool initial_scan)
221 this.queryable = queryable;
222 this.feed_file = feed_file;
223 this.serializer = queryable.Serializer;
224 this.initial_scan = initial_scan;
225 ReadFeedHeader ();
228 public void PostFlushHook ()
232 public string StatusName {
233 get { return feed_file; }
236 private bool IsUpToDate (string path)
238 // first check the file date
239 if (queryable.FileAttributesStore.IsUpToDate (path))
240 return true;
241 // if not up to date and initial scan, then we should index
242 if (initial_scan)
243 return false;
244 // next check the size - its really unlucky if the file is changed
245 // and yet the size is same
246 // FIXME: Maybe store the md5-hash of the file - that is less expensive
247 // than indexing all the feeds in the file!
248 FileInfo file = new FileInfo (path);
249 if (queryable.GetFileSize (path) != file.Length)
250 return false;
251 return true;
254 private void ReadFeedHeader () {
256 if (IsUpToDate (feed_file)) {
257 is_valid_file = false;
258 return;
260 try {
261 Log.Debug ("Opening feed file: {0}", feed_file);
262 reader = new XmlTextReader (feed_file);
263 reader.WhitespaceHandling = WhitespaceHandling.None;
265 is_valid_file = true;
267 // move to beginning of document
268 reader.MoveToContent();
269 // move to <rss ...> node
270 reader.ReadStartElement ("rss");
271 // move to <channel> node
272 reader.ReadStartElement ("channel");
274 // read <title>
276 do {
277 string elementName = reader.Name;
278 if (elementName == "item")
279 break;
280 switch (elementName) {
281 case "title":
282 reader.ReadStartElement ("title");
283 channel_title = reader.ReadString ();
284 reader.ReadEndElement ();
285 break;
287 case "link":
288 reader.ReadStartElement ("link");
289 channel_link = reader.ReadString ();
290 reader.ReadEndElement ();
291 break;
293 case "description":
294 reader.ReadStartElement ("description");
295 channel_description = reader.ReadString ();
296 reader.ReadEndElement ();
297 break;
299 // ignore other elements
300 default:
301 reader.ReadOuterXml ();
302 break;
304 } while (!reader.EOF && reader.NodeType == XmlNodeType.Element);
305 } catch (XmlException ex) {
306 Log.Warn (ex, "Caught exception parsing feed file:");
307 is_valid_file = false;
308 reader.Close ();
312 public bool HasNextIndexable ()
314 current_item = null;
315 if (!is_valid_file || reader == null)
316 return false;
317 string itemString = "";
318 try {
319 // check if the reader is at the startnode
320 if (reader.NodeType == XmlNodeType.Element) {
321 itemString = reader.ReadOuterXml ();
322 // form node object from the <node>...</node> string
323 // FIXME Deserialize is expensive - remove it altogether
324 current_item = (Item) serializer.Deserialize (new StringReader (itemString));
326 } catch (XmlException ex) {
327 // probably no more <item>
330 if (current_item == null) {
331 //Log.Debug ("AkregatorQ: Probably no more feeds left in " + feed_file);
332 //Log.Debug ("Causing string = " + itemString);
333 current_item = null;
334 is_valid_file = false;
335 reader.Close ();
337 if (! is_valid_file)
338 StoreFileSize ();
339 return is_valid_file;
342 private void StoreFileSize ()
344 // cache the file size
345 FileInfo file = new FileInfo (feed_file);
346 queryable.SetFileSize (feed_file, file.Length);
349 public Indexable GetNextIndexable ()
351 if (current_item != null || !current_item.IsDeleted)
352 return current_itemToIndexable ();
353 else
354 return null;
357 private Indexable current_itemToIndexable ()
359 // sanity check
360 if (current_item == null)
361 return null;
363 //Log.Debug ("Indexing " + channel_link + ":" + current_item.Link);
364 Indexable indexable = new Indexable (new Uri (String.Format ("feed:{0};item={1}", channel_link, current_item.Link)));
365 indexable.ParentUri = UriFu.PathToFileUri (feed_file);
366 indexable.MimeType = "text/html";
367 indexable.HitType = "FeedItem";
369 string RFC822 = "ddd, dd MMM yyyy HH:mm:ss zzz";
370 DateTime date = DateTime.ParseExact(current_item.PubDate, RFC822, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal);
371 indexable.Timestamp = date;
373 // replace property names with Dublin Core names
374 indexable.AddProperty (Property.New ("dc:title", current_item.Title));
375 indexable.AddProperty (Property.NewKeyword ("dc:identifier", current_item.Link));
376 indexable.AddProperty (Property.NewKeyword ("dc:source", channel_link));
377 indexable.AddProperty (Property.New ("dc:publisher", channel_title));
379 StringReader reader = new StringReader (current_item.Description);
380 indexable.SetTextReader (reader);
382 return indexable;
387 public class MetaInfo {
388 [XmlText]
389 public string value = "";
390 [XmlAttribute ("type")] public string Type = "";
393 // we will deserialize XML fragments, so there wont be any <? xml ... ?>
394 [System.Xml.Serialization.XmlRoot("item", Namespace="", IsNullable=false)]
395 [System.Xml.Serialization.XmlType("item", Namespace="")]
396 public class Item {
397 [XmlElement ("pubDate")] public string PubDate;
398 [XmlElement ("title")] public string Title = "";
399 [XmlElement ("description")] public string Description ="";
400 [XmlElement ("link")] public string Link="";
401 [XmlElement ("meta", typeof (MetaInfo), Namespace="http://foobar")]
402 public ArrayList MetaList {
403 get { return metaList; }
404 set { metaList = value; }
406 private ArrayList metaList = new ArrayList ();
408 public bool IsDeleted {
409 get {
410 for (int i=0; i<metaList.Count; ++i) {
411 MetaInfo meta = (MetaInfo)metaList[i];
412 if (meta.Type == "deleted" && meta.value == "true") {
413 return true;
416 return false;