* Filters/FilterPackage.cs, Filters/FilterRPM.cs,
[beagle.git] / beagled / AkregatorQueryable / AkregatorQueryable.cs
blobb4697904549ea99c533f2f6b2510f5fe5f212a67
1 //
2 // AkregatorQueryable.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera
5 //
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a
8 // copy of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the
12 // Software is furnished to do so, subject to the following conditions:
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 // DEALINGS IN THE SOFTWARE.
26 using System;
27 using System.IO;
28 using System.Collections;
29 using System.Threading;
30 using System.Text;
31 using System.Xml;
32 using System.Xml.Serialization;
33 using System.Globalization;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 namespace Beagle.Daemon.AkregatorQueryable {
40 [QueryableFlavor (Name="Akregator", Domain=QueryDomain.Local, RequireInotify=false)]
41 public class AkregatorQueryable : LuceneFileQueryable {
43 private static Logger log = Logger.Get ("AkregatorQueryable");
45 string akregator_dir;
47 // construct a serializer and keep it handy for indexablegenerator to use
48 private XmlSerializer serializer = null;
49 public XmlSerializer Serializer {
50 get {
51 if (serializer == null)
52 serializer = new XmlSerializer (typeof (Item));
53 return serializer;
57 // store the file size indexed by the filenames
58 // akregator unnecessarily saves files
59 private Hashtable file_sizes;
60 public long GetFileSize (string name)
62 if (! file_sizes.Contains (name))
63 return -1;
64 return (long)file_sizes [name];
66 public void SetFileSize (string name, long size)
68 file_sizes [name] = size;
71 // add versioning of index
72 // v1: change property names to DC names,
73 // store feed_file as ParentUri
74 private const int INDEX_VERSION = 1;
76 public AkregatorQueryable () : base ("AkregatorIndex", INDEX_VERSION)
78 akregator_dir = Path.Combine (PathFinder.HomeDir, ".kde");
79 akregator_dir = Path.Combine (akregator_dir, "share");
80 akregator_dir = Path.Combine (akregator_dir, "apps");
81 akregator_dir = Path.Combine (akregator_dir, "akregator");
82 akregator_dir = Path.Combine (akregator_dir, "Archive");
84 file_sizes = new Hashtable ();
87 /////////////////////////////////////////////////
89 public override void Start ()
91 base.Start ();
93 ExceptionHandlingThread.Start (new ThreadStart (StartWorker));
96 private void StartWorker ()
98 if (!Directory.Exists (akregator_dir)) {
99 GLib.Timeout.Add (60000, new GLib.TimeoutHandler (CheckForExistence));
100 return;
103 if (Inotify.Enabled) {
104 Inotify.EventType mask = Inotify.EventType.CloseWrite
105 | Inotify.EventType.Delete;
107 Inotify.Subscribe (akregator_dir, OnInotifyEvent, mask);
108 } else {
109 FileSystemWatcher fsw = new FileSystemWatcher ();
110 fsw.Path = akregator_dir;
112 fsw.Changed += new FileSystemEventHandler (OnChanged);
113 fsw.Created += new FileSystemEventHandler (OnChanged);
115 fsw.EnableRaisingEvents = true;
118 log.Info ("Scanning Akregator feeds...");
120 State = QueryableState.Crawling;
121 Stopwatch stopwatch = new Stopwatch ();
122 stopwatch.Start ();
124 DirectoryInfo dir = new DirectoryInfo (akregator_dir);
125 int count = 0;
126 foreach (FileInfo file in DirectoryWalker.GetFileInfos (dir)) {
127 if (file.Extension == ".xml") {
128 IndexSingleFeed (file.FullName, true);
129 count ++;
133 State = QueryableState.Idle;
134 stopwatch.Stop ();
135 log.Info ("{0} files will be parsed (scanned in {1})", count, stopwatch);
138 private bool CheckForExistence ()
140 if (!Directory.Exists (akregator_dir))
141 return true;
143 this.Start ();
145 return false;
148 /////////////////////////////////////////////////
150 // Modified/Created event using Inotify
152 private void OnInotifyEvent (Inotify.Watch watch,
153 string path,
154 string subitem,
155 string srcpath,
156 Inotify.EventType type)
158 if (subitem == "" || !subitem.EndsWith (".xml"))
159 return;
161 if ((type & Inotify.EventType.CloseWrite) != 0)
162 IndexSingleFeed (Path.Combine (path, subitem), false);
163 else if ((type & Inotify.EventType.Delete) != 0)
164 RemoveFeedFile (Path.Combine (path, subitem));
167 // Modified/Created event using FSW
169 private void OnChanged (object o, FileSystemEventArgs args)
171 IndexSingleFeed (args.FullPath, false);
174 /////////////////////////////////////////////////
176 // Parse and index a single feed
178 private void IndexSingleFeed (string filename, bool initial_scan) {
179 if (! filename.EndsWith (".xml"))
180 return;
181 if (ThisScheduler.ContainsByTag (filename)) {
182 Logger.Log.Debug ("Not adding task for already running task: {0}", filename);
183 return;
186 FeedIndexableGenerator generator = new FeedIndexableGenerator (this, filename, initial_scan);
187 Scheduler.Task task;
188 task = NewAddTask (generator);
189 task.Tag = filename;
190 ThisScheduler.Add (task);
193 private void RemoveFeedFile (string file) {
194 Logger.Log.Debug ("Removing Akregator feedfile:" + file);
195 Uri uri = UriFu.PathToFileUri (file);
196 Scheduler.Task task = NewRemoveTask (uri);
197 task.Priority = Scheduler.Priority.Immediate;
198 task.SubPriority = 0;
199 ThisScheduler.Add (task);
205 * Indexable generator for Akregator Feeds
207 public class FeedIndexableGenerator : IIndexableGenerator {
208 private string feed_file;
209 private AkregatorQueryable queryable;
211 private XmlTextReader reader;
212 private bool is_valid_file = true;
213 private bool initial_scan = false;
215 private string channel_title;
216 private string channel_link;
217 private string channel_description;
219 private Item current_item;
220 private XmlSerializer serializer;
222 public FeedIndexableGenerator (AkregatorQueryable queryable, string feed_file, bool initial_scan)
224 this.queryable = queryable;
225 this.feed_file = feed_file;
226 this.serializer = queryable.Serializer;
227 this.initial_scan = initial_scan;
228 ReadFeedHeader ();
231 public void PostFlushHook ()
235 public string StatusName {
236 get { return feed_file; }
239 private bool IsUpToDate (string path)
241 // first check the file date
242 if (queryable.FileAttributesStore.IsUpToDate (path))
243 return true;
244 // if not up to date and initial scan, then we should index
245 if (initial_scan)
246 return false;
247 // next check the size - its really unlucky if the file is changed
248 // and yet the size is same
249 // FIXME: Maybe store the md5-hash of the file - that is less expensive
250 // than indexing all the feeds in the file!
251 FileInfo file = new FileInfo (path);
252 if (queryable.GetFileSize (path) != file.Length)
253 return false;
254 return true;
257 private void ReadFeedHeader () {
259 if (IsUpToDate (feed_file)) {
260 is_valid_file = false;
261 return;
263 try {
264 Logger.Log.Debug ("Opening feed file: {0}", feed_file);
265 reader = new XmlTextReader (feed_file);
266 reader.WhitespaceHandling = WhitespaceHandling.None;
268 is_valid_file = true;
270 // move to beginning of document
271 reader.MoveToContent();
272 // move to <rss ...> node
273 reader.ReadStartElement ("rss");
274 // move to <channel> node
275 reader.ReadStartElement ("channel");
277 // read <title>
279 do {
280 string elementName = reader.Name;
281 if (elementName == "item")
282 break;
283 switch (elementName) {
284 case "title":
285 reader.ReadStartElement ("title");
286 channel_title = reader.ReadString ();
287 reader.ReadEndElement ();
288 break;
290 case "link":
291 reader.ReadStartElement ("link");
292 channel_link = reader.ReadString ();
293 reader.ReadEndElement ();
294 break;
296 case "description":
297 reader.ReadStartElement ("description");
298 channel_description = reader.ReadString ();
299 reader.ReadEndElement ();
300 break;
302 // ignore other elements
303 default:
304 reader.ReadOuterXml ();
305 break;
307 } while (!reader.EOF && reader.NodeType == XmlNodeType.Element);
308 } catch (XmlException ex) {
309 Logger.Log.Debug ("Invalid feed file: " + ex.Message);
310 is_valid_file = false;
311 reader.Close ();
315 public bool HasNextIndexable ()
317 current_item = null;
318 if (!is_valid_file || reader == null)
319 return false;
320 string itemString = "";
321 try {
322 // check if the reader is at the startnode
323 if (reader.NodeType == XmlNodeType.Element) {
324 itemString = reader.ReadOuterXml ();
325 // form node object from the <node>...</node> string
326 // FIXME Deserialize is expensive - remove it altogether
327 current_item = (Item) serializer.Deserialize (new StringReader (itemString));
329 } catch (XmlException ex) {
330 // probably no more <item>
333 if (current_item == null) {
334 //Logger.Log.Debug ("AkregatorQ: Probably no more feeds left in " + feed_file);
335 //Logger.Log.Debug ("Causing string = " + itemString);
336 current_item = null;
337 is_valid_file = false;
338 reader.Close ();
340 if (! is_valid_file)
341 StoreFileSize ();
342 return is_valid_file;
345 private void StoreFileSize ()
347 // cache the file size
348 FileInfo file = new FileInfo (feed_file);
349 queryable.SetFileSize (feed_file, file.Length);
352 public Indexable GetNextIndexable ()
354 if (current_item != null || !current_item.IsDeleted)
355 return current_itemToIndexable ();
356 else
357 return null;
360 private Indexable current_itemToIndexable ()
362 // sanity check
363 if (current_item == null)
364 return null;
366 Logger.Log.Debug ("Indexing " + channel_link + ":" + current_item.Link);
367 Indexable indexable = new Indexable (new Uri (String.Format ("feed:{0};item={1}", channel_link, current_item.Link)));
368 indexable.ParentUri = UriFu.PathToFileUri (feed_file);
369 indexable.MimeType = "text/html";
370 indexable.HitType = "FeedItem";
372 string RFC822 = "ddd, dd MMM yyyy HH:mm:ss zzz";
373 DateTime date = DateTime.ParseExact(current_item.PubDate, RFC822, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal);
374 indexable.Timestamp = date;
376 // replace property names with Dublin Core names
377 indexable.AddProperty (Property.New ("dc:title", current_item.Title));
378 indexable.AddProperty (Property.NewDate ("dc:date", date));
379 indexable.AddProperty (Property.NewKeyword ("dc:identifier", current_item.Link));
380 indexable.AddProperty (Property.NewKeyword ("dc:source", channel_link));
381 indexable.AddProperty (Property.New ("dc:publisher", channel_title));
383 StringReader reader = new StringReader (current_item.Description);
384 indexable.SetTextReader (reader);
386 return indexable;
391 public class MetaInfo {
392 [XmlText]
393 public string value = "";
394 [XmlAttribute ("type")] public string Type = "";
397 // we will deserialize XML fragments, so there wont be any <? xml ... ?>
398 [System.Xml.Serialization.XmlRoot("item", Namespace="", IsNullable=false)]
399 [System.Xml.Serialization.XmlType("item", Namespace="")]
400 public class Item {
401 [XmlElement ("pubDate")] public string PubDate;
402 [XmlElement ("title")] public string Title = "";
403 [XmlElement ("description")] public string Description ="";
404 [XmlElement ("link")] public string Link="";
405 [XmlElement ("meta", typeof (MetaInfo), Namespace="http://foobar")]
406 public ArrayList MetaList {
407 get { return metaList; }
408 set { metaList = value; }
410 private ArrayList metaList = new ArrayList ();
412 public bool IsDeleted {
413 get {
414 for (int i=0; i<metaList.Count; ++i) {
415 MetaInfo meta = (MetaInfo)metaList[i];
416 if (meta.Type == "deleted" && meta.value == "true") {
417 return true;
420 return false;