Compute lucene-style scores for our hits.
[beagle.git] / Filters / FilterPdf.cs
blob96dbafed5d9e628f0594c0a813a57d99b4db0e6c
1 //
2 // FilterPdf.cs: Very simplistic PDF filter
3 //
4 // Author:
5 // Christopher Orr <dashboard@protactin.co.uk>
6 //
7 // Copyright 2004 by Christopher Orr
8 //
10 using System;
11 using System.IO;
12 using System.Diagnostics;
14 using Beagle.Util;
15 using Beagle.Daemon;
17 namespace Beagle.Filters {
19 public class FilterPdf : Beagle.Daemon.Filter {
21 public FilterPdf ()
23 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/pdf"));
24 SnippetMode = true;
27 // FIXME: we should have a reasonable failure mode if pdftotext is
28 // not installed.
30 protected override void DoPullProperties ()
32 // create new external process
33 Process pc = new Process ();
34 pc.StartInfo.FileName = "pdfinfo";
35 // FIXME: We probably need to quote special chars in the path
36 pc.StartInfo.Arguments = String.Format (" \"{0}\"", FileInfo.FullName);
37 pc.StartInfo.RedirectStandardInput = false;
38 pc.StartInfo.RedirectStandardOutput = true;
39 pc.StartInfo.UseShellExecute = false;
40 try {
41 pc.Start ();
42 } catch (System.ComponentModel.Win32Exception) {
43 Logger.Log.Warn ("Unable to find pdfinfo in path; PDF file not indexed.");
44 Finished ();
45 return;
48 // add pdfinfo's output to pool
49 StreamReader pout = pc.StandardOutput;
50 string str = null;
51 string[] tokens = null;
52 string strMetaTag = null;
53 bool bKeyword = false;
55 while ((str = pout.ReadLine ()) != null) {
56 bKeyword = false;
57 strMetaTag = null;
58 tokens = str.Split (':');
59 if (tokens.Length > 1) {
60 switch (tokens[0]) {
61 case "Title":
62 strMetaTag = "dc:title";
63 break;
64 case "Author":
65 strMetaTag = "dc:author";
66 break;
67 case "Pages":
68 strMetaTag = "fixme:page-count";
69 bKeyword = true;
70 break;
71 case "Creator":
72 strMetaTag = "dc:creator";
73 break;
74 case "Producer":
75 strMetaTag = "dc:appname";
76 break;
78 if (strMetaTag != null) {
79 if (bKeyword)
80 AddProperty (Beagle.Property.NewKeyword (strMetaTag,
81 tokens[1].Trim()));
82 else
83 AddProperty (Beagle.Property.New (strMetaTag,
84 tokens[1].Trim()));
89 pout.Close ();
90 pc.WaitForExit ();
91 pc.Close ();
94 protected override void DoPull ()
96 // create new external process
97 Process pc = new Process ();
98 pc.StartInfo.FileName = "pdftotext";
99 // FIXME: We probably need to quote special chars in the path
100 pc.StartInfo.Arguments = String.Format ("-nopgbrk -enc UTF-8 \"{0}\" -", FileInfo.FullName);
101 pc.StartInfo.RedirectStandardInput = false;
102 pc.StartInfo.RedirectStandardOutput = true;
103 pc.StartInfo.UseShellExecute = false;
104 try {
105 pc.Start ();
106 } catch (System.ComponentModel.Win32Exception) {
107 Logger.Log.Warn ("Unable to find pdftotext in path; PDF file not indexed.");
108 Finished ();
109 return;
112 // Nice the process so we don't monopolize the CPU as much; we might want to even set this to idle
113 pc.PriorityClass = ProcessPriorityClass.BelowNormal;
115 // add pdftotext's output to pool
116 StreamReader pout = pc.StandardOutput;
118 // FIXME: I don't think this is really required
119 // Line by line parsing, however, we have to make
120 // sure, that "pdftotext" doesn't output any "New-lines".
121 string str;
122 while ((str = pout.ReadLine()) != null) {
123 AppendText (str);
124 AppendStructuralBreak ();
126 pout.Close ();
127 pc.WaitForExit ();
128 pc.Close ();
129 Finished ();