(Back)port some changes from beagle-lucene-1-9-lockfile-branch: allow ext: queries...
[beagle.git] / Filters / FilterPdf.cs
blob7066208b0ad306c2422c579712392e5d5900788a
1 //
2 // FilterPdf.cs: Very simplistic PDF filter
3 //
4 // Author:
5 // Christopher Orr <dashboard@protactin.co.uk>
6 //
7 // Copyright 2004 by Christopher Orr
8 //
10 using System;
11 using System.IO;
12 using System.Diagnostics;
14 using Beagle.Util;
15 using Beagle.Daemon;
17 namespace Beagle.Filters {
19 public class FilterPdf : Beagle.Daemon.Filter {
21 public FilterPdf ()
23 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/pdf"));
24 SnippetMode = true;
27 // FIXME: we should have a reasonable failure mode if pdftotext is
28 // not installed.
30 protected override void DoPullProperties ()
32 // create new external process
33 SafeProcess pc = new SafeProcess ();
34 pc.Arguments = new string [] { "pdfinfo", FileInfo.FullName };
35 pc.RedirectStandardOutput = true;
36 pc.RedirectStandardError = true;
38 try {
39 pc.Start ();
40 } catch (SafeProcessException e) {
41 Log.Warn (e.Message);
42 Error ();
43 return;
46 // add pdfinfo's output to pool
47 StreamReader pout = new StreamReader (pc.StandardOutput);
48 string str = null;
49 string[] tokens = null;
50 string strMetaTag = null;
51 bool bKeyword = false;
53 while ((str = pout.ReadLine ()) != null) {
54 bKeyword = false;
55 strMetaTag = null;
56 tokens = str.Split (':');
57 if (tokens.Length > 1) {
58 switch (tokens[0]) {
59 case "Title":
60 strMetaTag = "dc:title";
61 break;
62 case "Author":
63 strMetaTag = "dc:author";
64 break;
65 case "Pages":
66 strMetaTag = "fixme:page-count";
67 bKeyword = true;
68 break;
69 case "Creator":
70 strMetaTag = "dc:creator";
71 break;
72 case "Producer":
73 strMetaTag = "dc:appname";
74 break;
76 if (strMetaTag != null) {
77 if (bKeyword)
78 AddProperty (Beagle.Property.NewUnsearched (strMetaTag,
79 tokens[1].Trim()));
80 else
81 AddProperty (Beagle.Property.New (strMetaTag,
82 tokens[1].Trim()));
87 pout.Close ();
89 // Log any errors or warnings from stderr
90 pout = new StreamReader (pc.StandardError);
91 while ((str = pout.ReadLine ()) != null)
92 Log.Warn ("pdfinfo [{0}]: {1}", Uri, str);
94 pout.Close ();
95 pc.Close ();
98 protected override void DoPull ()
100 // create new external process
101 SafeProcess pc = new SafeProcess ();
102 pc.Arguments = new string [] { "pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", FileInfo.FullName, "-" };
103 pc.RedirectStandardOutput = true;
104 pc.RedirectStandardError = true;
106 try {
107 pc.Start ();
108 } catch (SafeProcessException e) {
109 Log.Warn (e.Message);
110 Error ();
111 return;
114 // add pdftotext's output to pool
115 StreamReader pout = new StreamReader (pc.StandardOutput);
117 // FIXME: I don't think this is really required
118 // Line by line parsing, however, we have to make
119 // sure, that "pdftotext" doesn't output any "New-lines".
120 string str;
121 while ((str = pout.ReadLine()) != null) {
122 AppendText (str);
123 AppendStructuralBreak ();
124 if (! AllowMoreWords ())
125 break;
127 pout.Close ();
129 pout = new StreamReader (pc.StandardError);
130 while ((str = pout.ReadLine ()) != null)
131 Log.Warn ("pdftotext [{0}]: {1}", Uri, str);
133 pout.Close ();
134 pc.Close ();
135 Finished ();