2 // FilterPdf.cs: Very simplistic PDF filter
5 // Christopher Orr <dashboard@protactin.co.uk>
7 // Copyright 2004 by Christopher Orr
12 using System
.Diagnostics
;
17 namespace Beagle
.Filters
{
19 public class FilterPdf
: Beagle
.Daemon
.Filter
{
23 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/pdf"));
27 // FIXME: we should have a reasonable failure mode if pdftotext is
30 protected override void DoPullProperties ()
32 // create new external process
33 SafeProcess pc
= new SafeProcess ();
34 pc
.Arguments
= new string [] { "pdfinfo", FileInfo.FullName }
;
35 pc
.RedirectStandardOutput
= true;
36 pc
.RedirectStandardError
= true;
40 } catch (SafeProcessException e
) {
46 // add pdfinfo's output to pool
47 StreamReader pout
= new StreamReader (pc
.StandardOutput
);
49 string[] tokens
= null;
50 string strMetaTag
= null;
51 bool bKeyword
= false;
53 while ((str
= pout
.ReadLine ()) != null) {
56 tokens
= str
.Split (':');
57 if (tokens
.Length
> 1) {
60 strMetaTag
= "dc:title";
63 strMetaTag
= "dc:author";
66 strMetaTag
= "fixme:page-count";
70 strMetaTag
= "dc:creator";
73 strMetaTag
= "dc:appname";
76 if (strMetaTag
!= null) {
78 AddProperty (Beagle
.Property
.NewUnsearched (strMetaTag
,
81 AddProperty (Beagle
.Property
.New (strMetaTag
,
89 // Log any errors or warnings from stderr
90 pout
= new StreamReader (pc
.StandardError
);
91 while ((str
= pout
.ReadLine ()) != null)
92 Log
.Warn ("pdfinfo [{0}]: {1}", Uri
, str
);
98 protected override void DoPull ()
100 // create new external process
101 SafeProcess pc
= new SafeProcess ();
102 pc
.Arguments
= new string [] { "pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", FileInfo.FullName, "-" }
;
103 pc
.RedirectStandardOutput
= true;
104 pc
.RedirectStandardError
= true;
108 } catch (SafeProcessException e
) {
109 Log
.Warn (e
.Message
);
114 // add pdftotext's output to pool
115 StreamReader pout
= new StreamReader (pc
.StandardOutput
);
117 // FIXME: I don't think this is really required
118 // Line by line parsing, however, we have to make
119 // sure, that "pdftotext" doesn't output any "New-lines".
121 while ((str
= pout
.ReadLine()) != null) {
123 AppendStructuralBreak ();
124 if (! AllowMoreWords ())
129 pout
= new StreamReader (pc
.StandardError
);
130 while ((str
= pout
.ReadLine ()) != null)
131 Log
.Warn ("pdftotext [{0}]: {1}", Uri
, str
);