2 // FilterPdf.cs: Very simplistic PDF filter
5 // Christopher Orr <dashboard@protactin.co.uk>
7 // Copyright 2004 by Christopher Orr
12 using System
.Diagnostics
;
17 namespace Beagle
.Filters
{
19 public class FilterPdf
: Beagle
.Daemon
.Filter
{
23 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/pdf"));
27 // FIXME: we should have a reasonable failure mode if pdftotext is
30 protected override void DoPullProperties ()
32 // create new external process
33 SafeProcess pc
= new SafeProcess ();
34 pc
.Arguments
= new string [] { "pdfinfo", FileInfo.FullName }
;
35 pc
.RedirectStandardOutput
= true;
39 } catch (SafeProcessException e
) {
45 // add pdfinfo's output to pool
46 StreamReader pout
= new StreamReader (pc
.StandardOutput
);
48 string[] tokens
= null;
49 string strMetaTag
= null;
50 bool bKeyword
= false;
52 while ((str
= pout
.ReadLine ()) != null) {
55 tokens
= str
.Split (':');
56 if (tokens
.Length
> 1) {
59 strMetaTag
= "dc:title";
62 strMetaTag
= "dc:author";
65 strMetaTag
= "fixme:page-count";
69 strMetaTag
= "dc:creator";
72 strMetaTag
= "dc:appname";
75 if (strMetaTag
!= null) {
77 AddProperty (Beagle
.Property
.NewUnsearched (strMetaTag
,
80 AddProperty (Beagle
.Property
.New (strMetaTag
,
90 protected override void DoPull ()
92 // create new external process
93 SafeProcess pc
= new SafeProcess ();
94 pc
.Arguments
= new string [] { "pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", FileInfo.FullName, "-" }
;
95 pc
.RedirectStandardOutput
= true;
99 } catch (SafeProcessException e
) {
100 Log
.Warn (e
.Message
);
105 // add pdftotext's output to pool
106 StreamReader pout
= new StreamReader (pc
.StandardOutput
);
108 // FIXME: I don't think this is really required
109 // Line by line parsing, however, we have to make
110 // sure, that "pdftotext" doesn't output any "New-lines".
112 while ((str
= pout
.ReadLine()) != null) {
114 AppendStructuralBreak ();
115 if (! AllowMoreWords ())