2 // FilterPdf.cs: Very simplistic PDF filter
5 // Christopher Orr <dashboard@protactin.co.uk>
7 // Copyright 2004 by Christopher Orr
12 using System
.Diagnostics
;
17 namespace Beagle
.Filters
{
19 public class FilterPdf
: Beagle
.Daemon
.Filter
{
23 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/pdf"));
27 // FIXME: we should have a reasonable failure mode if pdftotext is
30 protected override void DoPullProperties ()
32 // create new external process
33 Process pc
= new Process ();
34 pc
.StartInfo
.FileName
= "pdfinfo";
35 // FIXME: We probably need to quote special chars in the path
36 pc
.StartInfo
.Arguments
= String
.Format (" \"{0}\"", FileInfo
.FullName
);
37 pc
.StartInfo
.RedirectStandardInput
= false;
38 pc
.StartInfo
.RedirectStandardOutput
= true;
39 pc
.StartInfo
.UseShellExecute
= false;
42 } catch (System
.ComponentModel
.Win32Exception
) {
43 Logger
.Log
.Warn ("Unable to find pdfinfo in path; PDF file not indexed.");
48 // add pdfinfo's output to pool
49 StreamReader pout
= pc
.StandardOutput
;
51 string[] tokens
= null;
52 string strMetaTag
= null;
53 bool bKeyword
= false;
55 while ((str
= pout
.ReadLine ()) != null) {
58 tokens
= str
.Split (':');
59 if (tokens
.Length
> 1) {
62 strMetaTag
= "dc:title";
65 strMetaTag
= "dc:author";
68 strMetaTag
= "fixme:page-count";
72 strMetaTag
= "dc:creator";
75 strMetaTag
= "dc:appname";
78 if (strMetaTag
!= null) {
80 AddProperty (Beagle
.Property
.NewKeyword (strMetaTag
,
83 AddProperty (Beagle
.Property
.New (strMetaTag
,
94 protected override void DoPull ()
96 // create new external process
97 Process pc
= new Process ();
98 pc
.StartInfo
.FileName
= "pdftotext";
99 // FIXME: We probably need to quote special chars in the path
100 pc
.StartInfo
.Arguments
= String
.Format ("-nopgbrk -enc UTF-8 \"{0}\" -", FileInfo
.FullName
);
101 pc
.StartInfo
.RedirectStandardInput
= false;
102 pc
.StartInfo
.RedirectStandardOutput
= true;
103 pc
.StartInfo
.UseShellExecute
= false;
106 } catch (System
.ComponentModel
.Win32Exception
) {
107 Logger
.Log
.Warn ("Unable to find pdftotext in path; PDF file not indexed.");
112 // Nice the process so we don't monopolize the CPU as much; we might want to even set this to idle
113 pc
.PriorityClass
= ProcessPriorityClass
.BelowNormal
;
115 // add pdftotext's output to pool
116 StreamReader pout
= pc
.StandardOutput
;
118 // FIXME: I don't think this is really required
119 // Line by line parsing, however, we have to make
120 // sure, that "pdftotext" doesn't output any "New-lines".
122 while ((str
= pout
.ReadLine()) != null) {
124 AppendStructuralBreak ();