Dont throw EncodingFoundException unless asked to. Should remove the occassional...
[beagle.git] / Filters / FilterPdf.cs
blob782e738a14bbe235e9712fc1cf056892f722ca68
1 //
2 // FilterPdf.cs: Very simplistic PDF filter
3 //
4 // Author:
5 // Christopher Orr <dashboard@protactin.co.uk>
6 //
7 // Copyright 2004 by Christopher Orr
8 //
10 using System;
11 using System.IO;
12 using System.Diagnostics;
14 using Beagle.Util;
15 using Beagle.Daemon;
17 namespace Beagle.Filters {
19 public class FilterPdf : Beagle.Daemon.Filter {
21 public FilterPdf ()
23 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/pdf"));
24 SnippetMode = true;
27 // FIXME: we should have a reasonable failure mode if pdftotext is
28 // not installed.
30 protected override void DoPullProperties ()
32 // create new external process
33 SafeProcess pc = new SafeProcess ();
34 pc.Arguments = new string [] { "pdfinfo", FileInfo.FullName };
35 pc.RedirectStandardOutput = true;
37 try {
38 pc.Start ();
39 } catch (SafeProcessException e) {
40 Log.Warn (e.Message);
41 Error ();
42 return;
45 // add pdfinfo's output to pool
46 StreamReader pout = new StreamReader (pc.StandardOutput);
47 string str = null;
48 string[] tokens = null;
49 string strMetaTag = null;
50 bool bKeyword = false;
52 while ((str = pout.ReadLine ()) != null) {
53 bKeyword = false;
54 strMetaTag = null;
55 tokens = str.Split (':');
56 if (tokens.Length > 1) {
57 switch (tokens[0]) {
58 case "Title":
59 strMetaTag = "dc:title";
60 break;
61 case "Author":
62 strMetaTag = "dc:author";
63 break;
64 case "Pages":
65 strMetaTag = "fixme:page-count";
66 bKeyword = true;
67 break;
68 case "Creator":
69 strMetaTag = "dc:creator";
70 break;
71 case "Producer":
72 strMetaTag = "dc:appname";
73 break;
75 if (strMetaTag != null) {
76 if (bKeyword)
77 AddProperty (Beagle.Property.NewUnsearched (strMetaTag,
78 tokens[1].Trim()));
79 else
80 AddProperty (Beagle.Property.New (strMetaTag,
81 tokens[1].Trim()));
86 pout.Close ();
87 pc.Close ();
90 protected override void DoPull ()
92 // create new external process
93 SafeProcess pc = new SafeProcess ();
94 pc.Arguments = new string [] { "pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", FileInfo.FullName, "-" };
95 pc.RedirectStandardOutput = true;
97 try {
98 pc.Start ();
99 } catch (SafeProcessException e) {
100 Log.Warn (e.Message);
101 Error ();
102 return;
105 // add pdftotext's output to pool
106 StreamReader pout = new StreamReader (pc.StandardOutput);
108 // FIXME: I don't think this is really required
109 // Line by line parsing, however, we have to make
110 // sure, that "pdftotext" doesn't output any "New-lines".
111 string str;
112 while ((str = pout.ReadLine()) != null) {
113 AppendText (str);
114 AppendStructuralBreak ();
115 if (! AllowMoreWords ())
116 break;
118 pout.Close ();
119 pc.Close ();
120 Finished ();