openexr: make architecture independent
[oi-userland.git] / components / library / indri / patches / remove_xpdf.patch
blob229eafc5265a2c3002841811b34b10b1fbb0e6b8
1 Remove xpdf support from the build.
2 --- indri-5.4/MakeDefns.in čt črc 4 15:01:17 2013
3 +++ indri-5.4/MakeDefns.in čt črc 4 15:00:40 2013
4 @@ -48,7 +48,7 @@
5 PHPINCLUDE = @PHPINCLUDE@
6 MCS=@MCS@
8 -DEPENDENCIES = lemur xpdf
9 +DEPENDENCIES = lemur
10 ifeq ($(NEED_ANTLR), 1)
11 DEPENDENCIES += antlr
12 endif
13 --- indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:46 2013
14 +++ indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:28 2013
15 @@ -1,214 +1,0 @@
16 -/*==========================================================================
17 - * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
18 - *
19 - * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
20 - * is subject to the terms of the software license set forth in the LICENSE
21 - * file included with this software, and also available at
22 - * http://www.lemurproject.org/license.html
23 - *
24 - *==========================================================================
25 -*/
28 -//
29 -// PDFDocumentExtractor
30 -//
31 -// 25 June 2004 -- tds
32 -//
34 -#include "indri/PDFDocumentExtractor.hpp"
35 -#include "indri/Buffer.hpp"
37 -#include "GString.h"
38 -#include "TextOutputDev.h"
39 -#include "PDFDoc.h"
41 -#include "Object.h"
42 -#include "Stream.h"
43 -#include "Array.h"
44 -#include "Dict.h"
45 -#include "XRef.h"
46 -#include "Page.h"
47 -#include "CharTypes.h"
48 -#include "GlobalParams.h"
49 -#include "lemur/Exception.hpp"
51 -static void buffer_write( void* stream, char* text, int len ) {
52 - indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
54 - if( buffer->position() ) {
55 - buffer->unwrite(1);
56 - }
58 - memcpy( buffer->write(len), text, len );
59 - if( text[len-1] != 0 )
60 - *buffer->write(1) = 0;
63 -indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
64 - globalParams = new GlobalParams(0);
65 - _title="";
66 - _author="";
69 -indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
70 - delete globalParams;
71 - globalParams = 0;
75 -void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
76 - if (node == NULL) {
77 - return;
78 - }
80 - const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
81 - for( size_t i=0; i<children.size(); i++ ) {
82 - indri::xml::XMLNode* child = children[i];
83 - metaTag = child->getValue();
84 - if(metaTag.length()==0)
85 - seekValue(child,metaTag);
86 - else
87 - return;
88 - }
92 -void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
93 - indri::xml::XMLNode* current = 0;
95 - if (node == NULL) {
96 - return;
97 - }
99 - const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
101 - for( size_t i=0; i<children.size(); i++ ) {
102 - indri::xml::XMLNode* child = children[i];
103 - std::string name = child->getName();
104 - if(name=="dccreator")
106 - seekValue(child,_author);
108 - if(name=="dctitle")
110 - seekValue(child,_title);
112 - appendPdfMetaData(child);
120 -void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
121 - _documentTextBuffer.clear();
122 - _documentPath = filename;
125 -void indri::parse::PDFDocumentExtractor::close() {
126 - _documentPath = "";
129 -indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
130 - if( !_documentPath.length() )
131 - return 0;
133 - PDFDoc* doc = 0;
134 - TextOutputDev* textOut = 0;
135 - GString* gfilename = new GString(_documentPath.c_str());
136 - doc = new PDFDoc( gfilename );
137 - // if the doc is not ok, or ok to copy, it
138 - // will be a document of length 0.
139 - if( doc->isOk() && doc->okToCopy() ) {
140 - void* stream = &_documentTextBuffer;
141 - textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
142 - if ( textOut->isOk() ) {
143 - int firstPage = 1;
144 - int lastPage = doc->getNumPages();
145 - double hDPI=72.0;
146 - double vDPI=72.0;
147 - int rotate=0;
148 - GBool useMediaBox=gFalse;
149 - GBool crop=gTrue;
150 - GBool printing=gFalse;
151 - if(doc->readMetadata()!=NULL)
153 - GString rawMetaData = doc->readMetadata();
154 - GString preparedMetaData="";
156 - //zoek <rdf:RDF en eindig bij </rdf:RDF>!!
157 - for(int x=0; x<rawMetaData.getLength(); x++) {
158 - if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
159 - //skip characters which the XMLReader doesn't understand
160 - preparedMetaData.append(rawMetaData.getChar(x));
163 - std::string metaData(preparedMetaData.getCString());
164 - int startbegin = metaData.find("<rdf");
165 - int stopend = metaData.find(">", metaData.rfind("</rdf") );
166 - metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
169 - indri::xml::XMLReader reader;
171 - try {
172 - std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
173 - appendPdfMetaData( result.get() );
174 - } catch( lemur::api::Exception& e ) {
175 - LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
176 - }
177 - if( _author.length()>0 || _title.length()>0 )
179 - std::string createdPdfHeader;
180 - createdPdfHeader="<head>\n";
181 - if(_title.length()>0) {
182 - createdPdfHeader+="<title>";
183 - createdPdfHeader+=_title;
184 - createdPdfHeader+="</title>\n";
186 - if(_author.length()>0) {
187 - createdPdfHeader+="<author>";
188 - createdPdfHeader+=_author;
189 - createdPdfHeader+="</author>\n";
191 - createdPdfHeader+="</head>\n";
192 - char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
193 - strcpy(metastream, createdPdfHeader.c_str());
196 - doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
201 - delete textOut;
202 - delete doc;
204 - _unparsedDocument.textLength = _documentTextBuffer.position();
205 - _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
206 - char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
207 - strcpy( docnoPoint, _documentPath.c_str() );
208 - _unparsedDocument.text = _documentTextBuffer.front();
209 - _unparsedDocument.content = _documentTextBuffer.front();
210 - _unparsedDocument.metadata.clear();
212 - indri::parse::MetadataPair pair;
214 - pair.key = "path";
215 - pair.value = docnoPoint;
216 - pair.valueLength = _documentPath.length()+1;
217 - _unparsedDocument.metadata.push_back( pair );
219 - _docnostring.assign(_documentPath.c_str() );
220 - cleanDocno();
221 - pair.value = _docnostring.c_str();
222 - pair.valueLength = _docnostring.length()+1;
223 - pair.key = "docno";
224 - _unparsedDocument.metadata.push_back( pair );
226 - _documentPath = "";
228 - return &_unparsedDocument;
230 --- indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:16:04 2013
231 +++ indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:15:00 2013
232 @@ -1,57 +1,0 @@
233 -/*==========================================================================
234 - * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
236 - * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
237 - * is subject to the terms of the software license set forth in the LICENSE
238 - * file included with this software, and also available at
239 - * http://www.lemurproject.org/license.html
241 - *==========================================================================
242 - */
246 -// PDFDocumentExtractor
248 -// 25 June 2004 -- tds
251 -#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
252 -#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
254 -#include "lemur/lemur-compat.hpp"
255 -#include "indri/Buffer.hpp"
256 -#include "indri/UnparsedDocument.hpp"
257 -#include "indri/DocumentIterator.hpp"
258 -#include "indri/XMLReader.hpp"
259 -#include "indri/XMLNode.hpp"
260 -#include "indri/XMLWriter.hpp"
261 -#include <string>
262 -namespace indri
264 - namespace parse
267 - class PDFDocumentExtractor : public DocumentIterator {
268 - indri::utility::Buffer _documentTextBuffer;
269 - UnparsedDocument _unparsedDocument;
270 - std::string _documentPath;
272 - public:
273 - PDFDocumentExtractor();
274 - ~PDFDocumentExtractor();
276 - void open( const std::string& filename );
277 - UnparsedDocument* nextDocument();
278 - void appendPdfMetaData(indri::xml::XMLNode* node);
279 - void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
280 - void close();
281 - private:
282 - std::string _title;
283 - std::string _author;
285 - };
289 -#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
290 --- indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:24:24 2013
291 +++ indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:23:27 2013
292 @@ -18,7 +18,6 @@
294 #include "indri/DocumentIteratorFactory.hpp"
296 -#include "indri/PDFDocumentExtractor.hpp"
297 #include "indri/TaggedDocumentIterator.hpp"
298 #include "indri/WARCDocumentIterator.hpp"
299 #include "indri/TextDocumentExtractor.hpp"
300 @@ -36,7 +35,6 @@
302 #define TYPE_TAGGED ( "Tagged Document Collection" )
303 #define TYPE_WARC ( "WARC Document Collection" )
304 -#define TYPE_PDF ( "Adobe PDF" )
305 #define TYPE_WORD ( "Microsoft Word" )
306 #define TYPE_PPT ( "Microsoft PowerPoint" )
307 #define TYPE_MBOX ( "Mailbox" )
308 @@ -53,8 +51,6 @@
309 result = iter;
310 } else if( preferred == TYPE_WARC ) {
311 result = new indri::parse::WARCDocumentIterator();
312 - } else if( preferred == TYPE_PDF ) {
313 - result = new indri::parse::PDFDocumentExtractor();
314 } else if( preferred == TYPE_TEXT ) {
315 result = new indri::parse::TextDocumentExtractor();
316 } else if( preferred == TYPE_MBOX ) {
317 @@ -83,8 +79,6 @@
318 return TYPE_TAGGED;
319 } else if( type == "warc" || type == TYPE_WARC ) {
320 return TYPE_WARC;
321 - } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
322 - return TYPE_PDF;
323 } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
324 return TYPE_WORD;
325 } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
326 --- indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:56 2013
327 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:20 2013
328 @@ -55,8 +55,6 @@
329 // case. Values specified here can be in mixed case, since values are
330 // matched in a case-sensitive manner.
332 -static const char* pdf_index_tags[] = { "title", "author", 0 };
333 -static const char* pdf_metadata_tags[] = { "title", "author", 0 };
334 static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
335 static const char* html_metadata_tags[] = { "title", "author", 0 };
336 //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
337 @@ -279,21 +277,6 @@
338 #endif
341 - "pdf", // name
342 - "html", // parser
343 - "word", // tokenizer
344 - "pdf", // iterator
345 - NULL, // startDocTag
346 - NULL, // endDocTag
347 - NULL, // endMetadataTag
348 - NULL, // includeTags
349 - NULL, // excludeTags
350 - pdf_index_tags, // indexTags
351 - pdf_metadata_tags, // metadataTags
352 - NULL // conflations
353 - },
356 "txt", // name
357 "text", // parser
358 "word", // tokenizer