1 Remove xpdf support from the build.
2 --- indri-5.4/MakeDefns.in Ät Ärc 4 15:01:17 2013
3 +++ indri-5.4/MakeDefns.in Ät Ärc 4 15:00:40 2013
5 PHPINCLUDE = @PHPINCLUDE@
8 -DEPENDENCIES = lemur xpdf
10 ifeq ($(NEED_ANTLR), 1)
13 --- indri-5.4/src/PDFDocumentExtractor.cpp Ät Ärc 4 15:08:46 2013
14 +++ indri-5.4/src/PDFDocumentExtractor.cpp Ät Ärc 4 15:08:28 2013
16 -/*==========================================================================
17 - * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
19 - * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
20 - * is subject to the terms of the software license set forth in the LICENSE
21 - * file included with this software, and also available at
22 - * http://www.lemurproject.org/license.html
24 - *==========================================================================
29 -// PDFDocumentExtractor
31 -// 25 June 2004 -- tds
34 -#include "indri/PDFDocumentExtractor.hpp"
35 -#include "indri/Buffer.hpp"
38 -#include "TextOutputDev.h"
47 -#include "CharTypes.h"
48 -#include "GlobalParams.h"
49 -#include "lemur/Exception.hpp"
51 -static void buffer_write( void* stream, char* text, int len ) {
52 - indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
54 - if( buffer->position() ) {
58 - memcpy( buffer->write(len), text, len );
59 - if( text[len-1] != 0 )
60 - *buffer->write(1) = 0;
63 -indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
64 - globalParams = new GlobalParams(0);
69 -indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
70 - delete globalParams;
75 -void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
80 - const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
81 - for( size_t i=0; i<children.size(); i++ ) {
82 - indri::xml::XMLNode* child = children[i];
83 - metaTag = child->getValue();
84 - if(metaTag.length()==0)
85 - seekValue(child,metaTag);
92 -void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
93 - indri::xml::XMLNode* current = 0;
99 - const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
101 - for( size_t i=0; i<children.size(); i++ ) {
102 - indri::xml::XMLNode* child = children[i];
103 - std::string name = child->getName();
104 - if(name=="dccreator")
106 - seekValue(child,_author);
108 - if(name=="dctitle")
110 - seekValue(child,_title);
112 - appendPdfMetaData(child);
120 -void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
121 - _documentTextBuffer.clear();
122 - _documentPath = filename;
125 -void indri::parse::PDFDocumentExtractor::close() {
126 - _documentPath = "";
129 -indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
130 - if( !_documentPath.length() )
134 - TextOutputDev* textOut = 0;
135 - GString* gfilename = new GString(_documentPath.c_str());
136 - doc = new PDFDoc( gfilename );
137 - // if the doc is not ok, or ok to copy, it
138 - // will be a document of length 0.
139 - if( doc->isOk() && doc->okToCopy() ) {
140 - void* stream = &_documentTextBuffer;
141 - textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
142 - if ( textOut->isOk() ) {
144 - int lastPage = doc->getNumPages();
148 - GBool useMediaBox=gFalse;
150 - GBool printing=gFalse;
151 - if(doc->readMetadata()!=NULL)
153 - GString rawMetaData = doc->readMetadata();
154 - GString preparedMetaData="";
156 - //zoek <rdf:RDF en eindig bij </rdf:RDF>!!
157 - for(int x=0; x<rawMetaData.getLength(); x++) {
158 - if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
159 - //skip characters which the XMLReader doesn't understand
160 - preparedMetaData.append(rawMetaData.getChar(x));
163 - std::string metaData(preparedMetaData.getCString());
164 - int startbegin = metaData.find("<rdf");
165 - int stopend = metaData.find(">", metaData.rfind("</rdf") );
166 - metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
169 - indri::xml::XMLReader reader;
172 - std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
173 - appendPdfMetaData( result.get() );
174 - } catch( lemur::api::Exception& e ) {
175 - LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
177 - if( _author.length()>0 || _title.length()>0 )
179 - std::string createdPdfHeader;
180 - createdPdfHeader="<head>\n";
181 - if(_title.length()>0) {
182 - createdPdfHeader+="<title>";
183 - createdPdfHeader+=_title;
184 - createdPdfHeader+="</title>\n";
186 - if(_author.length()>0) {
187 - createdPdfHeader+="<author>";
188 - createdPdfHeader+=_author;
189 - createdPdfHeader+="</author>\n";
191 - createdPdfHeader+="</head>\n";
192 - char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
193 - strcpy(metastream, createdPdfHeader.c_str());
196 - doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
204 - _unparsedDocument.textLength = _documentTextBuffer.position();
205 - _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
206 - char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
207 - strcpy( docnoPoint, _documentPath.c_str() );
208 - _unparsedDocument.text = _documentTextBuffer.front();
209 - _unparsedDocument.content = _documentTextBuffer.front();
210 - _unparsedDocument.metadata.clear();
212 - indri::parse::MetadataPair pair;
215 - pair.value = docnoPoint;
216 - pair.valueLength = _documentPath.length()+1;
217 - _unparsedDocument.metadata.push_back( pair );
219 - _docnostring.assign(_documentPath.c_str() );
221 - pair.value = _docnostring.c_str();
222 - pair.valueLength = _docnostring.length()+1;
223 - pair.key = "docno";
224 - _unparsedDocument.metadata.push_back( pair );
226 - _documentPath = "";
228 - return &_unparsedDocument;
230 --- indri-5.4/include/indri/PDFDocumentExtractor.hpp Ät Ärc 4 15:16:04 2013
231 +++ indri-5.4/include/indri/PDFDocumentExtractor.hpp Ät Ärc 4 15:15:00 2013
233 -/*==========================================================================
234 - * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
236 - * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
237 - * is subject to the terms of the software license set forth in the LICENSE
238 - * file included with this software, and also available at
239 - * http://www.lemurproject.org/license.html
241 - *==========================================================================
246 -// PDFDocumentExtractor
248 -// 25 June 2004 -- tds
251 -#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
252 -#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
254 -#include "lemur/lemur-compat.hpp"
255 -#include "indri/Buffer.hpp"
256 -#include "indri/UnparsedDocument.hpp"
257 -#include "indri/DocumentIterator.hpp"
258 -#include "indri/XMLReader.hpp"
259 -#include "indri/XMLNode.hpp"
260 -#include "indri/XMLWriter.hpp"
267 - class PDFDocumentExtractor : public DocumentIterator {
268 - indri::utility::Buffer _documentTextBuffer;
269 - UnparsedDocument _unparsedDocument;
270 - std::string _documentPath;
273 - PDFDocumentExtractor();
274 - ~PDFDocumentExtractor();
276 - void open( const std::string& filename );
277 - UnparsedDocument* nextDocument();
278 - void appendPdfMetaData(indri::xml::XMLNode* node);
279 - void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
282 - std::string _title;
283 - std::string _author;
289 -#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
290 --- indri-5.4/src/DocumentIteratorFactory.cpp Ät Ärc 4 15:24:24 2013
291 +++ indri-5.4/src/DocumentIteratorFactory.cpp Ät Ärc 4 15:23:27 2013
294 #include "indri/DocumentIteratorFactory.hpp"
296 -#include "indri/PDFDocumentExtractor.hpp"
297 #include "indri/TaggedDocumentIterator.hpp"
298 #include "indri/WARCDocumentIterator.hpp"
299 #include "indri/TextDocumentExtractor.hpp"
302 #define TYPE_TAGGED ( "Tagged Document Collection" )
303 #define TYPE_WARC ( "WARC Document Collection" )
304 -#define TYPE_PDF ( "Adobe PDF" )
305 #define TYPE_WORD ( "Microsoft Word" )
306 #define TYPE_PPT ( "Microsoft PowerPoint" )
307 #define TYPE_MBOX ( "Mailbox" )
310 } else if( preferred == TYPE_WARC ) {
311 result = new indri::parse::WARCDocumentIterator();
312 - } else if( preferred == TYPE_PDF ) {
313 - result = new indri::parse::PDFDocumentExtractor();
314 } else if( preferred == TYPE_TEXT ) {
315 result = new indri::parse::TextDocumentExtractor();
316 } else if( preferred == TYPE_MBOX ) {
319 } else if( type == "warc" || type == TYPE_WARC ) {
321 - } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
323 } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
325 } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
326 --- indri-5.4/src/FileClassEnvironmentFactory.cpp Ät Ärc 4 15:33:56 2013
327 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp Ät Ärc 4 15:33:20 2013
329 // case. Values specified here can be in mixed case, since values are
330 // matched in a case-sensitive manner.
332 -static const char* pdf_index_tags[] = { "title", "author", 0 };
333 -static const char* pdf_metadata_tags[] = { "title", "author", 0 };
334 static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
335 static const char* html_metadata_tags[] = { "title", "author", 0 };
336 //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
343 - "word", // tokenizer
345 - NULL, // startDocTag
347 - NULL, // endMetadataTag
348 - NULL, // includeTags
349 - NULL, // excludeTags
350 - pdf_index_tags, // indexTags
351 - pdf_metadata_tags, // metadataTags
352 - NULL // conflations