components/library/indri/patches/remove_xpdf.patch

   1 Remove xpdf support from the build.
   2 --- indri-5.4/MakeDefns.in      Ät Ärc  4 15:01:17 2013
   3 +++ indri-5.4/MakeDefns.in      Ät Ärc  4 15:00:40 2013
   4 @@ -48,7 +48,7 @@
   5  PHPINCLUDE = @PHPINCLUDE@
   6  MCS=@MCS@
   7
   8 -DEPENDENCIES = lemur xpdf
   9 +DEPENDENCIES = lemur
  10  ifeq ($(NEED_ANTLR), 1)
  11    DEPENDENCIES += antlr
  12  endif
  13 --- indri-5.4/src/PDFDocumentExtractor.cpp      Ät Ärc  4 15:08:46 2013
  14 +++ indri-5.4/src/PDFDocumentExtractor.cpp      Ät Ärc  4 15:08:28 2013
  15 @@ -1,214 +1,0 @@
  16 -/*==========================================================================
  17 - * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
  18 - *
  19 - * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
  20 - * is subject to the terms of the software license set forth in the LICENSE
  21 - * file included with this software, and also available at
  22 - * http://www.lemurproject.org/license.html
  23 - *
  24 - *==========================================================================
  25 -*/
  26 -
  27 -
  28 -//
  29 -// PDFDocumentExtractor
  30 -//
  31 -// 25 June 2004 -- tds
  32 -//
  33 -
  34 -#include "indri/PDFDocumentExtractor.hpp"
  35 -#include "indri/Buffer.hpp"
  36 -
  37 -#include "GString.h"
  38 -#include "TextOutputDev.h"
  39 -#include "PDFDoc.h"
  40 -
  41 -#include "Object.h"
  42 -#include "Stream.h"
  43 -#include "Array.h"
  44 -#include "Dict.h"
  45 -#include "XRef.h"
  46 -#include "Page.h"
  47 -#include "CharTypes.h"
  48 -#include "GlobalParams.h"
  49 -#include "lemur/Exception.hpp"
  50 -
  51 -static void buffer_write( void* stream, char* text, int len ) {
  52 -  indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
  53 -
  54 -  if( buffer->position() ) {
  55 -    buffer->unwrite(1);
  56 -  }
  57 -
  58 -  memcpy( buffer->write(len), text, len );
  59 -  if( text[len-1] != 0 )
  60 -    *buffer->write(1) = 0;
  61 -}
  62 -
  63 -indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
  64 -  globalParams = new GlobalParams(0);
  65 -  _title="";
  66 -  _author="";
  67 -}
  68 -
  69 -indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
  70 -  delete globalParams;
  71 -  globalParams = 0;
  72 -}
  73 -
  74 -
  75 -void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
  76 -  if (node == NULL) {
  77 -    return;
  78 -  }
  79 -
  80 -  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
  81 -  for( size_t i=0; i<children.size(); i++ ) {
  82 -    indri::xml::XMLNode* child = children[i];
  83 -    metaTag = child->getValue();
  84 -       if(metaTag.length()==0)
  85 -               seekValue(child,metaTag);
  86 -       else
  87 -               return;
  88 -  }
  89 -
  90 -}
  91 -
  92 -void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
  93 -  indri::xml::XMLNode* current = 0;
  94 -
  95 -  if (node == NULL) {
  96 -    return;
  97 -  }
  98 -
  99 -  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
 100 -
 101 -  for( size_t i=0; i<children.size(); i++ ) {
 102 -    indri::xml::XMLNode* child = children[i];
 103 -    std::string name = child->getName();
 104 -       if(name=="dccreator")
 105 -       {
 106 -               seekValue(child,_author);
 107 -       }
 108 -       if(name=="dctitle")
 109 -       {
 110 -               seekValue(child,_title);
 111 -       }
 112 -       appendPdfMetaData(child);
 113 -
 114 -  }
 115 -
 116 -
 117 -
 118 -}
 119 -
 120 -void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
 121 -  _documentTextBuffer.clear();
 122 -  _documentPath = filename;
 123 -}
 124 -
 125 -void indri::parse::PDFDocumentExtractor::close() {
 126 -  _documentPath = "";
 127 -}
 128 -
 129 -indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
 130 -  if( !_documentPath.length() )
 131 -    return 0;
 132 -
 133 -  PDFDoc* doc = 0;
 134 -  TextOutputDev* textOut = 0;
 135 -  GString* gfilename = new GString(_documentPath.c_str());
 136 -  doc = new PDFDoc( gfilename );
 137 -  // if the doc is not ok, or ok to copy, it
 138 -  // will be a document of length 0.
 139 -  if( doc->isOk() && doc->okToCopy() ) {
 140 -    void* stream = &_documentTextBuffer;
 141 -    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
 142 -    if ( textOut->isOk() ) {
 143 -      int firstPage = 1;
 144 -      int lastPage = doc->getNumPages();
 145 -         double hDPI=72.0;
 146 -         double vDPI=72.0;
 147 -         int rotate=0;
 148 -         GBool useMediaBox=gFalse;
 149 -         GBool crop=gTrue;
 150 -         GBool printing=gFalse;
 151 -         if(doc->readMetadata()!=NULL)
 152 -         {
 153 -                 GString rawMetaData = doc->readMetadata();
 154 -                 GString preparedMetaData="";
 155 -
 156 -                 //zoek <rdf:RDF  en eindig bij </rdf:RDF>!!
 157 -                 for(int x=0; x<rawMetaData.getLength(); x++) {
 158 -                         if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
 159 -                                 //skip characters which the XMLReader doesn't understand
 160 -                                 preparedMetaData.append(rawMetaData.getChar(x));
 161 -                         }
 162 -                 }
 163 -                 std::string metaData(preparedMetaData.getCString());
 164 -                 int startbegin = metaData.find("<rdf");
 165 -                 int stopend = metaData.find(">", metaData.rfind("</rdf") );
 166 -                 metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
 167 -
 168 -
 169 -         indri::xml::XMLReader reader;
 170 -
 171 -                 try {
 172 -                         std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
 173 -                         appendPdfMetaData( result.get() );
 174 -                 } catch( lemur::api::Exception& e ) {
 175 -                       LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
 176 -                 }
 177 -                 if( _author.length()>0 || _title.length()>0 )
 178 -                 {
 179 -                       std::string createdPdfHeader;
 180 -                       createdPdfHeader="<head>\n";
 181 -                       if(_title.length()>0) {
 182 -                               createdPdfHeader+="<title>";
 183 -                               createdPdfHeader+=_title;
 184 -                               createdPdfHeader+="</title>\n";
 185 -                       }
 186 -                       if(_author.length()>0) {
 187 -                               createdPdfHeader+="<author>";
 188 -                               createdPdfHeader+=_author;
 189 -                               createdPdfHeader+="</author>\n";
 190 -                       }
 191 -                       createdPdfHeader+="</head>\n";
 192 -                       char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
 193 -                       strcpy(metastream, createdPdfHeader.c_str());
 194 -                 }
 195 -         }
 196 -      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
 197 -    }
 198 -  }
 199 -
 200 -
 201 -  delete textOut;
 202 -  delete doc;
 203 -
 204 -  _unparsedDocument.textLength = _documentTextBuffer.position();
 205 -  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
 206 -  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
 207 -  strcpy( docnoPoint, _documentPath.c_str() );
 208 -  _unparsedDocument.text = _documentTextBuffer.front();
 209 -  _unparsedDocument.content = _documentTextBuffer.front();
 210 -  _unparsedDocument.metadata.clear();
 211 -
 212 -  indri::parse::MetadataPair pair;
 213 -
 214 -  pair.key = "path";
 215 -  pair.value = docnoPoint;
 216 -  pair.valueLength = _documentPath.length()+1;
 217 -  _unparsedDocument.metadata.push_back( pair );
 218 -
 219 -  _docnostring.assign(_documentPath.c_str() );
 220 -  cleanDocno();
 221 -  pair.value = _docnostring.c_str();
 222 -  pair.valueLength = _docnostring.length()+1;
 223 -  pair.key = "docno";
 224 -  _unparsedDocument.metadata.push_back( pair );
 225 -
 226 -  _documentPath = "";
 227 -
 228 -  return &_unparsedDocument;
 229 -}
 230 --- indri-5.4/include/indri/PDFDocumentExtractor.hpp    Ät Ärc  4 15:16:04 2013
 231 +++ indri-5.4/include/indri/PDFDocumentExtractor.hpp    Ät Ärc  4 15:15:00 2013
 232 @@ -1,57 +1,0 @@
 233 -/*==========================================================================
 234 - * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
 235 - *
 236 - * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
 237 - * is subject to the terms of the software license set forth in the LICENSE
 238 - * file included with this software, and also available at
 239 - * http://www.lemurproject.org/license.html
 240 - *
 241 - *==========================================================================
 242 - */
 243 -
 244 -
 245 -//
 246 -// PDFDocumentExtractor
 247 -//
 248 -// 25 June 2004 -- tds
 249 -//
 250 -
 251 -#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
 252 -#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
 253 -
 254 -#include "lemur/lemur-compat.hpp"
 255 -#include "indri/Buffer.hpp"
 256 -#include "indri/UnparsedDocument.hpp"
 257 -#include "indri/DocumentIterator.hpp"
 258 -#include "indri/XMLReader.hpp"
 259 -#include "indri/XMLNode.hpp"
 260 -#include "indri/XMLWriter.hpp"
 261 -#include <string>
 262 -namespace indri
 263 -{
 264 -  namespace parse
 265 -  {
 266 -
 267 -    class PDFDocumentExtractor : public DocumentIterator {
 268 -      indri::utility::Buffer _documentTextBuffer;
 269 -      UnparsedDocument _unparsedDocument;
 270 -      std::string _documentPath;
 271 -
 272 -    public:
 273 -      PDFDocumentExtractor();
 274 -      ~PDFDocumentExtractor();
 275 -
 276 -      void open( const std::string& filename );
 277 -      UnparsedDocument* nextDocument();
 278 -         void appendPdfMetaData(indri::xml::XMLNode* node);
 279 -         void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
 280 -      void close();
 281 -       private:
 282 -         std::string _title;
 283 -         std::string _author;
 284 -
 285 -    };
 286 -  }
 287 -}
 288 -
 289 -#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
 290 --- indri-5.4/src/DocumentIteratorFactory.cpp   Ät Ärc  4 15:24:24 2013
 291 +++ indri-5.4/src/DocumentIteratorFactory.cpp   Ät Ärc  4 15:23:27 2013
 292 @@ -18,7 +18,6 @@
 293
 294  #include "indri/DocumentIteratorFactory.hpp"
 295
 296 -#include "indri/PDFDocumentExtractor.hpp"
 297  #include "indri/TaggedDocumentIterator.hpp"
 298  #include "indri/WARCDocumentIterator.hpp"
 299  #include "indri/TextDocumentExtractor.hpp"
 300 @@ -36,7 +35,6 @@
 301
 302  #define TYPE_TAGGED   ( "Tagged Document Collection" )
 303  #define TYPE_WARC     ( "WARC Document Collection" )
 304 -#define TYPE_PDF      ( "Adobe PDF" )
 305  #define TYPE_WORD     ( "Microsoft Word" )
 306  #define TYPE_PPT      ( "Microsoft PowerPoint" )
 307  #define TYPE_MBOX     ( "Mailbox" )
 308 @@ -53,8 +51,6 @@
 309      result = iter;
 310    } else if( preferred == TYPE_WARC ) {
 311      result = new indri::parse::WARCDocumentIterator();
 312 -  } else if( preferred == TYPE_PDF ) {
 313 -    result = new indri::parse::PDFDocumentExtractor();
 314    } else if( preferred == TYPE_TEXT ) {
 315      result = new indri::parse::TextDocumentExtractor();
 316    } else if( preferred == TYPE_MBOX ) {
 317 @@ -83,8 +79,6 @@
 318      return TYPE_TAGGED;
 319    } else if( type == "warc" || type == TYPE_WARC ) {
 320      return TYPE_WARC;
 321 -  } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
 322 -    return TYPE_PDF;
 323    } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
 324      return TYPE_WORD;
 325    } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
 326 --- indri-5.4/src/FileClassEnvironmentFactory.cpp       Ät Ärc  4 15:33:56 2013
 327 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp       Ät Ärc  4 15:33:20 2013
 328 @@ -55,8 +55,6 @@
 329  // case.  Values specified here can be in mixed case, since values are
 330  // matched in a case-sensitive manner.
 331
 332 -static const char* pdf_index_tags[] = { "title", "author", 0 };
 333 -static const char* pdf_metadata_tags[] = { "title", "author", 0 };
 334  static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
 335  static const char* html_metadata_tags[] = { "title", "author", 0 };
 336  //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
 337 @@ -279,21 +277,6 @@
 338  #endif
 339
 340    {
 341 -    "pdf",                // name
 342 -    "html",               // parser
 343 -    "word",               // tokenizer
 344 -    "pdf",                // iterator
 345 -    NULL,                 // startDocTag
 346 -    NULL,                 // endDocTag
 347 -    NULL,                 // endMetadataTag
 348 -    NULL,                 // includeTags
 349 -    NULL,                 // excludeTags
 350 -    pdf_index_tags,       // indexTags
 351 -    pdf_metadata_tags,    // metadataTags
 352 -    NULL                  // conflations
 353 -  },
 354 -
 355 -  {
 356      "txt",                // name
 357      "text",               // parser
 358      "word",               // tokenizer