xapian-applications/omega/handler_tesseract.cc

   1 /** @file
   2  * @brief Extract text from Images using tesseract.
   3  */
   4 /* Copyright (C) 2019 Bruno Baruffaldi
   5  * Copyright (C) 2022,2023 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22 #include <config.h>
  23 #include "handler.h"
  24
  25 // Workaround stupidity in tesseract headers.
  26 #undef HAVE_CONFIG_H
  27 #include <tesseract/baseapi.h>
  28 #include <leptonica/allheaders.h>
  29
  30 #include "safesysexits.h"
  31
  32 using namespace std;
  33 using namespace tesseract;
  34
  35 static TessBaseAPI* ocr;
  36
  37 bool
  38 initialise()
  39 {
  40     ocr = new TessBaseAPI();
  41     ocr->SetPageSegMode(PSM_AUTO_OSD);
  42     return true;
  43 }
  44
  45 void
  46 extract(const string& filename, const string&)
  47 {
  48     // Call Init() for each document so any adaptive state is reset as
  49     // we don't want the order of indexing documents to affect the text
  50     // indexed for each document.
  51     //
  52     // Tesseract documents that passing nullptr for the second parameter
  53     // here is the same as "eng", but that fails to work on macos (tested
  54     // with the homebrew tesseract v5.1.0).
  55     //
  56     // FIXME: We ought to provide a way to allow the language to use here
  57     // to be specified.
  58     if (ocr->Init(nullptr, "eng"))
  59         _Exit(EX_UNAVAILABLE);
  60
  61     // Open Image
  62     Pix* image = pixRead(filename.c_str());
  63     if (!image) {
  64         send_field(FIELD_ERROR, "pixRead() failed to load image");
  65         return;
  66     }
  67
  68     ocr->SetImage(image);
  69
  70     // Get OCR result
  71     const char* text = ocr->GetUTF8Text();
  72     send_field(FIELD_BODY, text);
  73     delete[] text;
  74
  75     // Release memory.
  76     ocr->Clear();
  77     pixDestroy(&image);
  78 }