Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / handler_tesseract.cc
blob89fe9e8686ced8cda204e37eef5fec09f21df3fa
1 /** @file
2 * @brief Extract text from Images using tesseract.
3 */
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
22 #include <config.h>
23 #include "handler.h"
25 // Workaround stupidity in tesseract headers.
26 #undef HAVE_CONFIG_H
27 #include <tesseract/baseapi.h>
28 #include <leptonica/allheaders.h>
30 #include "safesysexits.h"
32 using namespace std;
33 using namespace tesseract;
35 static TessBaseAPI* ocr;
37 bool
38 initialise()
40 ocr = new TessBaseAPI();
41 ocr->SetPageSegMode(PSM_AUTO_OSD);
42 return true;
45 void
46 extract(const string& filename, const string&)
48 // Call Init() for each document so any adaptive state is reset as
49 // we don't want the order of indexing documents to affect the text
50 // indexed for each document.
52 // Tesseract documents that passing nullptr for the second parameter
53 // here is the same as "eng", but that fails to work on macos (tested
54 // with the homebrew tesseract v5.1.0).
56 // FIXME: We ought to provide a way to allow the language to use here
57 // to be specified.
58 if (ocr->Init(nullptr, "eng"))
59 _Exit(EX_UNAVAILABLE);
61 // Open Image
62 Pix* image = pixRead(filename.c_str());
63 if (!image) {
64 send_field(FIELD_ERROR, "pixRead() failed to load image");
65 return;
68 ocr->SetImage(image);
70 // Get OCR result
71 const char* text = ocr->GetUTF8Text();
72 send_field(FIELD_BODY, text);
73 delete[] text;
75 // Release memory.
76 ocr->Clear();
77 pixDestroy(&image);