Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / opendocparser.cc
blob8cf9d1fe59c3f7dc81e7837f6539fe535958f910
1 /** @file
2 * @brief Extract text from OpenDocument XML.
3 */
4 /* Copyright (C) 2012-2022 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "opendocparser.h"
25 #include <cstring>
27 #include "stringutils.h"
29 using namespace std;
31 bool
32 OpenDocParser::opening_tag(const string& tag)
34 if (startswith(tag, "text:")) {
35 const char* tail = tag.c_str() + 5;
36 if (strcmp(tail, "p") == 0 ||
37 strcmp(tail, "h") == 0 ||
38 strcmp(tail, "line-break") == 0 ||
39 strcmp(tail, "tab") == 0) {
40 pending_space = true;
42 } else if (tag == "office:body") {
43 indexing = true;
44 } else if (tag == "style:style") {
45 (void)get_attribute("style:master-page-name", master_page_name);
46 } else if (tag == "style:master-page") {
47 string n;
48 if (get_attribute("style:name", n) && n == master_page_name)
49 indexing = true;
51 return true;
54 bool
55 OpenDocParser::closing_tag(const string& tag)
57 if (!indexing)
58 return true;
60 if (tag == "text:p" || tag == "text:h") {
61 pending_space = true;
62 } else if (tag == "office:body" || tag == "style:style") {
63 indexing = false;
65 return true;
68 void
69 OpenDocParser::process_content(const string& content)
71 if (indexing && !content.empty()) {
72 if (pending_space) {
73 pending_space = false;
74 if (!content.empty()) dump += ' ';
76 dump += content;