Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / opendocmetaparser.cc
blob3e956e0a9bd3ce1f6881d59fc6c58673bb75ff93
1 /** @file
2 * @brief Parser for OpenDocument's meta.xml.
4 * Also used for MSXML's docProps/core.xml.
5 */
6 /* Copyright (C) 2006,2009,2010,2011,2013,2015,2020,2022 Olly Betts
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "opendocmetaparser.h"
27 #include "datetime.h"
28 #include "parseint.h"
30 using namespace std;
32 void
33 OpenDocMetaParser::process_content(const string& content)
35 switch (field) {
36 case KEYWORDS:
37 if (!keywords.empty()) keywords += ' ';
38 keywords += content;
39 break;
40 case TITLE:
41 if (!title.empty()) title += ' ';
42 title += content;
43 break;
44 case SAMPLE:
45 if (!sample.empty()) sample += ' ';
46 sample += content;
47 break;
48 case AUTHOR:
49 if (!author.empty()) author += ' ';
50 author += content;
51 break;
52 case CREATED: {
53 // E.g. 2013-03-04T22:57:00
54 created = parse_datetime(content);
55 break;
57 case NONE:
58 // Ignore other fields.
59 break;
63 bool
64 OpenDocMetaParser::opening_tag(const string& tag)
66 if (tag.size() < 8) return true;
67 if (tag[0] == 'd' && tag[1] == 'c') {
68 if (tag == "dc:subject") {
69 // OpenDocument, MSXML.
71 // dc:subject is "Subject and Keywords":
72 // "Typically, Subject will be expressed as keywords, key phrases
73 // or classification codes that describe a topic of the resource."
74 // OpenOffice uses meta:keywords for keywords - dc:subject
75 // comes from a text field labelled "Subject". Let's just treat
76 // it as more keywords.
77 field = KEYWORDS;
78 } else if (tag == "dc:title") {
79 // OpenDocument, MSXML.
80 field = TITLE;
81 } else if (tag == "dc:description") {
82 // OpenDocument, MSXML.
83 field = SAMPLE;
84 } else if (tag == "dc:creator") {
85 // OpenDocument, MSXML.
86 field = AUTHOR;
87 } else if (tag == "dcterms:created") {
88 // MSXML.
89 field = CREATED;
91 } else if (tag[0] == 'm') {
92 if (tag == "meta:keyword") {
93 // OpenDocument.
95 // e.g.:
96 // <meta:keywords>
97 // <meta:keyword>information retrieval</meta:keyword>
98 // </meta:keywords>
99 field = KEYWORDS;
100 } else if (tag == "meta:creation-date") {
101 // OpenDocument.
102 field = CREATED;
103 } else if (tag == "meta:document-statistic") {
104 // OpenDocument:
106 // The values we want for the page count are to be found as
107 // attributes of the meta:document-statistic tag (which occurs
108 // inside <office:meta> but we don't bother to check that).
110 // For text documents, we want the meta:page-count attribute.
112 // For spreadsheets, meta:table-count seems to give the sheet count
113 // (text documents also have meta:table-count so we check for this
114 // after meta:page-count).
115 string value;
116 if (get_attribute("meta:page-count", value) ||
117 get_attribute("meta:table-count", value)) {
118 unsigned u_pages;
119 if (parse_unsigned(value.c_str(), u_pages))
120 pages = int(u_pages);
123 } else if (tag[0] == 'c' && tag[1] == 'p') {
124 if (tag == "cp:keywords") {
125 // MSXML.
126 field = KEYWORDS;
129 return true;
132 bool
133 OpenDocMetaParser::closing_tag(const string&)
135 field = NONE;
136 return true;