Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / xlsxparser.cc
blob360e540c20798f27afd5d72e272e1a5d0454a98e
1 /** @file
2 * @brief Extract fields from XLSX sheet*.xml.
3 */
4 /* Copyright (C) 2012,2013,2021 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xlsxparser.h"
25 #include <cstdlib>
26 #include <cstring>
27 #include <ctime>
29 using namespace std;
31 bool
32 XlsxParser::opening_tag(const string &tag)
34 if (tag == "c") {
35 // We need to distinguish <v> tags which are inside <c t="s">, as these
36 // are numeric references to shared strings.
37 string type;
38 if (get_attribute("t", type) && type == "s") {
39 mode = MODE_C_STRING;
40 } else {
41 mode = MODE_C_LITERAL;
42 if (get_attribute("s", type)) {
43 unsigned long style_id = strtoul(type.c_str(), NULL, 10);
44 if (date_style.find(style_id) != date_style.end()) {
45 mode = MODE_C_DATE;
49 } else if (tag == "v") {
50 if (mode == MODE_C_LITERAL) {
51 mode = MODE_V_LITERAL;
52 } else if (mode == MODE_C_STRING) {
53 mode = MODE_V_STRING;
54 } else if (mode == MODE_C_DATE) {
55 mode = MODE_V_DATE;
57 } else if (tag == "si") {
58 mode = MODE_SI;
59 } else if (tag == "sst") {
60 string unique_count;
61 if (get_attribute("uniqueCount", unique_count)) {
62 unsigned long c = strtoul(unique_count.c_str(), NULL, 10);
63 // This reserving is just a performance tweak, so don't go
64 // reserving ludicrous amounts of space just because an XML
65 // attribute told us to.
66 sst.reserve(std::min(c, 1000000ul));
68 } else if (tag == "workbookPr") {
69 string v;
70 if (get_attribute("date1904", v)) {
71 date1904 = (v == "true" || v == "1");
73 } else if (tag == "numFmt") {
74 string formatcode;
75 if (get_attribute("formatCode", formatcode)) {
76 // Heuristic for "date format" (FIXME: implement properly)
77 if (strchr(formatcode.c_str(), 'd') &&
78 strchr(formatcode.c_str(), 'm') &&
79 strchr(formatcode.c_str(), 'y')) {
80 string v;
81 if (get_attribute("numFmtId", v)) {
82 unsigned long id = strtoul(v.c_str(), NULL, 10);
83 date_format.insert(id);
87 } else if (tag == "cellXfs") {
88 mode = MODE_CELLXFS;
89 } else if (tag == "xf") {
90 if (mode == MODE_CELLXFS) {
91 string v;
92 if (get_attribute("numFmtId", v)) {
93 unsigned long id = strtoul(v.c_str(), NULL, 10);
94 if ((id >= 14 && id <= 17) ||
95 date_format.find(id) != date_format.end()) {
96 date_style.insert(style_index);
99 ++style_index;
102 return true;
105 void
106 XlsxParser::process_content(const string& content)
108 switch (mode) {
109 case MODE_V_DATE: {
110 // Date field.
111 unsigned long c = strtoul(content.c_str(), NULL, 10);
112 if (date1904) {
113 c -= 24107;
114 } else {
115 // The spec insists we treat 1900 as a leap year!
116 if (c > 60) --c;
117 c -= 25568;
119 time_t t = c * 86400 + 43200;
120 struct tm * tm = gmtime(&t);
121 if (tm) {
122 char buf[32];
123 size_t res = strftime(buf, sizeof(buf), "%Y-%m-%d", tm);
124 if (res)
125 append_field(string(buf, res));
127 mode = MODE_NONE;
128 return;
130 case MODE_V_STRING: {
131 // Shared string use.
132 unsigned long c = strtoul(content.c_str(), NULL, 10);
133 if (c < sst.size()) {
134 append_field(sst[c]);
136 mode = MODE_NONE;
137 return;
139 case MODE_V_LITERAL:
140 // Literal (possibly calculated) field value.
141 append_field(content);
142 mode = MODE_NONE;
143 return;
144 case MODE_SI:
145 // Shared string definition.
146 sst.push_back(content);
147 mode = MODE_NONE;
148 return;
149 default:
150 return;