omegatest: Use test_scriptindex more
[xapian.git] / xapian-applications / omega / utf8convert.cc
blob9e59de60a37b1edc97c02c7392167ac01b775811
1 /** @file
2 * @brief convert a string to UTF-8 encoding.
3 */
4 /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "utf8convert.h"
25 #include <algorithm>
26 #include <cerrno>
27 #include <string>
29 #ifdef HAVE_ICONV
30 # include <iconv.h>
31 #endif
32 #include <xapian.h>
33 #include "strcasecmp.h"
34 #include "stringutils.h"
36 using namespace std;
38 bool
39 convert_to_utf8_(const string& text, const string& charset, string& output)
41 // Shortcut if it's already in utf8!
42 if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
43 return false;
44 if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
45 return false;
46 if (charset.size() == 8 && strcasecmp(charset.c_str(), "us-ascii") == 0)
47 return false;
49 // Nobody has told us what charset it's in, so do as little work as
50 // possible!
51 if (charset.empty())
52 return false;
54 char buf[1024];
55 string tmp;
57 /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
58 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
59 const char * p = charset.c_str();
61 bool utf16 = false;
62 if (strncasecmp(p, "utf", 3) == 0) {
63 p += 3;
64 if (*p == '-' || *p == '_' || *p == ' ') ++p;
65 if (*p != '1' || p[1] != '6') goto try_iconv;
66 p += 2;
67 utf16 = true;
68 } else if (strncasecmp(p, "ucs", 3) == 0) {
69 p += 3;
70 if (*p == '-' || *p == '_' || *p == ' ') ++p;
71 if (*p != '2') goto try_iconv;
72 ++p;
73 utf16 = true;
76 if (utf16) {
77 if (text.size() < 2) return false;
79 bool big_endian = true;
80 string::const_iterator i = text.begin();
81 if (*p == '\0') {
82 // GNU iconv doesn't seem to handle BOMs.
83 if (startswith(text, "\xfe\xff")) {
84 i += 2;
85 } else if (startswith(text, "\xff\xfe")) {
86 big_endian = false;
87 i += 2;
89 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
90 // speaking, we're not meant to assume anything for UCS-2 with
91 // no BOM, but we've got to do something, so we might as well
92 // assume it's UTF-16 mislabelled, which is easy and sane.
93 } else if (strcasecmp(p, "LE") == 0) {
94 big_endian = false;
95 } else if (!(strcasecmp(p, "BE") == 0)) {
96 goto try_iconv;
99 tmp.reserve(text.size() / 2);
101 size_t start = 0;
102 auto text_end = text.end();
103 if (text.size() & 1) {
104 // If there's a half-character at the end, nuke it now to make the
105 // conversion loop below simpler.
106 --text_end;
109 while (i != text_end) {
110 unsigned ch = static_cast<unsigned char>(*i++);
111 unsigned ch2 = static_cast<unsigned char>(*i++);
112 if (big_endian) {
113 ch = (ch << 8) | ch2;
114 } else {
115 ch = (ch2 << 8) | ch;
117 if (ch >> 10 == 0xd800 >> 10) {
118 // Surrogate pair.
119 if (i == text_end) break;
120 unsigned hi = (ch & 0x3ff);
121 ch = static_cast<unsigned char>(*i++);
122 ch2 = static_cast<unsigned char>(*i++);
123 if (big_endian) {
124 ch = (ch << 8) | ch2;
125 } else {
126 ch = (ch2 << 8) | ch;
128 if (ch >> 10 == 0xdc00 >> 10) {
129 ch &= 0x3ff;
130 ch |= (hi << 10);
131 ch += 0x10000;
134 start += Xapian::Unicode::to_utf8(ch, buf + start);
135 if (start >= sizeof(buf) - 4) {
136 tmp.append(buf, start);
137 start = 0;
140 if (start) tmp.append(buf, start);
141 } else {
142 // Assume windows-1252 if iso-8859-1 is specified. The only
143 // differences are in the range 128-159 which are control characters in
144 // iso-8859-1, and a lot of content is mislabelled. We use our own
145 // conversion code for this case, as GNU iconv fails if it sees one of
146 // the unassigned code points in windows-1252, whereas it would accept
147 // the same input as iso-8859-1, and it seems undesirable to be
148 // rejecting input due to this behind-the-scenes character set
149 // shenanigans.
150 const char * q = NULL;
151 if (strncasecmp(p, "windows", 7) == 0) {
152 q = p + 7;
153 } else if (strncasecmp(p, "cp", 2) == 0) {
154 q = p + 2;
156 if (q) {
157 if (*q == '-' || *q == '_' || *q == ' ') ++q;
158 if (strcmp(q, "1252") != 0)
159 goto try_iconv;
160 } else {
161 if (strncasecmp(p, "iso", 3) == 0) {
162 p += 3;
163 if (*p == '-' || *p == '_' || *p == ' ') ++p;
165 if (strncmp(p, "8859", 4) != 0) goto try_iconv;
166 p += 4;
167 if (*p == '-' || *p == '_' || *p == ' ') ++p;
168 if (*p != '1') goto try_iconv;
169 if (strcmp(p + 1, "5") == 0) goto iso8859_15;
170 if (p[1] != '\0') goto try_iconv;
173 // FIXME: pull this out as a standard "normalise utf-8" function?
174 tmp.reserve(text.size());
176 size_t start = 0;
177 for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
178 static const unsigned cp1252_to_unicode[32] = {
179 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
180 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
181 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
182 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
184 const size_t CP1252_TO_UNICODE_ENTRIES =
185 sizeof(cp1252_to_unicode) / sizeof(*cp1252_to_unicode);
186 unsigned ch = static_cast<unsigned char>(*i);
187 if (ch - 128 < CP1252_TO_UNICODE_ENTRIES)
188 ch = cp1252_to_unicode[ch - 128];
189 start += Xapian::Unicode::to_utf8(ch, buf + start);
190 if (start >= sizeof(buf) - 4) {
191 tmp.append(buf, start);
192 start = 0;
195 if (start) tmp.append(buf, start);
198 if (false) {
199 try_iconv:
200 #ifdef HAVE_ICONV
201 iconv_t conv = iconv_open("UTF-8", charset.c_str());
202 if (conv == reinterpret_cast<iconv_t>(-1))
203 return false;
204 ICONV_CONST char* in = const_cast<char *>(text.c_str());
205 size_t in_len = text.size();
206 while (in_len) {
207 char * out = buf;
208 size_t out_len = sizeof(buf);
209 if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
210 errno != E2BIG) {
211 // FIXME: how to handle this?
212 break;
214 tmp.append(buf, out - buf);
217 (void)iconv_close(conv);
218 #else
219 return false;
220 #endif
223 if (false) {
224 iso8859_15:
225 tmp.reserve(text.size());
227 size_t start = 0;
228 for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
229 static const unsigned iso8859_15_to_unicode[] = {
230 0x20ac, 0x00a5, 0x0160, 0x00a7, 0x0161, 0x00a9, 0x00aa, 0x00ab,
231 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
232 0x017d, 0x00b5, 0x00b6, 0x00b7, 0x017e, 0x00b9, 0x00ba, 0x00bb,
233 0x0152, 0x0153, 0x0178
235 const size_t ISO8859_15_TO_UNICODE_ENTRIES =
236 sizeof(iso8859_15_to_unicode) / sizeof(*iso8859_15_to_unicode);
237 unsigned ch = static_cast<unsigned char>(*i);
238 if (ch - 164 < ISO8859_15_TO_UNICODE_ENTRIES)
239 ch = iso8859_15_to_unicode[ch - 164];
240 start += Xapian::Unicode::to_utf8(ch, buf + start);
241 if (start >= sizeof(buf) - 4) {
242 tmp.append(buf, start);
243 start = 0;
246 if (start) tmp.append(buf, start);
249 // `output` may be a reference to the same string object as `text` so we
250 // only switch after we've done converting.
251 output = std::move(tmp);
252 return true;