Fix bug in PL2+ implementation
[xapian.git] / xapian-applications / omega / utf8convert.cc
blob377b7463567a1a0666b84468ebcf39433acf0dcc
1 /** @file
2 * @brief convert a string to UTF-8 encoding.
3 */
4 /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021,2023 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "utf8convert.h"
25 #include <algorithm>
26 #include <cerrno>
27 #include <string>
29 #ifdef HAVE_ICONV
30 # include <iconv.h>
31 #endif
32 #include <xapian.h>
33 #include "strcasecmp.h"
34 #include "stringutils.h"
36 using namespace std;
38 bool
39 convert_to_utf8_(string_view text, const string& charset, string& output)
41 // Shortcut if it's already in utf8!
42 if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
43 return false;
44 if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
45 return false;
46 if (charset.size() == 8 && strcasecmp(charset.c_str(), "us-ascii") == 0)
47 return false;
49 // Nobody has told us what charset it's in, so do as little work as
50 // possible!
51 if (charset.empty())
52 return false;
54 char buf[1024];
55 string tmp;
57 /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
58 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
59 const char * p = charset.c_str();
61 bool utf16 = false;
62 if (strncasecmp(p, "utf", 3) == 0) {
63 p += 3;
64 if (*p == '-' || *p == '_' || *p == ' ') ++p;
65 if (*p != '1' || p[1] != '6') goto try_iconv;
66 p += 2;
67 utf16 = true;
68 } else if (strncasecmp(p, "ucs", 3) == 0) {
69 p += 3;
70 if (*p == '-' || *p == '_' || *p == ' ') ++p;
71 if (*p != '2') goto try_iconv;
72 ++p;
73 utf16 = true;
76 if (utf16) {
77 if (text.size() < 2) return false;
79 bool big_endian = true;
80 auto i = text.begin();
81 if (*p == '\0') {
82 // GNU iconv doesn't seem to handle BOMs.
83 if (startswith(text, "\xfe\xff")) {
84 i += 2;
85 } else if (startswith(text, "\xff\xfe")) {
86 big_endian = false;
87 i += 2;
89 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
90 // speaking, we're not meant to assume anything for UCS-2 with
91 // no BOM, but we've got to do something, so we might as well
92 // assume it's UTF-16 mislabelled, which is easy and sane.
93 } else if (strcasecmp(p, "LE") == 0) {
94 big_endian = false;
95 } else if (!(strcasecmp(p, "BE") == 0)) {
96 goto try_iconv;
99 tmp.reserve(text.size() / 2);
101 size_t start = 0;
102 auto text_end = text.end();
103 if (text.size() & 1) {
104 // If there's a half-character at the end, nuke it now to make the
105 // conversion loop below simpler.
106 --text_end;
109 while (i != text_end) {
110 unsigned ch = static_cast<unsigned char>(*i++);
111 unsigned ch2 = static_cast<unsigned char>(*i++);
112 if (big_endian) {
113 ch = (ch << 8) | ch2;
114 } else {
115 ch = (ch2 << 8) | ch;
117 if (ch >> 10 == 0xd800 >> 10) {
118 // Surrogate pair.
119 if (i == text_end) break;
120 unsigned hi = (ch & 0x3ff);
121 ch = static_cast<unsigned char>(*i++);
122 ch2 = static_cast<unsigned char>(*i++);
123 if (big_endian) {
124 ch = (ch << 8) | ch2;
125 } else {
126 ch = (ch2 << 8) | ch;
128 if (ch >> 10 == 0xdc00 >> 10) {
129 ch &= 0x3ff;
130 ch |= (hi << 10);
131 ch += 0x10000;
134 start += Xapian::Unicode::to_utf8(ch, buf + start);
135 if (start >= sizeof(buf) - 4) {
136 tmp.append(buf, start);
137 start = 0;
140 if (start) tmp.append(buf, start);
141 } else {
142 // Assume windows-1252 if iso-8859-1 is specified. The only
143 // differences are in the range 128-159 which are control characters in
144 // iso-8859-1, and a lot of content is mislabelled. We use our own
145 // conversion code for this case, as GNU iconv fails if it sees one of
146 // the unassigned code points in windows-1252, whereas it would accept
147 // the same input as iso-8859-1, and it seems undesirable to be
148 // rejecting input due to this behind-the-scenes character set
149 // shenanigans.
150 const char * q = NULL;
151 if (strncasecmp(p, "windows", 7) == 0) {
152 q = p + 7;
153 } else if (strncasecmp(p, "cp", 2) == 0) {
154 q = p + 2;
156 if (q) {
157 if (*q == '-' || *q == '_' || *q == ' ') ++q;
158 if (strcmp(q, "1252") != 0)
159 goto try_iconv;
160 } else {
161 if (strncasecmp(p, "iso", 3) == 0) {
162 p += 3;
163 if (*p == '-' || *p == '_' || *p == ' ') ++p;
165 if (strncmp(p, "8859", 4) != 0) goto try_iconv;
166 p += 4;
167 if (*p == '-' || *p == '_' || *p == ' ') ++p;
168 if (*p != '1') goto try_iconv;
169 if (strcmp(p + 1, "5") == 0) goto iso8859_15;
170 if (p[1] != '\0') goto try_iconv;
173 // FIXME: pull this out as a standard "normalise utf-8" function?
174 tmp.reserve(text.size());
176 size_t start = 0;
177 for (unsigned char ch : text) {
178 static const unsigned cp1252_to_unicode[32] = {
179 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
180 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
181 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
182 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
184 unsigned code_point = ch;
185 unsigned i = UNSIGNED_OVERFLOW_OK(code_point - 128);
186 if (i < std::size(cp1252_to_unicode))
187 code_point = cp1252_to_unicode[i];
188 start += Xapian::Unicode::to_utf8(code_point, buf + start);
189 if (start >= sizeof(buf) - 4) {
190 tmp.append(buf, start);
191 start = 0;
194 if (start) tmp.append(buf, start);
197 if (false) {
198 try_iconv:
199 #ifdef HAVE_ICONV
200 iconv_t conv = iconv_open("UTF-8", charset.c_str());
201 if (conv == reinterpret_cast<iconv_t>(-1))
202 return false;
203 ICONV_CONST char* in = const_cast<char *>(text.data());
204 size_t in_len = text.size();
205 while (in_len) {
206 char * out = buf;
207 size_t out_len = sizeof(buf);
208 if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
209 errno != E2BIG) {
210 // FIXME: how to handle this?
211 break;
213 tmp.append(buf, out - buf);
216 (void)iconv_close(conv);
217 #else
218 return false;
219 #endif
222 if (false) {
223 iso8859_15:
224 tmp.reserve(text.size());
226 size_t start = 0;
227 for (unsigned char ch : text) {
228 static const unsigned iso8859_15_to_unicode[] = {
229 0x20ac, 0x00a5, 0x0160, 0x00a7, 0x0161, 0x00a9, 0x00aa, 0x00ab,
230 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
231 0x017d, 0x00b5, 0x00b6, 0x00b7, 0x017e, 0x00b9, 0x00ba, 0x00bb,
232 0x0152, 0x0153, 0x0178
234 unsigned code_point = ch;
235 unsigned i = UNSIGNED_OVERFLOW_OK(code_point - 164);
236 if (i < std::size(iso8859_15_to_unicode))
237 code_point = iso8859_15_to_unicode[i];
238 start += Xapian::Unicode::to_utf8(code_point, buf + start);
239 if (start >= sizeof(buf) - 4) {
240 tmp.append(buf, start);
241 start = 0;
244 if (start) tmp.append(buf, start);
247 // `output` may be a reference to the same string object as `text` so we
248 // only switch after we've done converting.
249 output = std::move(tmp);
250 return true;