2 * @brief convert a string to UTF-8 encoding.
4 /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "utf8convert.h"
33 #include "strcasecmp.h"
34 #include "stringutils.h"
39 convert_to_utf8_(const string
& text
, const string
& charset
, string
& output
)
41 // Shortcut if it's already in utf8!
42 if (charset
.size() == 5 && strcasecmp(charset
.c_str(), "utf-8") == 0)
44 if (charset
.size() == 4 && strcasecmp(charset
.c_str(), "utf8") == 0)
46 if (charset
.size() == 8 && strcasecmp(charset
.c_str(), "us-ascii") == 0)
49 // Nobody has told us what charset it's in, so do as little work as
57 /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
58 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
59 const char * p
= charset
.c_str();
62 if (strncasecmp(p
, "utf", 3) == 0) {
64 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
65 if (*p
!= '1' || p
[1] != '6') goto try_iconv
;
68 } else if (strncasecmp(p
, "ucs", 3) == 0) {
70 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
71 if (*p
!= '2') goto try_iconv
;
77 if (text
.size() < 2) return false;
79 bool big_endian
= true;
80 string::const_iterator i
= text
.begin();
82 // GNU iconv doesn't seem to handle BOMs.
83 if (startswith(text
, "\xfe\xff")) {
85 } else if (startswith(text
, "\xff\xfe")) {
89 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
90 // speaking, we're not meant to assume anything for UCS-2 with
91 // no BOM, but we've got to do something, so we might as well
92 // assume it's UTF-16 mislabelled, which is easy and sane.
93 } else if (strcasecmp(p
, "LE") == 0) {
95 } else if (!(strcasecmp(p
, "BE") == 0)) {
99 tmp
.reserve(text
.size() / 2);
102 auto text_end
= text
.end();
103 if (text
.size() & 1) {
104 // If there's a half-character at the end, nuke it now to make the
105 // conversion loop below simpler.
109 while (i
!= text_end
) {
110 unsigned ch
= static_cast<unsigned char>(*i
++);
111 unsigned ch2
= static_cast<unsigned char>(*i
++);
113 ch
= (ch
<< 8) | ch2
;
115 ch
= (ch2
<< 8) | ch
;
117 if (ch
>> 10 == 0xd800 >> 10) {
119 if (i
== text_end
) break;
120 unsigned hi
= (ch
& 0x3ff);
121 ch
= static_cast<unsigned char>(*i
++);
122 ch2
= static_cast<unsigned char>(*i
++);
124 ch
= (ch
<< 8) | ch2
;
126 ch
= (ch2
<< 8) | ch
;
128 if (ch
>> 10 == 0xdc00 >> 10) {
134 start
+= Xapian::Unicode::to_utf8(ch
, buf
+ start
);
135 if (start
>= sizeof(buf
) - 4) {
136 tmp
.append(buf
, start
);
140 if (start
) tmp
.append(buf
, start
);
142 // Assume windows-1252 if iso-8859-1 is specified. The only
143 // differences are in the range 128-159 which are control characters in
144 // iso-8859-1, and a lot of content is mislabelled. We use our own
145 // conversion code for this case, as GNU iconv fails if it sees one of
146 // the unassigned code points in windows-1252, whereas it would accept
147 // the same input as iso-8859-1, and it seems undesirable to be
148 // rejecting input due to this behind-the-scenes character set
150 const char * q
= NULL
;
151 if (strncasecmp(p
, "windows", 7) == 0) {
153 } else if (strncasecmp(p
, "cp", 2) == 0) {
157 if (*q
== '-' || *q
== '_' || *q
== ' ') ++q
;
158 if (strcmp(q
, "1252") != 0)
161 if (strncasecmp(p
, "iso", 3) == 0) {
163 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
165 if (strncmp(p
, "8859", 4) != 0) goto try_iconv
;
167 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
168 if (*p
!= '1') goto try_iconv
;
169 if (strcmp(p
+ 1, "5") == 0) goto iso8859_15
;
170 if (p
[1] != '\0') goto try_iconv
;
173 // FIXME: pull this out as a standard "normalise utf-8" function?
174 tmp
.reserve(text
.size());
177 for (string::const_iterator i
= text
.begin(); i
!= text
.end(); ++i
) {
178 static const unsigned cp1252_to_unicode
[32] = {
179 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
180 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
181 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
182 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
184 const size_t CP1252_TO_UNICODE_ENTRIES
=
185 sizeof(cp1252_to_unicode
) / sizeof(*cp1252_to_unicode
);
186 unsigned ch
= static_cast<unsigned char>(*i
);
187 if (ch
- 128 < CP1252_TO_UNICODE_ENTRIES
)
188 ch
= cp1252_to_unicode
[ch
- 128];
189 start
+= Xapian::Unicode::to_utf8(ch
, buf
+ start
);
190 if (start
>= sizeof(buf
) - 4) {
191 tmp
.append(buf
, start
);
195 if (start
) tmp
.append(buf
, start
);
201 iconv_t conv
= iconv_open("UTF-8", charset
.c_str());
202 if (conv
== reinterpret_cast<iconv_t
>(-1))
204 ICONV_CONST
char* in
= const_cast<char *>(text
.c_str());
205 size_t in_len
= text
.size();
208 size_t out_len
= sizeof(buf
);
209 if (iconv(conv
, &in
, &in_len
, &out
, &out_len
) == size_t(-1) &&
211 // FIXME: how to handle this?
214 tmp
.append(buf
, out
- buf
);
217 (void)iconv_close(conv
);
225 tmp
.reserve(text
.size());
228 for (string::const_iterator i
= text
.begin(); i
!= text
.end(); ++i
) {
229 static const unsigned iso8859_15_to_unicode
[] = {
230 0x20ac, 0x00a5, 0x0160, 0x00a7, 0x0161, 0x00a9, 0x00aa, 0x00ab,
231 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
232 0x017d, 0x00b5, 0x00b6, 0x00b7, 0x017e, 0x00b9, 0x00ba, 0x00bb,
233 0x0152, 0x0153, 0x0178
235 const size_t ISO8859_15_TO_UNICODE_ENTRIES
=
236 sizeof(iso8859_15_to_unicode
) / sizeof(*iso8859_15_to_unicode
);
237 unsigned ch
= static_cast<unsigned char>(*i
);
238 if (ch
- 164 < ISO8859_15_TO_UNICODE_ENTRIES
)
239 ch
= iso8859_15_to_unicode
[ch
- 164];
240 start
+= Xapian::Unicode::to_utf8(ch
, buf
+ start
);
241 if (start
>= sizeof(buf
) - 4) {
242 tmp
.append(buf
, start
);
246 if (start
) tmp
.append(buf
, start
);
249 // `output` may be a reference to the same string object as `text` so we
250 // only switch after we've done converting.
251 output
= std::move(tmp
);