2 * @brief convert a string to UTF-8 encoding.
4 /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021,2023 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "utf8convert.h"
33 #include "strcasecmp.h"
34 #include "stringutils.h"
39 convert_to_utf8_(string_view text
, const string
& charset
, string
& output
)
41 // Shortcut if it's already in utf8!
42 if (charset
.size() == 5 && strcasecmp(charset
.c_str(), "utf-8") == 0)
44 if (charset
.size() == 4 && strcasecmp(charset
.c_str(), "utf8") == 0)
46 if (charset
.size() == 8 && strcasecmp(charset
.c_str(), "us-ascii") == 0)
49 // Nobody has told us what charset it's in, so do as little work as
57 /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
58 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
59 const char * p
= charset
.c_str();
62 if (strncasecmp(p
, "utf", 3) == 0) {
64 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
65 if (*p
!= '1' || p
[1] != '6') goto try_iconv
;
68 } else if (strncasecmp(p
, "ucs", 3) == 0) {
70 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
71 if (*p
!= '2') goto try_iconv
;
77 if (text
.size() < 2) return false;
79 bool big_endian
= true;
80 auto i
= text
.begin();
82 // GNU iconv doesn't seem to handle BOMs.
83 if (startswith(text
, "\xfe\xff")) {
85 } else if (startswith(text
, "\xff\xfe")) {
89 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
90 // speaking, we're not meant to assume anything for UCS-2 with
91 // no BOM, but we've got to do something, so we might as well
92 // assume it's UTF-16 mislabelled, which is easy and sane.
93 } else if (strcasecmp(p
, "LE") == 0) {
95 } else if (!(strcasecmp(p
, "BE") == 0)) {
99 tmp
.reserve(text
.size() / 2);
102 auto text_end
= text
.end();
103 if (text
.size() & 1) {
104 // If there's a half-character at the end, nuke it now to make the
105 // conversion loop below simpler.
109 while (i
!= text_end
) {
110 unsigned ch
= static_cast<unsigned char>(*i
++);
111 unsigned ch2
= static_cast<unsigned char>(*i
++);
113 ch
= (ch
<< 8) | ch2
;
115 ch
= (ch2
<< 8) | ch
;
117 if (ch
>> 10 == 0xd800 >> 10) {
119 if (i
== text_end
) break;
120 unsigned hi
= (ch
& 0x3ff);
121 ch
= static_cast<unsigned char>(*i
++);
122 ch2
= static_cast<unsigned char>(*i
++);
124 ch
= (ch
<< 8) | ch2
;
126 ch
= (ch2
<< 8) | ch
;
128 if (ch
>> 10 == 0xdc00 >> 10) {
134 start
+= Xapian::Unicode::to_utf8(ch
, buf
+ start
);
135 if (start
>= sizeof(buf
) - 4) {
136 tmp
.append(buf
, start
);
140 if (start
) tmp
.append(buf
, start
);
142 // Assume windows-1252 if iso-8859-1 is specified. The only
143 // differences are in the range 128-159 which are control characters in
144 // iso-8859-1, and a lot of content is mislabelled. We use our own
145 // conversion code for this case, as GNU iconv fails if it sees one of
146 // the unassigned code points in windows-1252, whereas it would accept
147 // the same input as iso-8859-1, and it seems undesirable to be
148 // rejecting input due to this behind-the-scenes character set
150 const char * q
= NULL
;
151 if (strncasecmp(p
, "windows", 7) == 0) {
153 } else if (strncasecmp(p
, "cp", 2) == 0) {
157 if (*q
== '-' || *q
== '_' || *q
== ' ') ++q
;
158 if (strcmp(q
, "1252") != 0)
161 if (strncasecmp(p
, "iso", 3) == 0) {
163 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
165 if (strncmp(p
, "8859", 4) != 0) goto try_iconv
;
167 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
168 if (*p
!= '1') goto try_iconv
;
169 if (strcmp(p
+ 1, "5") == 0) goto iso8859_15
;
170 if (p
[1] != '\0') goto try_iconv
;
173 // FIXME: pull this out as a standard "normalise utf-8" function?
174 tmp
.reserve(text
.size());
177 for (unsigned char ch
: text
) {
178 static const unsigned cp1252_to_unicode
[32] = {
179 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
180 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
181 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
182 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
184 unsigned code_point
= ch
;
185 unsigned i
= UNSIGNED_OVERFLOW_OK(code_point
- 128);
186 if (i
< std::size(cp1252_to_unicode
))
187 code_point
= cp1252_to_unicode
[i
];
188 start
+= Xapian::Unicode::to_utf8(code_point
, buf
+ start
);
189 if (start
>= sizeof(buf
) - 4) {
190 tmp
.append(buf
, start
);
194 if (start
) tmp
.append(buf
, start
);
200 iconv_t conv
= iconv_open("UTF-8", charset
.c_str());
201 if (conv
== reinterpret_cast<iconv_t
>(-1))
203 ICONV_CONST
char* in
= const_cast<char *>(text
.data());
204 size_t in_len
= text
.size();
207 size_t out_len
= sizeof(buf
);
208 if (iconv(conv
, &in
, &in_len
, &out
, &out_len
) == size_t(-1) &&
210 // FIXME: how to handle this?
213 tmp
.append(buf
, out
- buf
);
216 (void)iconv_close(conv
);
224 tmp
.reserve(text
.size());
227 for (unsigned char ch
: text
) {
228 static const unsigned iso8859_15_to_unicode
[] = {
229 0x20ac, 0x00a5, 0x0160, 0x00a7, 0x0161, 0x00a9, 0x00aa, 0x00ab,
230 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
231 0x017d, 0x00b5, 0x00b6, 0x00b7, 0x017e, 0x00b9, 0x00ba, 0x00bb,
232 0x0152, 0x0153, 0x0178
234 unsigned code_point
= ch
;
235 unsigned i
= UNSIGNED_OVERFLOW_OK(code_point
- 164);
236 if (i
< std::size(iso8859_15_to_unicode
))
237 code_point
= iso8859_15_to_unicode
[i
];
238 start
+= Xapian::Unicode::to_utf8(code_point
, buf
+ start
);
239 if (start
>= sizeof(buf
) - 4) {
240 tmp
.append(buf
, start
);
244 if (start
) tmp
.append(buf
, start
);
247 // `output` may be a reference to the same string object as `text` so we
248 // only switch after we've done converting.
249 output
= std::move(tmp
);