1 Add code support for ICU.
3 diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c
4 index b86a547..0f41df9 100644
5 --- a/third_party/libxml/encoding.c
6 +++ b/third_party/libxml/encoding.c
7 @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
8 static int xmlCharEncodingAliasesNb = 0;
9 static int xmlCharEncodingAliasesMax = 0;
11 -#ifdef LIBXML_ICONV_ENABLED
12 +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
14 #define DEBUG_ENCODING /* Define this to get encoding traces */
16 @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
17 NULL, 0, val, NULL, NULL, 0, 0, msg, val);
20 +#ifdef LIBXML_ICU_ENABLED
22 +openIcuConverter(const char* name, int toUnicode)
24 + UErrorCode status = U_ZERO_ERROR;
25 + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
29 + conv->uconv = ucnv_open(name, &status);
30 + if (U_FAILURE(status))
33 + status = U_ZERO_ERROR;
35 + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
36 + NULL, NULL, NULL, &status);
39 + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
40 + NULL, NULL, NULL, &status);
42 + if (U_FAILURE(status))
45 + status = U_ZERO_ERROR;
46 + conv->utf8 = ucnv_open("UTF-8", &status);
47 + if (U_SUCCESS(status))
52 + ucnv_close(conv->uconv);
58 +closeIcuConverter(uconv_t *conv)
61 + ucnv_close(conv->uconv);
62 + ucnv_close(conv->utf8);
66 +#endif /* LIBXML_ICU_ENABLED */
68 /************************************************************************
70 * Conversions To/From UTF8 encoding *
71 @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name,
72 #ifdef LIBXML_ICONV_ENABLED
73 handler->iconv_in = NULL;
74 handler->iconv_out = NULL;
75 -#endif /* LIBXML_ICONV_ENABLED */
77 +#ifdef LIBXML_ICU_ENABLED
78 + handler->uconv_in = NULL;
79 + handler->uconv_out = NULL;
83 * registers and returns the handler.
84 @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) {
85 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
86 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
87 #endif /* LIBXML_OUTPUT_ENABLED */
88 -#ifndef LIBXML_ICONV_ENABLED
89 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
90 #ifdef LIBXML_ISO8859X_ENABLED
91 xmlRegisterCharEncodingHandlersISO8859x ();
93 @@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) {
94 xmlCharEncodingHandlerPtr enc;
95 iconv_t icv_in, icv_out;
96 #endif /* LIBXML_ICONV_ENABLED */
97 +#ifdef LIBXML_ICU_ENABLED
98 + xmlCharEncodingHandlerPtr enc;
99 + uconv_t *ucv_in, *ucv_out;
100 +#endif /* LIBXML_ICU_ENABLED */
104 @@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) {
105 "iconv : problems with filters for '%s'\n", name);
107 #endif /* LIBXML_ICONV_ENABLED */
108 +#ifdef LIBXML_ICU_ENABLED
109 + /* check whether icu can handle this */
110 + ucv_in = openIcuConverter(name, 1);
111 + ucv_out = openIcuConverter(name, 0);
112 + if (ucv_in != NULL && ucv_out != NULL) {
113 + enc = (xmlCharEncodingHandlerPtr)
114 + xmlMalloc(sizeof(xmlCharEncodingHandler));
116 + closeIcuConverter(ucv_in);
117 + closeIcuConverter(ucv_out);
120 + enc->name = xmlMemStrdup(name);
122 + enc->output = NULL;
123 + enc->uconv_in = ucv_in;
124 + enc->uconv_out = ucv_out;
125 +#ifdef DEBUG_ENCODING
126 + xmlGenericError(xmlGenericErrorContext,
127 + "Found ICU converter handler for encoding %s\n", name);
130 + } else if (ucv_in != NULL || ucv_out != NULL) {
131 + closeIcuConverter(ucv_in);
132 + closeIcuConverter(ucv_out);
133 + xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
134 + "ICU converter : problems with filters for '%s'\n", name);
136 +#endif /* LIBXML_ICU_ENABLED */
138 #ifdef DEBUG_ENCODING
139 xmlGenericError(xmlGenericErrorContext,
140 @@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
142 /************************************************************************
144 + * ICU based generic conversion functions *
146 + ************************************************************************/
148 +#ifdef LIBXML_ICU_ENABLED
151 + * @cd: ICU uconverter data structure
152 + * @toUnicode : non-zero if toUnicode. 0 otherwise.
153 + * @out: a pointer to an array of bytes to store the result
154 + * @outlen: the length of @out
155 + * @in: a pointer to an array of ISO Latin 1 chars
156 + * @inlen: the length of @in
158 + * Returns 0 if success, or
159 + * -1 by lack of space, or
160 + * -2 if the transcoding fails (for *in is not valid utf8 string or
161 + * the result of transformation can't fit into the encoding we want), or
162 + * -3 if there the last byte can't form a single output char.
164 + * The value of @inlen after return is the number of octets consumed
165 + * as the return value is positive, else unpredictable.
166 + * The value of @outlen after return is the number of ocetes consumed.
169 +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
170 + const unsigned char *in, int *inlen) {
171 + const char *ucv_in = (const char *) in;
172 + char *ucv_out = (char *) out;
173 + UErrorCode err = U_ZERO_ERROR;
175 + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
176 + if (outlen != NULL) *outlen = 0;
182 + * 1. is ucnv_convert(To|From)Algorithmic better?
183 + * 2. had we better use an explicit pivot buffer?
184 + * 3. error returned comes from 'fromUnicode' only even
185 + * when toUnicode is true !
188 + /* encoding => UTF-16 => UTF-8 */
189 + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
190 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
193 + /* UTF-8 => UTF-16 => encoding */
194 + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
195 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
198 + *inlen = ucv_in - (const char*) in;
199 + *outlen = ucv_out - (char *) out;
200 + if (U_SUCCESS(err))
202 + if (err == U_BUFFER_OVERFLOW_ERROR)
204 + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
206 + /* if (err == U_TRUNCATED_CHAR_FOUND) */
209 +#endif /* LIBXML_ICU_ENABLED */
211 +/************************************************************************
213 * The real API used by libxml for on-the-fly conversion *
215 ************************************************************************/
216 @@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
217 if (ret == -1) ret = -3;
219 #endif /* LIBXML_ICONV_ENABLED */
220 +#ifdef LIBXML_ICU_ENABLED
221 + else if (handler->uconv_in != NULL) {
222 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
223 + &written, in->content, &toconv);
224 + xmlBufferShrink(in, toconv);
225 + out->use += written;
226 + out->content[out->use] = 0;
227 + if (ret == -1) ret = -3;
229 +#endif /* LIBXML_ICU_ENABLED */
230 #ifdef DEBUG_ENCODING
233 @@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
236 #endif /* LIBXML_ICONV_ENABLED */
237 +#ifdef LIBXML_ICU_ENABLED
238 + else if (handler->uconv_in != NULL) {
239 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
240 + &written, in->content, &toconv);
241 + xmlBufferShrink(in, toconv);
242 + out->use += written;
243 + out->content[out->use] = 0;
247 +#endif /* LIBXML_ICU_ENABLED */
250 #ifdef DEBUG_ENCODING
251 @@ -2015,6 +2190,15 @@ retry:
252 out->content[out->use] = 0;
254 #endif /* LIBXML_ICONV_ENABLED */
255 +#ifdef LIBXML_ICU_ENABLED
256 + else if (handler->uconv_out != NULL) {
257 + ret = xmlUconvWrapper(handler->uconv_out, 0,
258 + &out->content[out->use],
259 + &written, NULL, &toconv);
260 + out->use += written;
261 + out->content[out->use] = 0;
263 +#endif /* LIBXML_ICU_ENABLED */
264 #ifdef DEBUG_ENCODING
265 xmlGenericError(xmlGenericErrorContext,
266 "initialized encoder\n");
267 @@ -2061,6 +2245,26 @@ retry:
270 #endif /* LIBXML_ICONV_ENABLED */
271 +#ifdef LIBXML_ICU_ENABLED
272 + else if (handler->uconv_out != NULL) {
273 + ret = xmlUconvWrapper(handler->uconv_out, 0,
274 + &out->content[out->use],
275 + &written, in->content, &toconv);
276 + xmlBufferShrink(in, toconv);
277 + out->use += written;
278 + writtentot += written;
279 + out->content[out->use] = 0;
283 + * Can be a limitation of iconv
290 +#endif /* LIBXML_ICU_ENABLED */
292 xmlEncodingErr(XML_I18N_NO_OUTPUT,
293 "xmlCharEncOutFunc: no output function !\n", NULL);
294 @@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
297 #endif /* LIBXML_ICONV_ENABLED */
298 +#ifdef LIBXML_ICU_ENABLED
299 + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) {
300 + if (handler->name != NULL)
301 + xmlFree(handler->name);
302 + handler->name = NULL;
303 + if (handler->uconv_out != NULL) {
304 + closeIcuConverter(handler->uconv_out);
305 + handler->uconv_out = NULL;
307 + if (handler->uconv_in != NULL) {
308 + closeIcuConverter(handler->uconv_in);
309 + handler->uconv_in = NULL;
314 #ifdef DEBUG_ENCODING
316 xmlGenericError(xmlGenericErrorContext,
317 @@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
321 +#ifdef LIBXML_ICU_ENABLED
322 + } else if (handler->uconv_out != NULL) {
324 + toconv = in->end - cur;
326 + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0],
327 + &written, cur, &toconv);
336 + } while (ret == -2);
338 /* could not find a converter */
340 @@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
342 return(in->consumed + (in->cur - in->base));
346 -#ifndef LIBXML_ICONV_ENABLED
347 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
348 #ifdef LIBXML_ISO8859X_ENABLED
351 diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h
352 index c74b25f..b5f8b48 100644
353 --- a/third_party/libxml/include/libxml/encoding.h
354 +++ b/third_party/libxml/include/libxml/encoding.h
357 #ifdef LIBXML_ICONV_ENABLED
360 +#ifdef LIBXML_ICU_ENABLED
361 +#include <unicode/ucnv.h>
363 +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
364 + * to prevent unwanted ICU symbols being exposed to users of libxml2.
365 + * One particular case is Qt4 conflicting on UChar32.
369 +typedef struct UConverter UConverter;
371 +typedef wchar_t UChar;
373 +typedef uint16_t UChar;
380 @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
381 * Block defining the handlers for non UTF-8 encodings.
382 * If iconv is supported, there are two extra fields.
384 +#ifdef LIBXML_ICU_ENABLED
386 + UConverter *uconv; /* for conversion between an encoding and UTF-16 */
387 + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
389 +typedef struct _uconv_t uconv_t;
392 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
393 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
394 @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler {
397 #endif /* LIBXML_ICONV_ENABLED */
398 +#ifdef LIBXML_ICU_ENABLED
400 + uconv_t *uconv_out;
401 +#endif /* LIBXML_ICU_ENABLED */
405 diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h
406 index dd79c42..3580b63 100644
407 --- a/third_party/libxml/include/libxml/parser.h
408 +++ b/third_party/libxml/include/libxml/parser.h
409 @@ -1222,6 +1222,7 @@ typedef enum {
410 XML_WITH_DEBUG_MEM = 29,
411 XML_WITH_DEBUG_RUN = 30,
414 XML_WITH_NONE = 99999 /* just to be sure of allocation size */
417 diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in
418 index 4739f3a..de310ab 100644
419 --- a/third_party/libxml/include/libxml/xmlversion.h.in
420 +++ b/third_party/libxml/include/libxml/xmlversion.h.in
421 @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
425 + * LIBXML_ICU_ENABLED:
427 + * Whether icu support is available
430 +#define LIBXML_ICU_ENABLED
434 * LIBXML_ISO8859X_ENABLED:
436 * Whether ISO-8859-* support is made available in case iconv is not
437 diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c
438 index 85e7599..3ba2a06 100644
439 --- a/third_party/libxml/parser.c
440 +++ b/third_party/libxml/parser.c
441 @@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature)
446 +#ifdef LIBXML_ICU_ENABLED