2 * Test the UTF-8 decoding routines
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
10 #include <libxml/parser.h>
11 #include <libxml/parserInternals.h>
17 static void errorHandler(void *unused
, xmlErrorPtr err
) {
18 if ((unused
== NULL
) && (err
!= NULL
) && (lastError
== 0)) {
19 lastError
= err
->code
;
23 char document1
[100] = "<doc>XXXX</doc>";
24 char document2
[100] = "<doc foo='XXXX'/>";
26 static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt
, char *document
,
27 int len
, char *data
, int forbid1
, int forbid2
) {
31 for (i
= 0;i
<= 0xFF;i
++) {
37 res
= xmlReadMemory(document
, len
, "test", NULL
, 0);
39 if ((i
== forbid1
) || (i
== forbid2
)) {
40 if ((lastError
== 0) || (res
!= NULL
))
42 "Failed to detect invalid char for Byte 0x%02X: %c\n",
46 else if ((i
== '<') || (i
== '&')) {
47 if ((lastError
== 0) || (res
!= NULL
))
49 "Failed to detect illegal char %c for Byte 0x%02X\n", i
, i
);
51 else if (((i
< 0x20) || (i
>= 0x80)) &&
52 (i
!= 0x9) && (i
!= 0xA) && (i
!= 0xD)) {
53 if ((lastError
!= XML_ERR_INVALID_CHAR
) && (res
!= NULL
))
55 "Failed to detect invalid char for Byte 0x%02X\n", i
);
57 else if (res
== NULL
) {
59 "Failed to parse valid char for Byte 0x%02X : %c\n", i
, i
);
66 static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt
, char *document
,
67 int len
, char *data
) {
71 for (i
= 0x80;i
<= 0xFF;i
++) {
72 for (j
= 0;j
<= 0xFF;j
++) {
79 res
= xmlReadMemory(document
, len
, "test", NULL
, 0);
81 /* if first bit of first char is set, then second bit must too */
82 if ((i
& 0x80) && ((i
& 0x40) == 0)) {
83 if ((lastError
== 0) || (res
!= NULL
))
85 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
90 * if first bit of first char is set, then second char first
93 else if ((i
& 0x80) && ((j
& 0xC0) != 0x80)) {
94 if ((lastError
== 0) || (res
!= NULL
))
96 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
101 * if using a 2 byte encoding then the value must be greater
102 * than 0x80, i.e. one of bits 5 to 1 of i must be set
104 else if ((i
& 0x80) && ((i
& 0x1E) == 0)) {
105 if ((lastError
== 0) || (res
!= NULL
))
107 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
112 * if third bit of first char is set, then the sequence would need
113 * at least 3 bytes, but we give only 2 !
115 else if ((i
& 0xE0) == 0xE0) {
116 if ((lastError
== 0) || (res
!= NULL
))
118 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
123 * We should see no error in remaning cases
125 else if ((lastError
!= 0) || (res
== NULL
)) {
127 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i
, j
);
136 * testDocumentRanges:
138 * Test the correct UTF8 character parsing in context of XML documents
139 * Those are in-context injection tests checking the parser behaviour on
140 * edge case values at different point in content, beginning and end of
141 * CDATA in text or in attribute values.
144 static void testDocumentRanges(void) {
145 xmlParserCtxtPtr ctxt
;
149 * Set up a parsing context using the first document as
150 * the current input source.
152 ctxt
= xmlNewParserCtxt();
154 fprintf(stderr
, "Failed to allocate parser context\n");
158 printf("testing 1 byte char in document: 1");
160 data
= &document1
[5];
165 /* test 1 byte injection at beginning of area */
166 testDocumentRangeByte1(ctxt
, &document1
[0], strlen(document1
),
174 /* test 1 byte injection at end of area */
175 testDocumentRangeByte1(ctxt
, &document1
[0], strlen(document1
),
180 data
= &document2
[10];
185 /* test 1 byte injection at beginning of area */
186 testDocumentRangeByte1(ctxt
, &document2
[0], strlen(document2
),
194 /* test 1 byte injection at end of area */
195 testDocumentRangeByte1(ctxt
, &document2
[0], strlen(document2
),
199 printf("testing 2 byte char in document: 1");
201 data
= &document1
[5];
206 /* test 2 byte injection at beginning of area */
207 testDocumentRangeByte2(ctxt
, &document1
[0], strlen(document1
),
215 /* test 2 byte injection at end of area */
216 testDocumentRangeByte2(ctxt
, &document1
[0], strlen(document1
),
221 data
= &document2
[10];
226 /* test 2 byte injection at beginning of area */
227 testDocumentRangeByte2(ctxt
, &document2
[0], strlen(document2
),
235 /* test 2 byte injection at end of area */
236 testDocumentRangeByte2(ctxt
, &document2
[0], strlen(document2
),
240 xmlFreeParserCtxt(ctxt
);
243 static void testCharRangeByte1(xmlParserCtxtPtr ctxt
, char *data
) {
250 for (i
= 0;i
<= 0xFF;i
++) {
252 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
255 c
= xmlCurrentChar(ctxt
, &len
);
256 if ((i
== 0) || (i
>= 0x80)) {
257 /* we must see an error there */
258 if (lastError
!= XML_ERR_INVALID_CHAR
)
260 "Failed to detect invalid char for Byte 0x%02X\n", i
);
261 } else if (i
== 0xD) {
262 if ((c
!= 0xA) || (len
!= 1))
263 fprintf(stderr
, "Failed to convert char for Byte 0x%02X\n", i
);
264 } else if ((c
!= i
) || (len
!= 1)) {
265 fprintf(stderr
, "Failed to parse char for Byte 0x%02X\n", i
);
270 static void testCharRangeByte2(xmlParserCtxtPtr ctxt
, char *data
) {
276 for (i
= 0x80;i
<= 0xFF;i
++) {
277 for (j
= 0;j
<= 0xFF;j
++) {
280 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
283 c
= xmlCurrentChar(ctxt
, &len
);
285 /* if first bit of first char is set, then second bit must too */
286 if ((i
& 0x80) && ((i
& 0x40) == 0)) {
287 if (lastError
!= XML_ERR_INVALID_CHAR
)
289 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
294 * if first bit of first char is set, then second char first
297 else if ((i
& 0x80) && ((j
& 0xC0) != 0x80)) {
298 if (lastError
!= XML_ERR_INVALID_CHAR
)
300 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
305 * if using a 2 byte encoding then the value must be greater
306 * than 0x80, i.e. one of bits 5 to 1 of i must be set
308 else if ((i
& 0x80) && ((i
& 0x1E) == 0)) {
309 if (lastError
!= XML_ERR_INVALID_CHAR
)
311 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
316 * if third bit of first char is set, then the sequence would need
317 * at least 3 bytes, but we give only 2 !
319 else if ((i
& 0xE0) == 0xE0) {
320 if (lastError
!= XML_ERR_INVALID_CHAR
)
322 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
327 * We should see no error in remaning cases
329 else if ((lastError
!= 0) || (len
!= 2)) {
331 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i
, j
);
335 * Finally check the value is right
337 else if (c
!= (j
& 0x3F) + ((i
& 0x1F) << 6)) {
339 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
340 i
, j
, ((j
& 0x3F) + ((i
& 0x1F) << 6)), c
);
346 static void testCharRangeByte3(xmlParserCtxtPtr ctxt
, char *data
) {
349 unsigned char lows
[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
353 for (i
= 0xE0;i
<= 0xFF;i
++) {
354 for (j
= 0;j
<= 0xFF;j
++) {
355 for (k
= 0;k
< 6;k
++) {
360 value
= (K
& 0x3F) + ((j
& 0x3F) << 6) + ((i
& 0xF) << 12);
361 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
364 c
= xmlCurrentChar(ctxt
, &len
);
367 * if fourth bit of first char is set, then the sequence would need
368 * at least 4 bytes, but we give only 3 !
370 if ((i
& 0xF0) == 0xF0) {
371 if (lastError
!= XML_ERR_INVALID_CHAR
)
373 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
378 * The second and the third bytes must start with 10
380 else if (((j
& 0xC0) != 0x80) || ((K
& 0xC0) != 0x80)) {
381 if (lastError
!= XML_ERR_INVALID_CHAR
)
383 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
388 * if using a 3 byte encoding then the value must be greater
389 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
390 * the 6th byte of data[1] must be set
392 else if (((i
& 0xF) == 0) && ((j
& 0x20) == 0)) {
393 if (lastError
!= XML_ERR_INVALID_CHAR
)
395 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
400 * There are values in that range that are not allowed in XML-1.0
402 else if (((value
> 0xD7FF) && (value
<0xE000)) ||
403 ((value
> 0xFFFD) && (value
<0x10000))) {
404 if (lastError
!= XML_ERR_INVALID_CHAR
)
406 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
411 * We should see no error in remaining cases
413 else if ((lastError
!= 0) || (len
!= 3)) {
415 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
420 * Finally check the value is right
422 else if (c
!= value
) {
424 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
425 i
, j
, data
[2], value
, c
);
432 static void testCharRangeByte4(xmlParserCtxtPtr ctxt
, char *data
) {
433 int i
, j
, k
, K
, l
, L
;
435 unsigned char lows
[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
439 for (i
= 0xF0;i
<= 0xFF;i
++) {
440 for (j
= 0;j
<= 0xFF;j
++) {
441 for (k
= 0;k
< 6;k
++) {
442 for (l
= 0;l
< 6;l
++) {
449 value
= (L
& 0x3F) + ((K
& 0x3F) << 6) + ((j
& 0x3F) << 12) +
451 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
454 c
= xmlCurrentChar(ctxt
, &len
);
457 * if fifth bit of first char is set, then the sequence would need
458 * at least 5 bytes, but we give only 4 !
460 if ((i
& 0xF8) == 0xF8) {
461 if (lastError
!= XML_ERR_INVALID_CHAR
)
463 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
468 * The second, third and fourth bytes must start with 10
470 else if (((j
& 0xC0) != 0x80) || ((K
& 0xC0) != 0x80) ||
471 ((L
& 0xC0) != 0x80)) {
472 if (lastError
!= XML_ERR_INVALID_CHAR
)
474 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
479 * if using a 3 byte encoding then the value must be greater
480 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
481 * the 6 or 5th byte of j must be set
483 else if (((i
& 0x7) == 0) && ((j
& 0x30) == 0)) {
484 if (lastError
!= XML_ERR_INVALID_CHAR
)
486 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
491 * There are values in that range that are not allowed in XML-1.0
493 else if (((value
> 0xD7FF) && (value
<0xE000)) ||
494 ((value
> 0xFFFD) && (value
<0x10000)) ||
495 (value
> 0x10FFFF)) {
496 if (lastError
!= XML_ERR_INVALID_CHAR
)
498 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
503 * We should see no error in remaining cases
505 else if ((lastError
!= 0) || (len
!= 4)) {
507 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
512 * Finally check the value is right
514 else if (c
!= value
) {
516 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
517 i
, j
, data
[2], value
, c
);
528 * Test the correct UTF8 character parsing in isolation i.e.
529 * not when parsing a full document, this is less expensive and we can
530 * cover the full range of UTF-8 chars accepted by XML-1.0
533 static void testCharRanges(void) {
535 xmlParserCtxtPtr ctxt
;
536 xmlParserInputBufferPtr buf
;
537 xmlParserInputPtr input
;
542 * Set up a parsing context using the above data buffer as
543 * the current input source.
545 ctxt
= xmlNewParserCtxt();
547 fprintf(stderr
, "Failed to allocate parser context\n");
550 buf
= xmlParserInputBufferCreateStatic(data
, sizeof(data
),
551 XML_CHAR_ENCODING_NONE
);
553 fprintf(stderr
, "Failed to allocate input buffer\n");
556 input
= xmlNewInputStream(ctxt
);
558 xmlFreeParserInputBuffer(buf
);
561 input
->filename
= NULL
;
564 input
->base
= xmlBufContent(input
->buf
->buffer
);
565 input
->end
= input
->base
+ 4;
566 inputPush(ctxt
, input
);
568 printf("testing char range: 1");
570 testCharRangeByte1(ctxt
, data
);
573 testCharRangeByte2(ctxt
, data
);
576 testCharRangeByte3(ctxt
, data
);
579 testCharRangeByte4(ctxt
, data
);
584 xmlFreeParserCtxt(ctxt
);
590 * this initialize the library and check potential ABI mismatches
591 * between the version it was compiled for and the actual shared
597 * Catch errors separately
600 xmlSetStructuredErrorFunc(NULL
, errorHandler
);
606 testDocumentRanges();
609 * Cleanup function for the XML library.
613 * this is to debug memory for regression tests