2 * helper_utf8.c - Raptor UTF-8 and Unicode support
4 * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
5 * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
6 * Copyright (C) 2009 Miriam Ruiz <little_miry@yahoo.es>
8 * This package is Free Software and part of Redland http://librdf.org/
10 * It is licensed under the following three licenses as alternatives:
11 * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
12 * 2. GNU General Public License (GPL) V2 or any newer version
13 * 3. Apache License, V2.0 or any newer version
15 * You may not use this file except in compliance with at least one of
16 * the above three licenses.
31 * @c: Unicode character
32 * @output: UTF-8 string buffer or NULL
34 * Convert a Unicode character to UTF-8 encoding.
36 * Based on librdf_unicode_char_to_utf8() with no need to calculate
37 * length since the encoded character is always copied into a buffer
38 * with sufficient size.
40 * Return value: bytes encoded to output buffer or <0 on failure
42 int char_to_utf8(unsigned long c
, char *output
)
48 else if (c
< 0x00000800)
50 else if (c
< 0x00010000)
52 else if (c
< 0x00200000)
54 else if (c
< 0x04000000)
56 else if (c
< 0x80000000)
64 output
[5]=0x80 | (unsigned char)(c
& 0x3F);
66 /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
67 c
|= 0x4000000; /* 0x10000 = 0x04 << 24 */
70 output
[4]=0x80 | (unsigned char)(c
& 0x3F);
72 /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
73 c
|= 0x200000; /* 0x10000 = 0x08 << 18 */
76 output
[3]=0x80 | (unsigned char)(c
& 0x3F);
78 /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
79 c
|= 0x10000; /* 0x10000 = 0x10 << 12 */
82 output
[2]=0x80 | (unsigned char)(c
& 0x3F);
84 /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
85 c
|= 0x800; /* 0x800 = 0x20 << 6 */
88 output
[1]=0x80 | (unsigned char)(c
& 0x3F);
90 /* set bits 7,6 on last byte */
94 output
[0]=(unsigned char)c
;
102 * helper_utf8_to_unicode_char:
103 * @output: Pointer to the Unicode character or NULL
104 * @input: UTF-8 string buffer
105 * @length: buffer size
107 * Convert an UTF-8 encoded buffer to a Unicode character.
109 * If output is NULL, then will calculate the number of bytes that
110 * will be used from the input buffer and not perform the conversion.
112 * Return value: bytes used from input buffer or <0 on failure:
113 * -1 input buffer too short or length error
114 * -2 overlong UTF-8 sequence
115 * -3 illegal code positions
116 * -4 code out of range U+0000 to U+10FFFF.
117 * In cases -2, -3 and -4 the coded character is stored in the output.
119 int utf8_to_char(unsigned long *output
, const char *input
, int length
)
134 else if((in
& 0xe0) == 0xc0)
139 else if((in
& 0xf0) == 0xe0)
144 else if((in
& 0xf8) == 0xf0)
149 else if((in
& 0xfc) == 0xf8)
154 else if((in
& 0xfe) == 0xfc)
200 /* check for overlong UTF-8 sequences */
220 /* check for illegal code positions:
221 * U+D800 to U+DFFF (UTF-16 surrogates)
224 if((c
> 0xD7FF && c
< 0xE000) || c
== 0xFFFE || c
== 0xFFFF)
227 /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
228 /* of course this makes some 4 byte forms illegal */
237 * is_xml11_namestartchar:
238 * @c: Unicode character to check
240 * Check if Unicode character is legal to start an XML 1.1 Name
242 * Namespaces in XML 1.1 REC 2004-02-04
243 * http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
245 * Extensible Markup Language (XML) 1.1 REC 2004-02-04
246 * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
249 * Return value: non-0 if legal
251 int is_xml11_namestartchar(long c
)
254 return (((c
>= 0x0041) && (c
<= 0x005A)) ||
255 (c
== 0x005F) || /* '_' */
257 ((c
>= 0x0061) && (c
<= 0x007A)) ||
258 ((c
>= 0x00C0) && (c
<= 0x00D6)) ||
259 ((c
>= 0x00D8) && (c
<= 0x00F6)) ||
260 ((c
>= 0x00F8) && (c
<= 0x02FF)) ||
261 ((c
>= 0x0370) && (c
<= 0x037D)) ||
262 ((c
>= 0x037F) && (c
<= 0x1FFF)) ||
263 ((c
>= 0x200C) && (c
<= 0x200D)) ||
264 ((c
>= 0x2070) && (c
<= 0x218F)) ||
265 ((c
>= 0x2C00) && (c
<= 0x2FEF)) ||
266 ((c
>= 0x3001) && (c
<= 0xD7FF)) ||
267 ((c
>= 0xF900) && (c
<= 0xFDCF)) ||
268 ((c
>= 0xFDF0) && (c
<= 0xFFFD)) ||
269 ((c
>= 0x10000) && (c
<= 0xEFFFF)));
274 * is_xml10_namestartchar:
275 * @c: Unicode character to check
277 * Check if Unicode character is legal to start an XML 1.0 Name
279 * Namespaces in XML REC 1999-01-14
280 * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
282 * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
283 * http://www.w3.org/TR/2004/REC-xml-20040204/
286 * Return value: non-0 if legal
288 int is_xml10_namestartchar(long c
)
290 return (is_letter(c
) ||
297 * @c: Unicode character
299 * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
301 * Namespaces in XML 1.1 REC 2004-02-04
302 * http://www.w3.org/TR/2004/REC-xml11-20040204/
304 * Extensible Markup Language (XML) 1.1 REC 2004-02-04
305 * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
308 * Return value: non-0 if legal
310 int is_xml11_namechar(long c
)
312 return (is_xml11_namestartchar(c
) ||
313 (c
== 0x002D) || /* '-' */
314 (c
== 0x002E) || /* '.' */
316 (c
>= 0x0030 && c
<= 0x0039) ||
318 (c
>= 0x0300 && c
<=0x036F) ||
319 (c
>= 0x203F && c
<=0x2040));
325 * @c: Unicode character
327 * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
329 * Namespaces in XML REC 1999-01-14
330 * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
332 * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
333 * http://www.w3.org/TR/2004/REC-xml-20040204/
336 * Return value: non-0 if legal
338 int is_xml10_namechar(long c
)
340 return (is_letter(c
) ||
342 (c
== 0x002E) || /* '.' */
343 (c
== 0x002D) || /* '-' */
344 (c
== 0x005F) || /* '_' */
345 is_combiningchar(c
) ||
351 * All this below was derived by machine-transforming the classes in Appendix B
352 * of http://www.w3.org/TR/2000/REC-xml-20001006
355 int is_letter(long c
)
357 return(is_basechar(c
) ||
362 int is_basechar(long c
)
364 /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
365 return((c
>= 0x0041 && c
<= 0x005A ) ||
366 (c
>= 0x0061 && c
<= 0x007A ) ||
367 (c
>= 0x00C0 && c
<= 0x00D6 ) ||
368 (c
>= 0x00D8 && c
<= 0x00F6 ) ||
369 (c
>= 0x00F8 && c
<= 0x00FF ) ||
370 (c
>= 0x0100 && c
<= 0x0131 ) ||
371 (c
>= 0x0134 && c
<= 0x013E ) ||
372 (c
>= 0x0141 && c
<= 0x0148 ) ||
373 (c
>= 0x014A && c
<= 0x017E ) ||
374 (c
>= 0x0180 && c
<= 0x01C3 ) ||
375 (c
>= 0x01CD && c
<= 0x01F0 ) ||
376 (c
>= 0x01F4 && c
<= 0x01F5 ) ||
377 (c
>= 0x01FA && c
<= 0x0217 ) ||
378 (c
>= 0x0250 && c
<= 0x02A8 ) ||
379 (c
>= 0x02BB && c
<= 0x02C1 ) ||
381 (c
>= 0x0388 && c
<= 0x038A ) ||
383 (c
>= 0x038E && c
<= 0x03A1 ) ||
384 (c
>= 0x03A3 && c
<= 0x03CE ) ||
385 (c
>= 0x03D0 && c
<= 0x03D6 ) ||
390 (c
>= 0x03E2 && c
<= 0x03F3 ) ||
391 (c
>= 0x0401 && c
<= 0x040C ) ||
392 (c
>= 0x040E && c
<= 0x044F ) ||
393 (c
>= 0x0451 && c
<= 0x045C ) ||
394 (c
>= 0x045E && c
<= 0x0481 ) ||
395 (c
>= 0x0490 && c
<= 0x04C4 ) ||
396 (c
>= 0x04C7 && c
<= 0x04C8 ) ||
397 (c
>= 0x04CB && c
<= 0x04CC ) ||
398 (c
>= 0x04D0 && c
<= 0x04EB ) ||
399 (c
>= 0x04EE && c
<= 0x04F5 ) ||
400 (c
>= 0x04F8 && c
<= 0x04F9 ) ||
401 (c
>= 0x0531 && c
<= 0x0556 ) ||
403 (c
>= 0x0561 && c
<= 0x0586 ) ||
404 (c
>= 0x05D0 && c
<= 0x05EA ) ||
405 (c
>= 0x05F0 && c
<= 0x05F2 ) ||
406 (c
>= 0x0621 && c
<= 0x063A ) ||
407 (c
>= 0x0641 && c
<= 0x064A ) ||
408 (c
>= 0x0671 && c
<= 0x06B7 ) ||
409 (c
>= 0x06BA && c
<= 0x06BE ) ||
410 (c
>= 0x06C0 && c
<= 0x06CE ) ||
411 (c
>= 0x06D0 && c
<= 0x06D3 ) ||
413 (c
>= 0x06E5 && c
<= 0x06E6 ) ||
414 (c
>= 0x0905 && c
<= 0x0939 ) ||
416 (c
>= 0x0958 && c
<= 0x0961 ) ||
417 (c
>= 0x0985 && c
<= 0x098C ) ||
418 (c
>= 0x098F && c
<= 0x0990 ) ||
419 (c
>= 0x0993 && c
<= 0x09A8 ) ||
420 (c
>= 0x09AA && c
<= 0x09B0 ) ||
422 (c
>= 0x09B6 && c
<= 0x09B9 ) ||
423 (c
>= 0x09DC && c
<= 0x09DD ) ||
424 (c
>= 0x09DF && c
<= 0x09E1 ) ||
425 (c
>= 0x09F0 && c
<= 0x09F1 ) ||
426 (c
>= 0x0A05 && c
<= 0x0A0A ) ||
427 (c
>= 0x0A0F && c
<= 0x0A10 ) ||
428 (c
>= 0x0A13 && c
<= 0x0A28 ) ||
429 (c
>= 0x0A2A && c
<= 0x0A30 ) ||
430 (c
>= 0x0A32 && c
<= 0x0A33 ) ||
431 (c
>= 0x0A35 && c
<= 0x0A36 ) ||
432 (c
>= 0x0A38 && c
<= 0x0A39 ) ||
433 (c
>= 0x0A59 && c
<= 0x0A5C ) ||
435 (c
>= 0x0A72 && c
<= 0x0A74 ) ||
436 (c
>= 0x0A85 && c
<= 0x0A8B ) ||
438 (c
>= 0x0A8F && c
<= 0x0A91 ) ||
439 (c
>= 0x0A93 && c
<= 0x0AA8 ) ||
440 (c
>= 0x0AAA && c
<= 0x0AB0 ) ||
441 (c
>= 0x0AB2 && c
<= 0x0AB3 ) ||
442 (c
>= 0x0AB5 && c
<= 0x0AB9 ) ||
445 (c
>= 0x0B05 && c
<= 0x0B0C ) ||
446 (c
>= 0x0B0F && c
<= 0x0B10 ) ||
447 (c
>= 0x0B13 && c
<= 0x0B28 ) ||
448 (c
>= 0x0B2A && c
<= 0x0B30 ) ||
449 (c
>= 0x0B32 && c
<= 0x0B33 ) ||
450 (c
>= 0x0B36 && c
<= 0x0B39 ) ||
452 (c
>= 0x0B5C && c
<= 0x0B5D ) ||
453 (c
>= 0x0B5F && c
<= 0x0B61 ) ||
454 (c
>= 0x0B85 && c
<= 0x0B8A ) ||
455 (c
>= 0x0B8E && c
<= 0x0B90 ) ||
456 (c
>= 0x0B92 && c
<= 0x0B95 ) ||
457 (c
>= 0x0B99 && c
<= 0x0B9A ) ||
459 (c
>= 0x0B9E && c
<= 0x0B9F ) ||
460 (c
>= 0x0BA3 && c
<= 0x0BA4 ) ||
461 (c
>= 0x0BA8 && c
<= 0x0BAA ) ||
462 (c
>= 0x0BAE && c
<= 0x0BB5 ) ||
463 (c
>= 0x0BB7 && c
<= 0x0BB9 ) ||
464 (c
>= 0x0C05 && c
<= 0x0C0C ) ||
465 (c
>= 0x0C0E && c
<= 0x0C10 ) ||
466 (c
>= 0x0C12 && c
<= 0x0C28 ) ||
467 (c
>= 0x0C2A && c
<= 0x0C33 ) ||
468 (c
>= 0x0C35 && c
<= 0x0C39 ) ||
469 (c
>= 0x0C60 && c
<= 0x0C61 ) ||
470 (c
>= 0x0C85 && c
<= 0x0C8C ) ||
471 (c
>= 0x0C8E && c
<= 0x0C90 ) ||
472 (c
>= 0x0C92 && c
<= 0x0CA8 ) ||
473 (c
>= 0x0CAA && c
<= 0x0CB3 ) ||
474 (c
>= 0x0CB5 && c
<= 0x0CB9 ) ||
476 (c
>= 0x0CE0 && c
<= 0x0CE1 ) ||
477 (c
>= 0x0D05 && c
<= 0x0D0C ) ||
478 (c
>= 0x0D0E && c
<= 0x0D10 ) ||
479 (c
>= 0x0D12 && c
<= 0x0D28 ) ||
480 (c
>= 0x0D2A && c
<= 0x0D39 ) ||
481 (c
>= 0x0D60 && c
<= 0x0D61 ) ||
482 (c
>= 0x0E01 && c
<= 0x0E2E ) ||
484 (c
>= 0x0E32 && c
<= 0x0E33 ) ||
485 (c
>= 0x0E40 && c
<= 0x0E45 ) ||
486 (c
>= 0x0E81 && c
<= 0x0E82 ) ||
488 (c
>= 0x0E87 && c
<= 0x0E88 ) ||
491 (c
>= 0x0E94 && c
<= 0x0E97 ) ||
492 (c
>= 0x0E99 && c
<= 0x0E9F ) ||
493 (c
>= 0x0EA1 && c
<= 0x0EA3 ) ||
496 (c
>= 0x0EAA && c
<= 0x0EAB ) ||
497 (c
>= 0x0EAD && c
<= 0x0EAE ) ||
499 (c
>= 0x0EB2 && c
<= 0x0EB3 ) ||
501 (c
>= 0x0EC0 && c
<= 0x0EC4 ) ||
502 (c
>= 0x0F40 && c
<= 0x0F47 ) ||
503 (c
>= 0x0F49 && c
<= 0x0F69 ) ||
504 (c
>= 0x10A0 && c
<= 0x10C5 ) ||
505 (c
>= 0x10D0 && c
<= 0x10F6 ) ||
507 (c
>= 0x1102 && c
<= 0x1103 ) ||
508 (c
>= 0x1105 && c
<= 0x1107 ) ||
510 (c
>= 0x110B && c
<= 0x110C ) ||
511 (c
>= 0x110E && c
<= 0x1112 ) ||
518 (c
>= 0x1154 && c
<= 0x1155 ) ||
520 (c
>= 0x115F && c
<= 0x1161 ) ||
525 (c
>= 0x116D && c
<= 0x116E ) ||
526 (c
>= 0x1172 && c
<= 0x1173 ) ||
531 (c
>= 0x11AE && c
<= 0x11AF ) ||
532 (c
>= 0x11B7 && c
<= 0x11B8 ) ||
534 (c
>= 0x11BC && c
<= 0x11C2 ) ||
538 (c
>= 0x1E00 && c
<= 0x1E9B ) ||
539 (c
>= 0x1EA0 && c
<= 0x1EF9 ) ||
540 (c
>= 0x1F00 && c
<= 0x1F15 ) ||
541 (c
>= 0x1F18 && c
<= 0x1F1D ) ||
542 (c
>= 0x1F20 && c
<= 0x1F45 ) ||
543 (c
>= 0x1F48 && c
<= 0x1F4D ) ||
544 (c
>= 0x1F50 && c
<= 0x1F57 ) ||
548 (c
>= 0x1F5F && c
<= 0x1F7D ) ||
549 (c
>= 0x1F80 && c
<= 0x1FB4 ) ||
550 (c
>= 0x1FB6 && c
<= 0x1FBC ) ||
552 (c
>= 0x1FC2 && c
<= 0x1FC4 ) ||
553 (c
>= 0x1FC6 && c
<= 0x1FCC ) ||
554 (c
>= 0x1FD0 && c
<= 0x1FD3 ) ||
555 (c
>= 0x1FD6 && c
<= 0x1FDB ) ||
556 (c
>= 0x1FE0 && c
<= 0x1FEC ) ||
557 (c
>= 0x1FF2 && c
<= 0x1FF4 ) ||
558 (c
>= 0x1FF6 && c
<= 0x1FFC ) ||
560 (c
>= 0x212A && c
<= 0x212B ) ||
562 (c
>= 0x2180 && c
<= 0x2182 ) ||
563 (c
>= 0x3041 && c
<= 0x3094 ) ||
564 (c
>= 0x30A1 && c
<= 0x30FA ) ||
565 (c
>= 0x3105 && c
<= 0x312C ) ||
566 (c
>= 0xAC00 && c
<= 0xD7A3 )
571 int is_ideographic(long c
)
573 /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
574 return((c
>= 0x4E00 && c
<= 0x9FA5 ) ||
576 (c
>= 0x3021 && c
<= 0x3029 ));
580 int is_combiningchar(long c
)
582 /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
583 return((c
>= 0x0300 && c
<= 0x0345 ) ||
584 (c
>= 0x0360 && c
<= 0x0361 ) ||
585 (c
>= 0x0483 && c
<= 0x0486 ) ||
586 (c
>= 0x0591 && c
<= 0x05A1 ) ||
587 (c
>= 0x05A3 && c
<= 0x05B9 ) ||
588 (c
>= 0x05BB && c
<= 0x05BD ) ||
590 (c
>= 0x05C1 && c
<= 0x05C2 ) ||
592 (c
>= 0x064B && c
<= 0x0652 ) ||
594 (c
>= 0x06D6 && c
<= 0x06DC ) ||
595 (c
>= 0x06DD && c
<= 0x06DF ) ||
596 (c
>= 0x06E0 && c
<= 0x06E4 ) ||
597 (c
>= 0x06E7 && c
<= 0x06E8 ) ||
598 (c
>= 0x06EA && c
<= 0x06ED ) ||
599 (c
>= 0x0901 && c
<= 0x0903 ) ||
601 (c
>= 0x093E && c
<= 0x094C ) ||
603 (c
>= 0x0951 && c
<= 0x0954 ) ||
604 (c
>= 0x0962 && c
<= 0x0963 ) ||
605 (c
>= 0x0981 && c
<= 0x0983 ) ||
609 (c
>= 0x09C0 && c
<= 0x09C4 ) ||
610 (c
>= 0x09C7 && c
<= 0x09C8 ) ||
611 (c
>= 0x09CB && c
<= 0x09CD ) ||
613 (c
>= 0x09E2 && c
<= 0x09E3 ) ||
618 (c
>= 0x0A40 && c
<= 0x0A42 ) ||
619 (c
>= 0x0A47 && c
<= 0x0A48 ) ||
620 (c
>= 0x0A4B && c
<= 0x0A4D ) ||
621 (c
>= 0x0A70 && c
<= 0x0A71 ) ||
622 (c
>= 0x0A81 && c
<= 0x0A83 ) ||
624 (c
>= 0x0ABE && c
<= 0x0AC5 ) ||
625 (c
>= 0x0AC7 && c
<= 0x0AC9 ) ||
626 (c
>= 0x0ACB && c
<= 0x0ACD ) ||
627 (c
>= 0x0B01 && c
<= 0x0B03 ) ||
629 (c
>= 0x0B3E && c
<= 0x0B43 ) ||
630 (c
>= 0x0B47 && c
<= 0x0B48 ) ||
631 (c
>= 0x0B4B && c
<= 0x0B4D ) ||
632 (c
>= 0x0B56 && c
<= 0x0B57 ) ||
633 (c
>= 0x0B82 && c
<= 0x0B83 ) ||
634 (c
>= 0x0BBE && c
<= 0x0BC2 ) ||
635 (c
>= 0x0BC6 && c
<= 0x0BC8 ) ||
636 (c
>= 0x0BCA && c
<= 0x0BCD ) ||
638 (c
>= 0x0C01 && c
<= 0x0C03 ) ||
639 (c
>= 0x0C3E && c
<= 0x0C44 ) ||
640 (c
>= 0x0C46 && c
<= 0x0C48 ) ||
641 (c
>= 0x0C4A && c
<= 0x0C4D ) ||
642 (c
>= 0x0C55 && c
<= 0x0C56 ) ||
643 (c
>= 0x0C82 && c
<= 0x0C83 ) ||
644 (c
>= 0x0CBE && c
<= 0x0CC4 ) ||
645 (c
>= 0x0CC6 && c
<= 0x0CC8 ) ||
646 (c
>= 0x0CCA && c
<= 0x0CCD ) ||
647 (c
>= 0x0CD5 && c
<= 0x0CD6 ) ||
648 (c
>= 0x0D02 && c
<= 0x0D03 ) ||
649 (c
>= 0x0D3E && c
<= 0x0D43 ) ||
650 (c
>= 0x0D46 && c
<= 0x0D48 ) ||
651 (c
>= 0x0D4A && c
<= 0x0D4D ) ||
654 (c
>= 0x0E34 && c
<= 0x0E3A ) ||
655 (c
>= 0x0E47 && c
<= 0x0E4E ) ||
657 (c
>= 0x0EB4 && c
<= 0x0EB9 ) ||
658 (c
>= 0x0EBB && c
<= 0x0EBC ) ||
659 (c
>= 0x0EC8 && c
<= 0x0ECD ) ||
660 (c
>= 0x0F18 && c
<= 0x0F19 ) ||
666 (c
>= 0x0F71 && c
<= 0x0F84 ) ||
667 (c
>= 0x0F86 && c
<= 0x0F8B ) ||
668 (c
>= 0x0F90 && c
<= 0x0F95 ) ||
670 (c
>= 0x0F99 && c
<= 0x0FAD ) ||
671 (c
>= 0x0FB1 && c
<= 0x0FB7 ) ||
673 (c
>= 0x20D0 && c
<= 0x20DC ) ||
675 (c
>= 0x302A && c
<= 0x302F ) ||
683 /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
684 return((c
>= 0x0030 && c
<= 0x0039 ) ||
685 (c
>= 0x0660 && c
<= 0x0669 ) ||
686 (c
>= 0x06F0 && c
<= 0x06F9 ) ||
687 (c
>= 0x0966 && c
<= 0x096F ) ||
688 (c
>= 0x09E6 && c
<= 0x09EF ) ||
689 (c
>= 0x0A66 && c
<= 0x0A6F ) ||
690 (c
>= 0x0AE6 && c
<= 0x0AEF ) ||
691 (c
>= 0x0B66 && c
<= 0x0B6F ) ||
692 (c
>= 0x0BE7 && c
<= 0x0BEF ) ||
693 (c
>= 0x0C66 && c
<= 0x0C6F ) ||
694 (c
>= 0x0CE6 && c
<= 0x0CEF ) ||
695 (c
>= 0x0D66 && c
<= 0x0D6F ) ||
696 (c
>= 0x0E50 && c
<= 0x0E59 ) ||
697 (c
>= 0x0ED0 && c
<= 0x0ED9 ) ||
698 (c
>= 0x0F20 && c
<= 0x0F29 ));
702 int is_extender(long c
)
704 /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
705 return((c
== 0x00B7) ||
713 (c
>= 0x3031 && c
<= 0x3035 ) ||
714 (c
>= 0x309D && c
<= 0x309E ) ||
715 (c
>= 0x30FC && c
<= 0x30FE ));
721 return((c
== 0x0020) || // Space
722 (c
== 0x000C) || // Page jump: \f
723 (c
== 0x000D) || // Carriage return: \r
724 (c
== 0x000A) || // Next line: \n
725 (c
== 0x0009) || // Horizontal tab: \t
726 (c
== 0x000B) ); // Vertical tab \v
731 * helper_utf8_is_nfc:
732 * @input: UTF-8 string
733 * @length: length of string
735 * Check a string is in Unicode Normal Form C.
737 * Return value: Non 0 if the string is NFC
739 int is_nfc(const char *input
, size_t length
)
744 for(i
=0; i
<length
; i
++)
754 #ifdef helper_NFC_CHECK
755 return helper_nfc_check(input
, length
, NULL
);
764 * @string: UTF-8 string
765 * @length: length of string
767 * Check a string is UTF-8.
769 * Return value: Non 0 if the string is UTF-8
771 int check(const char *string
, size_t length
)
775 unsigned long unichar
=0;
777 int unichar_len
= utf8_to_char(&unichar
, string
, length
);
778 if(unichar_len
< 0 || unichar_len
> (int)length
)
781 if(unichar
> 0x10ffff)
784 string
+= unichar_len
;
785 length
-= unichar_len
;
796 unsigned long int UTF8
;
799 #define UTF8SIZE(Ch) ( (Ch >= 0x800) ? 3 : ( (Ch >= 0x80) ? 2 : 1 ) )
800 #define UTF8BYTE1(Ch) ( (Ch >= 0x800) ? (0xE0 | (Ch >> 12 & 0x0F)) : ( (Ch >= 0x80) ? (0xC0 | (Ch >> 6 & 0x1F)) : Ch ) )
801 #define UTF8BYTE2(Ch) ( (Ch >= 0x800) ? (0x80 | (Ch >> 6 & 0x3F)) : ( (Ch >= 0x80) ? (0x80 | (Ch & 0x3F)) : 0 ) )
802 #define UTF8BYTE3(Ch) ( (Ch >= 0x800) ? (0x80 | (Ch & 0x3F)) : ( (Ch >= 0x80) ? 0 : 0 ) )
803 #define UTF8ENTITY(Name, Ch) { Name, Ch, UTF8SIZE(Ch), UTF8BYTE1(Ch) + (UTF8BYTE2(Ch) << 8) + (UTF8BYTE3(Ch) << 16) }
805 static const EntityDataType EntityData
[] =
806 { // List of entities defined in the HTML 4.0 spec
807 UTF8ENTITY("nbsp", 160),
808 UTF8ENTITY("iexcl", 161),
809 UTF8ENTITY("cent", 162),
810 UTF8ENTITY("pound", 163),
811 UTF8ENTITY("curren", 164),
812 UTF8ENTITY("yen", 165),
813 UTF8ENTITY("brvbar", 166),
814 UTF8ENTITY("sect", 167),
815 UTF8ENTITY("uml", 168),
816 UTF8ENTITY("copy", 169),
817 UTF8ENTITY("ordf", 170),
818 UTF8ENTITY("laquo", 171),
819 UTF8ENTITY("not", 172),
820 UTF8ENTITY("shy", 173),
821 UTF8ENTITY("reg", 174),
822 UTF8ENTITY("macr", 175),
823 UTF8ENTITY("deg", 176),
824 UTF8ENTITY("plusmn", 177),
825 UTF8ENTITY("sup2", 178),
826 UTF8ENTITY("sup3", 179),
827 UTF8ENTITY("acute", 180),
828 UTF8ENTITY("micro", 181),
829 UTF8ENTITY("para", 182),
830 UTF8ENTITY("middot", 183),
831 UTF8ENTITY("cedil", 184),
832 UTF8ENTITY("sup1", 185),
833 UTF8ENTITY("ordm", 186),
834 UTF8ENTITY("raquo", 187),
835 UTF8ENTITY("frac14", 188),
836 UTF8ENTITY("frac12", 189),
837 UTF8ENTITY("frac34", 190),
838 UTF8ENTITY("iquest", 191),
839 UTF8ENTITY("Agrave", 192),
840 UTF8ENTITY("Aacute", 193),
841 UTF8ENTITY("Acirc", 194),
842 UTF8ENTITY("Atilde", 195),
843 UTF8ENTITY("Auml", 196),
844 UTF8ENTITY("Aring", 197),
845 UTF8ENTITY("AElig", 198),
846 UTF8ENTITY("Ccedil", 199),
847 UTF8ENTITY("Egrave", 200),
848 UTF8ENTITY("Eacute", 201),
849 UTF8ENTITY("Ecirc", 202),
850 UTF8ENTITY("Euml", 203),
851 UTF8ENTITY("Igrave", 204),
852 UTF8ENTITY("Iacute", 205),
853 UTF8ENTITY("Icirc", 206),
854 UTF8ENTITY("Iuml", 207),
855 UTF8ENTITY("ETH", 208),
856 UTF8ENTITY("Ntilde", 209),
857 UTF8ENTITY("Ograve", 210),
858 UTF8ENTITY("Oacute", 211),
859 UTF8ENTITY("Ocirc", 212),
860 UTF8ENTITY("Otilde", 213),
861 UTF8ENTITY("Ouml", 214),
862 UTF8ENTITY("times", 215),
863 UTF8ENTITY("Oslash", 216),
864 UTF8ENTITY("Ugrave", 217),
865 UTF8ENTITY("Uacute", 218),
866 UTF8ENTITY("Ucirc", 219),
867 UTF8ENTITY("Uuml", 220),
868 UTF8ENTITY("Yacute", 221),
869 UTF8ENTITY("THORN", 222),
870 UTF8ENTITY("szlig", 223),
871 UTF8ENTITY("agrave", 224),
872 UTF8ENTITY("aacute", 225),
873 UTF8ENTITY("acirc", 226),
874 UTF8ENTITY("atilde", 227),
875 UTF8ENTITY("auml", 228),
876 UTF8ENTITY("aring", 229),
877 UTF8ENTITY("aelig", 230),
878 UTF8ENTITY("ccedil", 231),
879 UTF8ENTITY("egrave", 232),
880 UTF8ENTITY("eacute", 233),
881 UTF8ENTITY("ecirc", 234),
882 UTF8ENTITY("euml", 235),
883 UTF8ENTITY("igrave", 236),
884 UTF8ENTITY("iacute", 237),
885 UTF8ENTITY("icirc", 238),
886 UTF8ENTITY("iuml", 239),
887 UTF8ENTITY("eth", 240),
888 UTF8ENTITY("ntilde", 241),
889 UTF8ENTITY("ograve", 242),
890 UTF8ENTITY("oacute", 243),
891 UTF8ENTITY("ocirc", 244),
892 UTF8ENTITY("otilde", 245),
893 UTF8ENTITY("ouml", 246),
894 UTF8ENTITY("divide", 247),
895 UTF8ENTITY("oslash", 248),
896 UTF8ENTITY("ugrave", 249),
897 UTF8ENTITY("uacute", 250),
898 UTF8ENTITY("ucirc", 251),
899 UTF8ENTITY("uuml", 252),
900 UTF8ENTITY("yacute", 253),
901 UTF8ENTITY("thorn", 254),
902 UTF8ENTITY("yuml", 255),
903 UTF8ENTITY("fnof", 402),
905 UTF8ENTITY("Alpha", 913),
906 UTF8ENTITY("Beta", 914),
907 UTF8ENTITY("Gamma", 915),
908 UTF8ENTITY("Delta", 916),
909 UTF8ENTITY("Epsilon", 917),
910 UTF8ENTITY("Zeta", 918),
911 UTF8ENTITY("Eta", 919),
912 UTF8ENTITY("Theta", 920),
913 UTF8ENTITY("Iota", 921),
914 UTF8ENTITY("Kappa", 922),
915 UTF8ENTITY("Lambda", 923),
916 UTF8ENTITY("Mu", 924),
917 UTF8ENTITY("Nu", 925),
918 UTF8ENTITY("Xi", 926),
919 UTF8ENTITY("Omicron", 927),
920 UTF8ENTITY("Pi", 928),
921 UTF8ENTITY("Rho", 929),
922 UTF8ENTITY("Sigma", 931),
923 UTF8ENTITY("Tau", 932),
924 UTF8ENTITY("Upsilon", 933),
925 UTF8ENTITY("Phi", 934),
926 UTF8ENTITY("Chi", 935),
927 UTF8ENTITY("Psi", 936),
928 UTF8ENTITY("Omega", 937),
929 UTF8ENTITY("alpha", 945),
930 UTF8ENTITY("beta", 946),
931 UTF8ENTITY("gamma", 947),
932 UTF8ENTITY("delta", 948),
933 UTF8ENTITY("epsilon", 949),
934 UTF8ENTITY("zeta", 950),
935 UTF8ENTITY("eta", 951),
936 UTF8ENTITY("theta", 952),
937 UTF8ENTITY("iota", 953),
938 UTF8ENTITY("kappa", 954),
939 UTF8ENTITY("lambda", 955),
940 UTF8ENTITY("mu", 956),
941 UTF8ENTITY("nu", 957),
942 UTF8ENTITY("xi", 958),
943 UTF8ENTITY("omicron", 959),
944 UTF8ENTITY("pi", 960),
945 UTF8ENTITY("rho", 961),
946 UTF8ENTITY("sigmaf", 962),
947 UTF8ENTITY("sigma", 963),
948 UTF8ENTITY("tau", 964),
949 UTF8ENTITY("upsilon", 965),
950 UTF8ENTITY("phi", 966),
951 UTF8ENTITY("chi", 967),
952 UTF8ENTITY("psi", 968),
953 UTF8ENTITY("omega", 969),
954 UTF8ENTITY("thetasym", 977),
955 UTF8ENTITY("upsih", 978),
956 UTF8ENTITY("piv", 982),
957 // General Punctuation
958 UTF8ENTITY("bull", 8226),
959 UTF8ENTITY("hellip", 8230),
960 UTF8ENTITY("prime", 8242),
961 UTF8ENTITY("Prime", 8243),
962 UTF8ENTITY("oline", 8254),
963 UTF8ENTITY("frasl", 8260),
964 // Letterlike Symbols
965 UTF8ENTITY("weierp", 8472),
966 UTF8ENTITY("image", 8465),
967 UTF8ENTITY("real", 8476),
968 UTF8ENTITY("trade", 8482),
969 UTF8ENTITY("alefsym", 8501),
971 UTF8ENTITY("larr", 8592),
972 UTF8ENTITY("uarr", 8593),
973 UTF8ENTITY("rarr", 8594),
974 UTF8ENTITY("darr", 8595),
975 UTF8ENTITY("harr", 8596),
976 UTF8ENTITY("crarr", 8629),
977 UTF8ENTITY("lArr", 8656),
978 UTF8ENTITY("uArr", 8657),
979 UTF8ENTITY("rArr", 8658),
980 UTF8ENTITY("dArr", 8659),
981 UTF8ENTITY("hArr", 8660),
982 // Mathematical Operators
983 UTF8ENTITY("forall", 8704),
984 UTF8ENTITY("part", 8706),
985 UTF8ENTITY("exist", 8707),
986 UTF8ENTITY("empty", 8709),
987 UTF8ENTITY("nabla", 8711),
988 UTF8ENTITY("isin", 8712),
989 UTF8ENTITY("notin", 8713),
990 UTF8ENTITY("ni", 8715),
991 UTF8ENTITY("prod", 8719),
992 UTF8ENTITY("sum", 8721),
993 UTF8ENTITY("minus", 8722),
994 UTF8ENTITY("lowast", 8727),
995 UTF8ENTITY("radic", 8730),
996 UTF8ENTITY("prop", 8733),
997 UTF8ENTITY("infin", 8734),
998 UTF8ENTITY("and", 8743),
999 UTF8ENTITY("or", 8744),
1000 UTF8ENTITY("cap", 8745),
1001 UTF8ENTITY("cup", 8746),
1002 UTF8ENTITY("int", 8747),
1003 UTF8ENTITY("there4", 8756),
1004 UTF8ENTITY("sim", 8764),
1005 UTF8ENTITY("cong", 8773),
1006 UTF8ENTITY("asymp", 8776),
1007 UTF8ENTITY("ne", 8800),
1008 UTF8ENTITY("equiv", 8801),
1009 UTF8ENTITY("le", 8804),
1010 UTF8ENTITY("ge", 8805),
1011 UTF8ENTITY("sub", 8834),
1012 UTF8ENTITY("sup", 8835),
1013 UTF8ENTITY("nsub", 8836),
1014 UTF8ENTITY("sube", 8838),
1015 UTF8ENTITY("supe", 8839),
1016 UTF8ENTITY("oplus", 8853),
1017 UTF8ENTITY("otimes", 8855),
1018 UTF8ENTITY("perp", 8869),
1019 UTF8ENTITY("sdot", 8901),
1020 // Miscellaneous Technical
1021 UTF8ENTITY("lceil", 8968),
1022 UTF8ENTITY("rceil", 8969),
1023 UTF8ENTITY("lfloor", 8970),
1024 UTF8ENTITY("rfloor", 8971),
1025 UTF8ENTITY("lang", 9001),
1026 UTF8ENTITY("rang", 9002),
1028 UTF8ENTITY("loz", 9674),
1029 // Miscellaneous Symbols
1030 UTF8ENTITY("spades", 9824),
1031 UTF8ENTITY("clubs", 9827),
1032 UTF8ENTITY("hearts", 9829),
1033 UTF8ENTITY("diams", 9830),
1034 UTF8ENTITY("quot", 34),
1035 UTF8ENTITY("amp", 38),
1036 UTF8ENTITY("lt", 60),
1037 UTF8ENTITY("gt", 62),
1039 UTF8ENTITY("OElig", 338),
1040 UTF8ENTITY("oelig", 339),
1041 UTF8ENTITY("Scaron", 352),
1042 UTF8ENTITY("scaron", 353),
1043 UTF8ENTITY("Yuml", 376),
1044 // Spacing Modifier Letters
1045 UTF8ENTITY("circ", 710),
1046 UTF8ENTITY("tilde", 732),
1047 // General Punctuation
1048 UTF8ENTITY("ensp", 8194),
1049 UTF8ENTITY("emsp", 8195),
1050 UTF8ENTITY("thinsp", 8201),
1051 UTF8ENTITY("zwnj", 8204),
1052 UTF8ENTITY("zwj", 8205),
1053 UTF8ENTITY("lrm", 8206),
1054 UTF8ENTITY("rlm", 8207),
1055 UTF8ENTITY("ndash", 8211),
1056 UTF8ENTITY("mdash", 8212),
1057 UTF8ENTITY("lsquo", 8216),
1058 UTF8ENTITY("rsquo", 8217),
1059 UTF8ENTITY("sbquo", 8218),
1060 UTF8ENTITY("ldquo", 8220),
1061 UTF8ENTITY("rdquo", 8221),
1062 UTF8ENTITY("bdquo", 8222),
1063 UTF8ENTITY("dagger", 8224),
1064 UTF8ENTITY("Dagger", 8225),
1065 UTF8ENTITY("permil", 8240),
1066 UTF8ENTITY("lsaquo", 8249),
1067 UTF8ENTITY("rsaquo", 8250),
1068 UTF8ENTITY("euro", 8364),
1069 UTF8ENTITY(NULL
, 0) // End of the list
1070 }; // End of HTMLEntityData
1072 class Char2UTF8Map
: std::map
<unsigned long int, std::string
>
1075 typedef std::map
<unsigned long int, std::string
>::iterator iterator
;
1076 typedef std::pair
<unsigned long int, std::string
> pair
;
1077 Char2UTF8Map(const EntityDataType
*entities
)
1079 const EntityDataType
*ent
= EntityData
;
1082 insert ( pair(ent
->Char
,ent
->Name
) );
1086 inline iterator
end()
1088 return std::map
<unsigned long int, std::string
>::end();
1090 inline iterator
find(unsigned long int ch
)
1092 return std::map
<unsigned long int, std::string
>::find(ch
);
1096 static Char2UTF8Map
char2utf8(EntityData
);
1098 void escape_text(std::ostream
&out
, const char *in
)
1100 const char *ptr
= in
;
1101 int len
= strlen(in
);
1104 unsigned long int ch
;
1105 int adv
= utf8_to_char(&ch
, ptr
, len
);
1106 if (adv
&& len
>= adv
)
1108 Char2UTF8Map::iterator it
= char2utf8
.find(ch
);
1109 if (it
!= char2utf8
.end())
1111 out
<< "&" << it
->second
<< ";";
1119 out
<< "&#" << ch
<< ";";
1129 } // Close namespace